/*
 * Decompiled with CFR 0.152.
 */
package it.unimi.dsi.mg4j.tool;

import gnu.getopt.Getopt;
import gnu.getopt.LongOpt;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.mg4j.io.FastBufferedInputStream;
import it.unimi.dsi.mg4j.io.FastBufferedReader;
import it.unimi.dsi.mg4j.io.InputBitStream;
import it.unimi.dsi.mg4j.io.OutputBitStream;
import it.unimi.dsi.mg4j.tool.Occurrence;
import it.unimi.dsi.mg4j.util.Fast;
import it.unimi.dsi.mg4j.util.MinimalPerfectHash;
import it.unimi.dsi.mg4j.util.MutableString;
import it.unimi.dsi.mg4j.util.ProgressMeter;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Properties;

public final class FirstPass {
    private static int bufferSize;
    private static Occurrence[] temp;
    private static int[] count;
    private static String baseName;
    private static final int TRIM_CHECK = 4096;
    private static final double TRIM_HIGH_WATERMARK_FRACTION = 0.0025;
    private static final int TRIM_HIGH_WATERMARK = 10;
    private static final double TRIM_LOW_WATERMARK_FRACTION = 5.0E-4;
    private static final int TRIM_LOW_WATERMARK = 2;
    private static final boolean ASSERTS = false;

    private static final void writeOccurrences(Occurrence[] occurrence, int numOccurrences, int numDocuments, int numTerms, int batch, boolean sortOnTermsOnly) throws IOException {
        try {
            OutputBitStream out = new OutputBitStream(new FileOutputStream(baseName + "batch" + batch), bufferSize);
            if (sortOnTermsOnly) {
                if (count.length < numTerms) {
                    count = null;
                    count = new int[numTerms];
                }
                Occurrence.countSortOnTerms(occurrence, temp, numOccurrences, count, numTerms);
                Occurrence.writeOccurrences(temp, numOccurrences, out);
            } else {
                if (count.length < Math.max(numTerms, numDocuments)) {
                    count = null;
                    count = new int[Math.max(numTerms, numDocuments)];
                }
                Occurrence.countSortOnDocuments(occurrence, temp, numOccurrences, count, numDocuments);
                Occurrence.countSortOnTerms(temp, occurrence, numOccurrences, count, numTerms);
                Occurrence.writeOccurrences(occurrence, numOccurrences, out);
            }
            out.close();
        }
        catch (IOException e) {
            System.err.println("I/O Error on batch " + batch);
            throw e;
        }
    }

    public static final void main(String[] arg) throws FileNotFoundException, IOException, ClassNotFoundException {
        int i;
        int ch;
        boolean zeroth = false;
        int delimiter = 10;
        String permFile = null;
        String bufferSizeSpec = "1Mi";
        String batchSizeSpec = "2Mi";
        String charSet = "UTF-8";
        int quantum = 10000;
        int[] perm = null;
        int numDocuments = -1;
        Object2IntOpenHashMap termMap = null;
        MinimalPerfectHash termMph = null;
        boolean isCaseSensitive = true;
        boolean downcase = false;
        LongOpt[] longopts = new LongOpt[]{new LongOpt("help", 0, null, 104), new LongOpt("zeroth", 0, null, 122), new LongOpt("batch-size", 1, null, 115), new LongOpt("buffer-size", 1, null, 98), new LongOpt("delimiter", 1, null, 100), new LongOpt("permutation", 1, null, 112), new LongOpt("charset", 1, null, 99), new LongOpt("dot-meter-quantum", 1, null, 113), new LongOpt("case-insensitive", 0, null, 105), new LongOpt("downcase", 0, null, 1)};
        Getopt g = new Getopt("FirstPass", arg, "zq:c:p:d:b:s:h", longopts);
        g.setOpterr(true);
        while ((ch = g.getopt()) != -1) {
            switch (ch) {
                case 104: {
                    System.err.println("Usage: FirstPass [OPTIONS] BASENAME");
                    System.err.println("Builds a set of batches from a list of documents.");
                    System.err.println("");
                    System.err.println("Mandatory arguments:");
                    System.err.println("  BASENAME    all file names will be stemmed from this name");
                    System.err.println("");
                    System.err.println("Optional arguments:");
                    System.err.println("  -z, --zeroth              assume ZerothPass has been run and BASENAME.mph");
                    System.err.println("                            has been built.");
                    System.err.println("  -c, --charset             the input charset encoding (default: " + charSet + ')');
                    System.err.println("  -s, --batch-size          the size of a batch (default: " + batchSizeSpec + ')');
                    System.err.println("  -b, --buffer-size         the size of an I/O buffer (default: " + bufferSizeSpec + ')');
                    System.err.println("  -d, --delimiter           the Unicode index of the document delimiter (default: " + delimiter + ')');
                    System.err.println("  -p, --permutation         the name of a permutation file");
                    System.err.println("  -q, --dot-meter-quantum   the quantum of the process meter (default: " + quantum + ')');
                    System.err.println("  -i, --case-insensitive    create a case-insensitive index (input must be downcase)");
                    System.err.println("  --downcase                downcase all words (and create a case-insensitive index)");
                    System.err.println("");
                    System.err.println("Help:");
                    System.err.println("  -h, --help        print this help screen");
                    System.err.println("");
                    return;
                }
                case 115: {
                    batchSizeSpec = g.getOptarg();
                    break;
                }
                case 98: {
                    bufferSizeSpec = g.getOptarg();
                    break;
                }
                case 99: {
                    charSet = g.getOptarg();
                    break;
                }
                case 122: {
                    zeroth = true;
                    break;
                }
                case 100: {
                    delimiter = (char)Integer.parseInt(g.getOptarg());
                    break;
                }
                case 112: {
                    permFile = g.getOptarg();
                    break;
                }
                case 113: {
                    quantum = Integer.parseInt(g.getOptarg());
                    break;
                }
                case 105: {
                    isCaseSensitive = false;
                    break;
                }
                case 1: {
                    isCaseSensitive = false;
                    downcase = true;
                    break;
                }
                case 63: {
                    return;
                }
            }
        }
        bufferSize = Fast.parseIntSize(bufferSizeSpec);
        int occsPerBatch = Fast.parseIntSize(batchSizeSpec);
        if (arg.length - g.getOptind() != 1) {
            System.err.println("Wrong number (" + (arg.length - g.getOptind()) + ") of arguments.");
            return;
        }
        baseName = arg[g.getOptind()] + '.';
        if (zeroth) {
            Properties properties = new Properties();
            properties.load(new FileInputStream(baseName + "properties"));
            numDocuments = Integer.parseInt(properties.getProperty("documents"));
        }
        System.err.println("Documents will be separated by the Unicode character " + delimiter);
        temp = new Occurrence[occsPerBatch];
        IntOpenHashSet termInDoc = new IntOpenHashSet();
        FastBufferedReader input = new FastBufferedReader(new InputStreamReader(System.in, charSet), bufferSize);
        MutableString word = new MutableString();
        if (zeroth) {
            termMph = (MinimalPerfectHash)new ObjectInputStream(new FastBufferedInputStream(new FileInputStream(baseName + "mph"), bufferSize)).readObject();
        } else {
            termMap = new Object2IntOpenHashMap();
            termMap.defaultReturnValue(-1);
        }
        long totOccurrences = 0L;
        int n = 0;
        int n2 = 0;
        if (zeroth) {
            n2 = termMph.size();
        }
        int numTerms = n2;
        int batch = 0;
        int numOccurrences = 0;
        int pos = 0;
        int trimThreshold = 256;
        int aboveThreshold = 0;
        int aboveHalfThreshold = 0;
        int[] frequency = new int[zeroth ? numTerms : 1024];
        count = new int[Math.max(numDocuments, numTerms)];
        int maxDocSize = 0;
        ProgressMeter pm = new ProgressMeter(1000000);
        if (permFile != null) {
            System.err.print("Reading permutation...");
            int m = (int)(new File(permFile).length() / (long)4);
            perm = new int[m];
            DataInputStream di = new DataInputStream(new FastBufferedInputStream(new FileInputStream(permFile), bufferSize));
            i = 0;
            while (i < m) {
                int j = di.readInt();
                if (j < 0 || j >= m) {
                    System.err.println("The permutation file contains the illegal mapping " + i + " |-> " + j);
                    return;
                }
                perm[i] = j;
                ++i;
            }
            di.close();
            System.err.println(" done.");
            if (numDocuments < 0) {
                numDocuments = perm.length;
            } else if (numDocuments != perm.length) {
                throw new IllegalStateException("The permutation file contains " + perm.length + " integers, but the ZerothPass property file claims that there are " + numDocuments + " documents");
            }
        }
        Occurrence[] occurrence = new Occurrence[occsPerBatch];
        i = occurrence.length;
        System.err.print("Creating occurrences...");
        pm.itemsName("objects");
        pm.start();
        while (i-- != 0) {
            occurrence[i] = new Occurrence();
        }
        pm.stop();
        pm.count(occurrence.length);
        pm.done();
        System.err.print("Indexing documents...");
        pm.quantum(quantum);
        pm.itemsName("documents");
        if (zeroth) {
            pm.expectedUpdates(numDocuments);
        }
        pm.start();
        try {
            int c;
            PrintWriter termFile = zeroth ? null : new PrintWriter(new BufferedWriter(new OutputStreamWriter((OutputStream)new FileOutputStream(baseName + "terms"), "UTF-8")));
            OutputBitStream sizeFile = new OutputBitStream(new FileOutputStream(baseName + "sizes"));
            while ((c = input.readWord(word)) != -1) {
                int t;
                if (word.length() != 0) {
                    if (downcase) {
                        word.toLowerCase();
                    }
                    if (zeroth) {
                        t = termMph.get(word);
                    } else {
                        t = termMap.getInt((Object)word);
                        if (t == -1) {
                            t = numTerms++;
                            termMap.put((Object)word.copy(), t);
                            word.println(termFile);
                            if (numTerms % 1000000 == 0) {
                                System.err.print("[" + Fast.format(numTerms) + " term(s), " + Fast.format(totOccurrences) + " occ(s)]");
                            }
                        }
                    }
                    if (termInDoc.add(t)) {
                        if (t >= frequency.length) {
                            frequency = IntArrays.grow((int[])frequency, (int)(t + 1));
                        }
                        int n3 = t;
                        frequency[n3] = frequency[n3] + 1;
                    }
                    if (perm != null) {
                        occurrence[numOccurrences++].set(t, perm[n], pos);
                    } else {
                        occurrence[numOccurrences++].set(t, n, pos);
                    }
                    ++totOccurrences;
                    if (numOccurrences == occsPerBatch) {
                        int n4 = numDocuments < 0 ? n + 1 : numDocuments;
                        int n5 = batch++;
                        boolean bl = false;
                        if (perm == null) {
                            bl = true;
                        }
                        FirstPass.writeOccurrences(occurrence, numOccurrences, n4, numTerms, n5, bl);
                        numOccurrences = 0;
                    }
                    ++pos;
                }
                if (c != delimiter) continue;
                ++n;
                pm.update();
                sizeFile.writeGamma(pos);
                if (pos > maxDocSize) {
                    maxDocSize = pos;
                }
                pos = 0;
                t = termInDoc.size();
                if (t > trimThreshold / 2) {
                    ++aboveHalfThreshold;
                    if (t > trimThreshold) {
                        ++aboveThreshold;
                    }
                }
                if (n % 4096 == 0) {
                    if (aboveThreshold > 10) {
                        trimThreshold *= 2;
                    } else if (aboveHalfThreshold <= 2 && trimThreshold > 1) {
                        trimThreshold /= 2;
                    }
                    aboveThreshold = 0;
                    aboveHalfThreshold = 0;
                }
                termInDoc.clear();
                termInDoc.trim(trimThreshold);
            }
            if (pos != 0) {
                ++n;
                pm.update();
                sizeFile.writeGamma(pos);
            }
            if (numOccurrences > 0) {
                int n6 = numDocuments < 0 ? n + 1 : numDocuments;
                int n7 = batch++;
                boolean bl = false;
                if (perm == null) {
                    bl = true;
                }
                FirstPass.writeOccurrences(occurrence, numOccurrences, n6, numTerms, n7, bl);
            }
            pm.done();
            if (numDocuments >= 0 && n != numDocuments) {
                System.err.println("WARNING: The input stream contains " + n + " documents, but the ZerothPass property file claims that there are " + numDocuments + " documents");
            }
            if (perm != null && n != perm.length) {
                System.err.println("WARNING: The input stream contains " + n + " documents, but the permutation contains " + perm.length + " integers");
            }
            if (!zeroth) {
                termFile.close();
            }
            sizeFile.close();
            termMap = null;
            termMph = null;
            temp = null;
            occurrence = null;
            count = null;
            OutputBitStream frequencies = new OutputBitStream(new FileOutputStream(baseName + "frequencies"));
            i = 0;
            while (i < numTerms) {
                frequencies.writeGamma(frequency[i]);
                ++i;
            }
            frequencies.close();
            frequency = null;
            if (perm != null) {
                int[] size = new int[n];
                InputBitStream size2File = new InputBitStream(new FileInputStream(baseName + "sizes"), bufferSize);
                i = 0;
                while (i < n) {
                    size[i] = size2File.readGamma();
                    ++i;
                }
                size2File.close();
                int[] invPerm = new int[perm.length];
                i = 0;
                while (i < n) {
                    invPerm[perm[i]] = i;
                    ++i;
                }
                OutputBitStream permSizeFile = new OutputBitStream(new FileOutputStream(baseName + "sizes"), bufferSize);
                i = 0;
                while (i < n) {
                    permSizeFile.writeGamma(size[invPerm[i]]);
                    ++i;
                }
                permSizeFile.close();
            }
            Properties properties = new Properties();
            properties.setProperty("documents", String.valueOf(n));
            properties.setProperty("terms", String.valueOf(numTerms));
            properties.setProperty("maxdocsize", String.valueOf(maxDocSize));
            properties.setProperty("iscasesensitive", Boolean.toString(isCaseSensitive));
            properties.setProperty("batches", String.valueOf(batch));
            properties.setProperty("occurrences", String.valueOf(totOccurrences));
            properties.setProperty("occsperbatch", String.valueOf(occsPerBatch));
            if (perm != null) {
                properties.setProperty("permutation", permFile);
            }
            FileOutputStream propertyFile = new FileOutputStream(baseName + "properties");
            properties.store(propertyFile, "FirstPass properties");
            propertyFile.close();
        }
        catch (IOException dontCare) {
            dontCare.printStackTrace();
        }
    }

    private FirstPass() {
    }
}

