/*
 * Decompiled with CFR 0.152.
 */
package it.unimi.dsi.mg4j.tool;

import gnu.getopt.Getopt;
import gnu.getopt.LongOpt;
import it.unimi.dsi.fastutil.objects.ObjectArrays;
import it.unimi.dsi.fastutil.objects.ObjectHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.objects.ObjectIterators;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import it.unimi.dsi.mg4j.io.FastBufferedReader;
import it.unimi.dsi.mg4j.util.Fast;
import it.unimi.dsi.mg4j.util.MutableString;
import it.unimi.dsi.mg4j.util.ProgressMeter;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Properties;

public final class ZerothPass {
    private static final String THIS_CLASS_NAME = new ZerothPass().getClass().getName();

    private static final int mergeSortedFiles(String[] inputFilename, String outputFilename, int bufferSize, ProgressMeter pm) throws IOException {
        int n = inputFilename.length;
        Object[] string = new MutableString[n];
        FastBufferedReader[] reader = new FastBufferedReader[n];
        ObjectHeapSemiIndirectPriorityQueue queue = new ObjectHeapSemiIndirectPriorityQueue(string, n);
        int i = 0;
        while (i < n) {
            reader[i] = new FastBufferedReader(new InputStreamReader((InputStream)new FileInputStream(inputFilename[i]), "UTF-8"), bufferSize);
            string[i] = new MutableString();
            reader[i].readWord((MutableString)string[i]);
            if (((MutableString)string[i]).length() != 0) {
                queue.enqueue(i);
            }
            ++i;
        }
        PrintWriter writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter((OutputStream)new FileOutputStream(outputFilename), "UTF-8"), bufferSize));
        int numTerms = 0;
        MutableString prev = new MutableString();
        if (pm != null) {
            pm.start("Merging...");
            pm.itemsName("items");
        }
        while (!queue.isEmpty()) {
            int curr = queue.first();
            if (!((MutableString)string[curr]).equals(prev)) {
                ++numTerms;
                ((MutableString)string[curr]).println(writer);
                prev.replace((MutableString)string[curr]);
            }
            reader[curr].readWord((MutableString)string[curr]);
            if (((MutableString)string[curr]).length() == 0) {
                queue.dequeue();
            } else {
                queue.changed();
            }
            if (pm == null) continue;
            pm.update();
        }
        if (pm != null) {
            pm.done();
        }
        writer.close();
        return numTerms;
    }

    private static final void dumpSortedTerms(ObjectSet terms, String filename, int bufferSize, MutableString[] support) throws UnsupportedEncodingException, FileNotFoundException {
        int n = terms.size();
        support = (MutableString[])ObjectArrays.ensureCapacity((Object[])support, (int)n, (int)0);
        ObjectIterators.unwrap((Iterator)terms.iterator(), (Object[])support);
        System.err.print("Sorting terms...");
        Arrays.sort(support, 0, n);
        System.err.print(" saving...");
        PrintWriter termFile = new PrintWriter(new BufferedWriter(new OutputStreamWriter((OutputStream)new FileOutputStream(filename), "UTF-8"), bufferSize));
        int i = 0;
        while (i < n) {
            support[i].println(termFile);
            ++i;
        }
        termFile.close();
        ObjectArrays.fill((Object[])support, null);
        System.err.print(" done.");
    }

    public static final void main(String[] arg) throws UnsupportedEncodingException, FileNotFoundException {
        int ch;
        int delimiter = 10;
        String charSet = "UTF-8";
        int quantum = 10000;
        int maxBatchTerms = Integer.MAX_VALUE;
        String bufferSizespec = "64Ki";
        boolean doUnsorted = true;
        boolean downcase = false;
        LongOpt[] longopts = new LongOpt[]{new LongOpt("help", 0, null, 104), new LongOpt("buffer-size", 1, null, 98), new LongOpt("delimiter", 1, null, 100), new LongOpt("charset", 1, null, 99), new LongOpt("max-batch-terms", 1, null, 109), new LongOpt("dot-meter-quantum", 1, null, 113), new LongOpt("no-unsorted", 0, null, 110), new LongOpt("downcase", 0, null, 1)};
        Getopt getopt = new Getopt(THIS_CLASS_NAME, arg, "nq:c:d:b:m:h", longopts);
        getopt.setOpterr(true);
        while ((ch = getopt.getopt()) != -1) {
            switch (ch) {
                case 104: {
                    System.err.println("Usage: " + THIS_CLASS_NAME + " [OPTIONS] BASENAME");
                    System.err.println("Compute terms from a list of documents.");
                    System.err.println("");
                    System.err.println("Mandatory arguments:");
                    System.err.println("  BASENAME    all file names will be stemmed from this name");
                    System.err.println("");
                    System.err.println("Optional arguments:");
                    System.err.println("  -c, --charset             the input charset encoding (default: " + charSet + ')');
                    System.err.println("  -b, --buffer-size         the size of an I/O buffer (default: " + bufferSizespec + ')');
                    System.err.println("  -d, --delimiter           the Unicode index of the document delimiter (default: " + delimiter + ')');
                    System.err.println("  -n, --no-unsorted         do not generate BASENAME.terms.unsorted");
                    System.err.println("  -m, --max-batch-terms     maximum number of terms in a batch (default: unlimited)");
                    System.err.println("  -q, --dot-meter-quantum   the quantum of the process meter (default: " + quantum + ')');
                    System.err.println("  --downcase                downcase all words");
                    System.err.println("");
                    System.err.println("Help:");
                    System.err.println("  -h, --help        print this help screen");
                    System.err.println("");
                    return;
                }
                case 98: {
                    bufferSizespec = getopt.getOptarg();
                    break;
                }
                case 99: {
                    charSet = getopt.getOptarg();
                    break;
                }
                case 100: {
                    delimiter = (char)Integer.parseInt(getopt.getOptarg());
                    break;
                }
                case 113: {
                    quantum = Integer.parseInt(getopt.getOptarg());
                    break;
                }
                case 109: {
                    maxBatchTerms = Integer.parseInt(getopt.getOptarg());
                    break;
                }
                case 110: {
                    doUnsorted = false;
                    break;
                }
                case 1: {
                    downcase = true;
                    break;
                }
                case 63: {
                    return;
                }
            }
        }
        int bufferSize = Fast.parseIntSize(bufferSizespec);
        if (arg.length - getopt.getOptind() != 1) {
            System.err.println("Wrong number (" + (arg.length - getopt.getOptind()) + ") of arguments.");
            return;
        }
        String basename = arg[getopt.getOptind()] + '.';
        System.err.println("Documents will be separated by the Unicode character " + delimiter);
        FastBufferedReader input = new FastBufferedReader(new InputStreamReader(System.in, charSet), bufferSize);
        MutableString word = new MutableString();
        ObjectOpenHashSet terms = new ObjectOpenHashSet(maxBatchTerms != Integer.MAX_VALUE ? maxBatchTerms : 16, 0.5f);
        MutableString[] support = maxBatchTerms != Integer.MAX_VALUE ? new MutableString[maxBatchTerms] : new MutableString[]{};
        int numTerms = 0;
        int numBatches = 0;
        boolean firstWordUnseen = true;
        ProgressMeter pm = new ProgressMeter(quantum);
        pm.itemsName("documents");
        pm.start("Indexing terms...");
        PrintWriter termFile = null;
        if (doUnsorted) {
            termFile = new PrintWriter(new BufferedWriter(new OutputStreamWriter((OutputStream)new FileOutputStream(basename + "terms." + (maxBatchTerms != Integer.MAX_VALUE ? "unsorted.dups" : "unsorted")), "UTF-8"), bufferSize));
        }
        try {
            int t;
            while ((t = input.readWord(word)) != -1) {
                if (downcase) {
                    word.toLowerCase();
                }
                firstWordUnseen = false;
                if (word.length() != 0 && !terms.contains((Object)word)) {
                    terms.add((Object)word.copy());
                    if (++numTerms % 1000000 == 0) {
                        System.err.print("[" + Fast.format(numTerms) + " terms]");
                    }
                    if (numTerms == maxBatchTerms) {
                        System.err.print('[');
                        ZerothPass.dumpSortedTerms((ObjectSet)terms, basename + "terms." + numBatches++, bufferSize, support);
                        System.err.print(']');
                        terms.clear();
                        numTerms = 0;
                    }
                    if (doUnsorted) {
                        word.println(termFile);
                    }
                }
                if (t != delimiter) continue;
                pm.update();
                firstWordUnseen = true;
            }
            if (!firstWordUnseen) {
                pm.update();
            }
            if (maxBatchTerms != Integer.MAX_VALUE) {
                System.err.print('[');
                ZerothPass.dumpSortedTerms((ObjectSet)terms, basename + "terms." + numBatches++, bufferSize, support);
                System.err.print(']');
                terms = null;
            }
            if (doUnsorted) {
                termFile.close();
            }
            pm.done();
            if (maxBatchTerms == Integer.MAX_VALUE) {
                ZerothPass.dumpSortedTerms((ObjectSet)terms, basename + "terms", bufferSize, support);
                System.err.println();
                terms = null;
            } else {
                String[] inputFilename = new String[numBatches];
                int i = 0;
                while (i < numBatches) {
                    inputFilename[i] = basename + "terms." + i;
                    ++i;
                }
                pm.quantum(quantum * 100);
                numTerms = ZerothPass.mergeSortedFiles(inputFilename, basename + "terms", bufferSize, pm);
                i = 0;
                while (i < numBatches) {
                    new File(inputFilename[i]).delete();
                    ++i;
                }
            }
            Properties properties = new Properties();
            properties.setProperty("basename", basename.substring(0, basename.length() - 1));
            properties.setProperty("documents", String.valueOf(pm.count()));
            properties.setProperty("terms", String.valueOf(numTerms));
            FileOutputStream propertyFile = new FileOutputStream(basename + "properties");
            properties.store(propertyFile, "ZerothPass properties");
            propertyFile.close();
        }
        catch (IOException dontCare) {
            dontCare.printStackTrace();
        }
    }

    private ZerothPass() {
    }
}

