/*
 * Decompiled with CFR 0.152.
 */
package it.unimi.dsi.law.warc.tool;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.StringParser;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.law.warc.io.GZWarcRecord;
import it.unimi.dsi.law.warc.io.WarcRecord;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.StringMap;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.util.Arrays;
import org.apache.log4j.Logger;

public class CutWarc {
    static final int IO_BUFFER_SIZE = 65536;

    public static void run(FastBufferedInputStream warc, RandomAccessFile idx, boolean isGZippedInput, boolean isGZippedOutput, long[] record, int recordCount, OutputStream out) throws IOException, WarcRecord.FormatException {
        WarcRecord inRecord = isGZippedInput ? new GZWarcRecord() : new WarcRecord();
        WarcRecord outRecord = isGZippedOutput ? new GZWarcRecord() : new WarcRecord();
        ProgressLogger logger = new ProgressLogger(Logger.getLogger((String)CutWarc.class.getName()), 10000L, "documents");
        logger.start((CharSequence)"Cutting documents");
        for (int i = 0; i < recordCount; ++i) {
            idx.seek(record[i] * 8L);
            long pos = idx.readLong();
            warc.position(pos);
            inRecord.resetRead();
            inRecord.read(warc);
            outRecord.fromWarcRecord(inRecord);
            outRecord.write(out);
            logger.lightUpdate();
        }
        logger.stop();
    }

    public static void main(String[] arg) throws Exception {
        SimpleJSAP jsap = new SimpleJSAP(CutWarc.class.getName(), "Cuts (that is, extracts record) from a warc file. It requires an index.", new Parameter[]{new Switch("gzip", 'z', "gzip", "Tells if the input warc is compressed."), new Switch("outzip", 'Z', "outzip", "Tells if the output warc must be compressed."), new Switch("permissive", 'p', "permissive", "Ignore unknown urls instead of throwing an exception"), new FlaggedOption("recordFile", (StringParser)JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'r', "recordFile", "A file containing, one per line, the ordinal numbers or URL of records to be output."), new FlaggedOption("urlMap", (StringParser)JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'm', "url-map", "The term map from URL to record number."), new UnflaggedOption("warcFile", (StringParser)JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, false, "The Warc file basename."), new UnflaggedOption("recordSpec", (StringParser)JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, true, "The spec (ordinal number or URL) of records to be output.")});
        JSAPResult jsapResult = jsap.parse(arg);
        if (jsap.messagePrinted()) {
            return;
        }
        CharSequence[] recordSpec = null;
        if (!jsapResult.userSpecified("recordFile") && !jsapResult.userSpecified("recordSpec")) {
            throw new IllegalArgumentException("One of the two options recordFile and recordSpec must be set.");
        }
        if (jsapResult.userSpecified("recordSpec") && jsapResult.userSpecified("recordFile")) {
            throw new IllegalArgumentException("You cannot specify both recordFile and recordSpec options");
        }
        recordSpec = jsapResult.userSpecified("recordSpec") ? jsapResult.getStringArray("recordSpec") : new FileLinesCollection((CharSequence)jsapResult.getString("recordFile"), "UTF-8").allLines().toArray(new CharSequence[0]);
        String warcFile = jsapResult.getString("warcFile");
        boolean isGZippedInput = jsapResult.getBoolean("gzip");
        boolean isGZippedOutput = jsapResult.getBoolean("outzip");
        boolean bePermissive = jsapResult.getBoolean("permissive");
        long[] record = new long[recordSpec.length];
        StringMap map = jsapResult.getString("urlMap") == null ? null : (StringMap)BinIO.loadObject((CharSequence)jsapResult.getString("urlMap"));
        int recordCount = 0;
        for (int i = 0; i < recordSpec.length; ++i) {
            try {
                record[recordCount] = Long.parseLong(recordSpec[i].toString());
                if (record[recordCount] < 0L) continue;
                ++recordCount;
                continue;
            }
            catch (NumberFormatException e) {
                if (map == null) {
                    throw new RuntimeException("URLs cannot be specified if a map is not provided");
                }
                record[recordCount] = map.getLong((Object)recordSpec[i]);
                if (record[recordCount] < 0L) {
                    if (bePermissive) continue;
                    throw new RuntimeException("URL " + recordSpec[i] + " cannot be resolved");
                }
                ++recordCount;
            }
        }
        Arrays.sort(record, 0, recordCount);
        FastBufferedInputStream warc = new FastBufferedInputStream((InputStream)new FileInputStream(new File(warcFile + ".warc" + (isGZippedInput ? ".gz" : ""))), 65536);
        RandomAccessFile idx = new RandomAccessFile(new File(warcFile + ".warc" + (isGZippedInput ? ".gz" : "") + ".idx"), "r");
        FastBufferedOutputStream out = new FastBufferedOutputStream((OutputStream)System.out, 65536);
        CutWarc.run(warc, idx, isGZippedInput, isGZippedOutput, record, recordCount, (OutputStream)out);
        warc.close();
        idx.close();
        out.close();
    }
}

