/*
 * Decompiled with CFR 0.152.
 */
package it.unimi.dsi.law.warc.tool;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.StringParser;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.dsi.fastutil.chars.CharArrays;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.law.warc.filters.Filter;
import it.unimi.dsi.law.warc.filters.parser.FilterParser;
import it.unimi.dsi.law.warc.io.GZWarcRecord;
import it.unimi.dsi.law.warc.io.HttpResponseFilteredIterator;
import it.unimi.dsi.law.warc.io.WarcRecord;
import it.unimi.dsi.law.warc.parser.Parser;
import it.unimi.dsi.law.warc.util.BURL;
import it.unimi.dsi.law.warc.util.HttpResponse;
import it.unimi.dsi.law.warc.util.WarcHttpResponse;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.StringMap;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Set;
import org.apache.log4j.Logger;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class ExtractLinks {
    private static final Logger LOGGER = Logger.getLogger(ExtractLinks.class);
    public static final String DEFAULT_BUFFER_SIZE = "64Ki";

    public static void run(FastBufferedInputStream in, boolean isGZipped, Filter<HttpResponse> filter, PrintWriter pw, StringMap<? extends CharSequence> urls, StringMap<? extends CharSequence> duplicates) throws NoSuchAlgorithmException, IOException {
        WarcRecord record = isGZipped ? new GZWarcRecord() : new WarcRecord();
        WarcRecord.Header header = record.header;
        WarcHttpResponse response = new WarcHttpResponse();
        HttpResponseFilteredIterator it = new HttpResponseFilteredIterator(in, record, response, filter);
        Parser parser = new Parser(0x100000);
        IntOpenHashSet successors = new IntOpenHashSet();
        int[] successor = IntArrays.EMPTY_ARRAY;
        ProgressLogger pl = new ProgressLogger(LOGGER, 60000L, "pages");
        pl.start((CharSequence)"Extracting...");
        while (it.hasNext()) {
            Set<BURL> outLinks;
            it.next();
            if (urls != null) {
                int k = (int)urls.getLong((Object)response.url().toString());
                if (Boolean.valueOf(header.anvlFields.get("BUbiNG-isduplicate")) == Boolean.TRUE) {
                    if (k < 0) continue;
                    LOGGER.error((Object)("URL " + response.url() + " is contained in the URL map but it is a duplicate"));
                    pw.println(k);
                    pl.update();
                    continue;
                }
                if (k == -1) {
                    LOGGER.error((Object)("URL " + response.url() + " is not contained in the URL map; this may happen if the original digest/URL file was sorted unstably or if there are several non-duplicate pages with the same digest"));
                    continue;
                }
                pw.print(k);
                pw.print('\t');
                parser.buffer = CharArrays.grow((char[])parser.buffer, (int)((int)response.contentAsStream().length()), (int)0);
                parser.parse(response);
                outLinks = parser.urls();
                successors.clear();
                for (BURL url : outLinks) {
                    k = (int)urls.getLong((Object)url.toString());
                    if (k != -1) {
                        System.err.println("Adding successor " + url + ":" + k);
                        successors.add(k);
                        continue;
                    }
                    if (duplicates == null || (k = (int)duplicates.getLong((Object)url.toString())) == -1) continue;
                    System.err.println("Adding duplicate " + url + ":" + k);
                    successors.add(k);
                }
                int d = successors.size();
                successor = IntArrays.grow((int[])successor, (int)d, (int)0);
                successors.toArray(successor);
                Arrays.sort(successor, 0, d);
                for (int i = 0; i < d; ++i) {
                    pw.print(successor[i]);
                    pw.print('\t');
                }
            } else {
                pw.print(response.url());
                parser.buffer = CharArrays.grow((char[])parser.buffer, (int)((int)response.contentAsStream().length()), (int)0);
                parser.parse(response);
                outLinks = parser.urls();
                for (BURL url : outLinks) {
                    pw.print('\t');
                    pw.print(url);
                }
            }
            pw.println();
            pl.update();
        }
        pl.done();
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static void main(String[] arg) throws Exception {
        SimpleJSAP jsap = new SimpleJSAP(ExtractLinks.class.getName(), "Extract links in pages from a WARC file.", new Parameter[]{new FlaggedOption("bufferSize", (StringParser)JSAP.INTSIZE_PARSER, DEFAULT_BUFFER_SIZE, false, 'b', "buffer-size", "The size of an I/O buffer."), new Switch("gzip", 'z', "gzip", "Tells if the warc is compressed."), new FlaggedOption("filter", (StringParser)JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'f', "filter", "The filter."), new FlaggedOption("start", (StringParser)JSAP.LONG_PARSER, JSAP.NO_DEFAULT, false, 's', "start", "The starting offset (in bytes) in the WARC file (mainly for debugging purposes)."), new FlaggedOption("duplicates", (StringParser)JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'd', "duplicates", "The (remapped) term map for duplicate URLs. If not present, only links pointing to URLs in <urls> will be used."), new FlaggedOption("urls", (StringParser)JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'u', "The term map for the node URLs."), new UnflaggedOption("warcFile", (StringParser)JSAP.STRING_PARSER, "-", true, false, "The WARC file basename (if not present, or -, stdin will be used).")});
        JSAPResult jsapResult = jsap.parse(arg);
        if (jsap.messagePrinted()) {
            return;
        }
        boolean isGZipped = jsapResult.getBoolean("gzip");
        String filterSting = jsapResult.getString("filter") == null ? "TRUE" : jsapResult.getString("filter");
        String warcFile = jsapResult.getString("warcFile");
        int bufferSize = jsapResult.getInt("bufferSize");
        Filter<HttpResponse> filter = new FilterParser<HttpResponse>(HttpResponse.class).parse(filterSting);
        StringMap urls = (StringMap)(jsapResult.userSpecified("urls") ? BinIO.loadObject((CharSequence)jsapResult.getString("urls")) : null);
        StringMap duplicates = (StringMap)(jsapResult.userSpecified("duplicates") ? BinIO.loadObject((CharSequence)jsapResult.getString("duplicates")) : null);
        FastBufferedInputStream in = new FastBufferedInputStream(warcFile.equals("-") ? System.in : new FileInputStream(new File(warcFile + ".warc" + (isGZipped ? ".gz" : ""))), bufferSize);
        if (jsapResult.userSpecified("start")) {
            in.skip(jsapResult.getLong("start"));
        }
        PrintWriter pw = new PrintWriter(new OutputStreamWriter((OutputStream)new FastBufferedOutputStream((OutputStream)System.out, bufferSize), "ASCII"));
        try {
            ExtractLinks.run(in, isGZipped, filter, pw, (StringMap<? extends CharSequence>)urls, (StringMap<? extends CharSequence>)duplicates);
        }
        finally {
            in.close();
            pw.close();
        }
    }
}

