package org.apache.sling.samples.webloader.internal;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collections;
import java.util.Iterator;
import javax.jcr.Node;
import javax.jcr.RepositoryException;
import javax.jcr.Session;
import javax.swing.text.BadLocationException;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import org.apache.sling.commons.mime.MimeTypeService;
import org.apache.sling.jcr.api.SlingRepository;
import org.apache.sling.samples.webloader.WebloaderException;
import org.apache.sling.samples.webloader.WebloaderJobStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/sling/samples/webloader/internal/WebloaderJob.class */
class WebloaderJob extends Thread implements WebloaderJobStatus {
    private Throwable error;
    private int numDocsLoaded;
    private final int maxDocsToRetrieve;
    private final int maxDocSize;
    private String statusInfo = "initialized";
    private String statusDetails = "";
    private boolean running = true;
    private final String jobId;
    private final String webQuery;
    private String storagePath;
    private final SlingRepository repository;
    private final MimeTypeService mimeTypeService;
    private Session session;
    private Node storageRoot;
    private static int idCounter;
    private final String[] filetypes;
    public static final int URL_RETRIEVE_TIMEOUT_SECONDS = 10;
    public static final String[] DEFAULT_FILETYPES = {"pdf", "rtf", "ppt", "doc", "xls"};
    private static final Logger log = LoggerFactory.getLogger(WebloaderJob.class);

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/apache/sling/samples/webloader/internal/WebloaderJob$DocTooBigException.class */
    public static class DocTooBigException extends IOException {
        DocTooBigException(URL url, int i) {
            super("Document at URL " + url + " too big (" + i + " bytes), will be ignored");
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public WebloaderJob(SlingRepository slingRepository, MimeTypeService mimeTypeService, String str, String str2, String str3, int i, int i2) {
        synchronized (WebloaderJob.class) {
            int i3 = idCounter;
            idCounter = i3 + 1;
            this.jobId = String.valueOf(i3);
        }
        this.repository = slingRepository;
        this.mimeTypeService = mimeTypeService;
        this.webQuery = str;
        this.storagePath = str2;
        this.maxDocsToRetrieve = i;
        this.maxDocSize = i2;
        String[] split = str3 == null ? null : str3.split(",");
        if (split == null || split.length <= 0) {
            this.filetypes = DEFAULT_FILETYPES;
        } else {
            this.filetypes = new String[split.length];
            for (int i4 = 0; i4 < split.length; i4++) {
                this.filetypes[i4] = split[i4].trim().toLowerCase();
            }
        }
        if (mimeTypeService == null) {
            throw new WebloaderException("Missing MimeTypeService");
        }
        if (slingRepository == null) {
            throw new WebloaderException("Missing Repository");
        }
        setDaemon(true);
        start();
    }

    @Override // java.lang.Thread
    public String toString() {
        StringBuffer stringBuffer = new StringBuffer();
        for (String str : this.filetypes) {
            if (stringBuffer.length() > 0) {
                stringBuffer.append(",");
            }
            stringBuffer.append(str);
        }
        return getClass().getSimpleName() + ", webQuery=" + this.webQuery + ", storagePath=" + this.storagePath + ", fileTypes=" + stringBuffer.toString() + ", maxDocsToRetrieve=" + this.maxDocsToRetrieve + ", maxDocSize=" + this.maxDocSize;
    }

    /* JADX WARN: Finally extract failed */
    @Override // java.lang.Thread, java.lang.Runnable
    public void run() {
        log.debug("Job thread starting: {}", this);
        this.session = null;
        if (this.storagePath.charAt(0) == '/') {
            this.storagePath = this.storagePath.substring(1);
        }
        String str = "/" + this.storagePath;
        try {
            try {
                this.session = this.repository.loginAdministrative((String) null);
                if (this.session.itemExists(str)) {
                    Node item = this.session.getItem(str);
                    if (!item.isNode()) {
                        throw new WebloaderException("Item at " + this.storagePath + " is not a Node");
                    }
                    this.storageRoot = item;
                } else {
                    this.storageRoot = this.session.getRootNode().addNode(this.storagePath);
                    this.session.save();
                }
                int i = 0;
                for (String str2 : this.filetypes) {
                    URL[] documentUrlsFromGoogle = getDocumentUrlsFromGoogle(str2, i);
                    int length = documentUrlsFromGoogle.length;
                    int i2 = 0;
                    while (true) {
                        if (i2 >= length) {
                            break;
                        }
                        URL url = documentUrlsFromGoogle[i2];
                        try {
                            try {
                                getAndStoreDocument(url);
                                this.session.save();
                                this.numDocsLoaded++;
                            } catch (Throwable th) {
                                this.session.refresh(false);
                                throw th;
                            }
                        } catch (DocTooBigException e) {
                            log.info(e.getMessage());
                            this.session.refresh(false);
                        } catch (Exception e2) {
                            log.warn("Exception while retrieving url " + url, e2);
                            this.session.refresh(false);
                        }
                        if (this.numDocsLoaded >= this.maxDocsToRetrieve) {
                            this.session.refresh(false);
                            break;
                        } else {
                            this.session.refresh(false);
                            i2++;
                        }
                    }
                    i += 10;
                    if (this.numDocsLoaded >= this.maxDocsToRetrieve) {
                        break;
                    }
                }
                this.statusInfo = "All done.";
                if (this.session != null) {
                    this.session.logout();
                }
                this.statusDetails = "";
                this.running = false;
            } catch (Exception e3) {
                this.error = e3;
                log.warn("Exception in WebloaderJob.run()", e3);
                this.statusInfo = "Exception while running job: " + e3;
                if (this.session != null) {
                    this.session.logout();
                }
                this.statusDetails = "";
                this.running = false;
            }
            if (this.numDocsLoaded >= this.maxDocsToRetrieve) {
                log.info("Stopped after retrieving maximum number of documents ({})", Integer.valueOf(this.maxDocsToRetrieve));
            }
            log.info("Job thread ends: {}, {} documents loaded", this, Integer.valueOf(this.numDocsLoaded));
        } catch (Throwable th2) {
            if (this.session != null) {
                this.session.logout();
            }
            this.statusDetails = "";
            this.running = false;
            throw th2;
        }
    }

    private URL[] getDocumentUrlsFromGoogle(String str, int i) throws IOException, BadLocationException {
        String str2;
        ArrayList arrayList = new ArrayList();
        URL url = new URL("http://www.google.com/search?q=" + URLEncoder.encode(this.webQuery + " filetype:" + str, "UTF-8") + "&start=" + i);
        log.debug("Querying {}", url.toString());
        this.statusInfo = "Querying " + url.toString();
        this.statusDetails = "";
        URLConnection openConnection = url.openConnection();
        openConnection.setRequestProperty("User-Agent", "");
        InputStream inputStream = openConnection.getInputStream();
        try {
            HTMLEditorKit hTMLEditorKit = new HTMLEditorKit();
            HTMLDocument hTMLDocument = new HTMLDocument();
            hTMLDocument.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
            hTMLEditorKit.read(new InputStreamReader(inputStream, "UTF-8"), hTMLDocument, 0);
            HTMLDocument.Iterator iterator = hTMLDocument.getIterator(HTML.Tag.A);
            while (iterator.isValid()) {
                if (iterator.getAttributes() != null && (str2 = (String) iterator.getAttributes().getAttribute(HTML.Attribute.HREF)) != null && str2.endsWith("." + str)) {
                    URL url2 = new URL(new URL("http", "www.google.com", "dummy"), str2);
                    if (url2.getHost().indexOf("google") == -1) {
                        log.debug("Got document URL from google: {}", url2);
                        this.statusDetails = "Got URL " + url2;
                        arrayList.add(url2);
                    }
                }
                iterator.next();
            }
            return (URL[]) arrayList.toArray(new URL[arrayList.size()]);
        } finally {
            inputStream.close();
        }
    }

    private void getAndStoreDocument(URL url) throws RepositoryException, IOException {
        this.statusInfo = "Retrieving document " + url;
        this.statusDetails = "";
        String path = url.getPath();
        if (path.startsWith("/")) {
            path = path.substring(1);
        }
        String host = url.getHost();
        ArrayList arrayList = new ArrayList();
        arrayList.addAll(Arrays.asList(host.split("\\.")));
        Collections.reverse(arrayList);
        arrayList.addAll(Arrays.asList(path.split("/", 0)));
        String replaceAll = URLDecoder.decode((String) arrayList.remove(arrayList.size() - 1), "UTF-8").replaceAll(":", "_");
        Node node = this.storageRoot;
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            String replaceAll2 = URLDecoder.decode((String) it.next(), "UTF-8").replaceAll(":", "_");
            if (replaceAll2.length() != 0) {
                if (!node.hasNode(replaceAll2)) {
                    node.addNode(replaceAll2, "nt:folder");
                }
                node = node.getNode(replaceAll2);
            }
        }
        log.debug("Retrieving document {}, will be stored at {}", url, node.getPath() + "/" + replaceAll);
        if (node.hasNode(replaceAll)) {
            return;
        }
        getAndStoreContent(url, node.addNode(replaceAll, "nt:file").addNode("jcr:content", "nt:resource"), replaceAll);
    }

    private void getAndStoreContent(URL url, Node node, String str) throws RepositoryException, IOException {
        this.statusInfo = "Retrieving content from " + url;
        this.statusDetails = "";
        URLConnection openConnection = url.openConnection();
        openConnection.setReadTimeout(10000);
        InputStream inputStream = openConnection.getInputStream();
        try {
            int contentLength = openConnection.getContentLength();
            if (contentLength != -1) {
                if (contentLength > this.maxDocSize * 1024) {
                    throw new DocTooBigException(url, contentLength);
                }
                inputStream = new ProgressInputStream(inputStream, contentLength) { // from class: org.apache.sling.samples.webloader.internal.WebloaderJob.1
                    int nextReport = 0;

                    @Override // org.apache.sling.samples.webloader.internal.ProgressInputStream
                    protected void reportProgress(int i, int i2) {
                        if (i > this.nextReport) {
                            this.nextReport += 1024;
                            WebloaderJob.this.statusDetails = "Downloaded " + i + " bytes out of " + i2;
                        }
                    }
                };
            }
            node.setProperty("jcr:data", inputStream);
            node.setProperty("jcr:mimeType", this.mimeTypeService.getMimeType(str));
            Calendar calendar = Calendar.getInstance();
            calendar.setTimeInMillis(openConnection.getLastModified());
            node.setProperty("jcr:lastModified", calendar);
            if (inputStream != null) {
                inputStream.close();
            }
        } catch (Throwable th) {
            if (inputStream != null) {
                inputStream.close();
            }
            throw th;
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public String getJobId() {
        return this.jobId;
    }

    @Override // org.apache.sling.samples.webloader.WebloaderJobStatus
    public Throwable getError() {
        return this.error;
    }

    @Override // org.apache.sling.samples.webloader.WebloaderJobStatus
    public int getNumberOfDocumentsLoaded() {
        return this.numDocsLoaded;
    }

    @Override // org.apache.sling.samples.webloader.WebloaderJobStatus
    public String getStatusInfo() {
        return this.statusInfo;
    }

    @Override // org.apache.sling.samples.webloader.WebloaderJobStatus
    public String getStatusDetails() {
        return this.statusDetails;
    }

    @Override // org.apache.sling.samples.webloader.WebloaderJobStatus
    public boolean isRunning() {
        return this.running;
    }
}
