diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java index 2f7b8b321..8f15f769d 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java @@ -141,7 +141,7 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parse env = new Environment(envHome, envConfig); docIdServer = new DocIDServer(env, config); - frontier = new Frontier(env, config); + frontier = createFrontier(config); this.pageFetcher = pageFetcher; this.parser = parser == null ? new Parser(config, tldList) : parser; @@ -153,6 +153,15 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parse robotstxtServer.setCrawlConfig(config); } + /** + * Creates the Frontier for this instance. Subclasses can create custom Frontiers + * @param config configuration procided to the CrawlController + * @return + */ + protected Frontier createFrontier(CrawlConfig config) { + return new Frontier(env, config); + } + public Parser getParser() { return parser; } @@ -534,7 +543,7 @@ public void addSeed(String pageUrl, int docId) throws IOException, InterruptedEx } } - WebURL webUrl = new WebURL(); + WebURL webUrl = createEmptyWebURL(pageUrl); webUrl.setTldList(tldList); webUrl.setURL(canonicalUrl); webUrl.setDocid(docId); @@ -548,6 +557,15 @@ public void addSeed(String pageUrl, int docId) throws IOException, InterruptedEx } } + /** + * Creates an empty WebURL. Subclases can override this to create subclases of WebURL instead. + * @param nonCanonicalString url before being transformed into canonical. It is ignored in default implementation + * @return + */ + protected WebURL createEmptyWebURL(String nonCanonicalString) { + return new WebURL(); + } + /** * This function can called to assign a specific document id to a url. This * feature is useful when you have had a previous crawl and have stored the diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java index 6f7c6573b..6ae5efd51 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java @@ -448,33 +448,7 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException onRedirectedStatusCode(page); if (myController.getConfig().isFollowRedirects()) { - int newDocId = docIdServer.getDocId(movedToUrl); - if (newDocId > 0) { - logger.debug("Redirect page: {} is already seen", curURL); - return; - } - - WebURL webURL = new WebURL(); - webURL.setTldList(myController.getTldList()); - webURL.setURL(movedToUrl); - webURL.setParentDocid(curURL.getParentDocid()); - webURL.setParentUrl(curURL.getParentUrl()); - webURL.setDepth(curURL.getDepth()); - webURL.setDocid(-1); - webURL.setAnchor(curURL.getAnchor()); - if (shouldVisit(page, webURL)) { - if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) { - webURL.setDocid(docIdServer.getNewDocID(movedToUrl)); - frontier.schedule(webURL); - } else { - logger.debug( - "Not visiting: {} as per the server's \"robots.txt\" policy", - webURL.getURL()); - } - } else { - logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", - webURL.getURL()); - } + redirectionPhase(page, curURL, movedToUrl); } } else { // All other http codes other than 3xx & 200 String description = @@ -513,40 +487,7 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException parser.parse(page, curURL.getURL()); if (shouldFollowLinksIn(page.getWebURL())) { - ParseData parseData = page.getParseData(); - List toSchedule = new ArrayList<>(); - int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling(); - for (WebURL webURL : parseData.getOutgoingUrls()) { - webURL.setParentDocid(curURL.getDocid()); - webURL.setParentUrl(curURL.getURL()); - int newdocid = docIdServer.getDocId(webURL.getURL()); - if (newdocid > 0) { - // This is not the first time that this Url is visited. So, we set the - // depth to a negative number. - webURL.setDepth((short) -1); - webURL.setDocid(newdocid); - } else { - webURL.setDocid(-1); - webURL.setDepth((short) (curURL.getDepth() + 1)); - if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) { - if (shouldVisit(page, webURL)) { - if (robotstxtServer.allows(webURL)) { - webURL.setDocid(docIdServer.getNewDocID(webURL.getURL())); - toSchedule.add(webURL); - } else { - logger.debug( - "Not visiting: {} as per the server's \"robots.txt\" " + - "policy", webURL.getURL()); - } - } else { - logger.debug( - "Not visiting: {} as per your \"shouldVisit\" policy", - webURL.getURL()); - } - } - } - } - frontier.scheduleAll(toSchedule); + scheduleOutgoingUrls(page, curURL); } else { logger.debug("Not looking for links in page {}, " + "as per your \"shouldFollowLinksInPage\" policy", @@ -584,6 +525,104 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException } } + protected WebURL createEmptyWebURL() { + return new WebURL(); + } + + protected void scheduleOutgoingUrls(Page page, WebURL curURL) throws IOException, InterruptedException { + ParseData parseData = page.getParseData(); + List toSchedule = new ArrayList<>(); + int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling(); + for (WebURL webURL : parseData.getOutgoingUrls()) { + webURL.setParentDocid(curURL.getDocid()); + webURL.setParentUrl(curURL.getURL()); + int newdocid = docIdServer.getDocId(webURL.getURL()); + if (newdocid > 0) { + // This is not the first time that this Url is visited. So, we set the + // depth to a negative number. + webURL.setDepth((short) -1); + webURL.setDocid(newdocid); + } else { + webURL.setDocid(-1); + webURL.setDepth((short) (curURL.getDepth() + 1)); + if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) { + if (shouldVisit(page, webURL)) { + if (robotstxtServer.allows(webURL)) { + webURL.setDocid(docIdServer.getNewDocID(webURL.getURL())); + toSchedule.add(webURL); + } else { + logger.debug( + "Not visiting: {} as per the server's \"robots.txt\" " + + "policy", webURL.getURL()); + } + } else { + logger.debug( + "Not visiting: {} as per your \"shouldVisit\" policy", + webURL.getURL()); + } + } + } + } + scheduleAll(toSchedule); + } + + protected void redirectionPhase(Page page, WebURL curURL, String movedToUrl) + throws IOException, InterruptedException { + int newDocId = docIdServer.getDocId(movedToUrl); + if (newDocId > 0) { + logger.debug("Redirect page: {} is already seen", curURL); + return; + } + + WebURL webURL = createRedirectedWebURL(curURL, movedToUrl); + if (shouldVisit(page, webURL)) { + if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) { + performRedirect(webURL, curURL); + } else { + logger.debug( + "Not visiting: {} as per the server's \"robots.txt\" policy", + webURL.getURL()); + } + } else { + logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", + webURL.getURL()); + } + } + + protected void performRedirect(WebURL target, WebURL currURL) { + target.setDocid(docIdServer.getNewDocID(target.getURL())); + schedule(target); + } + + protected void schedule(WebURL url) { + frontier.schedule(url); + } + + protected void scheduleAll(List urls) { + frontier.scheduleAll(urls); + } + + /** + * Creates a new WebURL based on provided WebURL data. + * + * Subclases may use aditional parameters or use subclasses of WebURL. + * + * @param curURL + * @param movedToUrl + * @return + */ + protected WebURL createRedirectedWebURL(WebURL curURL, String movedToUrl) { + WebURL webURL = createEmptyWebURL(); + webURL.setTldList(myController.getTldList()); + webURL.setURL(movedToUrl); + webURL.setParentDocid(curURL.getParentDocid()); + webURL.setParentUrl(curURL.getParentUrl()); + webURL.setDepth(curURL.getDepth()); + webURL.setAnchor(curURL.getAnchor()); + webURL.setDocid(-1); + return webURL; + } + public Thread getThread() { return myThread; } diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java index d80ebdf5a..d295550ee 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java @@ -55,7 +55,7 @@ public Frontier(Environment env, CrawlConfig config) { this.config = config; this.counters = new Counters(env, config); try { - workQueues = new WorkQueues(env, DATABASE_NAME, config.isResumableCrawling()); + workQueues = createWorkQueues(env, config, DATABASE_NAME); if (config.isResumableCrawling()) { scheduledPages = counters.getValue(Counters.ReservedCounterNames.SCHEDULED_PAGES); inProcessPages = new InProcessPagesDB(env); @@ -208,4 +208,14 @@ public void finish() { waitingList.notifyAll(); } } + + /** + * Creates the WorkQueues for this frontier. Can be overriden to create + * subclases of WorkQueues instead + * @return + * @see Environment#openDatabase + */ + protected WorkQueues createWorkQueues(Environment env, CrawlConfig config, String databaseName) { + return new WorkQueues(env, databaseName, config.isResumableCrawling()); + } } diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java index eeb474d03..dad6d094b 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java @@ -45,6 +45,10 @@ public class WorkQueues { protected final Object mutex = new Object(); public WorkQueues(Environment env, String dbName, boolean resumable) { + this(env, dbName, resumable, new WebURLTupleBinding()); + } + + protected WorkQueues(Environment env, String dbName, boolean resumable, WebURLTupleBinding webURLBinding) { this.env = env; this.resumable = resumable; DatabaseConfig dbConfig = new DatabaseConfig(); @@ -52,7 +56,7 @@ public WorkQueues(Environment env, String dbName, boolean resumable) { dbConfig.setTransactional(resumable); dbConfig.setDeferredWrite(!resumable); urlsDB = env.openDatabase(null, dbName, dbConfig); - webURLBinding = new WebURLTupleBinding(); + this.webURLBinding = webURLBinding; } protected Transaction beginTransaction() {