yasserg · dgoiko · Jan 25, 2020 · Jan 25, 2020 · Jan 25, 2020 · Jan 25, 2020
diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java
@@ -141,7 +141,7 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parse
 
         env = new Environment(envHome, envConfig);
         docIdServer = new DocIDServer(env, config);
-        frontier = new Frontier(env, config);
+        frontier = createFrontier(config);
 
         this.pageFetcher = pageFetcher;
         this.parser = parser == null ? new Parser(config, tldList) : parser;
@@ -153,6 +153,15 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parse
         robotstxtServer.setCrawlConfig(config);
     }
 
+    /**
+     * Creates the Frontier for this instance. Subclasses can create custom Frontiers
+     * @param config configuration procided to the CrawlController
+     * @return
+     */
+    protected Frontier createFrontier(CrawlConfig config) {
+        return new Frontier(env, config);
+    }
+
     public Parser getParser() {
         return parser;
     }
@@ -534,7 +543,7 @@ public void addSeed(String pageUrl, int docId) throws IOException, InterruptedEx
                 }
             }
 
-            WebURL webUrl = new WebURL();
+            WebURL webUrl = createEmptyWebURL(pageUrl);
             webUrl.setTldList(tldList);
             webUrl.setURL(canonicalUrl);
             webUrl.setDocid(docId);
@@ -548,6 +557,15 @@ public void addSeed(String pageUrl, int docId) throws IOException, InterruptedEx
         }
     }
 
+    /**
+     * Creates an empty WebURL. Subclases can override this to create subclases of WebURL instead.
+     * @param nonCanonicalString url before being transformed into canonical. It is ignored in default implementation
+     * @return
+     */
+    protected WebURL createEmptyWebURL(String nonCanonicalString) {
+        return new WebURL();
+    }
+
     /**
      * This function can called to assign a specific document id to a url. This
      * feature is useful when you have had a previous crawl and have stored the

diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
@@ -448,33 +448,7 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
                     onRedirectedStatusCode(page);
 
                     if (myController.getConfig().isFollowRedirects()) {
-                        int newDocId = docIdServer.getDocId(movedToUrl);
-                        if (newDocId > 0) {
-                            logger.debug("Redirect page: {} is already seen", curURL);
-                            return;
-                        }
-
-                        WebURL webURL = new WebURL();
-                        webURL.setTldList(myController.getTldList());
-                        webURL.setURL(movedToUrl);
-                        webURL.setParentDocid(curURL.getParentDocid());
-                        webURL.setParentUrl(curURL.getParentUrl());
-                        webURL.setDepth(curURL.getDepth());
-                        webURL.setDocid(-1);
-                        webURL.setAnchor(curURL.getAnchor());
-                        if (shouldVisit(page, webURL)) {
-                            if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
-                                webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
-                                frontier.schedule(webURL);
-                            } else {
-                                logger.debug(
-                                    "Not visiting: {} as per the server's \"robots.txt\" policy",
-                                    webURL.getURL());
-                            }
-                        } else {
-                            logger.debug("Not visiting: {} as per your \"shouldVisit\" policy",
-                                         webURL.getURL());
-                        }
+                        redirectionPhase(page, curURL, movedToUrl);
                     }
                 } else { // All other http codes other than 3xx & 200
                     String description =
@@ -513,40 +487,7 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
                 parser.parse(page, curURL.getURL());
 
                 if (shouldFollowLinksIn(page.getWebURL())) {
-                    ParseData parseData = page.getParseData();
-                    List<WebURL> toSchedule = new ArrayList<>();
-                    int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
-                    for (WebURL webURL : parseData.getOutgoingUrls()) {
-                        webURL.setParentDocid(curURL.getDocid());
-                        webURL.setParentUrl(curURL.getURL());
-                        int newdocid = docIdServer.getDocId(webURL.getURL());
-                        if (newdocid > 0) {
-                            // This is not the first time that this Url is visited. So, we set the
-                            // depth to a negative number.
-                            webURL.setDepth((short) -1);
-                            webURL.setDocid(newdocid);
-                        } else {
-                            webURL.setDocid(-1);
-                            webURL.setDepth((short) (curURL.getDepth() + 1));
-                            if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
-                                if (shouldVisit(page, webURL)) {
-                                    if (robotstxtServer.allows(webURL)) {
-                                        webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
-                                        toSchedule.add(webURL);
-                                    } else {
-                                        logger.debug(
-                                            "Not visiting: {} as per the server's \"robots.txt\" " +
-                                            "policy", webURL.getURL());
-                                    }
-                                } else {
-                                    logger.debug(
-                                        "Not visiting: {} as per your \"shouldVisit\" policy",
-                                        webURL.getURL());
-                                }
-                            }
-                        }
-                    }
-                    frontier.scheduleAll(toSchedule);
+                    scheduleOutgoingUrls(page, curURL);
                 } else {
                     logger.debug("Not looking for links in page {}, "
                                  + "as per your \"shouldFollowLinksInPage\" policy",
@@ -584,6 +525,104 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
         }
     }
 
+    protected WebURL createEmptyWebURL() {
+        return new WebURL();
+    }
+
+    protected void scheduleOutgoingUrls(Page page, WebURL curURL) throws IOException, InterruptedException {
+        ParseData parseData = page.getParseData();
+        List<WebURL> toSchedule = new ArrayList<>();
+        int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
+        for (WebURL webURL : parseData.getOutgoingUrls()) {
+            webURL.setParentDocid(curURL.getDocid());
+            webURL.setParentUrl(curURL.getURL());
+            int newdocid = docIdServer.getDocId(webURL.getURL());
+            if (newdocid > 0) {
+                // This is not the first time that this Url is visited. So, we set the
+                // depth to a negative number.
+                webURL.setDepth((short) -1);
+                webURL.setDocid(newdocid);
+            } else {
+                webURL.setDocid(-1);
+                webURL.setDepth((short) (curURL.getDepth() + 1));
+                if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
+                    if (shouldVisit(page, webURL)) {
+                        if (robotstxtServer.allows(webURL)) {
+                            webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
+                            toSchedule.add(webURL);
+                        } else {
+                            logger.debug(
+                                "Not visiting: {} as per the server's \"robots.txt\" " +
+                                "policy", webURL.getURL());
+                        }
+                    } else {
+                        logger.debug(
+                            "Not visiting: {} as per your \"shouldVisit\" policy",
+                            webURL.getURL());
+                    }
+                }
+            }
+        }
+        scheduleAll(toSchedule);
+    }
+
+    protected void redirectionPhase(Page page, WebURL curURL, String movedToUrl)
+                                        throws IOException, InterruptedException {
+        int newDocId = docIdServer.getDocId(movedToUrl);
+        if (newDocId > 0) {
+            logger.debug("Redirect page: {} is already seen", curURL);
+            return;
+        }
+
+        WebURL webURL = createRedirectedWebURL(curURL, movedToUrl);
+        if (shouldVisit(page, webURL)) {
+            if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
+                performRedirect(webURL, curURL);
+            } else {
+                logger.debug(
+                    "Not visiting: {} as per the server's \"robots.txt\" policy",
+                    webURL.getURL());
+            }
+        } else {
+            logger.debug("Not visiting: {} as per your \"shouldVisit\" policy",
+                         webURL.getURL());
+        }
+    }
+
+    protected void performRedirect(WebURL target, WebURL currURL) {
+        target.setDocid(docIdServer.getNewDocID(target.getURL()));
+        schedule(target);
+    }
+
+    protected void schedule(WebURL url) {
+        frontier.schedule(url);
+    }
+
+    protected void scheduleAll(List<WebURL> urls) {
+        frontier.scheduleAll(urls);
+    }
+
+    /**
+     * Creates a new WebURL based on provided WebURL data.
+     *
+     * Subclases may use aditional parameters or use subclasses of WebURL.
+     *
+     * @param curURL
+     * @param movedToUrl
+     * @return
+     */
+    protected WebURL createRedirectedWebURL(WebURL curURL, String movedToUrl) {
+        WebURL webURL = createEmptyWebURL();
+        webURL.setTldList(myController.getTldList());
+        webURL.setURL(movedToUrl);
+        webURL.setParentDocid(curURL.getParentDocid());
+        webURL.setParentUrl(curURL.getParentUrl());
+        webURL.setDepth(curURL.getDepth());
+        webURL.setAnchor(curURL.getAnchor());
+        webURL.setDocid(-1);
+        return webURL;
+    }
+
     public Thread getThread() {
         return myThread;
     }

diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java
@@ -55,7 +55,7 @@ public Frontier(Environment env, CrawlConfig config) {
         this.config = config;
         this.counters = new Counters(env, config);
         try {
-            workQueues = new WorkQueues(env, DATABASE_NAME, config.isResumableCrawling());
+            workQueues = createWorkQueues(env, config, DATABASE_NAME);
             if (config.isResumableCrawling()) {
                 scheduledPages = counters.getValue(Counters.ReservedCounterNames.SCHEDULED_PAGES);
                 inProcessPages = new InProcessPagesDB(env);
@@ -208,4 +208,14 @@ public void finish() {
             waitingList.notifyAll();
         }
     }
+
+    /**
+     * Creates the WorkQueues for this frontier. Can be overriden to create
+     * subclases of WorkQueues instead
+     * @return
+     * @see Environment#openDatabase
+     */
+    protected WorkQueues createWorkQueues(Environment env, CrawlConfig config, String databaseName) {
+        return new WorkQueues(env, databaseName, config.isResumableCrawling());
+    }
 }
diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java
@@ -45,14 +45,18 @@ public class WorkQueues {
     protected final Object mutex = new Object();
 
     public WorkQueues(Environment env, String dbName, boolean resumable) {
+        this(env, dbName, resumable, new WebURLTupleBinding());
+    }
+
+    protected WorkQueues(Environment env, String dbName, boolean resumable, WebURLTupleBinding webURLBinding) {
         this.env = env;
         this.resumable = resumable;
         DatabaseConfig dbConfig = new DatabaseConfig();
         dbConfig.setAllowCreate(true);
         dbConfig.setTransactional(resumable);
         dbConfig.setDeferredWrite(!resumable);
         urlsDB = env.openDatabase(null, dbName, dbConfig);
-        webURLBinding = new WebURLTupleBinding();
+        this.webURLBinding = webURLBinding;
     }
 
     protected Transaction beginTransaction() {