Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Base clases provide more protected methods for subclasses #432

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parse

env = new Environment(envHome, envConfig);
docIdServer = new DocIDServer(env, config);
frontier = new Frontier(env, config);
frontier = createFrontier(config);

this.pageFetcher = pageFetcher;
this.parser = parser == null ? new Parser(config, tldList) : parser;
Expand All @@ -153,6 +153,15 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parse
robotstxtServer.setCrawlConfig(config);
}

/**
* Creates the Frontier for this instance. Subclasses can create custom Frontiers
* @param config configuration procided to the CrawlController
* @return
*/
protected Frontier createFrontier(CrawlConfig config) {
return new Frontier(env, config);
}

public Parser getParser() {
return parser;
}
Expand Down Expand Up @@ -534,7 +543,7 @@ public void addSeed(String pageUrl, int docId) throws IOException, InterruptedEx
}
}

WebURL webUrl = new WebURL();
WebURL webUrl = createEmptyWebURL(pageUrl);
webUrl.setTldList(tldList);
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
Expand All @@ -548,6 +557,15 @@ public void addSeed(String pageUrl, int docId) throws IOException, InterruptedEx
}
}

/**
* Creates an empty WebURL. Subclases can override this to create subclases of WebURL instead.
* @param nonCanonicalString url before being transformed into canonical. It is ignored in default implementation
* @return
*/
protected WebURL createEmptyWebURL(String nonCanonicalString) {
return new WebURL();
}

/**
* This function can called to assign a specific document id to a url. This
* feature is useful when you have had a previous crawl and have stored the
Expand Down
161 changes: 100 additions & 61 deletions crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -448,33 +448,7 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
onRedirectedStatusCode(page);

if (myController.getConfig().isFollowRedirects()) {
int newDocId = docIdServer.getDocId(movedToUrl);
if (newDocId > 0) {
logger.debug("Redirect page: {} is already seen", curURL);
return;
}

WebURL webURL = new WebURL();
webURL.setTldList(myController.getTldList());
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setParentUrl(curURL.getParentUrl());
webURL.setDepth(curURL.getDepth());
webURL.setDocid(-1);
webURL.setAnchor(curURL.getAnchor());
if (shouldVisit(page, webURL)) {
if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
frontier.schedule(webURL);
} else {
logger.debug(
"Not visiting: {} as per the server's \"robots.txt\" policy",
webURL.getURL());
}
} else {
logger.debug("Not visiting: {} as per your \"shouldVisit\" policy",
webURL.getURL());
}
redirectionPhase(page, curURL, movedToUrl);
}
} else { // All other http codes other than 3xx & 200
String description =
Expand Down Expand Up @@ -513,40 +487,7 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
parser.parse(page, curURL.getURL());

if (shouldFollowLinksIn(page.getWebURL())) {
ParseData parseData = page.getParseData();
List<WebURL> toSchedule = new ArrayList<>();
int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
for (WebURL webURL : parseData.getOutgoingUrls()) {
webURL.setParentDocid(curURL.getDocid());
webURL.setParentUrl(curURL.getURL());
int newdocid = docIdServer.getDocId(webURL.getURL());
if (newdocid > 0) {
// This is not the first time that this Url is visited. So, we set the
// depth to a negative number.
webURL.setDepth((short) -1);
webURL.setDocid(newdocid);
} else {
webURL.setDocid(-1);
webURL.setDepth((short) (curURL.getDepth() + 1));
if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
if (shouldVisit(page, webURL)) {
if (robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
toSchedule.add(webURL);
} else {
logger.debug(
"Not visiting: {} as per the server's \"robots.txt\" " +
"policy", webURL.getURL());
}
} else {
logger.debug(
"Not visiting: {} as per your \"shouldVisit\" policy",
webURL.getURL());
}
}
}
}
frontier.scheduleAll(toSchedule);
scheduleOutgoingUrls(page, curURL);
} else {
logger.debug("Not looking for links in page {}, "
+ "as per your \"shouldFollowLinksInPage\" policy",
Expand Down Expand Up @@ -584,6 +525,104 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
}
}

protected WebURL createEmptyWebURL() {
return new WebURL();
}

protected void scheduleOutgoingUrls(Page page, WebURL curURL) throws IOException, InterruptedException {
ParseData parseData = page.getParseData();
List<WebURL> toSchedule = new ArrayList<>();
int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
for (WebURL webURL : parseData.getOutgoingUrls()) {
webURL.setParentDocid(curURL.getDocid());
webURL.setParentUrl(curURL.getURL());
int newdocid = docIdServer.getDocId(webURL.getURL());
if (newdocid > 0) {
// This is not the first time that this Url is visited. So, we set the
// depth to a negative number.
webURL.setDepth((short) -1);
webURL.setDocid(newdocid);
} else {
webURL.setDocid(-1);
webURL.setDepth((short) (curURL.getDepth() + 1));
if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
if (shouldVisit(page, webURL)) {
if (robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
toSchedule.add(webURL);
} else {
logger.debug(
"Not visiting: {} as per the server's \"robots.txt\" " +
"policy", webURL.getURL());
}
} else {
logger.debug(
"Not visiting: {} as per your \"shouldVisit\" policy",
webURL.getURL());
}
}
}
}
scheduleAll(toSchedule);
}

protected void redirectionPhase(Page page, WebURL curURL, String movedToUrl)
throws IOException, InterruptedException {
int newDocId = docIdServer.getDocId(movedToUrl);
if (newDocId > 0) {
logger.debug("Redirect page: {} is already seen", curURL);
return;
}

WebURL webURL = createRedirectedWebURL(curURL, movedToUrl);
if (shouldVisit(page, webURL)) {
if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
performRedirect(webURL, curURL);
} else {
logger.debug(
"Not visiting: {} as per the server's \"robots.txt\" policy",
webURL.getURL());
}
} else {
logger.debug("Not visiting: {} as per your \"shouldVisit\" policy",
webURL.getURL());
}
}

protected void performRedirect(WebURL target, WebURL currURL) {
target.setDocid(docIdServer.getNewDocID(target.getURL()));
schedule(target);
}

protected void schedule(WebURL url) {
frontier.schedule(url);
}

protected void scheduleAll(List<WebURL> urls) {
frontier.scheduleAll(urls);
}

/**
* Creates a new WebURL based on provided WebURL data.
*
* Subclases may use aditional parameters or use subclasses of WebURL.
*
* @param curURL
* @param movedToUrl
* @return
*/
protected WebURL createRedirectedWebURL(WebURL curURL, String movedToUrl) {
WebURL webURL = createEmptyWebURL();
webURL.setTldList(myController.getTldList());
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setParentUrl(curURL.getParentUrl());
webURL.setDepth(curURL.getDepth());
webURL.setAnchor(curURL.getAnchor());
webURL.setDocid(-1);
return webURL;
}

public Thread getThread() {
return myThread;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public Frontier(Environment env, CrawlConfig config) {
this.config = config;
this.counters = new Counters(env, config);
try {
workQueues = new WorkQueues(env, DATABASE_NAME, config.isResumableCrawling());
workQueues = createWorkQueues(env, config, DATABASE_NAME);
if (config.isResumableCrawling()) {
scheduledPages = counters.getValue(Counters.ReservedCounterNames.SCHEDULED_PAGES);
inProcessPages = new InProcessPagesDB(env);
Expand Down Expand Up @@ -208,4 +208,14 @@ public void finish() {
waitingList.notifyAll();
}
}

/**
* Creates the WorkQueues for this frontier. Can be overriden to create
* subclases of WorkQueues instead
* @return
* @see Environment#openDatabase
*/
protected WorkQueues createWorkQueues(Environment env, CrawlConfig config, String databaseName) {
return new WorkQueues(env, databaseName, config.isResumableCrawling());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,18 @@ public class WorkQueues {
protected final Object mutex = new Object();

public WorkQueues(Environment env, String dbName, boolean resumable) {
this(env, dbName, resumable, new WebURLTupleBinding());
}

protected WorkQueues(Environment env, String dbName, boolean resumable, WebURLTupleBinding webURLBinding) {
this.env = env;
this.resumable = resumable;
DatabaseConfig dbConfig = new DatabaseConfig();
dbConfig.setAllowCreate(true);
dbConfig.setTransactional(resumable);
dbConfig.setDeferredWrite(!resumable);
urlsDB = env.openDatabase(null, dbName, dbConfig);
webURLBinding = new WebURLTupleBinding();
this.webURLBinding = webURLBinding;
}

protected Transaction beginTransaction() {
Expand Down