Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extend the four-component to every request #808

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
package us.codecraft.webmagic;

import java.util.ArrayList;
import java.util.List;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.utils.Experimental;

import java.io.Serializable;
Expand Down Expand Up @@ -53,6 +59,14 @@ public class Request implements Serializable {

private String charset;

private Downloader downloader;

private PageProcessor pageProcessor;

private Scheduler scheduler;

private List<Pipeline> pipelines = new ArrayList<Pipeline>();

public Request() {
}

Expand Down Expand Up @@ -188,6 +202,38 @@ public Request setCharset(String charset) {
return this;
}

public Downloader getDownloader() {
return downloader;
}

public void setDownloader(Downloader downloader) {
this.downloader = downloader;
}

public PageProcessor getPageProcessor() {
return pageProcessor;
}

public void setPageProcessor(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
}

public Scheduler getScheduler() {
return scheduler;
}

public void setScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}

public List<Pipeline> getPipelines() {
return pipelines;
}

public void addPipelines(Pipeline pipeline) {
this.pipelines.add(pipeline);
}

@Override
public String toString() {
return "Request{" +
Expand Down
24 changes: 20 additions & 4 deletions webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,11 @@ public void test(String... urls) {
}

private void processRequest(Request request) {
Page page = downloader.download(request, this);
Downloader dl = request.getDownloader();
if(null == dl){
dl = downloader;
}
Page page = dl.download(request, this);
if (page.isDownloadSuccess()){
onDownloadSuccess(request, page);
} else {
Expand All @@ -411,10 +415,18 @@ private void processRequest(Request request) {

private void onDownloadSuccess(Request request, Page page) {
if (site.getAcceptStatCode().contains(page.getStatusCode())){
pageProcessor.process(page);
PageProcessor pp = request.getPageProcessor();
if(null == pp){
pp = pageProcessor;
}
pp.process(page);
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
List<Pipeline> ps = request.getPipelines();
if(ps.isEmpty()){
ps.addAll(pipelines);
}
for (Pipeline pipeline : ps) {
pipeline.process(page.getResultItems(), this);
}
}
Expand Down Expand Up @@ -468,7 +480,11 @@ private void addRequest(Request request) {
if (site.getDomain() == null && request != null && request.getUrl() != null) {
site.setDomain(UrlUtils.getDomain(request.getUrl()));
}
scheduler.push(request, this);
Scheduler sc = request.getScheduler();
if(null == sc){
sc = scheduler;
}
sc.push(request, this);
}

protected void checkIfRunning() {
Expand Down