From 8c008563ff1d89b3f327afa12deaf2f8abbe7202 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 10 Sep 2023 17:06:55 +0800 Subject: [PATCH 1/9] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 479959a7d..6fea73494 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 983d309b1..9838e1f5f 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 21fa00128..2c5732c6a 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.9.1 + 0.9.2-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 008d00443..94178bf8f 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 783a5e9ea..57b9cbac0 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 6982bc22e..138c050cb 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 30984e39d..14e495504 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 489bbbc95..c37cbe3de 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.1 + 0.9.2-SNAPSHOT 4.0.0 From c0d38a6f1a8406bd0723838a8742db118e4f6463 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sat, 23 Sep 2023 12:04:56 +0800 Subject: [PATCH 2/9] Upgrade maven-fluido-skin from 1.9 to 1.11.1. --- src/site/site.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/site/site.xml b/src/site/site.xml index d2d5caacd..b78651960 100644 --- a/src/site/site.xml +++ b/src/site/site.xml @@ -5,7 +5,7 @@ org.apache.maven.skins maven-fluido-skin - 1.9 + 1.11.1 From 73f60f809e30d56dec130811407814595a09a103 Mon Sep 17 00:00:00 2001 From: Maciej Walkowiak Date: Tue, 24 Oct 2023 01:50:14 +0200 Subject: [PATCH 3/9] Fix typos (#1131) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 750a76841..89536c927 100644 --- a/README.md +++ b/README.md @@ -118,9 +118,9 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/)) There are more examples in `webmagic-samples` package. -### Lisence: +### License: -Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0) +Licensed under [Apache 2.0 license](http://opensource.org/licenses/Apache-2.0) ### Thanks: From eda3be9432663951f42a96bf790987bb7dd6c530 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Mon, 6 Nov 2023 22:44:53 +0800 Subject: [PATCH 4/9] Fix log format. --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 2f3ef58ed..92c770236 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -85,13 +85,13 @@ public Page download(Request request, Task task) { page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(request, task); - logger.info("downloading page success {}", request.getUrl()); + logger.info("download page success {}", request.getUrl()); return page; } catch (IOException e) { onError(request, task, e); - logger.info("download page {} error", request.getUrl(), e); + logger.info("download page error {}", request.getUrl(), e); return page; } finally { From 67644de3d9540611ef494f4bb595688a47a541a6 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 20 Nov 2023 18:26:45 +0800 Subject: [PATCH 5/9] Expose Page to onSuccess & onError. --- .../main/java/us/codecraft/webmagic/Page.java | 21 ++++++++++- .../downloader/AbstractDownloader.java | 36 +++++++++++++++++++ .../downloader/HttpClientDownloader.java | 10 +++--- .../downloader/PhantomJSDownloader.java | 6 ++-- .../selenium/SeleniumDownloader.java | 6 ++-- 5 files changed, 67 insertions(+), 12 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 6370171df..e48d4cb00 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -56,8 +56,27 @@ public class Page { public Page() { } - public static Page fail(){ + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}. + * + * @return the page. + * @deprecated Use {@link #fail(Request)} instead. + */ + @Deprecated + public static Page fail() { + return fail(null); + } + + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}, + * and {@link #request} is specified. + * + * @return the page. + * @since 0.10.0 + */ + public static Page fail(Request request){ Page page = new Page(); + page.setRequest(request); page.setDownloadSuccess(false); return page; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index ea3bbc590..6a400e321 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -36,26 +36,62 @@ public Html download(String url, String charset) { return (Html) page.getHtml(); } + /** + * @param request the {@link Request}. + * @deprecated Use {@link #onSuccess(Page, Task)} instead. + */ @Deprecated protected void onSuccess(Request request) { } /** + * @param request the {@link Request}. + * @param task the {@link Task}. * @since 0.7.6 + * @deprecated Use {@link #onSuccess(Page, Task)} instead. */ + @Deprecated protected void onSuccess(Request request, Task task) { this.onSuccess(request); } + /** + * @param page the {@link Page}. + * @param task the {@link Task}. + * @since 0.10.0 + */ + protected void onSuccess(Page page, Task task) { + this.onSuccess(page.getRequest(), task); + } + + /** + * @param request the {@link Request}. + * @deprecated Use {@link #onError(Page, Task, Throwable)} instead. + */ @Deprecated protected void onError(Request request) { } /** + * @param request the {@link Request}. + * @param task the {@link Task}. + * @param e the exception. * @since 0.7.6 + * @deprecated Use {@link #onError(Page, Task, Throwable)} instead. */ + @Deprecated protected void onError(Request request, Task task, Throwable e) { this.onError(request); } + /** + * @param page the {@link Page}. + * @param task the {@link Task}. + * @param e the exception. + * @since 0.10.0 + */ + protected void onError(Page page, Task task, Throwable e) { + this.onError(page.getRequest(), task, e); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 92c770236..80e7b72c9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -79,19 +79,19 @@ public Page download(Request request, Task task) { CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); - Page page = Page.fail(); + Page page = Page.fail(request); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); - onSuccess(request, task); - logger.info("download page success {}", request.getUrl()); + onSuccess(page, task); + logger.info("downloading page success {}", request.getUrl()); return page; } catch (IOException e) { - onError(request, task, e); - logger.info("download page error {}", request.getUrl(), e); + onError(page, task, e); + logger.info("download page {} error", request.getUrl(), e); return page; } finally { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 4f1eee8e6..31dfca75a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -88,7 +88,7 @@ public Page download(Request request, Task task) { logger.info("downloading page: " + request.getUrl()); } - Page page = Page.fail(); + Page page = Page.fail(request); try { String content = getPage(request); if (!content.contains("HTTP request failed")) { @@ -98,9 +98,9 @@ public Page download(Request request, Task task) { page.setRequest(request); page.setStatusCode(200); } - onSuccess(request, task); + onSuccess(page, task); } catch (Exception e) { - onError(request, task, e); + onError(page, task, e); logger.warn("download page {} error", request.getUrl(), e); } return page; diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index 39b3bc914..874f8aef7 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -74,7 +74,7 @@ public SeleniumDownloader setSleepTime(int sleepTime) { public Page download(Request request, Task task) { checkInit(); WebDriver webDriver = null; - Page page = Page.fail(); + Page page = Page.fail(request); try { webDriver = webDriverPool.get(); @@ -111,10 +111,10 @@ public Page download(Request request, Task task) { page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); - onSuccess(request, task); + onSuccess(page, task); } catch (Exception e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request, task, e); + onError(page, task, e); } finally { if (webDriver != null) { webDriverPool.returnToPool(webDriver); From 622ed5a17f98ee1625222452096741ebe16dfe85 Mon Sep 17 00:00:00 2001 From: Harikrishna Date: Fri, 24 Nov 2023 10:07:04 +0530 Subject: [PATCH 6/9] Refactor compareLong method using Long.compare, corrected the local variable name (#1136) --- .../java/us/codecraft/webmagic/utils/NumberUtils.java | 8 +------- .../java/us/codecraft/webmagic/utils/WMCollections.java | 6 +++--- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java index 55e185105..fbeb8ed3b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java @@ -6,12 +6,6 @@ public abstract class NumberUtils { public static int compareLong(long o1, long o2) { - if (o1 < o2) { - return -1; - } else if (o1 == o2) { - return 0; - } else { - return 1; - } + return Long.compare(o1, o2); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java index 23e1644ce..a2ca5afd0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java @@ -21,10 +21,10 @@ public static Set newHashSet(T... t){ } public static List newArrayList(T... t){ - List set = new ArrayList(t.length); + List list = new ArrayList(t.length); for (T t1 : t) { - set.add(t1); + list.add(t1); } - return set; + return list; } } From a9111040763f1c078e67e5d4d2434fce9992ed5a Mon Sep 17 00:00:00 2001 From: Harikrishna Date: Fri, 24 Nov 2023 17:39:32 +0530 Subject: [PATCH 7/9] Refactored to remove multiple calls of getSourceTexts() api (#1137) --- .../webmagic/selector/AbstractSelectable.java | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java index 8775af108..1fb35f1a8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java @@ -3,6 +3,7 @@ import java.util.ArrayList; import java.util.List; + import org.apache.commons.collections4.CollectionUtils; /** @@ -55,11 +56,12 @@ public Selectable jsonPath(String jsonPath) { @Override public String get() { - if (CollectionUtils.isNotEmpty(all())) { - return all().get(0); - } else { - return null; - } + List sourceTexts = all(); + if (CollectionUtils.isNotEmpty(sourceTexts)) { + return sourceTexts.get(0); + } + return null; + } @Override @@ -91,8 +93,9 @@ public Selectable replace(String regex, String replacement) { } public String getFirstSourceText() { - if (getSourceTexts() != null && getSourceTexts().size() > 0) { - return getSourceTexts().get(0); + List sourceTexts = getSourceTexts(); + if (CollectionUtils.isNotEmpty(sourceTexts)) { + return sourceTexts.get(0); } return null; } @@ -104,6 +107,6 @@ public String toString() { @Override public boolean match() { - return getSourceTexts() != null && getSourceTexts().size() > 0; + return CollectionUtils.isNotEmpty(getSourceTexts()); } } From 7c20290ce4be0c642e9bd02edb82d235e39b761c Mon Sep 17 00:00:00 2001 From: Harikrishna Date: Sun, 26 Nov 2023 08:26:06 +0530 Subject: [PATCH 8/9] Refactor addTargetRequests method to eliminate redundant code. (#1138) --- .../main/java/us/codecraft/webmagic/Page.java | 41 ++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index e48d4cb00..17f8b03dd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -49,7 +49,7 @@ public class Page { private byte[] bytes; - private List targetRequests = new ArrayList(); + private List targetRequests = new ArrayList<>(); private String charset; @@ -142,13 +142,7 @@ public List getTargetRequests() { * @param requests requests */ public void addTargetRequests(Iterable requests) { - for (String s : requests) { - if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { - continue; - } - s = UrlUtils.canonicalizeUrl(s, url.toString()); - targetRequests.add(new Request(s)); - } + addTargetRequests(requests, 0); // Default priority is 0 } /** @@ -158,13 +152,32 @@ public void addTargetRequests(Iterable requests) { * @param priority priority */ public void addTargetRequests(Iterable requests, long priority) { - for (String s : requests) { - if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { - continue; - } - s = UrlUtils.canonicalizeUrl(s, url.toString()); - targetRequests.add(new Request(s).setPriority(priority)); + if(requests == null) { + return; + } + + for (String req : requests) { + addRequestIfValid(req, priority); + } + } + + /** + * Helper method to add a request if it's valid. + * + * @param url URL to add + * @param priority Priority for the URL + */ + private void addRequestIfValid(String url, long priority) { + if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) { + return; + } + + String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString()); + Request req = new Request(canonicalizedUrl); + if(priority > 0) { + req.setPriority(priority); } + targetRequests.add(req); } /** From 73dd2ebbac6f7712c59155027906f7441d693935 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 5 Dec 2023 12:28:05 +0800 Subject: [PATCH 9/9] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 6fea73494..700d5c426 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.9.2-SNAPSHOT + 0.10.0 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 9838e1f5f..021a83f3e 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.2-SNAPSHOT + 0.10.0 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 2c5732c6a..4109c49fc 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.9.2-SNAPSHOT + 0.10.0 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 94178bf8f..b47ae3614 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.9.2-SNAPSHOT + 0.10.0 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 57b9cbac0..08e70c161 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.2-SNAPSHOT + 0.10.0 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 138c050cb..4a2b358d0 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.2-SNAPSHOT + 0.10.0 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 14e495504..92914655a 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.2-SNAPSHOT + 0.10.0 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index c37cbe3de..5c2e50b2a 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.9.2-SNAPSHOT + 0.10.0 4.0.0