From 25c2d95e961ccb686f5286a1aa603d511ad93b55 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Fri, 5 Jul 2024 00:27:54 +0800 Subject: [PATCH 1/6] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index b96c9a829..696839f2f 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1 us.codecraft - 1.0.0 + 1.0.1-SNAPSHOT pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 6e1d3c896..4299d4b3b 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 19cdc33d7..e179e2a37 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 15f94cf5e..c76263a05 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 921161362..d52f78304 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 2530bd81d..b7682bf7d 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 3c03aaf8e..131ad5ef2 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index a0dc13861..f84c97997 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1-SNAPSHOT 4.0.0 From 2c135dadce1fcb102084ca222da4d9ade0e3b541 Mon Sep 17 00:00:00 2001 From: xiezcGitHub <765150816@qq.com> Date: Tue, 6 Aug 2024 19:29:41 +0800 Subject: [PATCH 2/6] =?UTF-8?q?#1172=20=E9=97=AE=E9=A2=98=E7=9A=84?= =?UTF-8?q?=E8=A7=A3=E5=86=B3=20(#1173)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: xiezhicheng --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a35af70af..e47a61f22 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -187,7 +187,7 @@ public Spider scheduler(Scheduler scheduler) { */ public Spider setScheduler(Scheduler updateScheduler) { checkIfRunning(); - SpiderScheduler oldScheduler = this.scheduler; + Scheduler oldScheduler = scheduler.getScheduler(); scheduler.setScheduler(updateScheduler); if (oldScheduler != null) { Request request; From 15ec80fcf1b8327b7bc780409aeab03f198384b9 Mon Sep 17 00:00:00 2001 From: xiezcGitHub <765150816@qq.com> Date: Mon, 19 Aug 2024 13:05:28 +0800 Subject: [PATCH 3/6] =?UTF-8?q?FileCacheQueueScheduler=E4=BD=BF=E7=94=A8Bl?= =?UTF-8?q?oomFilter=E8=BF=9B=E8=A1=8C=E5=8E=BB=E9=87=8D=20(#1176)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: xiezc --- .../java/us/codecraft/webmagic/Spider.java | 1 - .../scheduler/FileCacheQueueScheduler.java | 103 +++++------------- 2 files changed, 29 insertions(+), 75 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index e47a61f22..a71166421 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -458,7 +458,6 @@ private void onDownloadSuccess(Request request, Page page) { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); - return; } private void onDownloaderFail(Request request) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index fec3c1db9..0dabdd954 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -1,29 +1,13 @@ package us.codecraft.webmagic.scheduler; -import java.io.BufferedReader; -import java.io.Closeable; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.LinkedHashSet; -import java.util.Set; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.Executors; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; - -import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.math.NumberUtils; - import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +import java.io.*; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; /** @@ -32,7 +16,7 @@ * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler,Closeable { +public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, Closeable { private String filePath = System.getProperty("java.io.tmpdir"); @@ -52,8 +36,6 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement private BlockingQueue queue; - private Set urls; - private ScheduledExecutorService flushThreadPool; public FileCacheQueueScheduler(String filePath) { @@ -83,36 +65,13 @@ private void init(Task task) { } private void initDuplicateRemover() { - setDuplicateRemover( - new DuplicateRemover() { - @Override - public boolean isDuplicate(Request request, Task task) { - if (!inited.get()) { - init(task); - } - return !urls.add(request.getUrl()); - } - - @Override - public void resetDuplicateCheck(Task task) { - urls.clear(); - } - - @Override - public int getTotalRequestsCount(Task task) { - return urls.size(); - } - }); + BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(this.filePath.hashCode()); + setDuplicateRemover(bloomFilterDuplicateRemover); } private void initFlushThread() { - flushThreadPool = Executors.newScheduledThreadPool(1); - flushThreadPool.scheduleAtFixedRate(new Runnable() { - @Override - public void run() { - flush(); - } - }, 10, 10, TimeUnit.SECONDS); + flushThreadPool = Executors.newScheduledThreadPool(1); + flushThreadPool.scheduleAtFixedRate(this::flush, 10, 10, TimeUnit.SECONDS); } private void initWriter() { @@ -127,7 +86,6 @@ private void initWriter() { private void readFile() { try { queue = new LinkedBlockingQueue(); - urls = new LinkedHashSet(); readCursorFile(); readUrlFile(); // initDuplicateRemover(); @@ -140,46 +98,43 @@ private void readFile() { } private void readUrlFile() throws IOException { - String line; - BufferedReader fileUrlReader = null; - try { - fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName))); + try (BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)))) { + String line; int lineReaded = 0; while ((line = fileUrlReader.readLine()) != null) { - urls.add(line.trim()); + Request request = deserializeRequest(line); + this.getDuplicateRemover().isDuplicate(request, null); lineReaded++; if (lineReaded > cursor.get()) { - queue.add(deserializeRequest(line)); + queue.add(request); } } - } finally { - if (fileUrlReader != null) { - IOUtils.closeQuietly(fileUrlReader); - } } } private void readCursorFile() throws IOException { - BufferedReader fileCursorReader = null; - try { - fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor))); + String fileName = getFileName(fileCursor); + try (BufferedReader fileCursorReader = new BufferedReader(new FileReader(fileName))) { String line; + String lastLine = null; //read the last number while ((line = fileCursorReader.readLine()) != null) { - cursor = new AtomicInteger(NumberUtils.toInt(line)); + line = line.trim(); + if (!line.isEmpty()) { + lastLine = line; + } } - } finally { - if (fileCursorReader != null) { - IOUtils.closeQuietly(fileCursorReader); + if (lastLine != null) { + cursor.set(NumberUtils.toInt(line)); } } } - + public void close() throws IOException { - flushThreadPool.shutdown(); - fileUrlWriter.close(); - fileCursorWriter.close(); - } + flushThreadPool.shutdown(); + fileUrlWriter.close(); + fileCursorWriter.close(); + } private String getFileName(String filename) { return filePath + task.getUUID() + filename; From 541ced9eeaa55d14d2f9496b741d08a9ea42cb9a Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 9 Oct 2024 23:36:02 +0800 Subject: [PATCH 4/6] Change the default status code from 200 to 0 & downloadSuccess from true to false, for Page. --- webmagic-core/src/main/java/us/codecraft/webmagic/Page.java | 5 ++--- .../codecraft/webmagic/downloader/MockGithubDownloader.java | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index b4c161a9a..e8c75ccf1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -4,7 +4,6 @@ import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; -import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; @@ -43,9 +42,9 @@ public class Page { private Map> headers; - private int statusCode = HttpConstant.StatusCode.CODE_200; + private int statusCode; - private boolean downloadSuccess = true; + private boolean downloadSuccess; private byte[] bytes; diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 91e3698cf..bb18aa2c5 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -938,6 +938,7 @@ public Page download(Request request, Task task) { Page page = new Page(); page.setRawText(html); page.setStatusCode(200); + page.setDownloadSuccess(true); page.setRequest(new Request("https://github.com/code4craft/webmagic")); page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); return page; From 50026ff937a5af26179ee4daab8ab93d541e38ab Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 9 Oct 2024 23:38:56 +0800 Subject: [PATCH 5/6] Bump commons-io:commons-io from 2.11.0 to 2.14.0 (#1179) Bumps commons-io:commons-io from 2.11.0 to 2.14.0. --- updated-dependencies: - dependency-name: commons-io:commons-io dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 696839f2f..8fd0dbf6b 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ 3.23.1 1.5.0 4.4 - 2.11.0 + 2.14.0 3.12.0 2.0.19.graal 3.0.13 From 6eab7a4155163f8b4a0dbbd2f69b8ce452bef500 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sat, 26 Oct 2024 01:02:00 +0800 Subject: [PATCH 6/6] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 8fd0dbf6b..d0abd3568 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1 us.codecraft - 1.0.1-SNAPSHOT + 1.0.1 pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 4299d4b3b..52cd7ba2c 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index e179e2a37..98db3f826 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index c76263a05..1fe18e066 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index d52f78304..76105d330 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index b7682bf7d..c206d21a2 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 131ad5ef2..123ac6699 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index f84c97997..d09deef50 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1-SNAPSHOT + 1.0.1 4.0.0