Skip to content

Commit

Permalink
Fix no-page-number
Browse files Browse the repository at this point in the history
Signed-off-by: Ching Yi, Chan <[email protected]>
  • Loading branch information
qrtt1 committed Oct 10, 2024
1 parent a865f3f commit 1cf147d
Show file tree
Hide file tree
Showing 9 changed files with 72,846 additions and 17,565 deletions.
74,401 changes: 60,308 additions & 14,093 deletions data.json

Large diffs are not rendered by default.

32 changes: 29 additions & 3 deletions src/main/java/org/qty/crawler/Crawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,34 @@ private static int getMaxPageBySelector(Document document, String pageListSelect
return maxPage;
}

public int getMaxPageInTopic(Document document) {
return getMaxPageBySelector(document, ".pagination a");
public int getMaxPageInTopic(String url, Document document) {
// TODO add ?page=1 to 4 to the url and get the max page

// 理論上最多 30 篇,但有人也許會想寫心得,就會超過 3 頁。
for (int i = 4; i >= 1; i--) {
String urlWithPage = url + "?page=" + i;
Document doc = Jsoup.parse(fetch.get(urlWithPage));
Elements selections = doc.select("ul > li.disabled");


if (selections.isEmpty()) {
// 找不到 disabled 的「下一頁」,不應該發生。
// 就當它只有 1 頁吧!
return 1;
}

if (doc.toString().contains("還沒有任何文章哦")) {
// 這頁還沒有發文,跳過。
continue;
}
;

boolean nextPageFound = selections.first().select("span").text().equals("下一頁");
if (nextPageFound) {
return i;
}
}
return 1;
}

public void update(Topic topic) {
Expand All @@ -99,7 +125,7 @@ private void executeUpdate(Topic topic) {
List<Article> articles = extractArticles(document);
topic.articles.addAll(articles);

int maxPageInTopic = getMaxPageInTopic(document);
int maxPageInTopic = getMaxPageInTopic(topic.getUrl(), document);
if (maxPageInTopic > 1) {
for (int i = 2; i <= maxPageInTopic; i++) {
PageSampler.save(document, String.format("topic_page_%d.html", i));
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/qty/crawler/DebugViewCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ public class DebugViewCrawler {
public static void main(String[] args) throws IOException {
Crawler crawler = new Crawler(new DefaultFetch());
List<Topic> savedTopics = loadPreviousTopics();
savedTopics.stream().filter(t -> t.url.equals("https://ithelp.ithome.com.tw/users/20161290/ironman/7070")).forEach(topic -> {
savedTopics.stream().filter(t -> t.url.equals("https://ithelp.ithome.com.tw/users/20152821/ironman/7072")).forEach(topic -> {
crawler.update(topic);
System.out.println(topic);
topic.getArticles().forEach(a->{
Expand Down
36 changes: 18 additions & 18 deletions src/test/java/org/qty/crawler/CrawlerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class CrawlerTest {
@Test
public void testCrawler_parseMaxPage() {
Crawler crawler = new Crawler(null);
assertEquals(13, crawler.getMaxPage(Jsoup.parse(createFakeFetch().get(Crawler.CONTENT_LIST))));
assertEquals(106, crawler.getMaxPage(Jsoup.parse(createFakeFetch().get(Crawler.CONTENT_LIST))));
}

@Test
Expand All @@ -33,47 +33,47 @@ public int getMaxPage(Document document) {
List<Topic> topics = crawler.topics();

List<String> expectedTitles = Arrays.asList(
"Leetcode 解題之旅:逐日攻克",
"關於寫react 那二三事",
"運用生成式 AI 服務 所提供的API 實做應用開發(以Gemini及ChatGPT為例)",
"30天整頓職場",
"30 Days of AI Research",
"從 SwiftUI 到 Apple Vision Pro - SwiftUI 從零開始",
"asp.net可以變出那些功能",
"時空序列分析-關鍵籌碼分析",
"Flutter 開發實戰 - 30 天逃離新手村",
"Web仔常見的面試問題",
"Python入門基礎語法與應用",
"繁體中文的第一本CC書─Certified in Cybersecurity",
"使用 Spring AI 打造企業 RAG 知識庫",
"從 SwiftUI 到 Apple Vision Pro - SwiftUI 從零開始",
"用React Native打造找餐店APP");
"關於新手會想知道Kubernetes的幾件事情");


List<String> titles = topics.stream().map(Topic::getTitle).collect(Collectors.toList());
assertEquals(expectedTitles, titles);

List<String> expectedUrls = Arrays.asList(
"https://ithelp.ithome.com.tw/users/20162696/ironman/7080",
"https://ithelp.ithome.com.tw/users/20168266/ironman/7079",
"https://ithelp.ithome.com.tw/users/20046160/ironman/7100",
"https://ithelp.ithome.com.tw/users/20168339/ironman/7097",
"https://ithelp.ithome.com.tw/users/20152821/ironman/7074",
"https://ithelp.ithome.com.tw/users/20162607/ironman/7073",
"https://ithelp.ithome.com.tw/users/20119035/ironman/7064",
"https://ithelp.ithome.com.tw/users/20168322/ironman/7065",
"https://ithelp.ithome.com.tw/users/20059915/ironman/7066",
"https://ithelp.ithome.com.tw/users/20161704/ironman/7067",
"https://ithelp.ithome.com.tw/users/20168211/ironman/7068",
"https://ithelp.ithome.com.tw/users/20021644/ironman/7069",
"https://ithelp.ithome.com.tw/users/20161290/ironman/7070",
"https://ithelp.ithome.com.tw/users/20162607/ironman/7073",
"https://ithelp.ithome.com.tw/users/20132295/ironman/7083"
"https://ithelp.ithome.com.tw/users/20152821/ironman/7072"
);
List<String> urls = topics.stream().map(Topic::getUrl).collect(Collectors.toList());
assertEquals(expectedUrls, urls);

List<String> expectedCategories = Arrays.asList(
"AI/ ML & Data",
"Mobile Development",
"自我挑戰組",
"Modern Web",
"生成式 AI",
"佛心分享-IT 人的工作軟技能",
"Python",
"Mobile Development",
"JavaScript",
"Python",
"Security",
"生成式 AI",
"Mobile Development",
"佛心分享-SideProject30"
"Kubernetes"
);
List<String> categories = topics.stream().map(Topic::getCategory).collect(Collectors.toList());
assertEquals(expectedCategories, categories);
Expand Down
6 changes: 3 additions & 3 deletions src/test/java/org/qty/crawler/CrawlerTopicUpdaterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ public void testCrawler_viewCount() {
crawler.update(topic);

// pre-calculated view count was 421
assertEquals(279, topic.getView());
assertEquals(1253, topic.getView());

// body > div.container.index-top > div > div > div:nth-child(1) > div.profile-header.clearfix > div.profile-header__content > div.profile-header__name
assertEquals("ellelee (ellelee)", topic.getAuthor());
assertEquals("https://ithelp.ithome.com.tw/users/20168266/profile", topic.getProfileUrl());
assertEquals("香草 (mudream)", topic.getAuthor());
assertEquals("https://ithelp.ithome.com.tw/users/20151240/profile", topic.getProfileUrl());

}

Expand Down
Loading

0 comments on commit 1cf147d

Please sign in to comment.