Skip to content

Commit

Permalink
article parser
Browse files Browse the repository at this point in the history
  • Loading branch information
caodongping committed Apr 15, 2016
1 parent cb3fccd commit acf4fbf
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 98 deletions.
104 changes: 6 additions & 98 deletions app/src/main/java/com/github/mzule/androidweekly/api/ArticleApi.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import android.os.Handler;
import android.text.TextUtils;

import com.github.mzule.androidweekly.api.parser.ArticleParsers;
import com.github.mzule.androidweekly.dao.ArticleDao;
import com.github.mzule.androidweekly.dao.IssueListKeeper;
import com.github.mzule.androidweekly.entity.Article;
Expand Down Expand Up @@ -100,106 +101,13 @@ private Response<List<Issue>> doGetArchive() throws Exception {
}

private Response<List<Object>> doGetPage(String issue) throws Exception {
String url = "http://androidweekly.net";
if (issue != null) {
url += issue;
}
final List<Object> articles = new ArrayList<>();
Document doc = Jsoup.parse(new URL(url), 30000);
if (issue == null || isBiggerThan100(issue)) {
parse(doc, articles, issue);
} else {
Element root = doc.getElementsByClass("issue").get(0);
while (root.children().size() == 1) {
root = root.child(0);
}
String currentSection = null;
for (Element e : root.children()) {
if (e.tagName().equals("h2")) {
currentSection = e.text();
articles.add(currentSection);
continue;
}
if (e.tagName().equals("div")) {
Elements img = e.getElementsByTag("img");
if (!img.isEmpty()) {
Article article = new Article();
article.setImageUrl(img.get(0).attr("src"));
article.setTitle(e.getElementsByTag("a").get(1).text());
article.setLink(e.getElementsByTag("a").get(1).attr("href"));
article.setBrief(e.getElementsByTag("p").get(0).text());
Elements span = e.getElementsByTag("span");
if (!span.isEmpty()) {
article.setDomain(span.get(0).text().replace("(", "").replace(")", ""));
}
article.setIssue(issue);
article.setSection(currentSection);
articles.add(article);
//articleDao.save(article);
}
} else {
Article article = new Article();
Elements title = e.getElementsByTag("a");
if (title.isEmpty()) {
continue;
}
article.setTitle(title.get(0).text());
Elements span = e.getElementsByTag("span");
if (!span.isEmpty()) {
article.setDomain(span.get(0).text().replace("(", "").replace(")", ""));
}
article.setLink(e.getElementsByTag("a").get(0).attr("href"));
article.setBrief(e.text());
article.setIssue(issue);
article.setSection(currentSection);
articles.add(article);
//articleDao.save(article);
}
}
}
return new Response<>(articles, false);
}

private boolean isBiggerThan100(String issue) {
String s = issue.split("-")[1];
return Integer.parseInt(s) >= 103;
}

private void parse(Document doc, List<Object> articles, String issue) {
Elements tables = doc.getElementsByTag("table");
String currentSection = null;
for (Element e : tables) {
Elements h2 = e.getElementsByTag("h2");
if (!h2.isEmpty()) {
currentSection = h2.get(0).text();
articles.add(currentSection);
} else {
Elements tds = e.getElementsByTag("td");
Element td = tds.get(tds.size() - 2);
String imageUrl = null;
if (tds.size() == 4) {
imageUrl = tds.get(0).getElementsByTag("img").get(0).attr("src");
}
String title = td.getElementsByClass("article-headline").get(0).text();
String brief = td.getElementsByTag("p").get(0).text();
String link = td.getElementsByClass("article-headline").get(0).attr("href");
String domain = td.getElementsByTag("span").get(0).text().replace("(", "").replace(")", "");
if (issue == null) {
String number = doc.getElementsByClass("issue-header").get(0).getElementsByTag("span").get(0).text();
issue = "/issues/issue-" + number.replace("#", "");
}
Article article = new Article();
article.setTitle(title);
article.setBrief(brief);
article.setLink(link);
article.setDomain(domain);
article.setIssue(issue);
article.setImageUrl(imageUrl);
article.setSection(currentSection);
articles.add(article);
articleDao.save(article);
List<Object> result = ArticleParsers.get(issue).parse(issue);
for (Object obj : result) {
if (obj instanceof Article) {
articleDao.save((Article) obj);
}
}
return new Response<>(result, false);
}

private <T> void postSuccess(final Response<T> result, final ApiCallback<T> callback) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package com.github.mzule.androidweekly.api.parser;

import java.io.IOException;
import java.util.List;

/**
* Created by CaoDongping on 4/15/16.
*/
public interface ArticleParser {
List<Object> parse(String issue) throws IOException;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package com.github.mzule.androidweekly.api.parser;

import android.support.annotation.WorkerThread;

/**
* Created by CaoDongping on 4/15/16.
*/
public class ArticleParsers {
@WorkerThread
public static ArticleParser get(String issue) {
if (issue == null || Integer.parseInt(issue.split("-")[1]) > 102) {
return new FresherArticlesParser();
} else {
return new OlderArticlesParser();
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package com.github.mzule.androidweekly.api.parser;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.net.URL;

/**
* Created by CaoDongping on 4/15/16.
*/
public class DocumentProvider {
public static Document get(String issue) throws IOException {
String url = "http://androidweekly.net/";
if (issue != null) {
url += issue;
}
return Jsoup.parse(new URL(url), 30000);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package com.github.mzule.androidweekly.api.parser;

import com.github.mzule.androidweekly.entity.Article;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
* Created by CaoDongping on 4/15/16.
*/
public class FresherArticlesParser implements ArticleParser {

@Override
public List<Object> parse(String issue) throws IOException {
Document doc = DocumentProvider.get(issue);
List<Object> articles = new ArrayList<>();
Elements tables = doc.getElementsByTag("table");
String currentSection = null;
for (Element e : tables) {
Elements h2 = e.getElementsByTag("h2");
if (!h2.isEmpty()) {
currentSection = h2.get(0).text();
articles.add(currentSection);
} else {
Elements tds = e.getElementsByTag("td");
Element td = tds.get(tds.size() - 2);
String imageUrl = null;
if (tds.size() == 4) {
imageUrl = tds.get(0).getElementsByTag("img").get(0).attr("src");
}
String title = td.getElementsByClass("article-headline").get(0).text();
String brief = td.getElementsByTag("p").get(0).text();
String link = td.getElementsByClass("article-headline").get(0).attr("href");
String domain = td.getElementsByTag("span").get(0).text().replace("(", "").replace(")", "");
if (issue == null) {
String number = doc.getElementsByClass("issue-header").get(0).getElementsByTag("span").get(0).text();
issue = "/issues/issue-" + number.replace("#", "");
}
Article article = new Article();
article.setTitle(title);
article.setBrief(brief);
article.setLink(link);
article.setDomain(domain);
article.setIssue(issue);
article.setImageUrl(imageUrl);
article.setSection(currentSection);
articles.add(article);
}
}
return articles;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package com.github.mzule.androidweekly.api.parser;

import com.github.mzule.androidweekly.entity.Article;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
* Created by CaoDongping on 4/15/16.
*/
public class OlderArticlesParser implements ArticleParser {

@Override
public List<Object> parse(String issue) throws IOException {
Document doc = DocumentProvider.get(issue);
List<Object> articles = new ArrayList<>();
Element root = doc.getElementsByClass("issue").get(0);
while (root.children().size() == 1) {
root = root.child(0);
}
String currentSection = null;
for (Element e : root.children()) {
if (e.tagName().equals("h2")) {
currentSection = e.text();
articles.add(currentSection);
continue;
}
if (e.tagName().equals("div")) {
Elements img = e.getElementsByTag("img");
if (!img.isEmpty()) {
Article article = new Article();
article.setImageUrl(img.get(0).attr("src"));
article.setTitle(e.getElementsByTag("a").get(1).text());
article.setLink(e.getElementsByTag("a").get(1).attr("href"));
article.setBrief(e.getElementsByTag("p").get(0).text());
Elements span = e.getElementsByTag("span");
if (!span.isEmpty()) {
article.setDomain(span.get(0).text().replace("(", "").replace(")", ""));
}
article.setIssue(issue);
article.setSection(currentSection);
articles.add(article);
}
} else {
Article article = new Article();
Elements title = e.getElementsByTag("a");
if (title.isEmpty()) {
continue;
}
article.setTitle(title.get(0).text());
Elements span = e.getElementsByTag("span");
if (!span.isEmpty()) {
article.setDomain(span.get(0).text().replace("(", "").replace(")", ""));
}
article.setLink(e.getElementsByTag("a").get(0).attr("href"));
article.setBrief(e.text());
article.setIssue(issue);
article.setSection(currentSection);
articles.add(article);
}
}
return articles;
}
}

0 comments on commit acf4fbf

Please sign in to comment.