Skip to content

Commit

Permalink
Merge branch 'release/0.9.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
sutra committed Jun 22, 2023
2 parents 9e59b37 + 3688226 commit fd4a136
Show file tree
Hide file tree
Showing 25 changed files with 418 additions and 187 deletions.
4 changes: 2 additions & 2 deletions README-zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.5</version>
<version>${webmagic.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.5</version>
<version>${webmagic.version}</version>
</dependency>
```
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ Add dependencies to your pom.xml:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.5</version>
<version>${webmagic.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.5</version>
<version>${webmagic.version}</version>
</dependency>
```
Expand Down
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.8.0</version>
<version>0.9.0</version>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<properties>
Expand Down Expand Up @@ -124,7 +124,7 @@
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId>
<version>0.3.6</version>
<version>0.3.7</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
Expand Down
2 changes: 1 addition & 1 deletion webmagic-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.8.0</version>
<version>0.9.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
11 changes: 6 additions & 5 deletions webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* {@link #getHtml()} get content of current page <br>
* {@link #putField(String, Object)} save extracted result <br>
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br>
* {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch <br>
*
* @author [email protected] <br>
* @see us.codecraft.webmagic.downloader.Downloader
Expand Down Expand Up @@ -52,7 +52,7 @@ public class Page {
private List<Request> targetRequests = new ArrayList<Request>();

private String charset;

public Page() {
}

Expand Down Expand Up @@ -108,7 +108,8 @@ public Json getJson() {
* @deprecated since 0.4.0
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
*/
public void setHtml(Html html) {
@Deprecated
public void setHtml(Html html) {
this.html = html;
}

Expand All @@ -121,7 +122,7 @@ public List<Request> getTargetRequests() {
*
* @param requests requests
*/
public void addTargetRequests(List<String> requests) {
public void addTargetRequests(Iterable<String> requests) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;
Expand All @@ -137,7 +138,7 @@ public void addTargetRequests(List<String> requests) {
* @param requests requests
* @param priority priority
*/
public void addTargetRequests(List<String> requests, long priority) {
public void addTargetRequests(Iterable<String> requests, long priority) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;
Expand Down
26 changes: 26 additions & 0 deletions webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ public class Site {

private String charset;

private String defaultCharset;

private int sleepTime = 5000;

private int retryTimes = 0;
Expand Down Expand Up @@ -168,6 +170,30 @@ public String getCharset() {
return charset;
}

/**
* Set default charset of page.
*
* When charset detect failed, use this default charset.
*
* @param defaultCharset the default charset
* @return this
* @since 0.9.0
*/
public Site setDefaultCharset(String defaultCharset) {
this.defaultCharset = defaultCharset;
return this;
}

/**
* The default charset if charset detected failed.
*
* @return the defulat charset
* @since 0.9.0
*/
public String getDefaultCharset() {
return defaultCharset;
}

public int getTimeOut() {
return timeOut;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
Expand Down Expand Up @@ -76,7 +77,7 @@ public Page download(Request request, Task task) {
}
CloseableHttpResponse httpResponse = null;
CloseableHttpClient httpClient = getHttpClient(task.getSite());
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
Page page = Page.fail();
try {
Expand Down Expand Up @@ -116,7 +117,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http
page.setBytes(bytes);
if (!request.isBinaryContent()) {
if (charset == null) {
charset = getHtmlCharset(contentType, bytes);
charset = getHtmlCharset(contentType, bytes, task);
}
page.setCharset(charset);
page.setRawText(new String(bytes, charset));
Expand All @@ -131,11 +132,11 @@ protected Page handleResponse(Request request, String charset, HttpResponse http
return page;
}

private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException {
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
if (charset == null) {
charset = Charset.defaultCharset().name();
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name);
logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset());
}
return charset;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,5 @@
package us.codecraft.webmagic.downloader;

import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;

import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;

import org.apache.commons.lang3.JavaVersion;
import org.apache.commons.lang3.SystemUtils;
import org.apache.http.HttpException;
Expand All @@ -22,28 +11,32 @@
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.*;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import us.codecraft.webmagic.Site;

import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;

/**
* @author [email protected] <br>
* @since 0.4.0
*/
public class HttpClientGenerator {

private transient Logger logger = LoggerFactory.getLogger(getClass());
private transient Logger logger = LoggerFactory.getLogger(getClass());

private PoolingHttpClientConnectionManager connectionManager;

Expand All @@ -61,21 +54,20 @@ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
SSLContext sslContext = createIgnoreVerifySSL();
String[] supportedProtocols;
if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) {
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" };
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"};
} else {
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" };
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"};
}
logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols));
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
null,
new DefaultHostnameVerifier()); // 优先绕过安全证书
} catch (KeyManagementException e) {
logger.error("ssl connection fail", e);
} catch (NoSuchAlgorithmException e) {
//不进行主机校验
(host, sslSession) -> true); // 优先绕过安全证书
} catch (KeyManagementException | NoSuchAlgorithmException e) {
logger.error("ssl connection fail", e);
}
return SSLConnectionSocketFactory.getSocketFactory();
}
}

private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
Expand All @@ -97,9 +89,9 @@ public X509Certificate[] getAcceptedIssuers() {
};

SSLContext sc = SSLContext.getInstance("TLS");
sc.init(null, new TrustManager[] { trustManager }, null);
sc.init(null, new TrustManager[]{trustManager}, null);
return sc;
}
}

public HttpClientGenerator setPoolSize(int poolSize) {
connectionManager.setMaxTotal(poolSize);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package us.codecraft.webmagic.proxy;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;

/**
Expand All @@ -23,7 +24,23 @@ public interface ProxyProvider {
* Get a proxy for task by some strategy.
* @param task the download task
* @return proxy
* @deprecated Use {@link #getProxy(Request, Task)} instead.
*/
Proxy getProxy(Task task);
@Deprecated
default Proxy getProxy(Task task) {
throw new UnsupportedOperationException();
}

/**
* Returns a proxy for the request.
*
* @param request the request
* @param task the download task
* @return proxy
* @since 0.9.0
*/
default Proxy getProxy(Request request, Task task) {
return this.getProxy(task);
}

}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package us.codecraft.webmagic.proxy;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;

import java.util.ArrayList;
Expand Down Expand Up @@ -44,7 +45,7 @@ public void returnProxy(Proxy proxy, Page page, Task task) {
}

@Override
public Proxy getProxy(Task task) {
public Proxy getProxy(Request request, Task task) {
return proxies.get(incrForLoop());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import us.codecraft.webmagic.utils.BaseSelectorUtils;

import java.util.ArrayList;
import java.util.List;
Expand All @@ -13,16 +14,9 @@
*/
public abstract class BaseElementSelector implements Selector, ElementSelector {
private Document parse(String text) {
if (text == null) {
return null;
}

// Jsoup could not parse <tr></tr> or <td></td> tag directly
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
if ((text.startsWith("<tr>") && text.endsWith("</tr>"))
|| (text.startsWith("<td>") && text.endsWith("</td>"))) {
text = "<table>" + text + "</table>";
}
text = BaseSelectorUtils.preParse(text);
return Jsoup.parse(text);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package us.codecraft.webmagic.utils;

/**
* @author hooy
*/
public class BaseSelectorUtils {

/**
* Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly
* https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
*
* @param text - the html string
* @return text
*/
public static String preParse(String text) {
if (((text.startsWith("<tr>") || text.startsWith("<tr ")) && text.endsWith("</tr>"))
|| ((text.startsWith("<td>") || text.startsWith("<td ")) && text.endsWith("</td>"))) {
text = "<table>" + text + "</table>";
}
return text;
}

}
17 changes: 17 additions & 0 deletions webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package us.codecraft.webmagic;

import static org.junit.Assert.assertEquals;

import java.nio.charset.StandardCharsets;

import org.junit.Test;

public class SiteTest {

@Test
public void test() {
Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name());
assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset());
}

}
Loading

0 comments on commit fd4a136

Please sign in to comment.