-
Notifications
You must be signed in to change notification settings - Fork 4.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
25 changed files
with
418 additions
and
187 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,7 @@ | |
* {@link #getHtml()} get content of current page <br> | ||
* {@link #putField(String, Object)} save extracted result <br> | ||
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br> | ||
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br> | ||
* {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch <br> | ||
* | ||
* @author [email protected] <br> | ||
* @see us.codecraft.webmagic.downloader.Downloader | ||
|
@@ -52,7 +52,7 @@ public class Page { | |
private List<Request> targetRequests = new ArrayList<Request>(); | ||
|
||
private String charset; | ||
|
||
public Page() { | ||
} | ||
|
||
|
@@ -108,7 +108,8 @@ public Json getJson() { | |
* @deprecated since 0.4.0 | ||
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. | ||
*/ | ||
public void setHtml(Html html) { | ||
@Deprecated | ||
public void setHtml(Html html) { | ||
this.html = html; | ||
} | ||
|
||
|
@@ -121,7 +122,7 @@ public List<Request> getTargetRequests() { | |
* | ||
* @param requests requests | ||
*/ | ||
public void addTargetRequests(List<String> requests) { | ||
public void addTargetRequests(Iterable<String> requests) { | ||
for (String s : requests) { | ||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { | ||
continue; | ||
|
@@ -137,7 +138,7 @@ public void addTargetRequests(List<String> requests) { | |
* @param requests requests | ||
* @param priority priority | ||
*/ | ||
public void addTargetRequests(List<String> requests, long priority) { | ||
public void addTargetRequests(Iterable<String> requests, long priority) { | ||
for (String s : requests) { | ||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { | ||
continue; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,5 @@ | ||
package us.codecraft.webmagic.downloader; | ||
|
||
import java.io.IOException; | ||
import java.security.KeyManagementException; | ||
import java.security.NoSuchAlgorithmException; | ||
import java.security.cert.CertificateException; | ||
import java.security.cert.X509Certificate; | ||
import java.util.Map; | ||
|
||
import javax.net.ssl.SSLContext; | ||
import javax.net.ssl.TrustManager; | ||
import javax.net.ssl.X509TrustManager; | ||
|
||
import org.apache.commons.lang3.JavaVersion; | ||
import org.apache.commons.lang3.SystemUtils; | ||
import org.apache.http.HttpException; | ||
|
@@ -22,28 +11,32 @@ | |
import org.apache.http.config.SocketConfig; | ||
import org.apache.http.conn.socket.ConnectionSocketFactory; | ||
import org.apache.http.conn.socket.PlainConnectionSocketFactory; | ||
import org.apache.http.conn.ssl.DefaultHostnameVerifier; | ||
import org.apache.http.conn.ssl.SSLConnectionSocketFactory; | ||
import org.apache.http.impl.client.BasicCookieStore; | ||
import org.apache.http.impl.client.CloseableHttpClient; | ||
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; | ||
import org.apache.http.impl.client.HttpClientBuilder; | ||
import org.apache.http.impl.client.HttpClients; | ||
import org.apache.http.impl.client.*; | ||
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; | ||
import org.apache.http.impl.cookie.BasicClientCookie; | ||
import org.apache.http.protocol.HttpContext; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import us.codecraft.webmagic.Site; | ||
|
||
import javax.net.ssl.SSLContext; | ||
import javax.net.ssl.TrustManager; | ||
import javax.net.ssl.X509TrustManager; | ||
import java.io.IOException; | ||
import java.security.KeyManagementException; | ||
import java.security.NoSuchAlgorithmException; | ||
import java.security.cert.CertificateException; | ||
import java.security.cert.X509Certificate; | ||
import java.util.Map; | ||
|
||
/** | ||
* @author [email protected] <br> | ||
* @since 0.4.0 | ||
*/ | ||
public class HttpClientGenerator { | ||
|
||
private transient Logger logger = LoggerFactory.getLogger(getClass()); | ||
private transient Logger logger = LoggerFactory.getLogger(getClass()); | ||
|
||
private PoolingHttpClientConnectionManager connectionManager; | ||
|
||
|
@@ -61,21 +54,20 @@ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { | |
SSLContext sslContext = createIgnoreVerifySSL(); | ||
String[] supportedProtocols; | ||
if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) { | ||
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" }; | ||
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"}; | ||
} else { | ||
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" }; | ||
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}; | ||
} | ||
logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols)); | ||
return new SSLConnectionSocketFactory(sslContext, supportedProtocols, | ||
null, | ||
new DefaultHostnameVerifier()); // 优先绕过安全证书 | ||
} catch (KeyManagementException e) { | ||
logger.error("ssl connection fail", e); | ||
} catch (NoSuchAlgorithmException e) { | ||
//不进行主机校验 | ||
(host, sslSession) -> true); // 优先绕过安全证书 | ||
} catch (KeyManagementException | NoSuchAlgorithmException e) { | ||
logger.error("ssl connection fail", e); | ||
} | ||
return SSLConnectionSocketFactory.getSocketFactory(); | ||
} | ||
} | ||
|
||
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { | ||
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 | ||
|
@@ -97,9 +89,9 @@ public X509Certificate[] getAcceptedIssuers() { | |
}; | ||
|
||
SSLContext sc = SSLContext.getInstance("TLS"); | ||
sc.init(null, new TrustManager[] { trustManager }, null); | ||
sc.init(null, new TrustManager[]{trustManager}, null); | ||
return sc; | ||
} | ||
} | ||
|
||
public HttpClientGenerator setPoolSize(int poolSize) { | ||
connectionManager.setMaxTotal(poolSize); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
23 changes: 23 additions & 0 deletions
23
webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
package us.codecraft.webmagic.utils; | ||
|
||
/** | ||
* @author hooy | ||
*/ | ||
public class BaseSelectorUtils { | ||
|
||
/** | ||
* Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly | ||
* https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag | ||
* | ||
* @param text - the html string | ||
* @return text | ||
*/ | ||
public static String preParse(String text) { | ||
if (((text.startsWith("<tr>") || text.startsWith("<tr ")) && text.endsWith("</tr>")) | ||
|| ((text.startsWith("<td>") || text.startsWith("<td ")) && text.endsWith("</td>"))) { | ||
text = "<table>" + text + "</table>"; | ||
} | ||
return text; | ||
} | ||
|
||
} |
17 changes: 17 additions & 0 deletions
17
webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
package us.codecraft.webmagic; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
import java.nio.charset.StandardCharsets; | ||
|
||
import org.junit.Test; | ||
|
||
public class SiteTest { | ||
|
||
@Test | ||
public void test() { | ||
Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name()); | ||
assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset()); | ||
} | ||
|
||
} |
Oops, something went wrong.