Skip to content

Commit

Permalink
User Agent and Referer in FileUtils.copyURLToFile(URL, File)
Browse files Browse the repository at this point in the history
  • Loading branch information
iArthurTsai committed Jan 9, 2022
1 parent 5066acc commit 2dfcade
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 13 deletions.
4 changes: 4 additions & 0 deletions Spider/Spider.iml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,9 @@
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: org.jsoup:jsoup:1.14.3" level="project" />
<orderEntry type="library" name="Maven: commons-io:commons-io:2.11.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpclient:4.5.13" level="project" />
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore:4.4.13" level="project" />
<orderEntry type="library" name="Maven: commons-logging:commons-logging:1.2" level="project" />
<orderEntry type="library" name="Maven: commons-codec:commons-codec:1.11" level="project" />
</component>
</module>
7 changes: 7 additions & 0 deletions Spider/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>

</dependencies>

</project>
60 changes: 47 additions & 13 deletions Spider/src/main/java/GOTOP_book_attachment_downloader.java
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
//碁峰圖書附件下載器
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

import java.io.File;
import java.io.IOException;
import java.net.URL;

import java.io.*;
import java.util.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
Expand Down Expand Up @@ -30,10 +42,7 @@ public static void main(String[] args) throws IOException {
String Source = ("http://books.gotop.com.tw/download/" + str); //下載來源網址
System.out.print("\nSource : " + Source);

Document doc = Jsoup.connect(Source)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36")
.referrer("http://www.google.com")
.get(); //https://stackoverflow.com/questions/6581655/jsoup-useragent-how-to-set-it-right
Document doc = Jsoup.connect(Source).get();
//System.out.println("書名: " + doc.title());

//Elements Title = doc.select("#Label1"); //書名
Expand All @@ -48,14 +57,27 @@ public static void main(String[] args) throws IOException {
for (Element corver : Image) {
System.out.println("封面下載網址:" + corver.absUrl("src")); //封面下載網址

String src =(corver.absUrl("src"));
String src = (corver.absUrl("src"));

//https://www.delftstack.com/zh-tw/howto/java/java-remove-character-from-string/#%E5%9C%A8-java-%E4%B8%AD%E4%BD%BF%E7%94%A8-replace-%E5%87%BD%E5%BC%8F%E5%BE%9E%E5%AD%97%E4%B8%B2%E4%B8%AD%E5%88%AA%E9%99%A4%E5%AD%97%E5%85%83
String front_cover = src.replace("http://www.gotop.com.tw/Waweb2004/WawebImages/bookXL/", "");

System.out.println("File name : " + front_cover);

CloseableHttpClient httpClient = HttpClients.createDefault(); //https://stackoverflow.com/questions/35995431/how-to-specify-user-agent-and-referer-in-fileutils-copyurltofileurl-file-meth

HttpGet Getcorver = new HttpGet(src);
Getcorver.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36");
Getcorver.addHeader("Referer", Source);

CloseableHttpResponse httpResponse = httpClient.execute(Getcorver);
HttpEntity Entity = httpResponse.getEntity();

if (Entity != null) {
FileUtils.copyInputStreamToFile(Entity.getContent(), new File("C:\\Users\\Public\\Downloads\\" + front_cover));
} //C:\Users\Public\Downloads

Getcorver.releaseConnection();

//https://stackoverflow.com/questions/28840604/how-to-initiate-a-file-download-in-the-browser-using-java
//String fileName = front_cover;
Expand All @@ -67,7 +89,7 @@ public static void main(String[] args) throws IOException {
//int n = 0;
//while (-1!=(n=in.read(buf)))
//{
//out.write(buf, 0, n);
//out.write(buf, 0, n);
//}
//out.close();
//in.close();
Expand All @@ -82,7 +104,7 @@ public static void main(String[] args) throws IOException {

//Path cover = Paths.get(front_cover); //"front_cover.jpg"
//try (InputStream inputStream = fetchWebsite.openStream()) {
//Files.copy(inputStream, cover, StandardCopyOption.REPLACE_EXISTING);
//Files.copy(inputStream, cover, StandardCopyOption.REPLACE_EXISTING);
//}
break;
}
Expand All @@ -103,7 +125,7 @@ public static void main(String[] args) throws IOException {
for (Element attachment : newsHeadlines) {
System.out.println("附件下載網址:" + attachment.absUrl("href")); //附件下載網址

String href =(attachment.absUrl("href"));
String href = (attachment.absUrl("href"));

String file_name = href.replace("http://dlcenter.gotop.com.tw/SampleFiles/" + str + "/download/", "");
System.out.println("File name : " + file_name);
Expand All @@ -112,9 +134,22 @@ public static void main(String[] args) throws IOException {
//System.out.print("Enter file format : ");
//String file_format = format.next(); //reads string before the space
//System.out.print("你輸入的檔案格式:" + file_format + "\n"); //檔案格式
//System.out.print("Downloading..." + "\n");
System.out.print("Downloading..." + "\n");

CloseableHttpClient httpClient = HttpClients.createDefault(); //https://stackoverflow.com/questions/35995431/how-to-specify-user-agent-and-referer-in-fileutils-copyurltofileurl-file-meth

HttpGet Getattachment = new HttpGet(href);
Getattachment.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36");
Getattachment.addHeader("Referer", Source);

CloseableHttpResponse httpResponse = httpClient.execute(Getattachment);
HttpEntity Entity = httpResponse.getEntity();

if (Entity != null) {
FileUtils.copyInputStreamToFile(Entity.getContent(), new File("C:\\Users\\Public\\Downloads\\" + file_name));
} //C:\Users\Public\Downloads

Getattachment.releaseConnection();

//String fileName = file_name;
//URL link = new URL(href);
Expand All @@ -125,7 +160,7 @@ public static void main(String[] args) throws IOException {
//int n = 0;
//while (-1!=(n=in.read(buf)))
//{
//out.write(buf, 0, n);
//out.write(buf, 0, n);
//}
//out.close();
//in.close();
Expand All @@ -139,12 +174,11 @@ public static void main(String[] args) throws IOException {

//Path path = Paths.get(file_name); // + "." + file_format
//try (InputStream inputStream = fetchWebsite.openStream()) {
//Files.copy(inputStream, path, StandardCopyOption.REPLACE_EXISTING);
//Files.copy(inputStream, path, StandardCopyOption.REPLACE_EXISTING);
//}
break;
}
}
catch (Exception e) {
} catch (Exception e) {
System.out.println("\nerror: " + e);
System.out.println("\n書號有誤或無附件或檔案名有中文名");
}
Expand Down
39 changes: 39 additions & 0 deletions Spider/src/main/java/saveFile.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

import java.io.File;
import java.io.IOException;
import java.net.URL;

public class saveFile {
public static boolean saveFile(URL imgURL, String imgSavePath) {

boolean isSucceed = true;

CloseableHttpClient httpClient = HttpClients.createDefault();

HttpGet httpGet = new HttpGet(imgURL.toString());
httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36");
httpGet.addHeader("Referer", "https://www.google.com");

try {
CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
HttpEntity imageEntity = httpResponse.getEntity();

if (imageEntity != null) {
FileUtils.copyInputStreamToFile(imageEntity.getContent(), new File(imgSavePath));
}

} catch (IOException e) {
isSucceed = false;
}

httpGet.releaseConnection();

return isSucceed;
}
}
Binary file modified Spider/target/classes/GOTOP_book_attachment_downloader.class
Binary file not shown.
Binary file added Spider/target/classes/saveFile.class
Binary file not shown.

0 comments on commit 2dfcade

Please sign in to comment.