-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspider.js
336 lines (314 loc) · 9.74 KB
/
spider.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
//await,ES8原生版
var http = require("http");
var fs = require("fs");
var cheerio = require("cheerio");
var request = require("request");
var concurrencyCount = 0;
var json = "./data.json";
var config = "./jsonconfig/index.json";
// var url = "http://www.bootcdn.cn/react/";
var mkdirs = require("jm-mkdirs");
var defaut_image =
"http://iuap-design-cdn.oss-cn-beijing.aliyuncs.com/static/cdnconfig/default.png";
var json_urls = JSON.parse(fs.readFileSync(json, "utf8")).urls;
var config_all_data = JSON.parse(fs.readFileSync(config, "utf8"));
var config_data = config_all_data.prod;
var flag = false;
const OSS = require('ali-oss');
const ossConfig = require('./ossConfig.json');
let client = new OSS(ossConfig);
var readFileFunPromise = function(fileName) {
return new Promise(function(resolve, reject) {
var link = "http://" + fileName.split("://")[1];
request(link, function(error, response, html) {
try {
resolve(html);
} catch (error) {
console.log(error);
}
});
});
};
//上传
function putCDN(putUrl, filePath) {
client.put(putUrl, filePath).then(data => {
fs.appendFileSync('./update.txt',`${putUrl} \n`,'utf8')
console.log(`😀${filePath} 上传成功`)
}).catch(function (err) {
console.error(`❌ ${filePath} 上传失败`, err);
fs.appendFile('./cdnError.txt', `❌ ${filePath} 上传失败\n`);
});
}
var checkUpload = async (cdnPath,path,item)=>{
client.head(cdnPath).then(async (result) => {
if (result.res.status == 200) {//cdn已有此文件
console.log(`😀${cdnPath} CDN上已存在,跳过 `);
}else{
console.log('开始写入文件:'+path)
var fileData = await readFileFunPromise(item);
fs.writeFileSync(path, fileData, "utf-8", function(err) {
if (err) {
console.log(`写入文件 ${path} 失败✌️`)
console.log(err);
}else{
console.log(`写入文件 ${path} 成功✌️ 开始上传`);
// putCDN('static/react-dom/16.13.1/cjs/react-dom-server.browser.development.js','./data/react-dom/16.13.1/cjs/react-dom-server.browser.development.js')
putCDN(cdnPath,path);
}
});
}
}).catch( async e=>{
console.log(`😀${cdnPath} CDN上没有开始下载 `,cdnPath);
var fileData = await readFileFunPromise(item);
fs.writeFileSync(path, fileData, "utf-8", function(err) {
if (err) {
console.log(`写入文件 ${path} 失败✌️`)
console.log(err);
}else{
console.log(`写入文件 ${path} 成功✌️ 开始上传`);
// putCDN('static/react-dom/16.13.1/cjs/react-dom-server.browser.development.js','./data/react-dom/16.13.1/cjs/react-dom-server.browser.development.js')
putCDN(cdnPath,path);
}
});
})
}
//初始url
var fetchPage = function(url, array, bool) {
//封装了一层函数
var DIRNAME = url.split("/")[url.split("/").length - 2].toLowerCase();
if (!fs.existsSync("./data/")) {
fs.mkdirSync("./data/");
}
if (!fs.existsSync("./jsonconfig/")) {
fs.mkdirSync("./jsonconfig/");
}
if (!fs.existsSync("./data/" + DIRNAME + "/")) {
fs.mkdirSync("./data/" + DIRNAME + "/");
}
try {
if (array.indexOf(DIRNAME) === -1) {
flag = true;
} else {
flag = false;
}
} catch (error) {
flag = false;
}
startRequest(url, DIRNAME, flag, bool);
};
//请求页面并下载资源
var startRequest = async function(url, DIRNAME, flag, bool) {
var data = [],
i = 0,
//采用http模块向服务器发起一次get请求
html = await getHtml(url),
$ = cheerio.load(html),
name = url.split("/")[url.split("/").length - 2].toLowerCase(),
json = {},
versoins = [],
urls = []; //采用cheerio模块解析html
json = {
name: name,
download: {}
};
if (flag) {
var config_item = {
name: name,
image: defaut_image,
desc: $(".container.jumbotron>p").html()
};
config_data.push(config_item);
}
if (bool) {
config_all_data["prod"] = config_data;
fs.writeFileSync(
"./jsonconfig/index.json",
JSON.stringify(config_all_data),
"utf-8",
function(err) {
if (err) {
console.log(err);
}
}
);
}
$(".container>h3").each(function() {
var $this = $(this),
title = $this.text().trim(),
version = title.split(":")[1],
fileName = [],
d_json = {},
url = [],
obj = {};
var news_item = {
//获取文章的标题
title: title,
//i是用来判断获取了多少篇文章
i: (i = i + 1)
};
$this
.next()
.find(".library-url")
.each(function() {
var link = $(this).html(),
link_arry = link.split(version + "/"),
filename = "/" + link_arry[link_arry.length - 1];
urls.push(link);
url.push(link);
fileName.push(filename);
});
json["download"][version] = fileName;
versoins.push(version);
obj["version"] = version;
obj["url"] = url;
data.push(obj);
if (i > 0) {
// return false;
}
});
//读取页面并下载资源
var asyncFun = async function() {
for (var index = 0; index < data.length; index++) {
var urls = data[index].url,
version = data[index].version;
for (var i = 0; i < urls.length; i++) {
var item = urls[i]
var link = "http://" + item.split("://")[1];
var array = link.split(version + "/");
var name = array[array.length - 1];
var name_array = name.split("/");
var dir_name;
try {
if (name_array.length > 1) {
name = name_array.pop();
dir_name = name_array.join("/");
var source =
"./data/" + DIRNAME + "/" + version + "/" + dir_name + "/";
if (!fs.existsSync(source)) {
mkdirs.sync(source);
} else {
}
let path = source + name;
let cdnName = DIRNAME + "/" + version + "/" + dir_name + "/";
let cdnPath = `static/${cdnName}${name}`;
await checkUpload(cdnPath,path,item)
} else {
// dir_name = array[1];
// var source2 = "./data/" + DIRNAME + "/" + version;
// if (fs.existsSync(source2)) {
// // console.log('已经创建过此更新目录了');
// } else {
// fs.mkdirSync(source2);
// }
// fs.writeFileSync(source2 + "/" + name, fileData, "utf-8", function(
// err
// ) {
// if (err) {
// console.log(err);
// }
// });
}
} catch (error) {
// console.log(error)
}
}
}
};
asyncFun();
json["version"] = versoins;
if (fs.existsSync("./jsonconfig/" + name + "/")) {
// console.log('已经创建过此更新目录了');
} else {
fs.mkdirSync("./jsonconfig/" + name + "/");
console.log("jsonconfig更新目录已创建成功\n");
}
// 写入本地文件
fs.writeFileSync(
"./jsonconfig/" + name + "/index.json",
JSON.stringify(json),
"utf-8",
function(err) {
if (err) {
console.log(err);
}
}
);
};
//获取掉资源的html
var getHtml = function(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
var $ = cheerio.load(html); //采用cheerio模块解析html
// callback(undefined, html);
resolve(html);
});
});
};
function savedContent(link) {
link = "http://" + link.split("://")[1];
request(link, function(error, response, html) {
var $ = cheerio.load(html); //采用cheerio模块解析html
// callback(undefined, html);
});
}
//该函数的作用:在本地存储所爬取到的图片资源
// function savedImg($, news_title) {
// var i = 0;
// $('.main_img.img-hover').each(function(index, item) {
// var img_title = $(this).closest('li').attr('data-title').trim(); //获取图片的标题
// if (img_title.length > 35 || img_title == "") {
// img_title = "Null";
// }
// var img_filename = img_title + '.jpg';
// var img_src = $(this).attr('src'); //获取图片的url
//
// //采用request模块,向服务器发起一次请求,获取图片资源
// request.head(img_src, function(err, res, body) {
// if (err) {
// console.log(err);
// }
// });
// request(img_src).pipe(fs.createWriteStream('./image/' + img_filename)); //通过流的方式,把图片写到本地/image目录下,并用新闻的标题和图片的标题作为图片的名称。
// if (i > 10) {
// return false;
// }
// })
// }
// function openImage(url) {
// console.log(url);
// request(url, function(error, response, body) {
// if (!error && response.statusCode == 200) {
// var $ = cheerio.load(body); //采用cheerio模块解析html
// console.log(body);
// savedImg($);
// // console.log(body);
// }
// })
//
// }
// openImage(url);
// fetchPage(url); //主程序开始运行
var getUrl = function(json_urls) {
var config_name_arr = getConfigName();
for (var index = 0; index < json_urls.length; index++) {
var url = json_urls[index];
if (index == json_urls.length - 1) {
fetchPage(url, config_name_arr, true);
} else {
fetchPage(url, config_name_arr);
}
}
};
var getConfigName = function() {
var data = [];
for (
var config_index = 0;
config_index < config_data.length;
config_index++
) {
var config_name = config_data[config_index].name;
data.push(config_name);
}
return data;
};
getUrl(json_urls);