-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.js
156 lines (121 loc) · 3.61 KB
/
spider.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/*
* @Author: Yingying
* @Date: 2016-05-24 23:45:59
* @Last Modified by: Yingying
* @Last Modified time: 2016-08-01 00:26:24
*/
"use strict";
// mysql模块
var mysql = require('mysql');
// http模块
var http = require("http");
// 开启路由
var express = require('express');
var app = express();
// 需要爬的网站
var url = "http://zyingying.github.io",
author = "zyingying",
data = "" ;
// 连接池
var connection = mysql.createConnection({
host : '127.0.0.1',
user : 'root',
password : 'root',
database : 'nodesample'
});
// 创建一个请求
var req = http.request(url, function(res){
// 设置显示编码
res.setEncoding("utf8");
// 数据是 chunked 发送,意思就是一段一段发送过来的
// 我们使用 data 给他们串接起来
res.on('data', function(chunk){
data += chunk;
});
// 响应完毕时间出发,输出 data
res.on('end', function(){
// data是页面的代码
var reg = /<a(?= )[^>]* href=(['"])(\/\d{4}.*?)\1[^>]*\s*title=(['"])(.*)\s*itemprop=(['"])(.*)>.*<\/a>/g;
var res = [],
match;
while(match = reg.exec(data)) {
// console.log(match)
res.push({
"Url": match[2],
"Title": match[4]
});
}
console.log(res);
// main(res);
// }
// connection.end();
});
});
// 发送请求
req.end();
// 将功能抽离
const query = (userGetSql,options,fun,userAddSql_Params) => {
if (userAddSql_Params === undefined) {
connection.query(userGetSql,function (err, result) {
if(err){
console.log(options +'- ',err.message);
return;
}
console.log('result----->',result);
fun(null, result);
});
}
if(fun == undefined){
connection.query(userGetSql,userModSql_Params,function (err, result) {
if(err){
console.log(options +'- ',err.message);
return;
}
console.log('result----->',result);
});
}
}
const main = (data) => {
console.log('++++++++++++++++++++++++++++++++++++++',data)
let len = data.length,j;
for(j = 0 ; j < len; j++){
//查询是否有重复的东西
let sql = 'SELECT * FROM catalog WHERE Url = \'' + data.Url + ' \'';
query(sql,"SelectData",function(err, result){
if (result.length !== 0) {
//log: 取对象属性的时候,注意判断是否有值
// 数据库中有相同的东西,更新
sql = 'UPDATE catalog SET Author = ? ,Title = ? WHERE Url = ?'
!data && !data[j].Title && query(userGetSql,"update",undefined,[author,data[j].Title,data[j].Url]);
}else{
// 找不到相同的数据,则是新数据,插入到数据库中
sql = 'INSERT INTO catalog(Author,Title,Url) VALUES(?,?,?)';
query(userGetSql,"insert",undefined,[author,data[j].Title,data[j].Url]);
// Insert(data[j]);
}
});
}
app.use('/', express.static('public'));
app.get('/', function (req, res) {
SelectAll(function(err, result) {
res.send(result);
});
});
app.listen(3000, function () {
console.log('Example app listening on port 3000!');
});
}
const SelectAll = (done) => {
// 查询
var userGetSql = 'SELECT * FROM catalog ';
// 查
connection.query(userGetSql,function (err, result) {
if(err){
console.log('[SELECT ERROR] - ',err.message);
return done(err);
}
console.log('--------------------------SELECT----------------------------');
console.log('result----->',result);
done(null, result);
});
}