-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.ts
112 lines (103 loc) · 3.21 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
const puppeteer = require('puppeteer');
const fs = require('fs');
// Get search query, regular expression, and "filter" boolean from command line arguments
const query = process.argv[2].replace(/ /g, '+');
const regexp = new RegExp(process.argv[3], 'gm');
const filter = process.argv[4] === 'true';
var data = [];
(async () => {
const browser = await puppeteer.launch({ headless: false, timeout: 5000 });
const page = await browser.newPage();
const cookiesString = fs.readFileSync('cookies.json');
let cookies = JSON.parse(cookiesString);
if (cookies.length !== 0) {
for (let cookie of cookies) {
await page.setCookie(cookie);
}
}
// Construct Google search URL with search query and "filter" parameter
let searchUrl = `https://www.google.fr/search?q=intext%3A%28${query}%29`;
if (filter) {
searchUrl += '&filter=0';
}
await page.goto(searchUrl);
const navigationPromise = page.waitForNavigation();
let loop = true;
let iterationCount = 0;
while (loop) {
// Stop the loop after 20 iterations
if (iterationCount >= 20) {
loop = false;
break;
}
const results = await page.evaluate(() => {
const anchors = Array.from(document.querySelectorAll('div[data-header-feature] > div > a'));
return anchors.map(anchor => anchor.href);
});
// Open new tab for each link
for (let i = 0; i < results.length; i++) {
if (subSearch) {
// Open a new tab for each search result and perform sub-search
const newPage = await browser.newPage();
await newPage.goto(results[i]);
// Check if a ReCAPTCHA is present
if (await newPage.$('#recaptcha') !== null) {
console.log('ReCAPTCHA detected. Pausing loop.');
loop = false;
break;
}
// Perform sub-search using the regular expression
const subSearchResults = await newPage.evaluate((regexp) => {
const pageContent = document.body.innerText;
const matches = pageContent.match(regexp);
if (matches) {
return matches;
}
return [];
}, regexp);
if (subSearchResults.length > 0) {
data.push(subSearchResults);
console.log(subSearchResults);
}
await newPage.close();
} else {
// Open a new tab for each search result and get page content
const newPage = await browser.newPage();
await newPage.goto(results[i]);
// Check if a ReCAPTCHA is present
if (await newPage.$('#recaptcha') !== null) {
console.log('ReCAPTCHA detected. Pausing loop.');
loop = false;
break;
}
// Get page content
const pageContent = await newPage.content();
// Get RegExp matches
const matches = pageContent.match(regexp);
if (matches) {
data.push(matches);
console.log(matches);
}
await newPage.close();
}
}
// Go to next page
if (await page.$('#pnnext') !== null) {
console.log('Next page');
await page.click('#pnnext');
await navigationPromise;
await page.waitForTimeout(Math.floor(Math.random() * 2000) + 1000);
iterationCount++;
loop = true;
} else {
loop = false;
}
}
cookies = await page.cookies();
fs.writeFileSync('cookies.json', JSON.stringify(cookies));
await browser.close();
fs.writeFile("data.json", JSON.stringify(data), err => {
if (err) throw err;
console.log('Crawling complete.');
});
})();