diff --git a/crawler.js b/crawler.js index b248dbc..be59d05 100644 --- a/crawler.js +++ b/crawler.js @@ -1,5 +1,5 @@ /** - * Created by tushar on 13/09/17. + * Created by Sneha on 16/09/17. */ 'use strict' @@ -9,10 +9,64 @@ * @param url * @return {Promise.} */ + +const request = require("request"); +var http = require('http'); +var httpAgent = new http.Agent({keepAlive:true, keepAliveMsecs:30000, maxSockets: 50}); + +const getMatches = (str, regex) => { + let m = []; + const matches = []; + while ((m = regex.exec(str)) !== null) { + // This is necessary to avoid infinite loops with zero-width matches + if (m.index === regex.lastIndex) { + regex.lastIndex++; + } + // The result can be accessed through the `m`-variable. + m.forEach((match, groupIndex) => { + if (groupIndex === 1) + matches.push(match); + }); + } + return matches; +} + module.exports = url => new Promise((resolve, reject) => { - /** - * TODO: Write your high performance code here. - */ - reject(new Error('NotImplemented')) - }) + + let start = new Date(); + let tags = []; + const baseURL = url; + let linkCount = 1; + let resolvedCount = 0; + const linksTouched = {}; + const linkRegex = /href="(.*?)"/g; + const tagRegex = /

(.*?)<\/h1>/g; + + const getBest = async (url) => { + request({url, pool: httpAgent}, (error, response, body) => { + + if(error || (response && response.statusCode != 200)) { + console.log('error '+error) + getBest(url); + return; + } + resolvedCount++; + const linkList = getMatches(body, linkRegex).map(link => link.substr(0, 1) == '/' ? baseURL + link : link); + tags.push(getMatches(body, tagRegex).sort()[0]); + + linkList.forEach((link) => { + if (!linksTouched[link]) { + linksTouched[link] = true; + linkCount++; + getBest(link); + } + }); + + if (resolvedCount == linkCount) { + resolve(tags.sort()[0]) + } + }) + } + getBest(url); + }); diff --git a/package.json b/package.json index 0631069..8756b81 100644 --- a/package.json +++ b/package.json @@ -14,6 +14,7 @@ "express-rate-limit": "^2.9.0", "mocha": "^3.5.3", "nodemon": "^1.12.0", - "pug": "^2.0.0-rc.4" + "pug": "^2.0.0-rc.4", + "request": "^2.81.0" } }