diff --git a/crawler.js b/crawler.js index b248dbc..e6de6e7 100644 --- a/crawler.js +++ b/crawler.js @@ -2,17 +2,62 @@ * Created by tushar on 13/09/17. */ -'use strict' +"use strict"; +var request = require("superagent"); +const cheerio = require("cheerio"); -/** - * Crawls a website using a start {url}, and returns the lexicographically smallest string. - * @param url - * @return {Promise.} - */ -module.exports = url => - new Promise((resolve, reject) => { - /** - * TODO: Write your high performance code here. - */ - reject(new Error('NotImplemented')) - }) +let traversedPaths; // Local cache. Use any high-performance lookup db like redis for shared systems with some ttl. +let smallestWord; // This would be the current smallest word. Initializing with some huge value; + +function getSmallestString(codes) { + return codes.reduce(function(currentSmallestWord, code) { + return currentSmallestWord < code ? currentSmallestWord : code; + }, smallestWord); +} + +function getPaths($) { + return $(".link") + .map(function(i, elem) { + return $(this).attr("href"); + }) + .get(); +} + +function getCodes($) { + return $(".codes h1") + .map(function(i, elem) { + return $(this).text(); + }) + .get(); +} + +async function makeReq(url, result) { + const response = await request.get(url); + return response.text; +} + +async function crawlPage(hash, rootUrl) { + const response = await makeReq(`${rootUrl}${hash}`); + const $ = cheerio.load(response); + const codes = getCodes($); + smallestWord = getSmallestString(codes); + traversedPaths[hash] = true; + const childPaths = getPaths($); + + for (let childPath of childPaths) { + if (!(childPath in traversedPaths)) { + const wait = await crawlPage(childPath, rootUrl); + } + } +} + +module.exports = async function(url) { + try { + traversedPaths = {}; + smallestWord = "".padEnd("100", "z"); + const data = await crawlPage("", url); + return smallestWord; + } catch (error) { + return error; + } +}; diff --git a/package.json b/package.json index 0631069..81d31b3 100644 --- a/package.json +++ b/package.json @@ -10,10 +10,12 @@ "author": "", "license": "ISC", "dependencies": { + "cheerio": "^1.0.0-rc.2", "express": "^4.15.4", "express-rate-limit": "^2.9.0", "mocha": "^3.5.3", "nodemon": "^1.12.0", - "pug": "^2.0.0-rc.4" + "pug": "^2.0.0-rc.4", + "superagent": "^3.6.0" } }