-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebcrawler.rb
211 lines (163 loc) · 4.46 KB
/
webcrawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# Language: Ruby, Level: Level 3
# A web crawler in Ruby
#
# This script provides a generic Spider class for crawling urls and
# recording data scraped from websites. The Spider is to be used in
# collaboration with a "processor" class that defines which pages to
# visit and how data from those pages should be consumed.
#
# Usage:
# spider = Techpoint.new
# spider.results.take(10)
# => [{...}, {...}, ...]
#
# Requirements:
# Ruby 2.0+
#
# Based on Rossta's blog post about webcrawlers
#
require 'mechanize'
require 'pry'
require 'fast_gettext'
class Spider
REQUEST_INTERVAL = 1
MAX_URLS = 10
attr_reader :handlers
def initialize(processor, options = {})
@processor = processor
@urls = []
@results = []
@handlers = {}
@interval = options.fetch(:interval, REQUEST_INTERVAL)
@max_urls = options.fetch(:max_urls, MAX_URLS)
enqueue(@processor.root, @processor.handler)
end
# Attaching URLs with their Handlers and putting them in a running list.
# Setting up the queue of URLs that we will itterate through and grab data.
#
def enqueue(url, method, data = {})
return if @handlers[url]
@urls << url
@handlers[url] ||= { method: method, data: data }
end
#
def record(data = {})
@results << data
end
#
def results
return enum_for(:results) unless block_given?
i = @results.length
enqueued_urls.each do |url, handler|
begin
log "Handling", url.inspect
@processor.send(handler[:method], agent.get(url), handler[:data])
if block_given? && @results.length > i
yield @results.last
i += 1
end
rescue => ex
log "Error", "#{url.inspect}, #{ex}"
end
sleep @interval if @interval > 0
end
end
private
# loop through the array of url's
def enqueued_urls
Enumerator.new do |y|
index = 0
while index < @urls.count && index <= @max_urls
url = @urls[index]
index += 1
next unless url
y.yield url, @handlers[url]
end
end
end
# log warnings
def log(label, info)
warn "%-10s: %s" % [label, info]
end
#
def agent
@agent ||= Mechanize.new
end
end
# Wrap spider and extract data
class Techpoint
attr_reader :root, :handler
#
def initialize(root: "https://techpoint.org/", handler: :process_index, **options)
@root = root
@handler = handler
@options = options
end
# def initialize(root: "https://programmableweb.com/apis/directory", handler: :process_index, **options)
# @root = root
# @handler = handler
# @options = options
# end
#
def process_index(page, data = {})
page.links_with(href: /\?page=\d+/).each do |link|
spider.enqueue(link.href, :process_index)
end
#
page.links_with(href: %r{/twitter\b}).each do |link|
spider.enqueue(link.href, :process_twitter, name: link.text)
end
end
#
#def process_api(page, data = {})
# categories = page.search("article.node-api .tags").first.text.strip.split(/\s+/)
# fields = page.search("#tabs-content .field").each_with_object({}) do |tag, results|
# key = tag.search("label").text.strip.downcase.gsub(/[^\w]+/, ' ').gsub(/\s+/, "_").to_sym
# val = tag.search("span").text
# results[key] = val
# end
# Hit google and search for DemandWell -\/
# Go into the top 100 websites and search for
# key-words(demandwell, demand well, SEO,distance from home base).count
# on each should give a picture of content or I think it might.
# List out URLs that contain SEO.
# How to figure out what type of company the
# website belongs to, potential customer or
# Competitor. Only keep potential customers.
#
# Or search for SEO and see how many links contain
# DemandWell. Set to run automatically at ____ interval.
#
#
#
#
#
#
#
def process_twitter(page, data = {})
categories = page.search("article.node-api .tags").first.text.strip.split(/\s+/)
fields = page.search("#tabs-content .field").each_with_object({}) do |tag, results|
key = tag.search("label").text.strip.downcase.gsub(/[^\w]+/, ' ').gsub(/\s+/, "_").to_sym
val = tag.search("span").text
results[key] = val
end
spider.record data.merge(fields).merge(categories: categories)
end
#
def results(&block)
spider.results(&block)
end
private
#
def spider
@spider ||= Spider.new(self, @options)
end
end
#
if __FILE__ == $0
spider = Techpoint.new
#
spider.results.lazy.take(5).each_with_index do |result, i|
warn "%-2: %s" % [i, result.inspect]
end
end