forked from scrapfly/scrapfly-scrapers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wellfound.py
173 lines (150 loc) · 6.05 KB
/
wellfound.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""
This is an example web scraper for wellfound.com.
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""
import os
import json
from typing import Dict, List, TypedDict
from copy import deepcopy
from loguru import logger as log
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
BASE_CONFIG = {
# bypass wellfound.com web scraping blocking
"asp": True,
# set the proxy country to US
"country": "US",
}
class JobData(TypedDict):
"""type hint for scraped job result data"""
id: str
title: str
slug: str
remtoe: bool
primaryRoleTitle: str
locationNames: Dict
liveStartAt: int
jobType: str
description: str
# there are more fields, but these are basic ones
class CompanyData(TypedDict):
"""type hint for scraped company result data"""
id: str
badges: list
companySize: str
highConcept: str
highlightedJobListings: List[JobData]
logoUrl: str
name: str
slug: str
# there are more fields, but these are basic ones
def extract_apollo_state(result: ScrapeApiResponse):
"""extract apollo state graph from a page"""
data = result.selector.css("script#__NEXT_DATA__::text").get()
if data == None:
return
data = json.loads(data)
graph = data["props"]["pageProps"]["apolloState"]["data"]
return graph
def unpack_node_references(node, graph, debug=False):
"""
unpacks references in a graph node to a flat node structure:
>>> unpack_node_references({"field": {"id": "reference1", "type": "id"}}, graph={"reference1": {"foo": "bar"}})
{'field': {'foo': 'bar'}}
"""
def flatten(value):
try:
if value["type"] != "id":
return value
except (KeyError, TypeError):
return value
data = deepcopy(graph[value["id"]])
# flatten nodes too:
if data.get("node"):
data = flatten(data["node"])
if debug:
data["__reference"] = value["id"]
return data
node = flatten(node)
for key, value in node.items():
if isinstance(value, list):
node[key] = [flatten(v) for v in value]
elif isinstance(value, dict):
node[key] = unpack_node_references(value, graph)
return node
def parse_company(result: ScrapeApiResponse) -> CompanyData:
"""parse company data from wellfound.com company page"""
graph = extract_apollo_state(result)
company = None
for key in graph:
if key.startswith("Startup:"):
company = graph[key]
break
else:
raise ValueError("no embedded company data could be found")
return unpack_node_references(company, graph)
async def retry_failure(url: str, _retries: int = 0):
"""retry failed requests with a maximum number of retries"""
max_retries = 3
try:
response = await SCRAPFLY.async_scrape(
ScrapeConfig(url, **BASE_CONFIG, render_js=True, proxy_pool="public_residential_pool")
)
if response.status_code == 403:
if _retries < max_retries:
log.debug("Retrying failed request")
return await retry_failure(url, _retries=_retries + 1)
else:
raise Exception("Unable to scrape rge first search page, max retries exceeded")
return response
except Exception as e:
if _retries < max_retries:
log.debug("Retrying failed request")
return await retry_failure(url, _retries=_retries + 1)
else:
raise Exception("Unable to scrape rge first search page, max retries exceeded")
async def scrape_search(role: str = "", location: str = "", max_pages: int = None) -> List[CompanyData]:
"""scrape wellfound.com search"""
# wellfound.com has 3 types of search urls: for roles, for locations and for combination of both
if role and location:
url = f"https://wellfound.com/role/l/{role}/{location}"
elif role:
url = f"https://wellfound.com/role/{role}"
elif location:
url = f"https://wellfound.com/location/{location}"
else:
raise ValueError("need to pass either role or location argument to scrape search")
companies = []
log.info(f"scraping first page of search, {role} in {location}")
first_page = await retry_failure(url)
graph = extract_apollo_state(first_page)
companies.extend([unpack_node_references(graph[key], graph) for key in graph if key.startswith("StartupResult")])
seo_landing_key = next(key for key in graph["ROOT_QUERY"]["talent"] if "seoLandingPageJobSearchResults" in key)
total_pages = graph["ROOT_QUERY"]["talent"][seo_landing_key]["pageCount"]
# find total page count
if max_pages and max_pages < total_pages:
total_pages = max_pages
# next, scrape the remaining search pages directly from the API
log.info(f"scraping search pagination, remaining ({total_pages - 1}) more pages")
other_pages = [ScrapeConfig(url + f"?page={page}", **BASE_CONFIG) for page in range(2, total_pages + 1)]
async for response in SCRAPFLY.concurrent_scrape(other_pages):
try:
graph = extract_apollo_state(response)
companies.extend([unpack_node_references(graph[key], graph) for key in graph if key.startswith("StartupResult")])
except Exception as e:
log.debug(f"Error occured while crawling search: {e}")
pass
log.success(f"scraped {len(companies)} job listings from search pages")
return companies
async def scrape_companies(urls: List[str]) -> List[CompanyData]:
"""scrape wellfound.com companies"""
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
companies = []
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
try:
companies.append(parse_company(response))
except:
pass
log.success(f"scraped {len(companies)} comapny listings data from company pages")
return companies