-
Notifications
You must be signed in to change notification settings - Fork 128
/
base_spider.py
119 lines (102 loc) · 4.62 KB
/
base_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import datetime
from datetime import timedelta
import logging
from abc import ABC
from typing import Dict, List
import scrapy
from rows.utils.date import date_range
from covid19br.common.constants import State, ReportQuality
from covid19br.common.data_normalization_utils import NormalizationUtils
from covid19br.common.models.full_report import BulletinModel, FullReportModel
from covid19br.common.warnings import WarningType
logger = logging.getLogger(__name__)
class BaseCovid19Spider(scrapy.Spider, ABC):
"""
Responsible for defining the interface with the crawler engine (scrapy)
and the brasil.io backend.
Knows how to gather the covid19 data for a state on any date.
"""
normalizer = NormalizationUtils
custom_settings = {
"USER_AGENT": (
"Brasil.IO - Scraping para libertacao de dados da Covid 19 | "
"Mais infos em: https://brasil.io/dataset/covid19/"
)
}
# Your spider should override these values
state: State = None
report_qualities: List[ReportQuality]
# The information delay is the amount of days that the data collected by your scraper
# is delayed in being published by the official source.
# For example, a report from the day 20 may actually have the data referring to the day
# 19, which means 1 day of information delay. If the data gathered by your scraper really
# represents the numbers for the day when the bulletin was made available, keep this value 0.
information_delay_in_days: int = 0
def __init__(
self,
reports: Dict[datetime.date, FullReportModel],
start_date: datetime.date = None,
end_date: datetime.date = None,
dates_list: List[datetime.date] = None,
*args,
**kwargs
):
super().__init__(*args, **kwargs)
if dates_list and (start_date or end_date):
raise ValueError(
"The parameter 'dates_list' is not simultaneously supported with 'start_date' or 'end_date'."
)
self.today = datetime.date.today()
tomorrow = self.today + timedelta(days=1)
if not dates_list and not start_date and not end_date:
# if no date filter is passed, by default we gather the most recent data whatever
# it's reference date is
self.start_date = self.today
self.end_date = tomorrow
self.requested_dates = [self.today]
else:
# if there are date filters, the user is searching for data from specific dates, so we
# need to add the information delay on the dates to consider the difference between the
# bulletin's publishing date and the date it refers to in order to get the correct data
if dates_list:
self.start_date = min(dates_list)
self.end_date = max(dates_list)
requested_dates = dates_list
else:
self.start_date = start_date if start_date else self.today
self.end_date = (
tomorrow if not end_date or end_date > tomorrow else end_date
)
requested_dates = date_range(self.start_date, self.end_date)
delay = timedelta(days=self.information_delay_in_days)
self.requested_dates = (date + delay for date in requested_dates)
# The following variable is used to store the results of the scraping
# so they can be used outside the spider - it will have one report per
# date unless that report doesn't exist
if reports is None:
raise ValueError("'reports' must not be None")
self.reports = reports
self.pre_init()
def add_new_bulletin_to_report(self, bulletin: BulletinModel, date: datetime.date):
report = self._get_or_create_report(date)
report.add_new_bulletin(bulletin)
def add_warning_in_report(
self, date: datetime.date, slug: WarningType, description: str = None
):
report = self._get_or_create_report(date)
report.add_warning(slug, description)
def _get_or_create_report(self, date: datetime.date):
report = self.reports.get(date)
if not report:
report = FullReportModel(
reference_date=date - timedelta(days=self.information_delay_in_days),
published_at=date,
state=self.state,
qualities=self.report_qualities,
)
self.reports[date] = report
return report
def pre_init(self) -> None:
"""
You can [optionally] implement this method if you need to do something before the crawler starts
"""