-
Notifications
You must be signed in to change notification settings - Fork 1
/
check_progress.py
145 lines (111 loc) · 6.56 KB
/
check_progress.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import pandas as pd
from email.mime.text import MIMEText
import smtplib
import sys
import os
def check_job_card_urls():
msg = 'Checking the list of job card urls...\r\n'
if os.path.isfile(output_dir+'/job_card_urls.csv') and os.path.getsize(output_dir+'/job_card_urls.csv') > 0:
job_card_urls = pd.read_csv(output_dir+'/job_card_urls.csv',
header=None,
names=['job_card', 'url'])
else:
job_card_urls = pd.DataFrame({'job_card': [], 'url': []})
if os.path.isfile(gp_file) and os.path.getsize(gp_file) > 0:
gp_list = pd.read_csv(gp_file,
names=['district_name', 'block_name', 'panchayat_name', 'panchayat_code'],
usecols=['district_name', 'block_name', 'panchayat_name', 'panchayat_code'],
dtype={'panchayat_code': object})
else:
sys.exit('GP input file not found or empty')
job_card_urls['panchayat_code'] = job_card_urls.url.apply(lambda x: x.split('panchayat_code=')[1].split('&')[0])
job_card_urls = job_card_urls[['panchayat_code', 'job_card']].drop_duplicates().groupby(['panchayat_code']).count().reset_index()
job_card_counts = pd.merge(gp_list[['panchayat_code']], job_card_urls, how='left', on='panchayat_code')
job_card_counts = job_card_counts.fillna(0)
if len(job_card_counts[job_card_counts.job_card == 0].index) == 0:
msg += 'List of job card urls was populated for all panchayats\r\n'
else:
msg += 'WARNING: list of job card urls doesn\'t contain all the study panchayats\r\nNeed to restart the scrape\r\n\r\n'
msg += job_card_counts[job_card_counts.job_card == 0].to_string()
msg += '\r\n'
msg += '\r\n'
return msg
def check_job_card_scrape():
msg = 'Checking the progress of the job card scrape against the list of job card urls...\r\n'
if os.path.isfile(output_dir+'/jobcard.csv') and os.path.getsize(output_dir+'/jobcard.csv') > 0:
jobcards = pd.read_csv(output_dir+'/jobcard.csv', encoding='utf-8', usecols=['job_card_number'], dtype={'job_card_number': object})
jobcards = jobcards[jobcards['job_card_number'] != 'job_card_number'] # Headers get appended every time the scraper runs
else:
jobcards = pd.DataFrame({'job_card_number': []}, dtype=object)
if os.path.isfile(output_dir+'/job_card_urls.csv') and os.path.getsize(output_dir+'/job_card_urls.csv') > 0:
job_card_urls = pd.read_csv(output_dir+'/job_card_urls.csv', header=None, names=['job_card', 'url'])
else:
job_card_urls = pd.DataFrame({'job_card': [], 'url': []})
jc_df = pd.merge(job_card_urls, jobcards.drop_duplicates(), how='left', left_on='job_card', right_on='job_card_number')
jc_notscraped_df = jc_df[pd.isnull(jc_df.job_card_number)][['job_card', 'url']]
if len(jc_notscraped_df.index) == 0:
jc_total = len(jc_df.index)
msg += 'All {} of the job cards have been scraped\r\n'.format(jc_total)
else:
jc_total = len(jc_df.index)
jc_scraped = jc_total - len(jc_notscraped_df.index)
jc_pct = (float(jc_scraped)/float(jc_total))*100
msg += '{} of {} job cards have been scraped ({:.1f}%)\r\n'.format(jc_scraped, jc_total, jc_pct)
msg += '\r\n'
return msg
def check_muster_scrape():
msg = 'Checking the progress of the muster roll scrape against the list of encountered muster urls...\r\n'
msg += 'Note: list of encountered muster roll urls is populated from the job card pages and will grow until all job cards are scraped\r\n'
if os.path.isfile(output_dir+'/muster.csv') and os.path.getsize(output_dir+'/muster.csv') > 0:
musters = pd.read_csv(output_dir+'/muster.csv', encoding='utf-8', usecols=['work_code', 'msr_no'], dtype={'work_code': object, 'msr_no': object})
musters = musters[musters.work_code != 'work_code'] # when the script restarts it puts in an extra header row
else:
musters = pd.DataFrame({'work_code': [], 'msr_no': []}, dtype=object)
musters['right'] = 1
# Find all the musters that haven't been scraped
if os.path.isfile(output_dir+'/encountered_muster_links.csv'):
encountered_muster_links = pd.read_csv(output_dir+'/encountered_muster_links.csv',
header=None,
names=['job_card', 'url', 'msr_no', 'muster_url', 'work_code'],
usecols=['msr_no', 'work_code', 'muster_url'],
encoding='utf-8',
dtype={'work_code': object, 'msr_no': object, 'muster_url': object})
else:
encountered_muster_links = pd.DataFrame({'msr_no': [], 'muster_url': [], 'work_code': []}, dtype=object)
mr_df = pd.merge(encountered_muster_links, musters.drop_duplicates(), how='left', on=['msr_no', 'work_code'])
mr_notscraped_df = mr_df[pd.isnull(mr_df.right)] # keep the musters that haven't been scraped yet
if len(mr_notscraped_df.index) == 0:
mr_total = len(mr_df.index)
msg += 'All {} of the encountered muster roll urls have been scraped\r\n'.format(mr_total)
else:
mr_total = len(mr_df.index)
mr_scraped = mr_total - len(mr_notscraped_df.index)
mr_pct = (float(mr_scraped)/float(mr_total))*100
msg += '{} of {} encountered muster roll urls have been scraped ({:.1f}%)\r\n'.format(mr_scraped, mr_total, mr_pct)
msg += '\r\n'
return msg
def send_email(email_recipients, msg_string):
s = smtplib.SMTP('smtp.mailgun.org', 587)
#with open('password.txt', 'r') as f:
#password = f.read()
#password = '9db9446b166c39ebb2fda2bcb2293b3b-07e45e2a-6dbd4c5a'
s.login('[email protected]', password)
msg = MIMEText(msg_string)
msg['Subject'] = 'FBA Scrape Progress'
msg['From'] = 'FBA Progress Tracker <[email protected]>'
msg['To'] = ','.join(email_recipients)
s.sendmail('[email protected]', email_recipients, msg.as_string())
s.quit()
if __name__ == '__main__':
input_dir = './input'
output_dir = './full_output'
gp_file = input_dir + '/gp_list.csv'
email_recipients = [
]
msg_string = ''
msg_string += check_job_card_urls()
msg_string += check_job_card_scrape()
msg_string += check_muster_scrape()
# print msg_string
send_email(email_recipients, msg_string)