-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
137 lines (112 loc) · 6.98 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# main.py -- main
import argparse
import os
import logging
from collections import namedtuple
from datetime import datetime
# list of available jobs
from Jobs.AmbitofinancieroJob import AmbitofinancieroJob
from Jobs.ClarinJob import ClarinJob
from Jobs.InfobaeJob import InfobaeJob
from Jobs.La100Job import La100Job
from Jobs.LanacionJob import LanacionJob
from Jobs.Pagina12Job import Pagina12Job
from Jobs.RadiomitreJob import RadiomitreJob
from Jobs.TNJob import TNJob
from Jobs.DatabaseTestJob import DatabaseTestJob
from Jobs.ClarinReprocessJob import ClarinReprocessJob
# this dict() is a jump table look-alike used to parse and any new job added to the
# script should be added in here for the command line parser to know what to do.
# FIXME: There should be a more elegant way of doing this, by iterating over a list
# jobs, and accessing their .__name__ members
valid_jobs = { 'ambitofinanciero' : AmbitofinancieroJob,
'clarin' : ClarinJob,
'infobae' : InfobaeJob,
'la100' : La100Job,
'lanacion' : LanacionJob,
'pagina12' : Pagina12Job,
'radiomitre' : RadiomitreJob, # me habia comido este job. Costo poco agregarlo (45"). Buen approach...
'tn' : TNJob,
'databasetest' : DatabaseTestJob,
'clarinreprocess' : ClarinReprocessJob,
}
# cmdline interpretation section
epilog = "Jobs disponibles: " + ' '.join((valid_jobs[job].__name__[:-3] for job in valid_jobs)) # generator ... explicitly. A matter of taste
cmdline = argparse.ArgumentParser(description = "BSscrapper - Scrapper basado en Beautiful Soup v4", epilog = epilog)
cmdline.add_argument("-dc",
action = "store_true",
dest = "debug_commit",
help = "Toggle debugging mode on for database commits -- Database commits get tagged")
cmdline.add_argument("-dp",
action = "store_true",
dest = "debug_program",
help = "Toggle debugging mode on for program. Logger mode gets set to DEBUG")
cmdline.add_argument("-f",
action = "store_true",
dest = "file_commit",
help = "Commit result of scrapping to file, not to database. Here for debugging purposes")
cmdline.add_argument("-j",
action = "append",
dest = "jobs",
help = "Jobs a disparar. Uno por '-j'")
params = cmdline.parse_args() # Available params and what are they intended for:
# debug_commit: Database commits should get tagged as DBG
# debug_program: Program get much more verbose when it comes to logging
# file_commit: No commits to database. Dump to file for inspection instead
# jobs: list of jobs to be executed
DebugTuple = namedtuple('DebugTuple', ['commit', 'program', 'file_commit']) # Packs debugging switches
dbg = DebugTuple(commit = params.debug_commit, # Intended to flag database commits as debugging. MongoDB, a dbg flag. SQL commit to a different table (*_dbg)
program = params.debug_program, #
file_commit = params.file_commit) #
if dbg.program:
print(f"cmdline:\n\t{cmdline}\nparams:\n\t{params}")
if not params.jobs:
print("Nada para hacer...")
print(cmdline.prog, "-h para ver opciones y jobs disponibles")
exit(0)
# Logging Config -- Comments from The Python Library Reference Release 3.9.4 by Guido van Rossum and the Python development team
d = datetime.now() # "datetime.now() : Return the current local date and time" [Page 183]
# log path at ./logs/<year-month-day>
logpath = os.getcwd() + "/logs/" + d.strftime('%Y%m%d') # "os.getcwd() : Returns a string representing the current working directory" [Page 568]
# "datetime.strftime() : Return a string representing the date and time, controlled by an explicit format string.
# For a complete list of formatting directives, see strftime() and strptime() Behavior [Page 203]" [Page 191]
os.makedirs(logpath, exist_ok = True) # "os.makedirs() : Recursive directory creation function [...]"
# "If exists_ok is False (the default), an FileExistsError ir raised if the target directory already exists"
# [Page 570]
logfilename = logpath + "/" + d.strftime('%Y%m%d%H%M%S') + ".log"
print('Logging in: ' + logfilename + '\n')
logging.basicConfig(level = logging.INFO if params.debug_program == False else logging.DEBUG,
encoding = 'utf-8',
filename = logfilename,
format = '%(levelname)s:%(asctime)s:%(funcName)s:%(lineno)d - %(message)s',
datefmt = '%Y-%m-%d %H:%M:%S')
# prepare list of jobs to be undertaken
job_list = []
for j in params.jobs: # filter only valid jobs -- those strings existing as keys in valid
if j.lower() in valid_jobs:
job_list.append(valid_jobs[j.lower()](dbg = params.debug_commit)) # switch debug & file_commit flags. False by defaults
if params.debug_program:
logging.info("Database commits to be flagged as debug")
logging.info('Adding job ' + valid_jobs[j.lower()].__name__ + ' to queue')
else:
print('Job ' + j + ' is unknown!!!')
logging.warning('Job ' + j + ' is unknown!!!')
if job_list == []:
logging.error("Ninguno de los trabajos especificados es reconocido")
logging.error("Nada para hacer...")
print("Ninguno de los trabajos especificados es reconocido")
print("Nada para hacer...");
print(cmdline.prog, "-h para ver opciones y jobs disponibles")
exit(0)
jobs_ul_length = len("Jobs reconocidos :" + str([j.__class__.__name__ for j in job_list])[1:-1])
logging.info("Jobs reconocidos :" + str([j.__class__.__name__ for j in job_list])[1:-1])
logging.info('-' * jobs_ul_length)
print("Jobs reconocidos :" + str([j.__class__.__name__ for j in job_list])[1:-1])
print('-' * jobs_ul_length)
for job in job_list:
logging.info("Launching job " + job.__class__.__name__)
print("Launching job " + job.__class__.__name__)
job.launch()
logging.info("Storing job " + job.__class__.__name__ + '\'s collected stuff')
print("Storing job " + job.__class__.__name__ + '\'s collected stuff')
job.store(params.file_commit) # if commit is done to a file the only one concerned is the one what stores