-
Notifications
You must be signed in to change notification settings - Fork 1
/
somali.py
executable file
·239 lines (194 loc) · 7.61 KB
/
somali.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# !/usr/bin/python
# encoding: utf-8
# author: Juliano Fischer Naves
# contact: julianofischer at gmail dot com
import sys
import os
import glob
import getpass
from datetime import datetime
import requests
import sqlite3
import argparse
from BeautifulSoup import BeautifulSoup
import lucene
from java.io import File
from java.io import StringReader
from java.lang import System
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.document import Document, Field, StringField, TextField
from org.apache.lucene.index import IndexWriter
from org.apache.lucene.index import IndexWriterConfig
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.index import IndexReader
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.util import Version
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search.highlight import Highlighter
from org.apache.lucene.search.highlight import QueryScorer
from org.apache.lucene.search.highlight import SimpleHTMLFormatter
from org.apache.lucene.search.highlight import SimpleSpanFragmenter
db_filename = 'one.db'
db_schema = """create table if not exists monthly_archive (
id integer primary key autoincrement,
link text)
"""
# class: representing a connection with the DB
# filename: the file used by sqlite3
class DBConnection(object):
def __init__(self):
self.filename = db_filename
def get_connection(self):
return sqlite3.connect(self.filename)
class MonthlyArchive:
def __init__(self, link):
self.link = link
def __repr__(self):
return '[Doc id: %d - :%s]' % (self.id, self.link)
class MonthlyArchiveDAO(object):
SQL_SELECT = "SELECT * FROM monthly_archive WHERE link='%s'"
SQL_INSERT = "INSERT INTO monthly_archive (link) VALUES ('%s')"
def __init__(self):
self.db_connection = DBConnection()
# Checks if the entry is already in the DB
# returns True if the entry is already in DB
# returns False otherwise
def is_in_db(self, monthly_archive):
with self.db_connection.get_connection() as conn:
cur = conn.cursor()
cur.execute(MonthlyArchiveDAO.SQL_SELECT % monthly_archive.link)
rows = cur.fetchall()
return bool(rows)
# insert an entry in DB
def insert(self, monthly_archive):
with self.db_connection.get_connection() as conn:
cur = conn.cursor()
cur.execute(self.SQL_INSERT % (monthly_archive.link))
conn.commit()
# create a DB if not exists
def createDB():
db_is_new = not os.path.exists(db_filename)
if db_is_new:
with sqlite3.connect(db_filename) as conn:
conn.execute(db_schema)
def successfully_logged(req):
if req.text.find('Authentication') == -1:
return True
return False
def do_login():
username = raw_input('Enter your registered e-mail\n')
password = getpass.getpass('Enter your password\n')
data = {'username': username, 'password': password}
r = requests.post('https://www.netlab.tkk.fi/mailman/private/theone/',
data=data, verify=False)
return r
# returns the monthly entries
def retrieve_monthly_archives(r):
soup = BeautifulSoup(r.text)
lista = soup.findAll("td")
new_list = []
for td in lista:
all_a = td.findAll("a")
for a in all_a:
if a["href"].find("txt") != -1:
new_list.append(MonthlyArchive(a["href"]))
return new_list
# download a monthly archive entry
def downloadDocument(monthly_archive, r):
link = "https://www.netlab.tkk.fi/mailman/private/theone/"
link += monthly_archive.link
req = requests.get(link, verify=False, cookies=r.cookies)
file = monthly_archive.link.split('.gz')[0]
with open(file, 'w') as new_file:
new_file.write(req.text.encode('utf-8'))
def lucene_indexing():
lucene.initVM()
index_dir = os.getcwd()
dir = SimpleFSDirectory(File(index_dir))
analyzer = StandardAnalyzer(Version.LUCENE_48)
index_writer_config = IndexWriterConfig(Version.LUCENE_48, analyzer);
index_writer = IndexWriter(dir, index_writer_config)
for tfile in glob.glob(os.path.join(index_dir, '*.txt')):
print "Indexing: ", tfile
document = Document()
with open(tfile, 'r') as f:
content = f.read()
document.add(Field("text", content, Field.Store.YES,
Field.Index.ANALYZED))
document.add(Field("title", tfile, Field.Store.YES,
Field.Index.ANALYZED))
index_writer.addDocument(document)
print index_writer.numDocs()
index_writer.close()
def lucene_search(query, MAX, showHighlight):
dir = os.getcwd()
lucene.initVM()
index_dir = SimpleFSDirectory(File(dir))
index_reader = DirectoryReader.open(index_dir)
lucene_searcher = IndexSearcher(index_reader)
lucene_analyzer = StandardAnalyzer(Version.LUCENE_48)
my_query = QueryParser(Version.LUCENE_48, "text",
lucene_analyzer).parse(query)
#We can define the MAX number of results (default 10)
total_hits = lucene_searcher.search(my_query, MAX)
query_scorer = QueryScorer(my_query)
formatter = SimpleHTMLFormatter()
highlighter = Highlighter(formatter, query_scorer)
# Set the fragment size. We break text in to fragment of 50 characters
fragmenter = SimpleSpanFragmenter(query_scorer, 50)
highlighter.setTextFragmenter(fragmenter)
print "Only shows at most %s documents" % MAX
if showHighlight:
print "<br>"
for hit in total_hits.scoreDocs:
doc = lucene_searcher.doc(hit.doc)
text = doc.get("text")
ts = lucene_analyzer.tokenStream("text", StringReader(text))
if showHighlight:
print "<p>"
print doc.get("title")
if showHighlight:
print "<br>"
print highlighter.getBestFragments(ts, text, 3, "...")
print "</p>"
def main():
parse = argparse.ArgumentParser()
parse.add_argument("-update", help="Update the archive by\
downloading the last monthly entries", action='store_true')
parse.add_argument("-query", help="The query", required=True)
parse.add_argument("-maxresults", metavar='N', type=int,
help='an integer for the max number of results to show')
parse.add_argument("-highlight", action='store_true',
help='show the highlighted query in context with html format')
args = parse.parse_args(sys.argv[1:])
createDB()
showHighlighted = False
if args.highlight:
showHighlighted = True
if args.update:
r = do_login()
if successfully_logged(r):
print "Login successfully..."
entries = retrieve_monthly_archives(r)
dao = MonthlyArchiveDAO()
for e in entries:
if not dao.is_in_db(e):
print "Downloading: %s" % e.link
downloadDocument(e, r)
dao.insert(e)
lucene_indexing()
else:
print "Wrong username/password..."
elif (args.query and args.maxresults):
lucene_search(args.query, args.maxresults, showHighlighted)
elif args.query:
lucene_search(args.query, 10, showHighlighted)
if __name__ == "__main__":
try:
main()
except Exception as exc:
exc_type, exc_obj, exc_tb = sys.exc_info()
msg = "Exception occured in main: "+str(type(exc)) +\
" args:"+str(exc)+" line:"+str(exc_tb.tb_lineno)
print msg