forked from andyjaramillo/AIPolicyWebCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
78 lines (72 loc) · 2.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from crawler import run
from parser_1 import parse_output
from model import model, Document
from crawler import run
import ast
def save_document_array(document_array):
# Save document_array to a file
with open("document_array.txt", "w") as f:
for document in document_array:
f.write(str(document) + "\n")
def load_document_array():
document_array = []
with open("document_array.txt", "r") as f:
lines = f.readlines() # Read all lines from the file
index = 0
while index < len(lines):
split_line = lines[index].split()
# print(split_line)
if split_line[0] == "Document":
## we are at a new document. We need to add it to document array
doc_id = ""
text = []
labels = []
date_created = ""
date_modified = ""
title = ""
author = ""
url = ""
##lines
doc_id = split_line[2]
index += 1
while index < len(lines) and not lines[index].startswith("Labels:"):
if "Text:" in lines[index].strip():
text.append(lines[index].strip().replace("Text:", ""))
else:
text.append(lines[index].strip())
index += 1
text = " ".join(text)
text = ast.literal_eval(text)
split_line = lines[index]
labels = split_line.split(":")[1].strip()
index += 1
split_line = lines[index]
date_created = split_line.split(":")[1].strip()
index += 1
split_line = lines[index]
date_modified = split_line.split(":")[1].strip()
index += 1
split_line = lines[index]
title = split_line.split(":")[1].strip()
index += 1
split_line = lines[index]
author = split_line.split(":")[1].strip()
index += 1
split_line = lines[index]
url = split_line.split("URL:")[1].strip()
document_array.append(Document(doc_id=doc_id, text=text, created=date_created, modified=date_modified, title=title, author=author, url=url))
index += 1
return document_array
def main():
# call crawl function
# run()
# print("Crawling complete")
document_array = parse_output()
print("Parsing complete")
save_document_array(document_array)
# document_array = load_document_array()
#print(document_array)
result = model(document_array)
return result
if __name__ == "__main__":
main()