-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract.py
144 lines (121 loc) · 4.19 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3.8
'''
Written by: Saksham Consul 04/05/2023
Script to extract data from pdf
'''
import os
import tqdm
import pandas as pd
from PyPDF2 import PdfReader, PdfFileWriter
def convert_pdf_text(dir_name):
try:
os.mkdir('papers_parse')
except:
pass
dir_list = os.listdir(dir_name)
paper_list = tqdm.tqdm(dir_list, desc='Extracting text from PDFs')
for file in paper_list:
# Skip files that have already been parsed
if file[:-3]+'txt' in os.listdir('papers_parse'):
continue
file_path = os.path.join('papers', file)
get_pdf_text(file_path)
def get_pdf_text(file_path):
'''Extracts the text from the PDF file and saves it to a text file'''
# write to a new file
try:
pdf = PdfReader(file_path)
except Exception as e:
print(file_path, '\n', e)
return
file = file_path.split('/')[-1]
with open('papers_parse/'+file[:-3]+'txt', 'w') as f:
for page_num in range(len(pdf.pages)):
pageObj = pdf.pages[page_num]
try:
txt = pageObj.extract_text()
except Exception as e:
print(e)
pass
else:
try:
txt = txt.encode('UTF-8', 'ignore').decode('UTF-8')
except Exception as e:
print(e)
pass
else:
f.write(txt)
f.write('\n')
f.close()
# print(dir_list)
def remove_newlines(serie):
serie = serie.str.replace('\n', ' ')
serie = serie.str.replace('\\n', ' ')
serie = serie.str.replace(' ', ' ')
serie = serie.str.replace(' ', ' ')
return serie
def saving_csv(dir_name):
try:
os.mkdir('processed')
except:
pass
# Create a list to store the text files
texts = []
# Get all the text files in the text directory
dir_list = os.listdir(dir_name)
paper_list = tqdm.tqdm(dir_list, desc='Saving as csv')
for file in paper_list:
# Open the file and read the text
with open(dir_name+'/'+file, "r", encoding="UTF-8") as f:
try:
text = f.read()
# Replace -, _, and #update with spaces.
texts.append(
(file.replace('-', ' ').replace('_', ' ').replace('#update', ''), text))
except:
pass
# Create a dataframe from the list of texts
df = pd.DataFrame(texts, columns=['fname', 'text'])
# Set the text column to be the raw text with the newlines removed
df['text'] = df.fname + ". " + remove_newlines(df.text)
df.to_csv('processed/scraped.csv')
print(df.head())
def main():
convert_pdf_text('papers')
saving_csv('papers_parse')
# def reset_eof_of_pdf_return_stream(pdf_stream_in: list):
# # find the line position of the EOF
# for i, x in enumerate(pdf_stream_in[::-1]):
# if b'%%EOF' in x:
# actual_line = len(pdf_stream_in)-i
# print(
# f'EOF found at line position {-i} = actual {actual_line}, with value {x}')
# break
# # return the list up to that point
# return pdf_stream_in[:actual_line]
# def correct_pdf(dir_name):
# '''Corrects the EOF of the PDF file and saves it to a pdf file'''
# try:
# os.mkdir('papers_fixed')
# except:
# pass
# dir_list = os.listdir(dir_name)
# paper_list = tqdm.tqdm(dir_list, desc='Correcting EOF of PDFs')
# for file in paper_list:
# # Skip files that have already been corrected
# if file[:-3]+'txt' in os.listdir('papers_fixed'):
# continue
# # opens the file for reading
# with open(dir_name + '/'+file, 'rb') as p:
# txt = (p.readlines())
# # get the new list terminating correctly
# txtx = reset_eof_of_pdf_return_stream(txt)
# print(type(txtx))
# file = file.split('/')[-1]
# with open('papers_fixed/'+file[:-3]+'txt', 'w') as f:
# for txt in txtx:
# print(txt)
# txt = txt.encode('ascii', 'ignore').decode('ascii')
# f.write(txt)
if __name__ == "__main__":
main()