-
Notifications
You must be signed in to change notification settings - Fork 0
/
LIMS_download.py
98 lines (84 loc) · 3.77 KB
/
LIMS_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from time import sleep
import requests
import re
import json
from subprocess import PIPE, run,check_call, CalledProcessError
import os.path
import pytesseract
from PIL import Image
import glob
from PyPDF2 import PdfFileReader
with open('doclinks.txt') as file:
urls = file.read().split('\n')
pdfs = os.getcwd()+'/pdfs/'
#if not os.path.isdir(pdfs):
# os.mkdir(pdfs)
text = os.getcwd()+'/text/'
#if not os.path.isdir(text):
# os.mkdir(text)
files = os.listdir('pdfs')
files = [str.lower(file) for file in files]
# do names in all caps mess this up (do the non-all caps versions not get found)? http://lims.dccouncil.us/Download/29298/B20-0142-COMMITTEEREPORT.pdf
toDownload = []
for url in urls:
filename = str.lower(url.split('/')[-1])
if filename not in files:
toDownload.append(url)
councilPeriod20 = [url for url in toDownload if '20' in url.split("/")[-1].split('-')[0]]
councilPeriod21 = [url for url in toDownload if '21' in url.split("/")[-1].split('-')[0]]
councilPeriod22 = [url for url in toDownload if '22' in url.split("/")[-1].split('-')[0]]
# this file is broken for some reason: http://lims.dccouncil.us/Download/29565/B20-0409-Engrossment.pdf
# this too http://lims.dccouncil.us/Download/30020/GBM20-0050-Introduction.pdf
for file in councilPeriod22:
print(file)
response = requests.get(file)
if len(response.content)==0:
continue
else:
filename = file.split('/')[-1]
if os.path.isfile(pdfs+filename):
continue
else:
with open(pdfs+filename, 'wb') as file:
file.write(response.content)
try:
# small pdfs get written too quickly for pdftotext to find them
run(['pdftotext', '-enc', 'UTF-8','-layout' , pdfs+filename,text+filename[:-3]+'txt'], stdout=PIPE,check=True)
with open(text+filename[:-3]+'txt','r') as f:
if len(f)<1000:
content = extractText(pdfs+filename,pdfs)
with open(path(text+filename[:-3])+'txt','w') as f:
f.write(content)
except:
with open(text+filename[:-3]+'txt','w') as f:
content = extractText(pdfs+filename,pdfs)
if content=='':
print('true')
continue
else:
f.write(content)
def convertGrayscale(imageLocation,i):
params = ['convert', '-density','300', '-units','PixelsPerInch', '-type','Grayscale', imageLocation+str([i]), imageLocation[:-4]+'-'+str([i])+'.png']
check_call(params)
def convertColor(imageLocation,i):
params = ['convert', '-density','300', '-units','PixelsPerInch', imageLocation+str([i]), imageLocation[:-4]+'-'+str([i])+'.png']
check_call(params)
# This doesn't deal with encrypted PDFs (why are there encrypted docs?) - if there turns out to be more than just the oen I will have to figure something out
def extractText(imageLocation,path):
# converting to png outputs one file for each page, with -1,-2,...-n.png extensions - need a way to determine how many files are created
text = ''
try:
numPages = PdfFileReader(open(imageLocation,'rb')).getNumPages()
except:
return('')
for i in range(numPages):
try:
convertGrayscale(imageLocation,i)
text+=pytesseract.image_to_string(Image.open(imageLocation[:-4]+'-'+str([i])+'.png'))
except:
convertColor(imageLocation,i)
text+=pytesseract.image_to_string(Image.open(imageLocation[:-4]+'-'+str([i])+'.png'))
# delete pngs after text extraction: they are pretty large
for file in glob.glob(os.path.join(path, imageLocation.split('/')[-1][:-4]+'-*.png')):
os.remove(file)
return(text)