-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdfplumber.py
34 lines (28 loc) · 892 Bytes
/
pdfplumber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import pdfplumber
import pytesseract
import pandas as pd
import sys
import os
inputfile = sys.argv[1]
outputfile = '/home/erpnext/covenantTrackingCVS/'
#inputfile = inputfile.replace(' ','\ ')
#print(os.path.basename(inputfile))
os.system("mkdir -p "+os.path.dirname(outputfile+inputfile.replace("./","")))
#pdf = pdfplumber.open("Term Sheet 3 - STFC.pdf")
pdf = pdfplumber.open(inputfile)
countpage=len(pdf.pages)
#p0 = pdf.pages[0]
flag = False
for page1 in range(0,countpage):
p0 = pdf.pages[page1]
table = p0.extract_table()
#print(table)
if(table is not None):
flag = True
#with open('2_new.csv', 'a',newline='') as csvfile:
df=pd.DataFrame(table)
df.to_csv(outputfile+inputfile.replace("./",""),encoding='utf-8', mode='a',header=None)
#if flag == False :
# outputfile+inputfile.replace("./","")
#im = p0.to_image()
#print(table)