-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconcat_text.py
34 lines (31 loc) · 1.02 KB
/
concat_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#-*- encoding: utf-8 -*-
from bs4 import BeautifulSoup
import os
import pdb
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def concat_texts(file_name, ex_type):
first_slash = file_name.find("/")
patent_type = file_name[:first_slash]
with open(file_name, "r") as a:
text = a.read()
print file_name
concat = open(patent_type + "/" + ex_type + ".txt", "a")
concat.write(text)
concat.close()
if __name__ == "__main__":
if len(sys.argv) < 3:
print "run with 2 arguments for patent directory(searched_patents, timed_patents) and section(abstract, description, claims, sentences)"
sys.exit()
text_dir = sys.argv[1] + "/" + sys.argv[2] + "/"
files = os.listdir(text_dir)
if len(files) < 1:
print "wrong patent directory: " + text_dir
sys.exit()
concat_file = sys.argv[1] + "/" + sys.argv[2] + ".txt"
if os.path.exists(concat_file):
os.remove(concat_file)
for file in files:
if file.endswith("txt"):
concat_texts(text_dir + file, sys.argv[2])