-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
74 lines (60 loc) · 1.85 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
import re
import glob
import os
def collect_all_txt_file_name(path="./"):
print("current path:" + path)
os.chdir(path)
txt_files = glob.glob('*.txt')
print(txt_files)
return txt_files
def brake_down_txt(name):
txt = open(name, "r").read().splitlines()
# print(txt)
data = []
data_point = []
for line in txt:
print(line)
if re.match("(^\\*+$)", line):
if len(data_point) != 0:
data.append(data_point)
data_point = []
print("adding data point...")
else:
data_point.append(line)
data.append(data_point)
# print(len(data), data)
return data
def construct_data_frame(data):
single_file_df = pd.DataFrame()
for row in data:
data_txt = '\n'.join(row)
# print(data_txt)
name = re.findall(r'Image\s\w+:.+', data_txt)
name_txt = '& '.join(name)
numbers = re.findall(r'\w+=[^1][\w\\.]+\S', data_txt)
attributes = {'name': ' & '.join(name)}
print(name_txt, numbers)
for item in numbers:
pair = item.split("=")
print(pair)
attributes[pair[0]] = pair[1]
print(attributes)
single_file_df = single_file_df.append(attributes, ignore_index=True)
return single_file_df
# print(dataframe)
# if pair[0] not in dataframe.columns:
# dataframe.ins
def main():
df = pd.DataFrame()
txt_collection = collect_all_txt_file_name()
file_count = 0
for txt in txt_collection:
file_count += 1
data = brake_down_txt(txt)
df = pd.concat([df, construct_data_frame(data)])
print("pre process done, total " + str(file_count) + " file(s)...")
print(df)
df.to_excel(r'.\test.xlsx', index=False, header=True)
if __name__ == "__main__":
main()