-
Notifications
You must be signed in to change notification settings - Fork 1
/
testPreprocess.py
104 lines (80 loc) · 2.88 KB
/
testPreprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import re
true = 1
false = 0
delimiter = "/"
def preProcess():
readPath = r"data/TE3-platinum-test"
writePath = r"stanford-ner-2015-12-09/data/TE3-platinum-test-text"
inExt = "tml.TE3input.xml"
outExt = "txt"
forceWriteFolder = true
overwriteFlag = true
ignoreExtMismatch = false
success = doPreProcessing(readPath, writePath, forceWriteFolder, overwriteFlag, inExt, outExt, ignoreExtMismatch)
return success
def checkReadPath(readPath):
if os.path.exists(readPath):
return true
else:
print "Directory ", readPath, " does not exist or is inaccessible."
return false
def checkWritePath(writePath, forceWrite):
if os.path.exists(writePath):
return true
elif forceWrite == true:
print "Directory ", writePath, " does not exist. Attempting to create"
os.makedirs(writePath)
if os.path.exists(writePath):
print "Creation successful."
return true
else:
print "Coult not create directory ", writePath
return false
else:
print "Could find directory ", writePath
return false
def stripTags(content):
if ("<TEXT>" not in content) or ("</TEXT>" not in content):
return ""
textStart = content.find("<TEXT>") + 6
textEnd = content.find("</TEXT>")
content = content [textStart : textEnd]
content = content.strip()
return content
def doPreProcessing(readPath, writePath, forceWriteFolder, overwriteFlag, inExt, outExt, ignoreExtMismatch):
if (checkReadPath(readPath) == false or checkWritePath(writePath, forceWriteFolder) == false):
return false
fileList = os.listdir(readPath)
fileCount = len(fileList)
if(fileCount == 0):
print "Empty folder."
return false
print fileCount, " files found in directory ", readPath
print "Writing output files to directory ", writePath, "\n"
for i in range(0, fileCount):
if inExt not in fileList[i] and ignoreExtMismatch == false:
continue
ofPath = readPath + delimiter + fileList[i]
currFile = open(ofPath, 'r')
content = currFile.read()
plainContent = stripTags(content)
if (plainContent == ""):
print "\tFile ID: " , i , "\t\tName: " , fileList[i], "\t\tInput badly formed. Cannot process."
newFile = re.sub(inExt, outExt, fileList[i])
nfPath = writePath + delimiter + newFile
if not (os.path.exists(nfPath)):
print "\tFile ID: " , i , "\t\tName: " , fileList[i], "\t\tWriting to:", newFile
outFile = open(nfPath,"w")
outFile.write(plainContent)
outFile.close()
else:
if (overwriteFlag == true):
print "\tFile ID: " , i , "\t\tName: " , fileList[i], "\t\tOverwriting:", newFile
outFile = open(nfPath,"w")
outFile.write(plainContent)
outFile.close()
else:
print "\tFile ID: " , i , "\t\tName: " , fileList[i], "\t\tOutput file exists. Skipping."
currFile.close()
return true