-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain.py
154 lines (124 loc) · 6.83 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/python
__author__ = 'Chuang Ma'
__date__ = "$2015-9-1$"
import sys
import os
import smtplib
from email.mime.text import MIMEText
from email.header import Header
args = sys.argv
def psource(module):
file = os.path.basename(module)
dir = os.path.dirname(module)
toks = file.split('.')
modname = toks[0]
# Check if dirrectory is really a directory
if (os.path.exists(dir)):
# Check if the file directory already exists in the sys.path array
paths = sys.path
pathfound = 0
for path in paths:
if (dir == path):
pathfound = 1
# If the dirrectory is not part of sys.path add it
if not pathfound:
sys.path.append(dir)
# exec works like MEL's eval but you need to add in globals()
# at the end to make sure the file is imported into the global
# namespace else it will only be in the scope of this function
exec ('import ' + modname) in globals()
# reload the file to make sure its up to date
exec ('reload( ' + modname + ' )') in globals()
# This returns the namespace of the file imported
return modname
# end
def train(fullname,email):
##################################################################################################
##############################parameters section##################################################
##################################################################################################
##the directory of RNAfold ###
RNAFoldDic = "/usr/bin/" ###
###
##file directory of source.py ###
sourceDic = os.getcwd()+"/" ###
###
##result directory, include trainDataFileName, predDataFileName, predDataAnnotFileName ###
resultDic = sourceDic + "results/" ###
###
###
##a logical parameter indicate whether run cross validation on training dataset
cross_validation_flag = False
# if args[1]== 'true' or args[1] == 'True':
# cross_validation_flag = True ###
###
##miRNAs and pre-miRNAs for training ###
trainDataFileName = fullname.split('/')[1] ###
##pre-miRNAs for prediction ###
predDataFileName = "predictionData.txt" ###
##the annotated miRNAs and pre-miRNAs for prediction dataset ###
###
###
##################################optional parameters ############################################
##the full path of trained prediction model. If a full path is given, ###
##miRLocator will locate it directly, otherwise, miRLocator will run the training program. ###
predModelFileDir = resultDic + "trained_prediction_model" ###
##If the file name is defined, miRLocator will evaluate the prediction results ###
##based on annotation infomation in this file ###
predDataAnnotFileName = "predictionData_Annotated.txt" ###
##################################################################################################
#####################################################################################################################################
#####################################################################################################################################
# keep this command, temp file used to record temporary results
tempFileDir = resultDic + "tempResult.txt"
psource(sourceDic + "source.py")
trainDataFileDir = sourceDic + email + '/' + trainDataFileName
predDataFileDir = resultDic + predDataFileName
predDataAnnotFileDir = resultDic + predDataAnnotFileName
predResultFileDir = resultDic + "miRLocator_predResults.txt"
evalResultFileDir = resultDic + "miRLocator_evalResults.txt"
# create file directories for recording dp_ss files for training and prediction
dpSSFileDic_train = sourceDic + email + '/' + "dp_ss_train/"
dpSSFileDic_pred = sourceDic + email + '/' + "dp_ss_pred/"
source.createDict(dpSSFileDic_train)
source.createDict(dpSSFileDic_pred)
#####################################################################################################################################
#####################################################################################################################################
# check file for training and prediction, and fold pre-miRNA sequences
##InputFileDir: the full path of input file including pre-miRNA and mature miRNA sequences (and structures)
##RNAFoldDic: the full path of RNAfold
##sourceDic:the directory of source.py, miRLocator.py
##dpSSFileDic:the directory of dp.ps, ss.ps files
##dpSSFlag: True: dp_ss files already exists, not need to be generated again; False: no dp_ss files, need to be generated in this run
##trainDataFileDir_Ref: refined data for training.
# trainDataFileDir_Ref = source.checkFileForTraining(trainDataFileDir, RNAFoldDic, sourceDic, dpSSFileDic_train, dpSSFlag = True )
##cross_validation_test
if (cross_validation_flag == True):
print "start cross validation.\n"
source.cross_validation_function(source.cv, trainDataFileDir_Ref, source.minCandidateMiRNALen,
source.maxCandidateMiRNALen, source.upOffSet, source.downOffSet,
source.minPredScore,
resultDic, dpSSFileDic_train, RNAFoldDic, tempFileDir)
# end if
##train prediction model on all miRNAs in input file
##trainDataFileDir_Ref: refined data for training
##dpSSFileDic_train: dp_ss files for training data
##RNAFoldDic: the directory of RNAfold
##tempFileDir: tempoary file
##source.upOffSet: 5 in default
##source.downOffSet: 5 in default
# predModelFileDir: the full path of trained prediction model
##resultDic: result directory
predModel = ""
trainDataFileDir_Ref = source.checkFileForTraining(trainDataFileDir, RNAFoldDic, sourceDic, dpSSFileDic_train,
dpSSFlag=True)
newdir = os.getcwd()+'/'+'result_'+email
isExits = os.path.exists(newdir)
if not isExits:
os.mkdir(newdir)
newdir = newdir+'/'
print "start to train prediction models.\n"
predModelFileDir = newdir + "trained_prediction_model"
predModel = source.train_prediction_model(trainDataFileDir_Ref, dpSSFileDic_train, RNAFoldDic, tempFileDir,
source.upOffSet, source.downOffSet, predModelFileDir, resultDic)
return predModelFileDir
# end else