forked from fechitheodore/GenomeGit3.1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
update.py
289 lines (254 loc) · 14.4 KB
/
update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
#!/usr/bin/env python
"""
INSTRUCTIONS: The script will update the coordinates of the variants and annotation files of a given
genome assembly to match those of a new one. Arguments:
update_dependent_datasets.py -new_file- -threads- -file_size- -template_length-
"""
# Make the imports
import os
import sys
import datetime
import glob
from subprocess import Popen, check_output
from update_functions import detect_updates
from reconstruct_functions import reconstruct_dataset
from ObtainAlignment_functions import obtain_alignment
from StoreAlignment import store_variables, load_variables, obtain_alignment_pickle
from InterpretAlignment_functions import interpret_alignment
# Load arguments: new_assembly, threads size, tlength, which aligner to use, and associated mashmap args
new_assembly = str(sys.argv[1])
number_threads = int(sys.argv[2])
file_size = str(sys.argv[3])
template_length = int(sys.argv[4])
aligner_switch = int(sys.argv[5])
aligner = ""
c_flag = int(sys.argv[6])
b_flag = int(sys.argv[7])
percent_identity = int(sys.argv[8])
kmer = int(sys.argv[9])
segLength = int(sys.argv[10])
ms_flag = int(sys.argv[11])
file_size_bytes = os.path.getsize(new_assembly)
def format_bytes(size):
"Convert Bytes to relative filesize and return size and associated label i.e. GB/TB"
# 2**10 = 1024
power = 2 ** 10
n = 0
power_labels = {0: '', 1: 'K', 2: 'M', 3: 'G', 4: 'T'}
while (size > power):
size /= power
n += 1
return str(size), power_labels[n] + 'B'
file_size, label = format_bytes(file_size_bytes)
file_size = str(file_size)
def aligner_status(ToUpdate):
"""
Informs the user of the aligner selected
Requires ToUpdate and global variables defined at the start of Update.py
"""
if (ToUpdate == "empty"):
print("No datasets to be updated were detected.\n")
sys.exit()
else:
if aligner_switch == 1 and ms_flag == 1:
aligner = "MashMap + Nucmer"
print("\n\n**Aligner chosen is MashMap + Nucmer (GenomeGit 2.1) with the following "
"parameters for MashMap:**\n\n\t -percent identity: {}"
"\n\n\t -Kmer: {}\n\n\t -Segment length: {}"
"\n\nParameters for Nucmer:\n\n\t -b: {}\n\n\t -c: {}\n\n"
"\n\nHandles splits only.\n\n"
.format(percent_identity, kmer, segLength, b_flag, c_flag))
elif aligner_switch == 1 and ms_flag == 2:
aligner = "MashMap + Nucmer"
print("\n\n**Aligner chosen is MashMap + Nucmer (GenomeGit 2.1) with the following "
"parameters for MashMap:**\n\n\t -percent identity: {}"
"\n\n\t -Kmer: {}\n\n\t -Segment length: {}"
"\n\nParameters for Nucmer:\n\n\t -b: {}\n\n\t -c: {}\n\n"
"\n\nHandles merges only.\n\n"
.format(percent_identity, kmer, segLength, b_flag, c_flag))
else:
aligner = "Nucmer"
print("\n\n**Aligner chosen is Nucmer only (GenomeGit 2.0)\n\n"
"Parameters chosen:\n\n-b: {} and -c: {}".format(b_flag, c_flag))
return aligner
def update_inform_user(ToUpdate):
"""
Simply checks if there are any files to be updated and reports the file information, including
file size and filename.
Requires: ToUpdate
"""
aligner = aligner_status(ToUpdate)
print("Number of threads selected: {}\nMaximum insert size: {}\nThe following files that "
"need to be updated have been detected:\n".format(str(number_threads), str(template_length)))
for dataset in ToUpdate.keys():
# If the dataset is not a genome dataset
if (dataset != "Genome"):
print("\t-Files detected in the {} dataset\n".format(dataset))
# If there are files (len of dataset > 0), print the files out
if (len(ToUpdate[dataset]) != 0):
for subfile in ToUpdate[dataset]:
print("\t\t--{} ({})\n".format(subfile[0], subfile[2]))
# If there are no files in this dataset, inform the user
else:
print("\t\t--No files detected in this {} dataset.\n".format(dataset))
# Inform the user that the update will start now
# If the file size of the genome data is 1 Mb perform the following:
print("\n*** NOW STARTING UPDATE OF THE REPOSITORY: {} ({}) ---> {} ({} {})***\n"
.format(ToUpdate["Genome"][0][0], ToUpdate["Genome"][0][2],
os.path.basename(new_assembly), file_size, label))
def reconstruct_annotation_variants(ToUpdate):
"""
Reconstructs the variant and annotation dataset in the temporary directory.
Requires: ToUpdate ({dataset:[[filename.extension,directory,size],[],...]}) which contains INFORMATION
of what requires updating.
"""
for dataset in ToUpdate.keys():
# If the data are variants and annotation datasets, and there is data present, perform the following block
if (dataset != "Genome" and len(ToUpdate[dataset]) != 0):
# Loop through the files of the dataset and reconstruct the file in the temporary
# directory with the same original name
for subfile in ToUpdate[dataset]:
reconstruct_dataset(
size=1, directory=subfile[1], output_file="./temporary_directory/{}".format(subfile[0]),
mode=dataset, update=True, seqID="0", region="0")
# Create an appropiate tabix library for the file. If it is annotation or
# alignment there are two pseudo files
if (dataset == "Annotation" or dataset == "Alignment"):
# Get all the headers (starts wiht #) for the alignment files, and all non header data,
# sort column 1 and column 2 (numerically).
# Then compress and create indexes and use tabix to create the library
Popen("(grep ""^#"" ./temporary_directory/{}_A; grep -v ""^#"" ./temporary_directory/{}_A | "
"sort -V -k1,1 -k2,2n) | bgzip > ./temporary_directory/{}_A.gz; "
"tabix -b 2 -e 2 ./temporary_directory/{}_A.gz;"
.format(subfile[0], subfile[0], subfile[0], subfile[0]), shell=True).wait()
# Do this for dataset B too
Popen("(grep ""^#"" ./temporary_directory/{}_B; grep -v ""^#"" ./temporary_directory/{}_B | "
"sort -V -k1,1 -k2,2n) | bgzip > ./temporary_directory/{}_B.gz; "
"tabix -b 2 -e 2 ./temporary_directory/{}_B.gz;"
.format(subfile[0], subfile[0], subfile[0], subfile[0]), shell=True).wait()
elif (dataset == "Variants"):
# Same as above but with variant data instead
Popen("(grep ""^#"" ./temporary_directory/{}_A; grep -v ""^#"" ./temporary_directory/{}_A | "
"sort -V -k1,1 -k2,2n) | bgzip > ./temporary_directory/{}_A.gz; "
"tabix -b 2 -e 2 ./temporary_directory/{}_A.gz;"
.format(subfile[0], subfile[0], subfile[0], subfile[0]), shell=True).wait()
# Otherwise there was an error
else:
print(
"***INTERNAL ERROR*** DATASET NOT RECOGNIZED: {}".format(dataset))
def obtain_variables(alignment_pickle):
"""
This function will obtain the alignment if present, otherwise it'll call to perform a new alignment if not present.
It will return the variables obtained, including the new alignment if performed.
Requires: alignment_pickle
"""
# If there is an alignment already stored in the repository
if (os.path.isdir(alignment_pickle)):
# Inform the user
print("\n\t*PART II. OBTAINING GENOME ALIGNMENT: STORED ALIGNMENT DETECTED, NOW LOADING SAVED DATA.*")
print("\t" + str(datetime.datetime.now()))
# Load the variables stored in the pickle [tabix_queries,alignment_pickle,summary_Dict]
variables = load_variables(alignment_pickle + "/pickle")
# Otherwise it is necessary to obtain the alignment
else:
# Inform the user
print("\n\t*PART II. OBTAINING GENOME ALIGNMENT: NO STORED ALIGNMENT DETECTED, NOW CREATING NEW ALIGNMENT*")
print("\t" + str(datetime.datetime.now()))
# Obtain the variables
variables = obtain_alignment(old_assembly="./temporary_directory/genome_old.fa",
new_assembly=new_assembly, directory="./temporary_directory",
threads=number_threads, ToUpdate=ToUpdate, alignment_pickle=alignment_pickle,
aligner_switch=aligner_switch, percent_identity=percent_identity,
kmer=kmer, segLength=segLength, c_flag=c_flag, b_flag=b_flag, ms_flag=ms_flag)
return variables
####################################################
# PART 0.DETERMINE FILES THAT NEED TO BE UPDATED #
####################################################
# Determine the files to be updated. ToUpdate = {dataset:[[filename.extension,directory,size],[],...]}
ToUpdate = detect_updates("./RepoMap.txt")
# If the there are no files to update, inform the user
update_inform_user(ToUpdate) # check if it stops everything
###############################################
# PART 1. RECONSTRUCTION OF REPOSITORY DATA #
###############################################
# Inform the user
print("\n\t*PART I. RECONSTRUCTION OF REPOSITORY DATA.*")
print("\t {}".format(str(datetime.datetime.now())))
# Create temporary directory
os.mkdir("./temporary_directory")
# Reconstruct the necessary datasets. First the old genome.
reconstruct_dataset(size=60, directory="./Genome", output_file="./temporary_directory/genome_old.fa",
mode="Genome", seqID="0", region="0")
# Reconstruct all the files related to the variants and annotation datasets
for dataset in ToUpdate.keys():
if (dataset != "Genome" and len(ToUpdate[dataset]) != 0):
reconstruct_annotation_variants(ToUpdate)
##############################################
# PART 2. OBTAIN THE ALIGNMENT INFORMATION #
##############################################
# Obtain the information of the alignment between both assemblies.
# Determine the alignment pickle
alignment_pickle = obtain_alignment_pickle("./temporary_directory/genome_old.fa",
new_assembly)
print(alignment_pickle)
# check if file has already been added and stored in repository
new_assembly_hash = alignment_pickle.split("_")[0]
comp_hash = new_assembly_hash.split("/")[2]
stored_alignments = [hash for hash in glob.glob("Delta/*"+comp_hash+"*")]
# to check new hash for comparing first and second commits
if (alignment_pickle == new_assembly_hash+"_"+comp_hash):
stored_alignments.append(alignment_pickle)
if (len(stored_alignments) != 0):
print("\n\t *** THE REPOSITORY ALREADY CONTAINS THIS FASTA FILE.***\n\tNow aborting.")
sys.exit(1)
variables = obtain_variables(alignment_pickle)
store_variables(variables=variables, alignment_pickle=alignment_pickle)
queries, alignment_pickle, summary_Dict, oldSeqs, newSeqs = variables
###################################################################
# PART 3. START THE INTERPRETATION OF THE ALIGNMENT INFORMATION #
###################################################################
if ("Annotation" in ToUpdate or "Alignment" in ToUpdate or "Variants" in ToUpdate):
# Inform the user
print("\n\t*PART III. INTERPRETATION OF THE ALIGNMENT INFORMATION AND CREATION OF UPDATED FILES")
print("\t{}".format(str(datetime.datetime.now())))
# Interpret the information contained in the delta_dict and obtain the updated files.
interpret_alignment(queries=queries, threads=number_threads, ToUpdate=ToUpdate,
tlength=template_length, new_assembly=new_assembly)
# ###################################################################
# # PART 4. OBTAIN ALIGNMENT INFORMATION FOR OLDER COMMITS #
# ###################################################################
# if (ggw != 0):
# # Get list of commits and head commit
# commitList = check_output(
# "git log --format=%H", shell=True).decode("utf-8").split()
# print(commitList)
# # first commit is HEAD commit
# commitList.pop(0)
# # headCommit = check_output(
# # "git rev-parse HEAD", shell=True).decode("utf-8").rstrip()
# if (len(commitList) != 0):
# print("\n\t*PART IV. OBTAIN ALIGNMENT DATA FOR OLDER COMMITS")
# # remove previous assembly so it will not align to old assembly
# for commit in commitList:
# # if (commit != headCommit):
# ShellCommand = Popen("git checkout " + commit +
# " > /dev/null", shell=True).wait()
# # reconstruct
# reconstruct_dataset(size=60, directory="./Genome",
# output_file="./temporary_directory/genome_{}.fa".format(commit), mode="Genome")
# ShellCommand = Popen(
# "git checkout master > /dev/null", shell=True).wait()
# alignment_pickle = obtain_alignment_pickle(
# "./temporary_directory/genome_{}.fa".format(commit), new_assembly)
# print(alignment_pickle, commit)
# variables = obtain_alignment(old_assembly="./temporary_directory/genome_{}.fa".format(commit),
# new_assembly=new_assembly,
# directory="./temporary_directory",
# threads=number_threads, ToUpdate=ToUpdate, alignment_pickle=alignment_pickle,
# aligner_switch=aligner_switch, percent_identity=percent_identity,
# kmer=kmer, segLength=segLength, c_flag=c_flag, b_flag=b_flag, ms_flag=ms_flag)
# store_variables(variables=variables,
# alignment_pickle=alignment_pickle)
# os.remove("./temporary_directory/genome_old.fa")
# Inform the user the update is completed
print("\n\t***UPDATE COMPLETED: NOW PARSING THE GENOME DATASET***")