-
Notifications
You must be signed in to change notification settings - Fork 0
/
module_5.py
87 lines (70 loc) · 3.02 KB
/
module_5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 1 16:03:57 2021
@author: Yihang Zhou
Contact: [email protected]
https://github.com/Y-H-Joe/
####============================ description ==============================####
## module 5
Process the repeatmaster output tables and store into filtered_tables.
CRPG_input.tsv:
code,read_len,num_reads,avg_genome_cov,genome_size
SUSSC,151,693665606,42.36,2472461935
#================================== input =====================================
#================================== output ====================================
#================================ parameters ==================================
#================================== example ===================================
python3 current_script_dir/module_5.py CRPG_loc name name_folder_dir
## Cal_Repeats_Per_Genome_and_Percent_of_Len_Linux.py will automatically find
## the name_repeatsummary.tsv file in the name_folder_dir given the CRPG_input_df.tsv
python3 cal_repeats_per_genome_and_percent_of_len_linux.py.py name_folder_dir CRPG_input_df.tsv output_tables output_figs
#================================== warning ===================================
####=======================================================================####
"""
import sys
import os
import pandas as pd
#os.system('taskset -p %s' %os.getpid())
CRPG_loc=sys.argv[1]
name=sys.argv[2]
name_folder_dir=sys.argv[3]
output_dir=sys.argv[4]
## prepare the CRPG_input.tsv for Cal_Repeats_Per_Genome_and_Percent_of_Len_Linux.py
"""
cat name.stat ## generated by repeatmaster
reads 7008213708
bases 707829584508
"""
## read_len=bases/reads
## genome_size was stored in genome_size file
## avg_genome_cov=bases/genome_size
stat_file=str(name_folder_dir+"/"+name+".stat")
stat_df=pd.read_csv(stat_file,header=None,sep="\t")
reads_num=float(stat_df.loc[0,1])
bases_num=float(stat_df.loc[1,1])
#print("reads_num: ",reads_num)
#print("reads_num type: ",type(reads_num))
#print("bases_num: ",bases_num)
#print("bases_num type: ",type(bases_num))
read_len=float('%.2f' % (bases_num/reads_num) )
genome_size_loc=str(name_folder_dir+"/genome_size")
with open(genome_size_loc,'r') as f:
genome_size=int(f.readline())
avg_genome_cov=float('%.2f' % (bases_num/genome_size))
CRPG_input_dict={'code':[name],
'read_len':[read_len],
'num_reads':[reads_num],
'avg_genome_cov':[avg_genome_cov],
'genome_size':[genome_size]
}
CRPG_input_df=pd.DataFrame.from_dict(CRPG_input_dict)
## write CRPG_input.tsv
CRPG_input_loc=str(name_folder_dir+"/"+"CRPG_input.tsv")
CRPG_input_df.to_csv(CRPG_input_loc,sep="\t",index=None)
## parse the CRPG_input.tsv to Cal_Repeats_Per_Genome_and_Percent_of_Len_Linux.py
filtered_tables_dir=str(output_dir+"/TRIP_results/"+"filtered_tables")
barplots_dir=str(output_dir+"/TRIP_results/"+"barplots")
CRPG_cmd=str('python3 '+CRPG_loc+" "+name_folder_dir+" "+CRPG_input_loc+" "+filtered_tables_dir+" "+barplots_dir)
print("module 5: ",CRPG_cmd)
os.system(CRPG_cmd)