-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsequencing.py
127 lines (96 loc) · 4.36 KB
/
sequencing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 14 12:10:24 2023
@author: rjovelin
"""
import json
import os
from utilities import connect_to_db
def collect_sequence_info(project_name, database):
'''
(str, str) -> list
Returns a list with sequence file information for a project of interest
Parameters
----------
- project_name (str): Project of interest
- database (str): Path to the sqlite database
'''
# get sequences
conn = connect_to_db(database)
files = conn.execute("SELECT Files.file, Files.workflow, Files.version, Files.wfrun_id, Files.attributes, \
Workflow_Inputs.run, Workflow_Inputs.lane, Workflow_Inputs.platform, \
Libraries.library, Libraries.case_id, Libraries.ext_id, Libraries.group_id, Libraries.group_id_description, \
Libraries.library_type, Libraries.tissue_origin, Libraries.tissue_type \
from Files JOIN Workflow_Inputs JOIN Libraries WHERE Files.project_id = '{0}' \
AND Workflow_Inputs.project_id = '{0}' AND Libraries.project_id = '{0}' \
AND Files.wfrun_id = Workflow_Inputs.wfrun_id AND Workflow_Inputs.library = Libraries.library \
AND LOWER(Files.workflow) in ('casava', 'bcl2fastq', 'fileimportforanalysis', 'fileimport', 'import_fastq');".format(project_name)).fetchall()
conn.close()
return files
def get_sequences(L):
'''
(list) -> list
Returns a list sequence file information by grouping paired fastqs
Pre-condition: all fastqs are paired-fastqs. Non-paired-fastqs are discarded.
Parameters
----------
- L (list): List of sqlite3.Row extracted from the database and containing sequence file information
'''
# sort list according to files
L.sort(key = lambda x: x['file'])
F = []
for i in range(len(L)):
# keep only read1
if json.loads(L[i]['attributes'])['read_number'] == '1':
case = L[i]['case_id']
sample = L[i]['ext_id']
library = L[i]['library']
library_type = L[i]['library_type']
tissue_origin = L[i]['tissue_origin']
tissue_type = L[i]['tissue_type']
group_id = L[i]['group_id']
group_description = L[i]['group_id_description']
workflow = L[i]['workflow'] + '_' + L[i]['version']
wfrun = L[i]['wfrun_id']
file = L[i]['file']
run = L[i]['run'] + '_' + str(L[i]['lane'])
platform = L[i]['platform']
read_count = json.loads(L[i]['attributes'])['read_count'] if 'read_count' in json.loads(L[i]['attributes']) else 'NA'
sample_id = '_'.join([case, tissue_origin, tissue_type, group_id])
readcount = '{:,}'.format(int(read_count)) if read_count != 'NA' else 'NA'
fileprefix = os.path.basename(file)
fileprefix = '_'.join(fileprefix.split('_')[:-1])
d = {'case': case, 'sample': sample, 'sample_id': sample_id, 'library': library, 'run': run,
'read_count': readcount, 'workflow': workflow, 'prefix':fileprefix,
'platform': platform, 'group_id': group_id,
'group_description': group_description, 'tissue_type': tissue_type,
'library_type': library_type, 'tissue_origin': tissue_origin}
F.append(d)
F.sort(key = lambda x: x['case'])
return F
def platform_name(project_name, database):
'''
(str, str) -> list
Returns a dictionary with sequencing platform, shortname for all platforms
for the project of interest
Parameters
----------
- project_name (str): Project of interest
- database (str): Path to the sqlite database
'''
# get sequences
conn = connect_to_db(database)
data = conn.execute("SELECT DISTINCT Workflow_Inputs.platform FROM Workflow_Inputs WHERE \
Workflow_Inputs.project_id = '{0}';".format(project_name)).fetchall()
conn.close()
D = {}
for i in data:
s = ''
for j in i['platform']:
if not j.isnumeric():
s += j
s = s.split('_')
while '' in s:
s.remove('')
D[i['platform']] = s[-1].lower()
return D