-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrm_expression.py
123 lines (97 loc) · 3.54 KB
/
crm_expression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 15 13:37:54 2015
@author: Richard
"""
import numpy as np
# use this function to het the expression level in remanei males and females
def expression_male_female(expression_male_female_file):
'''
(file) -> dict
Returns a dictionnary for each gene in the expression file with
a tuple including the average expression in fermale, in male, and p-value
of the mean difference between female and male
'''
# create dictionnary {gene : (female, male, p-val)}
expression = {}
# open file for reading
infile = open(expression_male_female_file, 'r')
# skip header
infile.readline()
# read file and store relevant info in dict
for line in infile:
line = line.rstrip()
if line != '':
line = line.split()
# modify gene so that it matches in the GFF file
gene = 'CRE_PX356_' + line[0]
# get expression level in males and females
female = float(line[4])
male = float(line[8])
# get the p-value comparing expression difference between males and females
p_val = float(line[9])
expression[gene] = (female, male, p_val)
infile.close()
return expression
# use this function to assign gene names to remanei gene IDs
def crem_wormbase_gene_ID(gene_ID_file):
'''
(file) -> dict
Returns a dictionnary with the wormbase gene ID as key and
the remanei gene name as value
'''
# create dictionnary {WBGene : CRE0000}
identifiers = {}
# open file for reading
ID = open(gene_ID_file, 'r')
# go through the file
for line in ID:
line = line.rstrip()
if line != '':
line = line.split(',')
# use only live genes
if line[-1] == 'Live':
if line[1].startswith('WBG') and line[-2].startswith('CRE'):
identifiers[line[1]] = line[-2]
ID.close()
return identifiers
# use this function to get average expression level across development
def expression_developmental_stages(expression_file, gene_ID_file):
'''
(file, file) -> dict
Returns a dictionnary with cremanei gene name and average expression
level measured across embryonic development
'''
# get the cremanei gene names for each remanei Wormbase ID
identifiers = crem_wormbase_gene_ID(gene_ID_file)
# make a dict with average expression for each WBGene ID
expression_WBG = {}
# open expression file for reading
infile = open(expression_file, 'r')
infile.readline()
# go through the file
for line in infile:
line = line.rstrip()
if line != '':
# convert str to list
line = line.split()
# extract WBGene ID from list
gene = line.pop(0)
# convert strings in list to floats
line = list(map(lambda x: float(x), line))
# compute mean expression
mean = np.mean(line)
expression_WBG[gene] = mean
# create dict {CRE_gene_ID: mean eexpression}
expression = {}
for gene in identifiers:
# check if gene has expression
if gene in expression_WBG:
# get the remanei gene name
gene_name = identifiers[gene]
# modify remanei gene name to match with gene names in the GFF file
gene_name = 'CRE_PX356_' + gene_name
expression[gene_name] = expression_WBG[gene]
# close file after reading
infile.close()
return expression