Bug fix for parsing gene names from GTF

francois-a · Oct 24, 2017 · ccf1845 · ccf1845
1 parent fd21f3c
commit ccf1845
Showing 1 changed file with 3 additions and 4 deletions.
diff --git a/python/annotate_outputs.py b/python/annotate_outputs.py
@@ -30,10 +30,9 @@
     for row in gtf:
         row = row.strip().split('\t')
         if row[0][0]=='#' or row[2]!='gene': continue
-        attributes = row[8].split('; ',5)
-        gene_id = attributes[0].split()[1].replace('"','')
-        gene_name = attributes[4].split()[1].replace('"','')
-        gene_dict[gene_id] = [gene_name, row[0], row[3], row[4], row[6]]
+        # get gene_id and gene_name from attributes
+        attr = dict([i.split() for i in row[8].replace('"','').split(';') if i!=''])
+        gene_dict[attr['gene_id']] = [attr['gene_name'], row[0], row[3], row[4], row[6]]
 
 print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Annotating permutation results (eGenes)', flush=True)
 gene_df = pd.read_csv(args.permutation_results, sep='\t', index_col=0)