-
Notifications
You must be signed in to change notification settings - Fork 9
/
utilities.py
55 lines (46 loc) · 1.55 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 4 14:55:56 2020
@author:
Lewis Moffat
Github: limitloss
"""
from Bio import SeqIO
def aas2int(seq):
aanumdict = {'A':0, 'R':1, 'N':2, 'D':3, 'C':4, 'Q':5, 'E':6, 'G':7, 'H':8,
'I':9, 'L':10, 'K':11, 'M':12, 'F':13, 'P':14, 'S':15, 'T':16,
'W':17, 'Y':18, 'V':19}
return [aanumdict.get(res, 20) for res in seq]
def LEGACY_loadfasta(fasta_file="test_seqs.fas"):
'''
Assumes a fasta file containing a single sequence
'''
seqs=[]
with open(fasta_file,"r") as f:
for line in f:
line=line.rstrip("\n")
if ">" in line:
name=line
else:
seqs.append(line)
seqs=''.join(seqs)
iseqs=aas2int(seqs)
data=[name,iseqs,seqs]
return data
def loadfasta(fasta_file="test_seqs.fas"):
'''
Takes a single FASTA file containing a variable number of sequences.
Returns a list of each example, where each example is a list itself of the
Biopython-determined name string, the sequence as a list of integers, and the
original sequence as a string.
'''
sequences = []
records = list(SeqIO.parse(fasta_file, "fasta"))
for record in records:
name = record.name
seq = str(record.seq)
# Sanity checks for lowercase characters, gaps, etc.
name = name.rstrip().upper().replace('-','')
iseq = aas2int(seq)
sequences.append([name,iseq,seq])
return sequences