Skip to content

Commit

Permalink
Add support for annotation files
Browse files Browse the repository at this point in the history
  • Loading branch information
alinja committed May 20, 2021
1 parent fc7806a commit 8669436
Show file tree
Hide file tree
Showing 7 changed files with 335 additions and 21 deletions.
66 changes: 54 additions & 12 deletions haploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
from bs4 import BeautifulSoup
import urllib.request
import json
import glob

def print_uptree(snpset, ut, do_print=True, b3x='b37'):
prev_gl=[]
rep=''
y=snpset['Y'];
pos=0
Expand All @@ -17,11 +19,33 @@ def print_uptree(snpset, ut, do_print=True, b3x='b37'):
txt=''
if 'txt' in mut:
txt=mut['txt']
otherg=mut['isog']
if mut['ftg'] != '?':
if mut['isog']:
otherg+=', '
otherg+=mut['ftg']
if mut[b3x] in y:
rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], y[mut[b3x]]['gen'], mut['raw'], mut['isog'], txt)
rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], y[mut[b3x]]['gen'], mut['raw'], otherg, txt)
else:
rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], ' ', mut['raw'], mut['isog'], txt)
rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], ' ', mut['raw'], otherg, txt)
pass

if not mut['g'] in prev_gl:
if mut['g'] in annotations_by_g:
for anno in annotations_by_g[mut['g']]:
rep += "%45s[Y] %s\n"%('',anno['txt'])
prev_gl.append(mut['g'])
if not mut['ftg'] in prev_gl:
if mut['ftg'] in annotations_by_g and not mut['g'] in annotations_by_g:
for anno in annotations_by_g[mut['ftg']]:
rep += "%45s[F] %s\n"%('',anno['txt'])
if mut['ftg'] != '?':
prev_gl.append(mut['ftg'])
for m in mut['raw'].split('/'):
m2=m.replace('(H)','')
if m2 in annotations_by_m:
for anno in annotations_by_m[m2]:
rep += "%45s[M] %s\n"%('',anno['txt'])
if do_print:
print(rep)
return rep
Expand Down Expand Up @@ -522,7 +546,6 @@ def load_yfull_snp(pages):

haplo_ybrowse_info = ''
haplo_ybrowse_muts_by_name = {}
haplo_ybrowse_muts_by_b38 = {}

# Source: http://www.ybrowse.org/gbrowse2/gff/
def load_ybrowse_snp():
Expand Down Expand Up @@ -568,8 +591,6 @@ def load_ybrowse_snp():
}
if mname not in haplo_ybrowse_muts_by_name:
haplo_ybrowse_muts_by_name[mname] = mut
if b38 not in haplo_ybrowse_muts_by_b38:
haplo_ybrowse_muts_by_b38[b38] = mut
print("Lines in YBrowse snp DB: ", len(haplo_ybrowse_muts_by_name))

# Convert formats with CrossMap and chain file in crossmap/
Expand Down Expand Up @@ -777,13 +798,8 @@ def decode_entry(e):
else:
if mut['t'] != m['t']:
print('FTDNA der mismatch:', e, mut['t'], m['t'], mut, m)
#if not 'isog' in m:
# m['isog']=''
#else:
if 'isog' in m:
if m['isog']:
m['isog']+=', '
m['isog']+=mut['g']+''
if 'b38' in m:
m['ftg']=mut['g']
return m

def yfull_fname(group):
Expand Down Expand Up @@ -894,6 +910,14 @@ def yfull_recurse_list(ul_in, level, fileroot):
#muts['f']=dec['f']
mutse['t']=dec['t']
mutse['isog']=dec['isog']
mutse['ftg']='?'
if 'ftg' in dec:
mutse['ftg']=dec['ftg']
#discard far matches
if not mutse['isog'].startswith(mutse['g'][0]):
mutse['isog']=''
if not mutse['ftg'].startswith(mutse['g'][0]):
mutse['ftg']='?'
mutse['b36']=dec['b36']
mutse['b37']=dec['b37']
mutse['b38']=dec['b38']
Expand Down Expand Up @@ -1030,5 +1054,23 @@ def import_ftdna_tree():
skip=1
print('FTDNA Tree database size is %d nodes'%len(haplo_ftdna_muts_list))

annotations_by_g = {}
annotations_by_m = {}
def load_annotations(fname):
files = glob.glob(fname)
for fn in files:
with open(fn, 'r') as f:
print('Loading annotation file %s'%fn)
jdata = json.load(f)
for anno in jdata['annotation']:
#print(anno)
if 'g' in anno and anno['g']:
if not anno['g'] in annotations_by_g:
annotations_by_g[anno['g']] = []
annotations_by_g[anno['g']].append(anno)
if 'm' in anno and anno['m']:
if not anno['m'] in annotations_by_m:
annotations_by_m[anno['m']] = []
annotations_by_m[anno['m']].append(anno)


241 changes: 241 additions & 0 deletions haploy_anno_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
#!/usr/bin/python3
import re
import csv
import os
from bs4 import BeautifulSoup
import urllib.request
import json
import glob




def yfull_fname(group):
if group:
return 'yfull/yfull-ytree-'+group+'.html'
else:
return 'yfull/yfull-ytree.html'

def yfull_url(group):
if group:
return 'https://www.yfull.com/tree/' + group + '/'
else:
return 'https://www.yfull.com/tree/'

# YFull mtree import (experimental)
def download_yfull_file(group):
try:
os.mkdir('yfull')
except OSError:
pass
fname = yfull_fname(group)
url = yfull_url(group)
print('Downloading ' + url + 'to file: ' + fname)
#urllib.request.urlretrieve("https://www.yfull.com/tree/"+group+"/", fname);

def yfull_parse_muts(li):
s=''
snpforhg=li.find('span', class_='yf-snpforhg', recursive=False)
if snpforhg:
s+=snpforhg.text
plussnps=li.find('span', class_='yf-plus-snps', recursive=False)
if plussnps:
s += ' * ' + plussnps['title']
o=[]
if len(s) > 0:
for m in s.split('*'):
o.append(m.strip())
return o

def yfull_parse_age(li):
s=''
agespan=li.find('span', class_='yf-age', recursive=False)
if agespan:
s+=agespan.text
return s

def yfull_parse_person(li):
sams=[]
ul = li.find('ul', recursive=False)
if ul:
lis = ul.find_all('li', recursive=False)
else:
return sams
if not lis:
return sams
for li in lis:
has_sample=0
sam=''
if li.has_attr('valsampleid'):
sam+=li['valsampleid']+ ': '
has_sample=1
for geo in li.find_all('b', recursive=False):
if geo.has_attr('class') and 'yf-geo' in geo['class'] and 'fl' in geo['class']:
if geo.has_attr('title'):
sam+=geo['title']
if geo.has_attr('original-title'):
sam+=geo['original-title']
sam+=' '
if geo.has_attr('class') and 'yf-geo' in geo['class'] and 'yf-lang' in geo['class']:
if geo.has_attr('title'):
sam+=geo['title']
if geo.has_attr('original-title'):
sam+=geo['original-title']
for geo in li.find_all('span', recursive=False):
if geo.has_attr('class') and 'yf-a-age' in geo['class']:
if geo.has_attr('title'):
sam+=geo['title']
if geo.has_attr('original-title'):
sam+=geo['original-title']
if has_sample:
sams.append(sam)
#sam+=' '
#print(sam)
return sams

def yfull_is_tree_quirk(group_name, fileroot):
if fileroot:
return False
if group_name=='R-P312':
return True
if group_name=='R-Z2118':
return True
return False

def yfull_recurse_list(ul_in, level, fileroot):
lis = ul_in.find_all('li', recursive=False)
for li in lis:
#print(li.get_text())
muts={}
muts['l']=level
g=li.find('a', recursive=False)
group_name=''
if g:
group_name=g.text
muts['g']=g.text
txts = yfull_parse_person(li)
grp=g.text.strip('*')
for txt in txts:
print(grp, txt)
anno = {
"g": grp,
"txt": 'YFULL: %s'%(txt)
}
annos.append(anno)
l=li.find('a', href=True, recursive=False)
if l:
muts['link']=l['href']


ul = li.find('ul', recursive=False)
if ul and not yfull_is_tree_quirk(group_name, fileroot):
#print('->')
yfull_recurse_list(ul, level+1, False)
#print('<-')
else:
if 'g' in muts and muts['g'].endswith('*'):
continue
if 'link' in muts:
group=muts['link'].split('/')[-2]
#print('FILE: ' +fname)
yfull_recurse_file(group, level)
#print('END: ' +fname)
return 0

def yfull_recurse_file(group, level):
fname = yfull_fname(group)
try:
with open(fname) as f:
pass
except OSError:
print('File not found: ' +fname)
download_yfull_file(group)

with open(fname) as f:
print('Importing file: ' +fname)
soup = BeautifulSoup(f.read(), features="html.parser")
ul = soup.find('ul', id='tree')
yfull_recurse_list(ul, level, True)
#yfull_get_info(soup)

def import_yfull_tree(gr):
yfull_recurse_file(gr, 0)





#
def import_ftdna_chart(fname, info=''):
with open(fname) as f:
print('Importing file: ' +fname)
soup = BeautifulSoup(f.read(), features="html.parser")

#rows = soup.find('div', id='MainContent_color1_GridView1').find('table').find_all("tr")
#rows = soup.find('table').find_all("tr")
rows = soup.find('div', {"id" : re.compile('MainContent.*')}).find('table').find_all("tr")

kiti = -1
pati = -1
coui = -1
gri = -1
row = rows[0]
ths = row.find_all("th")
for i, th in enumerate(ths):
if 'Kit' in th.get_text():
kiti = i
if 'Paternal' in th.get_text():
pati = i
if 'Country' in th.get_text():
coui = i
if 'Haplogroup' in th.get_text():
gri = i
for row in rows:
tds = row.find_all("td")
if len(tds)>1:
kit=''
pat=''
cou=''
gr=''
kit = tds[kiti].get_text().strip()
if pati >= 0:
pat = tds[pati].get_text().strip()
if coui >= 0:
cou = tds[coui].get_text().strip()
gr = tds[gri].get_text().strip()
if not gr:
continue
#if 'MIN' in kit or 'MAX' in kit or 'MODE' in kit:
# continue
print(kit, pat, gr)
anno = {
"g": gr,
"txt": 'FTDNA: %s %s %s'%(kit, pat, info)
}
annos.append(anno)

def save_anno(fname):
jroot={
'info': 'haploy_anno_iport.py',
'annotation': annos }
with open(fname, 'w') as f:
json.dump(jroot, f, indent=1);


# Example annotations - it probably doesn't make sense for everyone to import every project

annos=[]
import_yfull_tree('A00')
import_yfull_tree('A0-T')
#import_yfull_tree('N-FGC28435')
#import_yfull_tree('N')
save_anno('haploy_annodb_yfull.txt')

annos=[]
import_ftdna_chart('ftdna/FamilyTreeDNA - Estonia.htm', '[Estonia]')
import_ftdna_chart('ftdna/FamilyTreeDNA - Saami Project.htm', '[Saami]')
import_ftdna_chart('ftdna/FamilyTreeDNA - I1 Suomi Finland & N-CTS8565 -projekti.htm', '[I1 Suomi]')
import_ftdna_chart('ftdna/FamilyTreeDNA - Finland DNA Project.htm', '[FinlandDNA]')
import_ftdna_chart('ftdna/FamilyTreeDNA - RussiaDNA Project.htm', '[RussiaDNA]')
import_ftdna_chart('ftdna/FamilyTreeDNA - R1a1a and Subclades Y-DNA Project.htm', '[R1a1a]')
save_anno('haploy_annodb_ftdnatest.txt')
17 changes: 17 additions & 0 deletions haploy_annodb_example.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"info": "Test file",
"annotation": [
{"m": "M2019", "txt": "Yakuts"},
{"m": "Y13850", "txt": "Ugric"},
{"m": "Y10932", "txt": "Rurikids"},
{"m": "L550", "txt": "Scandic/Baltic"},
{"m": "Z1933", "txt": "Savo/Karjala"},
{"m": "CTS9976", "txt": "Finns"},
{"m": "VL62", "txt": "Karjala"},
{"m": "CTS8565", "txt": "Savo"},
{"m": "M7414", "txt": "Kärsä-Laitinen"},
{"m": "VL29", "txt": "North European"},
{"m": "PH521", "txt": "Lapland"},
{"m": "L1022", "txt": "Häme/Länsi-Suomi"}
]
}
1 change: 1 addition & 0 deletions haploy_find.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
print("Loading DB2...")
haploy.load_db2j(min_tree_load_level=min_tree_load_level)
print("DB loaded!")
haploy.load_annotations('haploy_annodb_*.txt')
rep = haploy.report(args.file[0], n_single, do_all=all, filt=filt, force=force, min_match_level=min_match_level)
print(rep)
else:
Expand Down
Loading

0 comments on commit 8669436

Please sign in to comment.