From 86694365227f34b8fe34e29a26b9ad357e0efb52 Mon Sep 17 00:00:00 2001 From: Jani Alinikula Date: Thu, 20 May 2021 22:48:22 +0300 Subject: [PATCH] Add support for annotation files --- haploy.py | 66 +++++++++-- haploy_anno_import.py | 241 ++++++++++++++++++++++++++++++++++++++ haploy_annodb_example.txt | 17 +++ haploy_find.py | 1 + mk_snipsawin.sh | 19 ++- snipsa-gui.py | 10 +- snipsa.bat | 2 +- 7 files changed, 335 insertions(+), 21 deletions(-) create mode 100755 haploy_anno_import.py create mode 100755 haploy_annodb_example.txt diff --git a/haploy.py b/haploy.py index 1870448..f928ee4 100755 --- a/haploy.py +++ b/haploy.py @@ -6,8 +6,10 @@ from bs4 import BeautifulSoup import urllib.request import json +import glob def print_uptree(snpset, ut, do_print=True, b3x='b37'): + prev_gl=[] rep='' y=snpset['Y']; pos=0 @@ -17,11 +19,33 @@ def print_uptree(snpset, ut, do_print=True, b3x='b37'): txt='' if 'txt' in mut: txt=mut['txt'] + otherg=mut['isog'] + if mut['ftg'] != '?': + if mut['isog']: + otherg+=', ' + otherg+=mut['ftg'] if mut[b3x] in y: - rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], y[mut[b3x]]['gen'], mut['raw'], mut['isog'], txt) + rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], y[mut[b3x]]['gen'], mut['raw'], otherg, txt) else: - rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], ' ', mut['raw'], mut['isog'], txt) + rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], ' ', mut['raw'], otherg, txt) pass + + if not mut['g'] in prev_gl: + if mut['g'] in annotations_by_g: + for anno in annotations_by_g[mut['g']]: + rep += "%45s[Y] %s\n"%('',anno['txt']) + prev_gl.append(mut['g']) + if not mut['ftg'] in prev_gl: + if mut['ftg'] in annotations_by_g and not mut['g'] in annotations_by_g: + for anno in annotations_by_g[mut['ftg']]: + rep += "%45s[F] %s\n"%('',anno['txt']) + if mut['ftg'] != '?': + prev_gl.append(mut['ftg']) + for m in mut['raw'].split('/'): + m2=m.replace('(H)','') + if m2 in annotations_by_m: + for anno in annotations_by_m[m2]: + rep += "%45s[M] %s\n"%('',anno['txt']) if do_print: print(rep) return rep @@ -522,7 +546,6 @@ def load_yfull_snp(pages): haplo_ybrowse_info = '' haplo_ybrowse_muts_by_name = {} -haplo_ybrowse_muts_by_b38 = {} # Source: http://www.ybrowse.org/gbrowse2/gff/ def load_ybrowse_snp(): @@ -568,8 +591,6 @@ def load_ybrowse_snp(): } if mname not in haplo_ybrowse_muts_by_name: haplo_ybrowse_muts_by_name[mname] = mut - if b38 not in haplo_ybrowse_muts_by_b38: - haplo_ybrowse_muts_by_b38[b38] = mut print("Lines in YBrowse snp DB: ", len(haplo_ybrowse_muts_by_name)) # Convert formats with CrossMap and chain file in crossmap/ @@ -777,13 +798,8 @@ def decode_entry(e): else: if mut['t'] != m['t']: print('FTDNA der mismatch:', e, mut['t'], m['t'], mut, m) - #if not 'isog' in m: - # m['isog']='' - #else: - if 'isog' in m: - if m['isog']: - m['isog']+=', ' - m['isog']+=mut['g']+'' + if 'b38' in m: + m['ftg']=mut['g'] return m def yfull_fname(group): @@ -894,6 +910,14 @@ def yfull_recurse_list(ul_in, level, fileroot): #muts['f']=dec['f'] mutse['t']=dec['t'] mutse['isog']=dec['isog'] + mutse['ftg']='?' + if 'ftg' in dec: + mutse['ftg']=dec['ftg'] + #discard far matches + if not mutse['isog'].startswith(mutse['g'][0]): + mutse['isog']='' + if not mutse['ftg'].startswith(mutse['g'][0]): + mutse['ftg']='?' mutse['b36']=dec['b36'] mutse['b37']=dec['b37'] mutse['b38']=dec['b38'] @@ -1030,5 +1054,23 @@ def import_ftdna_tree(): skip=1 print('FTDNA Tree database size is %d nodes'%len(haplo_ftdna_muts_list)) +annotations_by_g = {} +annotations_by_m = {} +def load_annotations(fname): + files = glob.glob(fname) + for fn in files: + with open(fn, 'r') as f: + print('Loading annotation file %s'%fn) + jdata = json.load(f) + for anno in jdata['annotation']: + #print(anno) + if 'g' in anno and anno['g']: + if not anno['g'] in annotations_by_g: + annotations_by_g[anno['g']] = [] + annotations_by_g[anno['g']].append(anno) + if 'm' in anno and anno['m']: + if not anno['m'] in annotations_by_m: + annotations_by_m[anno['m']] = [] + annotations_by_m[anno['m']].append(anno) diff --git a/haploy_anno_import.py b/haploy_anno_import.py new file mode 100755 index 0000000..293aa95 --- /dev/null +++ b/haploy_anno_import.py @@ -0,0 +1,241 @@ +#!/usr/bin/python3 +import re +import csv +import os +from bs4 import BeautifulSoup +import urllib.request +import json +import glob + + + + +def yfull_fname(group): + if group: + return 'yfull/yfull-ytree-'+group+'.html' + else: + return 'yfull/yfull-ytree.html' + +def yfull_url(group): + if group: + return 'https://www.yfull.com/tree/' + group + '/' + else: + return 'https://www.yfull.com/tree/' + +# YFull mtree import (experimental) +def download_yfull_file(group): + try: + os.mkdir('yfull') + except OSError: + pass + fname = yfull_fname(group) + url = yfull_url(group) + print('Downloading ' + url + 'to file: ' + fname) + #urllib.request.urlretrieve("https://www.yfull.com/tree/"+group+"/", fname); + +def yfull_parse_muts(li): + s='' + snpforhg=li.find('span', class_='yf-snpforhg', recursive=False) + if snpforhg: + s+=snpforhg.text + plussnps=li.find('span', class_='yf-plus-snps', recursive=False) + if plussnps: + s += ' * ' + plussnps['title'] + o=[] + if len(s) > 0: + for m in s.split('*'): + o.append(m.strip()) + return o + +def yfull_parse_age(li): + s='' + agespan=li.find('span', class_='yf-age', recursive=False) + if agespan: + s+=agespan.text + return s + +def yfull_parse_person(li): + sams=[] + ul = li.find('ul', recursive=False) + if ul: + lis = ul.find_all('li', recursive=False) + else: + return sams + if not lis: + return sams + for li in lis: + has_sample=0 + sam='' + if li.has_attr('valsampleid'): + sam+=li['valsampleid']+ ': ' + has_sample=1 + for geo in li.find_all('b', recursive=False): + if geo.has_attr('class') and 'yf-geo' in geo['class'] and 'fl' in geo['class']: + if geo.has_attr('title'): + sam+=geo['title'] + if geo.has_attr('original-title'): + sam+=geo['original-title'] + sam+=' ' + if geo.has_attr('class') and 'yf-geo' in geo['class'] and 'yf-lang' in geo['class']: + if geo.has_attr('title'): + sam+=geo['title'] + if geo.has_attr('original-title'): + sam+=geo['original-title'] + for geo in li.find_all('span', recursive=False): + if geo.has_attr('class') and 'yf-a-age' in geo['class']: + if geo.has_attr('title'): + sam+=geo['title'] + if geo.has_attr('original-title'): + sam+=geo['original-title'] + if has_sample: + sams.append(sam) + #sam+=' ' + #print(sam) + return sams + +def yfull_is_tree_quirk(group_name, fileroot): + if fileroot: + return False + if group_name=='R-P312': + return True + if group_name=='R-Z2118': + return True + return False + +def yfull_recurse_list(ul_in, level, fileroot): + lis = ul_in.find_all('li', recursive=False) + for li in lis: + #print(li.get_text()) + muts={} + muts['l']=level + g=li.find('a', recursive=False) + group_name='' + if g: + group_name=g.text + muts['g']=g.text + txts = yfull_parse_person(li) + grp=g.text.strip('*') + for txt in txts: + print(grp, txt) + anno = { + "g": grp, + "txt": 'YFULL: %s'%(txt) + } + annos.append(anno) + l=li.find('a', href=True, recursive=False) + if l: + muts['link']=l['href'] + + + ul = li.find('ul', recursive=False) + if ul and not yfull_is_tree_quirk(group_name, fileroot): + #print('->') + yfull_recurse_list(ul, level+1, False) + #print('<-') + else: + if 'g' in muts and muts['g'].endswith('*'): + continue + if 'link' in muts: + group=muts['link'].split('/')[-2] + #print('FILE: ' +fname) + yfull_recurse_file(group, level) + #print('END: ' +fname) + return 0 + +def yfull_recurse_file(group, level): + fname = yfull_fname(group) + try: + with open(fname) as f: + pass + except OSError: + print('File not found: ' +fname) + download_yfull_file(group) + + with open(fname) as f: + print('Importing file: ' +fname) + soup = BeautifulSoup(f.read(), features="html.parser") + ul = soup.find('ul', id='tree') + yfull_recurse_list(ul, level, True) + #yfull_get_info(soup) + +def import_yfull_tree(gr): + yfull_recurse_file(gr, 0) + + + + + +# +def import_ftdna_chart(fname, info=''): + with open(fname) as f: + print('Importing file: ' +fname) + soup = BeautifulSoup(f.read(), features="html.parser") + + #rows = soup.find('div', id='MainContent_color1_GridView1').find('table').find_all("tr") + #rows = soup.find('table').find_all("tr") + rows = soup.find('div', {"id" : re.compile('MainContent.*')}).find('table').find_all("tr") + + kiti = -1 + pati = -1 + coui = -1 + gri = -1 + row = rows[0] + ths = row.find_all("th") + for i, th in enumerate(ths): + if 'Kit' in th.get_text(): + kiti = i + if 'Paternal' in th.get_text(): + pati = i + if 'Country' in th.get_text(): + coui = i + if 'Haplogroup' in th.get_text(): + gri = i + for row in rows: + tds = row.find_all("td") + if len(tds)>1: + kit='' + pat='' + cou='' + gr='' + kit = tds[kiti].get_text().strip() + if pati >= 0: + pat = tds[pati].get_text().strip() + if coui >= 0: + cou = tds[coui].get_text().strip() + gr = tds[gri].get_text().strip() + if not gr: + continue + #if 'MIN' in kit or 'MAX' in kit or 'MODE' in kit: + # continue + print(kit, pat, gr) + anno = { + "g": gr, + "txt": 'FTDNA: %s %s %s'%(kit, pat, info) + } + annos.append(anno) + +def save_anno(fname): + jroot={ + 'info': 'haploy_anno_iport.py', + 'annotation': annos } + with open(fname, 'w') as f: + json.dump(jroot, f, indent=1); + + +# Example annotations - it probably doesn't make sense for everyone to import every project + +annos=[] +import_yfull_tree('A00') +import_yfull_tree('A0-T') +#import_yfull_tree('N-FGC28435') +#import_yfull_tree('N') +save_anno('haploy_annodb_yfull.txt') + +annos=[] +import_ftdna_chart('ftdna/FamilyTreeDNA - Estonia.htm', '[Estonia]') +import_ftdna_chart('ftdna/FamilyTreeDNA - Saami Project.htm', '[Saami]') +import_ftdna_chart('ftdna/FamilyTreeDNA - I1 Suomi Finland & N-CTS8565 -projekti.htm', '[I1 Suomi]') +import_ftdna_chart('ftdna/FamilyTreeDNA - Finland DNA Project.htm', '[FinlandDNA]') +import_ftdna_chart('ftdna/FamilyTreeDNA - RussiaDNA Project.htm', '[RussiaDNA]') +import_ftdna_chart('ftdna/FamilyTreeDNA - R1a1a and Subclades Y-DNA Project.htm', '[R1a1a]') +save_anno('haploy_annodb_ftdnatest.txt') diff --git a/haploy_annodb_example.txt b/haploy_annodb_example.txt new file mode 100755 index 0000000..d67ef62 --- /dev/null +++ b/haploy_annodb_example.txt @@ -0,0 +1,17 @@ +{ + "info": "Test file", + "annotation": [ + {"m": "M2019", "txt": "Yakuts"}, + {"m": "Y13850", "txt": "Ugric"}, + {"m": "Y10932", "txt": "Rurikids"}, + {"m": "L550", "txt": "Scandic/Baltic"}, + {"m": "Z1933", "txt": "Savo/Karjala"}, + {"m": "CTS9976", "txt": "Finns"}, + {"m": "VL62", "txt": "Karjala"}, + {"m": "CTS8565", "txt": "Savo"}, + {"m": "M7414", "txt": "Kärsä-Laitinen"}, + {"m": "VL29", "txt": "North European"}, + {"m": "PH521", "txt": "Lapland"}, + {"m": "L1022", "txt": "Häme/Länsi-Suomi"} + ] +} \ No newline at end of file diff --git a/haploy_find.py b/haploy_find.py index 5d130d6..aaf0d2f 100755 --- a/haploy_find.py +++ b/haploy_find.py @@ -41,6 +41,7 @@ print("Loading DB2...") haploy.load_db2j(min_tree_load_level=min_tree_load_level) print("DB loaded!") + haploy.load_annotations('haploy_annodb_*.txt') rep = haploy.report(args.file[0], n_single, do_all=all, filt=filt, force=force, min_match_level=min_match_level) print(rep) else: diff --git a/mk_snipsawin.sh b/mk_snipsawin.sh index 2330003..f6f3dd2 100755 --- a/mk_snipsawin.sh +++ b/mk_snipsawin.sh @@ -13,7 +13,7 @@ set -e #python embed and pre-installed tk/other libs as a corresponding zip is needed in the folder SNIPSAWIN_DIR=snipsawin -PYTHON_EMBED=python-3.9.2-embed-amd64 +PYTHON_EMBED=python-3.8.5-embed-amd64-preinstalled echo Preparing Snipsa Windows package... @@ -24,10 +24,9 @@ mkdir $SNIPSAWIN_DIR #python echo Installing python environment ... mkdir $SNIPSAWIN_DIR/$PYTHON_EMBED -unzip -q $PYTHON_EMBED.zip -d $SNIPSAWIN_DIR/$PYTHON_EMBED -unzip -q python-3.9.2-libs.zip -d $SNIPSAWIN_DIR/$PYTHON_EMBED +unzip -q $PYTHON_EMBED.zip -d $SNIPSAWIN_DIR -echo python39.zip > temp._pth +echo python38.zip > temp._pth echo . >> temp._pth echo .\\Lib >> temp._pth echo .\\DLLs >> temp._pth @@ -41,12 +40,22 @@ mkdir $SNIPSAWIN_DIR/snipsa cp \ haplomt.py haplomt_find.py haplomt_db_import.py \ haploy.py haploy_find.py haploy_db_import.py \ - haplomt_map.txt haploy_map2j.txt \ snpload.py \ bamload.py \ snipsa-gui.py \ $SNIPSAWIN_DIR/snipsa/ +#DB licenses included in implementation +#TODO: FTDNA +cp haplomt_map.txt haploy_map2j.txt haploy_annodb_example.txt haploy_annodb_yfull.txt $SNIPSAWIN_DIR/snipsa/ + +# Ybrowse (CC BY-NC-SA 3.0) +# https://isogg.org/wiki/ISOGG_Wiki:Terms_of_Use +#cp str_hg19.gff3 str_hg38.gff3 snps_hg38.gff3 $SNIPSAWIN_DIR/snipsa/ +cp str_hg19.gff3 str_hg38.gff3 $SNIPSAWIN_DIR/snipsa/ + +#http://May2021.archive.ensembl.org/info/about/legal/disclaimer.html free use license +cp -a crossmap $SNIPSAWIN_DIR/snipsa/ #starter cp snipsa.bat $SNIPSAWIN_DIR/ diff --git a/snipsa-gui.py b/snipsa-gui.py index 1dd46e2..1a948cf 100755 --- a/snipsa-gui.py +++ b/snipsa-gui.py @@ -7,7 +7,7 @@ import haplomt import haploy -bam_support=0 +bam_support=1 if bam_support: import bamload @@ -15,15 +15,18 @@ def handle_bam_select(): bamname = tkinter.filedialog.askopenfile().name - bamload.get_build(bamname) + in_build = bamload.get_build(bamname) + bamload.setup_conv(in_build) snpset = bamload.full_convert(bamname) snpload.save(bamname+'.snp.txt', snpset, 38) - message='SNP file written to ' + bamname+'.snp.txt' + snpname = bamname+'.snp.txt' + message = 'SNP file written to ' + snpname scr.config(state=tk.NORMAL) scr.delete('1.0', tk.END) scr.insert('1.0', message) scr.config(state=tk.DISABLED) scr.see("end") + fnamevar.set(snpname) def handle_file_select(): @@ -57,6 +60,7 @@ def handle_findy(): if not dby_loaded: dby_loaded = 1 haploy.load_db2j() + haploy.load_annotations('haploy_annodb_*.txt') do_uptree = report_snps.get() do_all = report_all.get() force = pathvar.get() diff --git a/snipsa.bat b/snipsa.bat index 22545ae..79ec429 100755 --- a/snipsa.bat +++ b/snipsa.bat @@ -1,2 +1,2 @@ @cd snipsa -@..\python-3.9.2-embed-amd64\python.exe snipsa-gui.py +@..\python-3.8.5-embed-amd64-preinstalled\python.exe snipsa-gui.py