Add support for annotation files

alinja · May 20, 2021 · 8669436 · 8669436
1 parent fc7806a
commit 8669436
Show file tree

Hide file tree

Showing 7 changed files with 335 additions and 21 deletions.
diff --git a/haploy.py b/haploy.py
@@ -6,8 +6,10 @@
 from bs4 import BeautifulSoup
 import urllib.request
 import json
+import glob
 
 def print_uptree(snpset, ut, do_print=True, b3x='b37'):
+    prev_gl=[]
     rep=''
     y=snpset['Y'];
     pos=0
@@ -17,11 +19,33 @@ def print_uptree(snpset, ut, do_print=True, b3x='b37'):
         txt=''
         if 'txt' in mut:
             txt=mut['txt']
+        otherg=mut['isog']
+        if mut['ftg'] != '?':
+            if mut['isog']:
+                otherg+=', '
+            otherg+=mut['ftg']
         if mut[b3x] in y:
-            rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], y[mut[b3x]]['gen'], mut['raw'], mut['isog'], txt)
+            rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], y[mut[b3x]]['gen'], mut['raw'], otherg, txt)
         else:
-            rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], ' ', mut['raw'], mut['isog'], txt)
+            rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], ' ', mut['raw'], otherg, txt)
             pass
+
+        if not mut['g'] in prev_gl:
+            if mut['g'] in annotations_by_g:
+                for anno in annotations_by_g[mut['g']]:
+                    rep += "%45s[Y] %s\n"%('',anno['txt'])
+            prev_gl.append(mut['g'])
+        if not mut['ftg'] in prev_gl:
+            if mut['ftg'] in annotations_by_g and not mut['g'] in annotations_by_g:
+                for anno in annotations_by_g[mut['ftg']]:
+                    rep += "%45s[F] %s\n"%('',anno['txt'])
+            if mut['ftg'] != '?':
+                prev_gl.append(mut['ftg'])
+        for m in mut['raw'].split('/'):
+            m2=m.replace('(H)','')
+            if m2 in annotations_by_m:
+                for anno in annotations_by_m[m2]:
+                    rep += "%45s[M] %s\n"%('',anno['txt'])
     if do_print:
         print(rep)
     return rep
@@ -522,7 +546,6 @@ def load_yfull_snp(pages):
 
 haplo_ybrowse_info = ''
 haplo_ybrowse_muts_by_name = {}
-haplo_ybrowse_muts_by_b38 = {}
 
 # Source: http://www.ybrowse.org/gbrowse2/gff/
 def load_ybrowse_snp():
@@ -568,8 +591,6 @@ def load_ybrowse_snp():
             }
             if mname not in haplo_ybrowse_muts_by_name:
                 haplo_ybrowse_muts_by_name[mname] = mut
-            if b38 not in haplo_ybrowse_muts_by_b38:
-                haplo_ybrowse_muts_by_b38[b38] = mut
     print("Lines in YBrowse snp DB: ", len(haplo_ybrowse_muts_by_name))
 
 # Convert formats with CrossMap and chain file in crossmap/
@@ -777,13 +798,8 @@ def decode_entry(e):
                     else:
                         if mut['t'] != m['t']:
                             print('FTDNA der mismatch:', e, mut['t'], m['t'], mut, m)
-                #if not 'isog' in m:
-                #    m['isog']=''
-                #else:
-                if 'isog' in m:
-                    if m['isog']:
-                        m['isog']+=', '
-                    m['isog']+=mut['g']+''
+                if 'b38' in m:
+                    m['ftg']=mut['g']
     return m
 
 def yfull_fname(group):
@@ -894,6 +910,14 @@ def yfull_recurse_list(ul_in, level, fileroot):
                 #muts['f']=dec['f']
                 mutse['t']=dec['t']
                 mutse['isog']=dec['isog']
+                mutse['ftg']='?'
+                if 'ftg' in dec:
+                    mutse['ftg']=dec['ftg']
+                #discard far matches
+                if not mutse['isog'].startswith(mutse['g'][0]):
+                    mutse['isog']=''
+                if not mutse['ftg'].startswith(mutse['g'][0]):
+                    mutse['ftg']='?'
                 mutse['b36']=dec['b36']
                 mutse['b37']=dec['b37']
                 mutse['b38']=dec['b38']
@@ -1030,5 +1054,23 @@ def import_ftdna_tree():
                 skip=1
     print('FTDNA Tree database size is %d nodes'%len(haplo_ftdna_muts_list))
 
+annotations_by_g = {}
+annotations_by_m = {}
+def load_annotations(fname):
+    files = glob.glob(fname)
+    for fn in files:
+        with open(fn, 'r') as f:
+            print('Loading annotation file %s'%fn)
+            jdata = json.load(f)
+            for anno in jdata['annotation']:
+                #print(anno)
+                if 'g' in anno and anno['g']:
+                    if not anno['g'] in annotations_by_g:
+                        annotations_by_g[anno['g']] = []
+                    annotations_by_g[anno['g']].append(anno)
+                if 'm' in anno and anno['m']:
+                    if not anno['m'] in annotations_by_m:
+                        annotations_by_m[anno['m']] = []
+                    annotations_by_m[anno['m']].append(anno)
 
 
diff --git a/haploy_anno_import.py b/haploy_anno_import.py
@@ -0,0 +1,241 @@
+#!/usr/bin/python3
+import re
+import csv
+import os
+from bs4 import BeautifulSoup
+import urllib.request
+import json
+import glob
+
+
+
+
+def yfull_fname(group):
+    if group:
+        return 'yfull/yfull-ytree-'+group+'.html'
+    else:
+        return 'yfull/yfull-ytree.html'
+
+def yfull_url(group):
+    if group:
+        return 'https://www.yfull.com/tree/' + group + '/'
+    else:
+        return 'https://www.yfull.com/tree/'
+
+# YFull mtree import (experimental)
+def download_yfull_file(group):
+    try:
+        os.mkdir('yfull')
+    except OSError:
+        pass
+    fname = yfull_fname(group)
+    url = yfull_url(group)
+    print('Downloading ' + url + 'to file: ' + fname)
+    #urllib.request.urlretrieve("https://www.yfull.com/tree/"+group+"/", fname);
+
+def yfull_parse_muts(li):
+    s=''
+    snpforhg=li.find('span', class_='yf-snpforhg', recursive=False)
+    if snpforhg:
+        s+=snpforhg.text
+    plussnps=li.find('span', class_='yf-plus-snps', recursive=False)
+    if plussnps:
+        s += ' * ' + plussnps['title']
+    o=[]
+    if len(s) > 0:
+        for m in s.split('*'):
+            o.append(m.strip())
+    return o
+
+def yfull_parse_age(li):
+    s=''
+    agespan=li.find('span', class_='yf-age', recursive=False)
+    if agespan:
+        s+=agespan.text
+    return s
+
+def yfull_parse_person(li):
+    sams=[]
+    ul = li.find('ul', recursive=False)
+    if ul:
+        lis = ul.find_all('li', recursive=False)
+    else:
+        return sams
+    if not lis:
+        return sams
+    for li in lis:
+        has_sample=0
+        sam=''
+        if li.has_attr('valsampleid'):
+            sam+=li['valsampleid']+ ': '
+            has_sample=1
+        for geo in li.find_all('b', recursive=False):
+            if geo.has_attr('class') and 'yf-geo' in geo['class'] and 'fl' in geo['class']:
+                if geo.has_attr('title'):
+                    sam+=geo['title']
+                if geo.has_attr('original-title'):
+                    sam+=geo['original-title']
+                sam+=' '
+            if geo.has_attr('class') and 'yf-geo' in geo['class'] and 'yf-lang' in geo['class']:
+                if geo.has_attr('title'):
+                    sam+=geo['title']
+                if geo.has_attr('original-title'):
+                    sam+=geo['original-title']
+        for geo in li.find_all('span', recursive=False):
+            if geo.has_attr('class') and 'yf-a-age' in geo['class']:
+                if geo.has_attr('title'):
+                    sam+=geo['title']
+                if geo.has_attr('original-title'):
+                    sam+=geo['original-title']
+        if has_sample:
+            sams.append(sam)
+    #sam+=' '
+    #print(sam)
+    return sams
+
+def yfull_is_tree_quirk(group_name, fileroot):
+    if fileroot:
+        return False
+    if group_name=='R-P312':
+        return True
+    if group_name=='R-Z2118':
+        return True
+    return False
+
+def yfull_recurse_list(ul_in, level, fileroot):
+    lis = ul_in.find_all('li', recursive=False)
+    for li in lis:
+        #print(li.get_text())
+        muts={}
+        muts['l']=level
+        g=li.find('a', recursive=False)
+        group_name=''
+        if g:
+            group_name=g.text
+            muts['g']=g.text
+            txts = yfull_parse_person(li)
+            grp=g.text.strip('*')
+            for txt in txts:
+                print(grp, txt)
+                anno = {
+                    "g": grp,
+                    "txt": 'YFULL: %s'%(txt)
+                }
+                annos.append(anno)
+        l=li.find('a', href=True, recursive=False)
+        if l:
+            muts['link']=l['href']
+
+
+        ul = li.find('ul', recursive=False)
+        if ul and not yfull_is_tree_quirk(group_name, fileroot):
+            #print('->')
+            yfull_recurse_list(ul, level+1, False)
+            #print('<-')
+        else:
+            if 'g' in muts and muts['g'].endswith('*'):
+                continue
+            if 'link' in muts:
+                group=muts['link'].split('/')[-2]
+                #print('FILE: ' +fname)
+                yfull_recurse_file(group, level)
+                #print('END: ' +fname)
+    return 0
+
+def yfull_recurse_file(group, level):
+    fname = yfull_fname(group)
+    try:
+        with open(fname) as f:
+            pass
+    except OSError:
+        print('File not found: ' +fname)
+        download_yfull_file(group)
+
+    with open(fname) as f:
+        print('Importing file: ' +fname)
+        soup = BeautifulSoup(f.read(), features="html.parser")
+        ul = soup.find('ul', id='tree')
+        yfull_recurse_list(ul, level, True)
+        #yfull_get_info(soup)
+
+def import_yfull_tree(gr):
+    yfull_recurse_file(gr, 0)
+
+
+
+
+
+#
+def import_ftdna_chart(fname, info=''):
+    with open(fname) as f:
+        print('Importing file: ' +fname)
+        soup = BeautifulSoup(f.read(), features="html.parser")
+
+        #rows = soup.find('div', id='MainContent_color1_GridView1').find('table').find_all("tr")
+        #rows = soup.find('table').find_all("tr")
+        rows = soup.find('div', {"id" : re.compile('MainContent.*')}).find('table').find_all("tr")
+
+        kiti = -1
+        pati = -1
+        coui = -1
+        gri = -1
+        row = rows[0]
+        ths = row.find_all("th")
+        for i, th in enumerate(ths):
+            if 'Kit' in th.get_text():
+                kiti = i
+            if 'Paternal' in th.get_text():
+                pati = i
+            if 'Country' in th.get_text():
+                coui = i
+            if 'Haplogroup' in th.get_text():
+                gri = i
+        for row in rows:
+            tds = row.find_all("td")
+            if len(tds)>1:
+                kit=''
+                pat=''
+                cou=''
+                gr=''
+                kit = tds[kiti].get_text().strip()
+                if pati >= 0:
+                    pat = tds[pati].get_text().strip()
+                if coui >= 0:
+                    cou = tds[coui].get_text().strip()
+                gr = tds[gri].get_text().strip()
+                if not gr:
+                    continue
+                #if 'MIN' in kit or 'MAX' in kit or 'MODE' in kit:
+                #    continue
+                print(kit, pat, gr)
+                anno = {
+                    "g": gr,
+                    "txt": 'FTDNA: %s %s %s'%(kit, pat, info)
+                }
+                annos.append(anno)
+
+def save_anno(fname):
+    jroot={
+        'info': 'haploy_anno_iport.py',
+        'annotation': annos }
+    with open(fname, 'w') as f:
+        json.dump(jroot, f, indent=1);
+
+
+# Example annotations - it probably doesn't make sense for everyone to import every project
+
+annos=[]
+import_yfull_tree('A00')
+import_yfull_tree('A0-T')
+#import_yfull_tree('N-FGC28435')
+#import_yfull_tree('N')
+save_anno('haploy_annodb_yfull.txt')
+
+annos=[]
+import_ftdna_chart('ftdna/FamilyTreeDNA - Estonia.htm', '[Estonia]')
+import_ftdna_chart('ftdna/FamilyTreeDNA - Saami Project.htm', '[Saami]')
+import_ftdna_chart('ftdna/FamilyTreeDNA - I1 Suomi Finland & N-CTS8565 -projekti.htm', '[I1 Suomi]')
+import_ftdna_chart('ftdna/FamilyTreeDNA - Finland DNA Project.htm', '[FinlandDNA]')
+import_ftdna_chart('ftdna/FamilyTreeDNA - RussiaDNA Project.htm', '[RussiaDNA]')
+import_ftdna_chart('ftdna/FamilyTreeDNA - R1a1a and Subclades Y-DNA Project.htm', '[R1a1a]')
+save_anno('haploy_annodb_ftdnatest.txt')
diff --git a/haploy_annodb_example.txt b/haploy_annodb_example.txt
@@ -0,0 +1,17 @@
+{
+ "info": "Test file",
+ "annotation": [
+  {"m": "M2019", "txt": "Yakuts"},
+  {"m": "Y13850", "txt": "Ugric"},
+  {"m": "Y10932", "txt": "Rurikids"},
+  {"m": "L550", "txt": "Scandic/Baltic"},
+  {"m": "Z1933", "txt": "Savo/Karjala"},
+  {"m": "CTS9976", "txt": "Finns"},
+  {"m": "VL62", "txt": "Karjala"},
+  {"m": "CTS8565", "txt": "Savo"},
+  {"m": "M7414", "txt": "Kärsä-Laitinen"},
+  {"m": "VL29", "txt": "North European"},
+  {"m": "PH521", "txt": "Lapland"},
+  {"m": "L1022", "txt": "Häme/Länsi-Suomi"}
+ ]
+}
diff --git a/haploy_find.py b/haploy_find.py
@@ -41,6 +41,7 @@
         print("Loading DB2...")
         haploy.load_db2j(min_tree_load_level=min_tree_load_level)
         print("DB loaded!")
+        haploy.load_annotations('haploy_annodb_*.txt')
         rep = haploy.report(args.file[0], n_single, do_all=all, filt=filt, force=force, min_match_level=min_match_level)
         print(rep)
     else: