diff --git a/README.md b/README.md index b0e8cd5..d23b0c7 100755 --- a/README.md +++ b/README.md @@ -34,6 +34,13 @@ CrossMap (`pip3 install CrossMap`), conversion chain file (`crossmap/GRCh38_to_N the ISOGG spreadsheet in csv format (`'SNP Index - Human.csv'`). Outputs a `haploy_map.txt` file which is used by haploy_find.py. See haploy.py for details. +## haploy_anno_import.py + +This example script is used to import your own annotations that can be attached to the reported tree nodes. As an example the script will +import YFull person IDs, and also some selected FTDNA project files. Open project chart tables in a browser, select page +size so that everything fits in one page, and edit the corresponding lines at the end of the script. Snipsa will load any +files starting with haploy_annodb. An example file is included. + ## haplomt_find.py This small tool reads a raw SNP data file and lists MT chromosome haplogroup information. You must first initialize the mutation diff --git a/haplomt.py b/haplomt.py index e546d79..a05b504 100755 --- a/haplomt.py +++ b/haplomt.py @@ -64,9 +64,9 @@ def print_data(do_print=True): return rep -def report(fname, n, do_uptree=True, do_extra=True, do_all=False, filt='', force=''): +def report(fname, n, do_uptree=True, do_extra=True, do_all=False, filt='', force='', vcf_sample='', force_build=0): rep='' - snpset, meta = snpload.load(fname, ['MT']) + snpset, meta = snpload.load(fname, ['MT'], vcf_sample=vcf_sample, force_build=force_build) if 'MT' not in snpset: return "No MT data found\n" diff --git a/haplomt_find.py b/haplomt_find.py index 7902e01..5c8bfb8 100755 --- a/haplomt_find.py +++ b/haplomt_find.py @@ -11,11 +11,15 @@ force='' filt='' all=False +force_build=0 +vcf_sample='' parser = argparse.ArgumentParser() parser.add_argument('-s', '--single', help='Analyse a path for single group') parser.add_argument('-a', '--all', action='store_true', help='Show listing of all found mutations') parser.add_argument('-n', '--num', help='Show num best matches') +parser.add_argument('-v', '--vcf-sample', help='VCF sample select (regexp)') +parser.add_argument('-b', '--build', help='Force build36/37&38 input') parser.add_argument('file', nargs='+') args = parser.parse_args() @@ -27,6 +31,10 @@ n_multi = int(args.num) if args.all: all=True +if args.vcf_sample: + vcf_sample = args.vcf_sample +if args.build: + force_build = int(args.build) if len(args.file) < 1: print(sys.argv[0]+" ") @@ -37,7 +45,7 @@ print("DB loaded!") print("Loading chr data...") - rep = haplomt.report(args.file[0], n_single, do_all=all, filt=filt, force=force) + rep = haplomt.report(args.file[0], n_single, do_all=all, filt=filt, force=force, vcf_sample=vcf_sample, force_build=force_build) print(rep) else: @@ -48,7 +56,7 @@ lookfor = args.file[0].split(',') for fname in args.file[1:]: #try: - snpset, meta = snpload.load(fname, ['MT']) + snpset, meta = snpload.load(fname, ['MT'], vcf_sample=vcf_sample, force_build=force_build) if 'MT' not in snpset: print('%s: no MT data'%fname) diff --git a/haploy.py b/haploy.py index f928ee4..da1bac8 100755 --- a/haploy.py +++ b/haploy.py @@ -133,9 +133,9 @@ def path_str(ut, n): return rep -def report(fname, n, do_uptree=True, do_extra=True, do_all=False, filt='', force='', min_match_level=0): +def report(fname, n, do_uptree=True, do_extra=True, do_all=False, filt='', force='', min_match_level=0, vcf_sample='', force_build=0): rep='' - snpset, meta = snpload.load(fname, ['Y']) + snpset, meta = snpload.load(fname, ['Y'], vcf_sample=vcf_sample, force_build=force_build) if 'Y' not in snpset: return "No Y data found\n" @@ -714,7 +714,7 @@ def show_db2(): print(m) blacklist_etc='M8990' #str etc -blacklist_yb='Z1908 Y13952 Z6171 PF1401 M11813 YSC0001289 BY2821 M11836 M3745 M11838 M11843' +blacklist_yb='M3745' blacklist_yf='Z8834 Z7451 YP1757 YP2129 YP1822 YP1795 YP2228 YP1809 YP2229 YP1948 YP2226 YP1827 L508' blacklist_yf+=' Y125394 Y125393 Y125392 Y125391 Y125390 Y125389 Y125396 Y125397 Y125408 [report-spacer] V1896 PAGE65.1 Y2363 PF3515 PF3512 PF3507 PF3596 Z6023 M547 A3073 Z1716 PF5827 PF1534 PF6011 PF2634 PF2635' blacklist_rootambi='BY229589 Z2533 DFZ77 M11801 FT227770 Y3946 Y125419 FT227767 Y1578 CTS12490 FT227774 YP1740 Y125394 Y125393 Y125392 Y125391 Y125390' #TODO diff --git a/haploy_find.py b/haploy_find.py index aaf0d2f..1b3f4c3 100755 --- a/haploy_find.py +++ b/haploy_find.py @@ -14,12 +14,16 @@ min_match_level=0 min_tree_load_level=0 new_yfind=1 +force_build=0 +vcf_sample='' parser = argparse.ArgumentParser() parser.add_argument('-s', '--single', help='Analyse a path for single group') parser.add_argument('-a', '--all', action='store_true', help='Show listing of all found mutations') parser.add_argument('-n', '--num', help='Show num best matches') parser.add_argument('-q', '--quick', help='Quick mode') +parser.add_argument('-v', '--vcf-sample', help='VCF sample select (regexp)') +parser.add_argument('-b', '--build', help='Force build36/37&38 input') parser.add_argument('file', nargs='+') args = parser.parse_args() @@ -34,6 +38,10 @@ if args.quick: min_match_level=int(args.quick) min_tree_load_level=int(args.quick) +if args.vcf_sample: + vcf_sample = args.vcf_sample +if args.build: + force_build = int(args.build) if len(args.file) < 2: @@ -42,7 +50,7 @@ haploy.load_db2j(min_tree_load_level=min_tree_load_level) print("DB loaded!") haploy.load_annotations('haploy_annodb_*.txt') - rep = haploy.report(args.file[0], n_single, do_all=all, filt=filt, force=force, min_match_level=min_match_level) + rep = haploy.report(args.file[0], n_single, do_all=all, filt=filt, force=force, min_match_level=min_match_level, vcf_sample=vcf_sample, force_build=force_build) print(rep) else: # keep old one available @@ -69,7 +77,8 @@ haploy.load_db2j(min_tree_load_level=min_tree_load_level) print("DB loaded!") for fname in args.file[1:]: - snpset, meta = snpload.load(fname, ['Y']) + #TODO: loop over vcf samples + snpset, meta = snpload.load(fname, ['Y'], vcf_sample=vcf_sample, force_build=force_build) if 'Y' not in snpset: print('%s: no Y data'%fname) diff --git a/snipsa-gui.py b/snipsa-gui.py index 1a948cf..f3f193f 100755 --- a/snipsa-gui.py +++ b/snipsa-gui.py @@ -44,10 +44,16 @@ def handle_findmt(): do_uptree = report_snps.get() do_all = report_all.get() force = pathvar.get() + vcf_sample = vcfvar.get() + buildstr = buildvar.get() + force_build=0 + if buildstr == 'Build36': force_build = 36 + if buildstr == 'Build37': force_build = 37 + if buildstr == 'Build38': force_build = 38 fname = fnamevar.get() print("Reporting file: "+fname) num = int(numbox.get()) - rep = haplomt.report(fname, num, do_uptree=do_uptree, do_extra=do_uptree, do_all=do_all, filt=force, force=force) + rep = haplomt.report(fname, num, do_uptree=do_uptree, do_extra=do_uptree, do_all=do_all, filt=force, force=force, vcf_sample=vcf_sample, force_build=force_build) print("Done") scr.config(state=tk.NORMAL) scr.delete('1.0', tk.END) @@ -64,10 +70,16 @@ def handle_findy(): do_uptree = report_snps.get() do_all = report_all.get() force = pathvar.get() + vcf_sample = vcfvar.get() + buildstr = buildvar.get() + force_build=0 + if buildstr == 'Build36': force_build = 36 + if buildstr == 'Build37': force_build = 37 + if buildstr == 'Build38': force_build = 38 fname = fnamevar.get() print("Reporting file: "+fname) num = int(numbox.get()) - rep = haploy.report(fname, num, do_uptree=do_uptree, do_extra=do_uptree, do_all=do_all, filt=force, force=force) + rep = haploy.report(fname, num, do_uptree=do_uptree, do_extra=do_uptree, do_all=do_all, filt=force, force=force, vcf_sample=vcf_sample, force_build=force_build) print("Done") scr.config(state=tk.NORMAL) scr.delete('1.0', tk.END) @@ -86,12 +98,25 @@ def handle_findy(): # File fframe=tk.Frame(hdrframe) fframe.pack(fill='both') +if bam_support: + cbutton3 = tk.Button(fframe, text="Import BAM", command=handle_bam_select) + cbutton3.pack(side=tk.LEFT) button = tk.Button(fframe, text="Choose file", command=handle_file_select) button.pack(side=tk.LEFT) fnamevar = tk.StringVar() fnamevar.set("No file selected") fnamelabel=tk.Label(fframe, textvariable=fnamevar, anchor='w') -fnamelabel.pack(side=tk.RIGHT, fill='both', expand=True) +fnamelabel.pack(side=tk.LEFT, fill='both', expand=True) +vcflabel=tk.Label(fframe, text='VCF sample:', anchor='w') +vcflabel.pack(side=tk.LEFT, fill='both') +vcfvar = tk.StringVar() +vcfbox = tk.Entry(fframe, textvariable=vcfvar,width=10) +vcfbox.pack(side=tk.LEFT) +buildvar = tk.StringVar() +buildchoices = {'Auto', 'Build36', 'Build37', 'Build38'} +buildvar.set('Auto') +builddropdown = tk.OptionMenu(fframe, buildvar, *buildchoices) +builddropdown.pack(side=tk.LEFT) # Settings sframe=tk.Frame(hdrframe) @@ -123,9 +148,6 @@ def handle_findy(): cbutton1.pack(side=tk.LEFT, fill='x', expand=True) cbutton2 = tk.Button(cframe, text="Find Y", command=handle_findy) cbutton2.pack(side=tk.LEFT, fill='x', expand=True) -if bam_support: - cbutton3 = tk.Button(cframe, text="Import BAM", command=handle_bam_select) - cbutton3.pack(side=tk.LEFT, fill='x', expand=True) # Result area scr=scrolledtext.ScrolledText(window, wrap=tk.WORD) diff --git a/snpload.py b/snpload.py index d6bd470..8e0be03 100755 --- a/snpload.py +++ b/snpload.py @@ -6,19 +6,28 @@ #fname: filename #crs: chromosomes to read -def load(fname, crs=[]): +def load(fname, crs=[], vcf_sample='', force_build=0): snpset = {} meta = {} + vcf_idx = 0 try: tmpfile = preprocess_file(fname) (build, fmt) = detect_file_format(tmpfile) + if fmt == 'vcf': + idxs = get_vcf_sample_idx(tmpfile, vcf_sample) + #print(idxs) + if len(idxs) > 0: + vcf_idx = idxs[0] + print('VCF sample idx:', vcf_idx) except: print("FORMAT AUTODETECT FAILED!!!!!") meta['build'] = 0 meta['format'] = '' meta['total'] = 0 return (snpset, meta) - + + if force_build: + build = force_build meta['build'] = build meta['format'] = fmt meta['total'] = 0 @@ -31,6 +40,8 @@ def load(fname, crs=[]): importer = import_line_ftdna elif fmt == 'myheritage': importer = import_line_ftdna #same + elif fmt == 'vcf': + importer = import_line_vcf else: print("Undetected format: build%d, %s"%(build, fmt)) meta['total'] = 0 @@ -41,8 +52,8 @@ def load(fname, crs=[]): with open(tmpfile) as f: for line in f: try: - (snp, pos, cr, gen) = importer(line, build) - except: + (snp, pos, cr, gen) = importer(line, build, vcf_idx) + except RuntimeError: continue if gen[0] == "-": @@ -79,14 +90,14 @@ def load(fname, crs=[]): meta['total'] = n_total return (snpset, meta) -def import_line_23andme(line, build): +def import_line_23andme(line, build, idx): if len(line) < 7: - raise Exception() + raise RuntimeError() if line.startswith('#'): - raise Exception() + raise RuntimeError() sline = line.split() if len(sline) < 4: - raise Exception() + raise RuntimeError() cr = sline[1]; pos = sline[2]; gen = sline[3]; @@ -104,16 +115,16 @@ def import_line_23andme(line, build): return (snp, pos, cr, gen) -def import_line_ancestry(line, build): +def import_line_ancestry(line, build, idx): if 'allele1' in line and 'allele2' in line: - raise Exception() + raise RuntimeError() if len(line) < 7: - raise Exception() + raise RuntimeError() if line.startswith('#'): - raise Exception() + raise RuntimeError() sline = line.split() if len(sline) < 4: - raise Exception() + raise RuntimeError() cr = sline[1]; if cr == '23': cr = 'X' @@ -140,16 +151,16 @@ def import_line_ancestry(line, build): return (snp, pos, cr, gen) -def import_line_ftdna(line, build): +def import_line_ftdna(line, build, idx): if len(line) < 7: - raise Exception() + raise RuntimeError() if line.startswith('#'): - raise Exception() + raise RuntimeError() if line.startswith('RSID'): - raise Exception() + raise RuntimeError() sline = line.split(',') if len(sline) < 4: - raise Exception() + raise RuntimeError() cr = sline[1].strip('"'); pos = sline[2].strip('"'); gen = sline[3].strip().strip('"'); @@ -167,6 +178,82 @@ def import_line_ftdna(line, build): return (snp, pos, cr, gen) +def import_line_vcf(line, build, idx): + if len(line) < 7: + raise RuntimeError() + if line.startswith('#'): + raise RuntimeError() + sline = line.split() + if len(sline) < 4: + raise RuntimeError() + cr = sline[0]; + if cr == 'chrY': + cr = 'Y' + elif cr == 'chrMT': + cr = 'MT' + elif cr == 'M': + cr = 'MT' + elif cr == 'chrM': + cr = 'MT' + else: + cr.replace('chr','') + + pos = sline[1] + ref = sline[3] + alt = sline[4] + alls = [ref] + alls+=alt.split(',') + + #skip indel + if len(ref) > 1 or len(alt) > 1: + raise RuntimeError() + + #GT index from format string + form = sline[8] + gti=0 + for i,gtstr in enumerate(form.split(':')): + if gtstr == 'GT': + gti=i + + s = sline[9+idx].split(':') + + #print(pos, alls, gti, s) + if s[gti][0] == '.': + raise RuntimeError() + g = alls[int(s[gti][0])] + + #print(pos, alls, g) + gen = g+g; + + snp = {'id': sline[2], + 'cr': cr, + 'gen': gen } + + if build == 36: + snp['b36'] = pos + elif build == 37: + snp['b37'] = pos + elif build == 38: + snp['b38'] = pos + + return (snp, pos, cr, gen) + +def get_vcf_sample_idx(fname, sname): + lc=0 + idxs=[] + with open(fname) as f: + for line in f: + #print(line) + if line.startswith('#CHROM'): + for i, sam in enumerate(line.split('\t')[9:]): + if re.search(sname, sam): + idxs.append(i) + break + lc+=1 + if lc > 10000: + break + return idxs + def detect_file_format(fname): lc=0 build=0 @@ -174,14 +261,14 @@ def detect_file_format(fname): with open(fname) as f: for line in f: lc += 1 - if lc > 3000: + if lc > 4000: break if fmt != '' and build != 0: break if line.startswith('#'): - if 'build ' in line: + if fmt != 'vcf' and 'build ' in line: build = int(re.findall(r'build \d+', line)[0].split()[1]) - if 'Build: ' in line: + if fmt != 'vcf' and 'Build: ' in line: build = int(re.findall(r'Build: \d+', line)[0].split()[1]) if "23andMe" in line: fmt = '23andme' @@ -191,7 +278,12 @@ def detect_file_format(fname): fmt = 'myheritage' if "##fileformat=VCF" in line: fmt = 'vcf' - return (build, fmt) + if "##reference" in line: + #hacky autodetect may work for many files + if '37' in line: + build = 37 + if '38' in line: + build = 38 continue if lc == 1 and line.startswith('RSID'): fmt = 'ftdna'