diff --git a/.appveyor.yml b/.appveyor.yml index fa8bed9dc..580498394 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -30,10 +30,10 @@ build_script: - set HOME=. - set MSYSTEM=MINGW64 - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% - - "sh -lc \"aclocal && autoheader && autoconf && ./configure && make -j2\"" + - "sh -lc \"aclocal && autoheader && autoconf && ./configure --enable-werror CFLAGS='-g -O3' && make -j2\"" #build_script: # - make test_script: - - "sh -lc \"make test\"" + - "sh -lc \"make test-shlib-exports && make test\"" diff --git a/.gitignore b/.gitignore index d53f239f3..939714d2e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ *.dSYM *.exe *.dll +*.dll.a *.pc.tmp *-uninstalled.pc /version.h @@ -22,6 +23,7 @@ hfile_*.cygdll hfile_*.dll hfile_*.so +hts-object-files htslib_static.mk cyg*.dll @@ -31,24 +33,33 @@ lib*.dylib lib*.so lib*.so.* +header-exports.txt +shlib-exports-*.txt + /bgzip /htsfile /tabix /test/fieldarith /test/hfile /test/hts_endian +/test/longrefs/*.tmp.* +/test/pileup /test/sam /test/tabix/*.tmp.* /test/tabix/FAIL* /test/test-bcf-sr /test/test-bcf-translate /test/test_bgzf +/test/test_index +/test/test_kstring +/test/test-parse-reg /test/test_realn /test/test-regidx +/test/test_str2int /test/test-vcf-api /test/test-vcf-sweep /test/test_view -/test/thrash_threads[1-6] +/test/thrash_threads[1-7] /test/*.tmp /test/*.tmp.* diff --git a/.travis.yml b/.travis.yml index 550298fcb..7fcb8f2eb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,41 +1,96 @@ # Control file for continuous integration testing at http://travis-ci.org/ language: c -compiler: - - clang - - gcc - -os: - - linux - - osx - -env: - - USE_CONFIG=no - - USE_CONFIG=yes matrix: include: - compiler: gcc os: linux - env: USE_CONFIG=yes USE_LIBDEFLATE=yes + env: DO_MAINTAINER_CHECKS=yes USE_CONFIG=no + + - compiler: gcc-8 + os: linux + env: USE_CONFIG=yes CC=gcc-8 AR=gcc-ar-8 + addons: + apt: + sources: + - ubuntu-toolchain-r-test + packages: + - gcc-8 + + # An optimised build with address and leak checking, also using libdeflate + - compiler: gcc-8 + os: linux + dist: xenial + env: USE_CONFIG=yes USE_LIBDEFLATE=yes CC=gcc-8 AR=gcc-ar-8 CFLAGS="-g -Wall -O3 -fsanitize=address" LDFLAGS="-fsanitize=address" + addons: + apt: + sources: + - ubuntu-toolchain-r-test + packages: + - gcc-8 + + - compiler: clang + os: osx + env: USE_CONFIG=no + + - compiler: clang + os: osx + env: USE_CONFIG=yes + - compiler: clang os: osx env: USE_CONFIG=yes USE_LIBDEFLATE=yes -# For linux systems -addons: - apt: - packages: - - liblzma-dev - - libbz2-dev + - compiler: gcc + os: linux + env: USE_CONFIG=yes + + - compiler: clang + os: linux + env: USE_CONFIG=yes + + - compiler: gcc + os: linux + env: CFLAGS="-std=c99 -pedantic" USE_CONFIG=yes # For MacOSX systems before_install: - - if [[ "$TRAVIS_OS_NAME" == "osx" && "$USE_CONFIG" == "no" ]]; then HOMEBREW_NO_AUTO_UPDATE=1 brew install xz || ( brew update && brew install xz ); fi + - | + if [[ "$TRAVIS_OS_NAME" == "osx" && "$USE_CONFIG" == "no" ]]; then + HOMEBREW_NO_AUTO_UPDATE=1 brew install xz || ( brew update && brew install xz ) + fi before_script: - - if test "x$USE_LIBDEFLATE" == "xyes" ; then ( cd "$HOME" && git clone --depth 1 https://github.com/ebiggers/libdeflate.git && cd libdeflate && make -j 2 CFLAGS='-fPIC -O3' libdeflate.a ); fi + - | + if test "x$USE_LIBDEFLATE" == "xyes"; then + pushd "$HOME" && \ + git clone --depth 1 https://github.com/ebiggers/libdeflate.git && \ + pushd libdeflate && \ + make -j 2 CFLAGS='-fPIC -O3' libdeflate.a && \ + popd && \ + popd + fi script: - - if test "x$USE_LIBDEFLATE" = "xyes" ; then CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="-L$HOME/libdeflate" --with-libdeflate' ; else CONFIG_OPTS='--without-libdeflate' ; fi - - if test "$USE_CONFIG" = "yes" ; then autoreconf && eval ./configure $CONFIG_OPTS || { cat config.log ; false ; } ; fi && make -j 2 -e && make test + - | + if test "x$USE_LIBDEFLATE" = "xyes"; then + CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate" --with-libdeflate' + else + CONFIG_OPTS='--without-libdeflate' + fi + - | + if test "$USE_CONFIG" = "yes"; then + MAKE_OPTS= ; + autoreconf && \ + eval ./configure --enable-werror $CONFIG_OPTS CFLAGS=\"-g -O3 $CFLAGS\" || \ + ( cat config.log; false ) + else + MAKE_OPTS=-e + fi && \ + if test "x$DO_MAINTAINER_CHECKS" = "xyes"; then + make maintainer-check + fi && \ + make -j 2 $MAKE_OPTS && \ + make test-shlib-exports && \ + make test diff --git a/LICENSE b/LICENSE index 86f782b14..f70e757ee 100644 --- a/LICENSE +++ b/LICENSE @@ -3,7 +3,7 @@ according to the terms of the following MIT/Expat license.] The MIT/Expat License -Copyright (C) 2012-2018 Genome Research Ltd. +Copyright (C) 2012-2019 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,7 +29,7 @@ according to the terms of the following Modified 3-Clause BSD license.] The Modified-BSD License -Copyright (C) 2012-2018 Genome Research Ltd. +Copyright (C) 2012-2019 Genome Research Ltd. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/Makefile b/Makefile index ef86c83e6..72195c496 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile for htslib, a C library for high-throughput sequencing data formats. # -# Copyright (C) 2013-2017 Genome Research Ltd. +# Copyright (C) 2013-2019 Genome Research Ltd. # # Author: John Marshall # @@ -27,15 +27,15 @@ AR = ar RANLIB = ranlib # Default libraries to link if configure is not used -htslib_default_libs = -lz -lm -lbz2 -llzma +htslib_default_libs = -lz -lm -lbz2 -llzma -lcurl CPPFLAGS = # TODO: probably update cram code to make it compile cleanly with -Wc++-compat # For testing strict C99 support add -std=c99 -D_XOPEN_SOURCE=600 -#CFLAGS = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600 -D__FUNCTION__=__func__ -CFLAGS = -g -Wall -O2 +#CFLAGS = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600 +CFLAGS = -g -Wall -O2 -fvisibility=hidden EXTRA_CFLAGS_PIC = -fpic -LDFLAGS = +LDFLAGS = -fvisibility=hidden LIBS = $(htslib_default_libs) prefix = /usr/local @@ -48,6 +48,7 @@ datarootdir = $(prefix)/share mandir = $(datarootdir)/man man1dir = $(mandir)/man1 man5dir = $(mandir)/man5 +man7dir = $(mandir)/man7 pkgconfigdir= $(libdir)/pkgconfig MKDIR_P = mkdir -p @@ -70,15 +71,21 @@ BUILT_TEST_PROGRAMS = \ test/hts_endian \ test/fieldarith \ test/hfile \ + test/pileup \ test/sam \ test/test_bgzf \ + test/test_kstring \ test/test_realn \ test/test-regidx \ + test/test_str2int \ test/test_view \ + test/test_index \ test/test-vcf-api \ test/test-vcf-sweep \ test/test-bcf-sr \ - test/test-bcf-translate + test/fuzz/hts_open_fuzzer.o \ + test/test-bcf-translate \ + test/test-parse-reg BUILT_THRASH_PROGRAMS = \ test/thrash_threads1 \ @@ -86,7 +93,8 @@ BUILT_THRASH_PROGRAMS = \ test/thrash_threads3 \ test/thrash_threads4 \ test/thrash_threads5 \ - test/thrash_threads6 + test/thrash_threads6 \ + test/thrash_threads7 all: lib-static lib-shared $(BUILT_PROGRAMS) plugins $(BUILT_TEST_PROGRAMS) @@ -96,7 +104,9 @@ include htslib_vars.mk # If not using GNU make, you need to copy the version number from version.sh # into here. PACKAGE_VERSION := $(shell ./version.sh) -LIBHTS_SOVERSION = 2 + +LIBHTS_SOVERSION = 3 +MACH_O_COMPATIBILITY_VERSION = $(LIBHTS_SOVERSION) # $(NUMERIC_VERSION) is for items that must have a numeric X.Y.Z string # even if this is a dirty or untagged Git working tree. @@ -106,7 +116,7 @@ NUMERIC_VERSION := $(shell ./version.sh numeric) version.h: $(if $(wildcard version.h),$(if $(findstring "$(PACKAGE_VERSION)",$(shell cat version.h)),,force)) version.h: - echo '#define HTS_VERSION "$(PACKAGE_VERSION)"' > $@ + echo '#define HTS_VERSION_TEXT "$(PACKAGE_VERSION)"' > $@ print-version: @echo $(PACKAGE_VERSION) @@ -132,6 +142,7 @@ LIBHTS_OBJS = \ bgzf.o \ errmod.o \ faidx.o \ + header.o \ hfile.o \ hfile_net.o \ hts.o \ @@ -141,6 +152,7 @@ LIBHTS_OBJS = \ probaln.o \ realn.o \ regidx.o \ + region.o \ sam.o \ synced_bcf_reader.o \ vcf_sweep.o \ @@ -157,28 +169,33 @@ LIBHTS_OBJS = \ cram/cram_io.o \ cram/cram_samtools.o \ cram/cram_stats.o \ - cram/files.o \ cram/mFILE.o \ cram/open_trace_file.o \ cram/pooled_alloc.o \ cram/rANS_static.o \ - cram/sam_header.o \ - cram/string_alloc.o + cram/string_alloc.o \ + $(NONCONFIGURE_OBJS) + +# Without configure we wish to have a rich set of default figures, +# but we still need conditional inclusion as we wish to still +# support ./configure --disable-blah. +NONCONFIGURE_OBJS = hfile_libcurl.o PLUGIN_EXT = PLUGIN_OBJS = -cram_h = cram/cram.h $(cram_samtools_h) $(cram_sam_header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h $(htslib_cram_h) +cram_h = cram/cram.h $(cram_samtools_h) $(header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h $(htslib_cram_h) cram_io_h = cram/cram_io.h $(cram_misc_h) -cram_misc_h = cram/misc.h $(cram_os_h) +cram_misc_h = cram/misc.h cram_os_h = cram/os.h $(htslib_hts_endian_h) -cram_sam_header_h = cram/sam_header.h cram/string_alloc.h cram/pooled_alloc.h $(htslib_khash_h) $(htslib_kstring_h) -cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) $(cram_sam_header_h) -cram_structs_h = cram/cram_structs.h $(htslib_thread_pool_h) cram/string_alloc.h cram/mFILE.h $(htslib_khash_h) +cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) +cram_structs_h = cram/cram_structs.h $(htslib_thread_pool_h) $(htslib_cram_h) cram/string_alloc.h cram/mFILE.h $(htslib_khash_h) cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h bcf_sr_sort_h = bcf_sr_sort.h $(htslib_synced_bcf_reader_h) $(htslib_kbitset_h) -hfile_internal_h = hfile_internal.h $(htslib_hfile_h) $(textutils_internal_h) +header_h = header.h cram/string_alloc.h cram/pooled_alloc.h $(htslib_khash_h) $(htslib_kstring_h) $(htslib_sam_h) +hfile_internal_h = hfile_internal.h $(htslib_hts_defs_h) $(htslib_hfile_h) $(textutils_internal_h) hts_internal_h = hts_internal.h $(htslib_hts_h) $(textutils_internal_h) +sam_internal_h = sam_internal.h $(htslib_sam_h) textutils_internal_h = textutils_internal.h $(htslib_kstring_h) thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) @@ -198,9 +215,11 @@ config.h: echo '/* Default config.h generated by Makefile */' > $@ echo '#define HAVE_LIBBZ2 1' >> $@ echo '#define HAVE_LIBLZMA 1' >> $@ + echo '#ifndef __APPLE__' >> $@ echo '#define HAVE_LZMA_H 1' >> $@ - echo '#define HAVE_FSEEKO 1' >> $@ + echo '#endif' >> $@ echo '#define HAVE_DRAND48 1' >> $@ + echo '#define HAVE_LIBCURL 1' >> $@ # And similarly for htslib.pc.tmp ("pkg-config template"). No dependency # on htslib.pc.in listed, as if that file is newer the usual way to regenerate @@ -231,6 +250,9 @@ lib-shared: cyghts-$(LIBHTS_SOVERSION).dll else ifeq "$(findstring MSYS,$(PLATFORM))" "MSYS" SHLIB_FLAVOUR = dll lib-shared: hts-$(LIBHTS_SOVERSION).dll +else ifeq "$(findstring MINGW,$(PLATFORM))" "MINGW" +SHLIB_FLAVOUR = dll +lib-shared: hts-$(LIBHTS_SOVERSION).dll else SHLIB_FLAVOUR = so lib-shared: libhts.so @@ -266,15 +288,19 @@ libhts.so: $(LIBHTS_OBJS:.o=.pico) # includes this project's build directory). libhts.dylib: $(LIBHTS_OBJS) - $(CC) -dynamiclib -install_name $(libdir)/libhts.$(LIBHTS_SOVERSION).dylib -current_version $(NUMERIC_VERSION) -compatibility_version $(LIBHTS_SOVERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS) $(LIBS) + $(CC) -dynamiclib -install_name $(libdir)/libhts.$(LIBHTS_SOVERSION).dylib -current_version $(NUMERIC_VERSION) -compatibility_version $(MACH_O_COMPATIBILITY_VERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS) $(LIBS) ln -sf $@ libhts.$(LIBHTS_SOVERSION).dylib -cyghts-$(LIBHTS_SOVERSION).dll: $(LIBHTS_OBJS) - $(CC) -shared -Wl,--out-implib=libhts.dll.a -Wl,--export-all-symbols -Wl,--enable-auto-import $(LDFLAGS) -o $@ -Wl,--whole-archive $(LIBHTS_OBJS) -Wl,--no-whole-archive $(LIBS) -lpthread +cyghts-$(LIBHTS_SOVERSION).dll libhts.dll.a: $(LIBHTS_OBJS) + $(CC) -shared -Wl,--out-implib=libhts.dll.a -Wl,--enable-auto-import $(LDFLAGS) -o $@ -Wl,--whole-archive $(LIBHTS_OBJS) -Wl,--no-whole-archive $(LIBS) -lpthread -hts-$(LIBHTS_SOVERSION).dll: $(LIBHTS_OBJS) - $(CC) -shared -Wl,--out-implib=hts.dll.a -Wl,--export-all-symbols -Wl,--enable-auto-import $(LDFLAGS) -o $@ -Wl,--whole-archive $(LIBHTS_OBJS) -Wl,--no-whole-archive $(LIBS) -lpthread +hts-$(LIBHTS_SOVERSION).dll hts.dll.a: $(LIBHTS_OBJS) + $(CC) -shared -Wl,--out-implib=hts.dll.a -Wl,--enable-auto-import -Wl,--exclude-all-symbols $(LDFLAGS) -o $@ -Wl,--whole-archive $(LIBHTS_OBJS) -Wl,--no-whole-archive $(LIBS) -lpthread +# Target to allow htslib.mk to build all the object files before it +# links the shared and static libraries. +hts-object-files: $(LIBHTS_OBJS) + touch $@ .pico.so: $(CC) -shared -Wl,-E $(LDFLAGS) -o $@ $< $(LIBS) -lpthread @@ -282,40 +308,43 @@ hts-$(LIBHTS_SOVERSION).dll: $(LIBHTS_OBJS) .o.bundle: $(CC) -bundle -Wl,-undefined,dynamic_lookup $(LDFLAGS) -o $@ $< $(LIBS) -.o.cygdll: +%.cygdll: %.o libhts.dll.a $(CC) -shared $(LDFLAGS) -o $@ $< libhts.dll.a $(LIBS) -.o.dll: +%.dll: %.o hts.dll.a $(CC) -shared $(LDFLAGS) -o $@ $< hts.dll.a $(LIBS) -bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(htslib_khash_h) +bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h) errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h) kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) knetfile.o knetfile.pico: knetfile.c config.h $(htslib_hts_log_h) $(htslib_knetfile_h) -hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(hts_internal_h) $(htslib_khash_h) +header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) +hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(htslib_kstring_h) $(hts_internal_h) $(htslib_khash_h) hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_knetfile_h) +hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) -hts.o hts.pico: hts.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) -hts_os.o hts_os.pico: hts_os.c config.h os/rand.c -vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) -sam.o sam.pico: sam.c config.h $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) $(htslib_hts_endian_h) +hts.o hts.pico: hts.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) +hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c +vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) +sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h) faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h) $(hts_internal_h) -bcf_sr_sort.o bcf_sr_sort.pico: bcf_sr_sort.c config.h $(bcf_sr_sort_h) $(htslib_khash_str2int_h) +bcf_sr_sort.o bcf_sr_sort.pico: bcf_sr_sort.c config.h $(bcf_sr_sort_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h) synced_bcf_reader.o synced_bcf_reader.pico: synced_bcf_reader.c config.h $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(htslib_bgzf_h) $(htslib_thread_pool_h) $(bcf_sr_sort_h) vcf_sweep.o vcf_sweep.pico: vcf_sweep.c config.h $(htslib_vcf_sweep_h) $(htslib_bgzf_h) vcfutils.o vcfutils.pico: vcfutils.c config.h $(htslib_vcfutils_h) $(htslib_kbitset_h) kfunc.o kfunc.pico: kfunc.c config.h $(htslib_kfunc_h) regidx.o regidx.pico: regidx.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(htslib_regidx_h) $(hts_internal_h) +region.o region.pico: region.c config.h $(htslib_hts_h) $(htslib_khash_h) md5.o md5.pico: md5.c config.h $(htslib_hts_h) $(htslib_hts_endian_h) multipart.o multipart.pico: multipart.c config.h $(htslib_kstring_h) $(hts_internal_h) $(hfile_internal_h) plugin.o plugin.pico: plugin.c config.h $(hts_internal_h) $(htslib_kstring_h) probaln.o probaln.pico: probaln.c config.h $(htslib_hts_h) realn.o realn.pico: realn.c config.h $(htslib_hts_h) $(htslib_sam_h) -textutils.o textutils.pico: textutils.c config.h $(htslib_hfile_h) $(htslib_kstring_h) $(hts_internal_h) +textutils.o textutils.pico: textutils.c config.h $(htslib_hfile_h) $(htslib_kstring_h) $(htslib_sam_h) $(hts_internal_h) cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(cram_h) cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) @@ -323,14 +352,12 @@ cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) -cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c config.h $(cram_h) $(htslib_sam_h) +cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c config.h $(cram_h) $(htslib_sam_h) $(sam_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) -cram/files.o cram/files.pico: cram/files.c config.h $(cram_misc_h) cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h -cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) +cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(htslib_hts_h) cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c config.h cram/pooled_alloc.h $(cram_misc_h) cram/rANS_static.o cram/rANS_static.pico: cram/rANS_static.c config.h cram/rANS_static.h cram/rANS_byte.h -cram/sam_header.o cram/sam_header.pico: cram/sam_header.c config.h $(htslib_hts_log_h) $(cram_sam_header_h) cram/string_alloc.h cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/string_alloc.h thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) @@ -348,6 +375,12 @@ bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) +# Maintainer source code checks +# - copyright boilerplate presence +# - tab and trailing space detection +maintainer-check: + test/maintainer/check_copyright.pl . + test/maintainer/check_spaces.pl . # For tests that might use it, set $REF_PATH explicitly to use only reference # areas within the test suite (or set it to ':' to use no reference areas). @@ -356,10 +389,14 @@ tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htsl # MSYS2_ARG_CONV_EXCL="*" make check check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) test/hts_endian + test/test_kstring + test/test_str2int test/fieldarith test/fieldarith.sam test/hfile test/test_bgzf test/bgziptest.txt + test/test-parse-reg -t test/colons.bam cd test/tabix && ./test-tabix.sh tabix.tst + cd test/mpileup && ./test-pileup.sh mpileup.tst REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq test/test-regidx cd test && REF_PATH=: ./test.pl $${TEST_OPTS:-} @@ -367,27 +404,45 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) test/hts_endian: test/hts_endian.o $(CC) $(LDFLAGS) -o $@ test/hts_endian.o $(LIBS) +test/fuzz/hts_open_fuzzer: test/fuzz/hts_open_fuzzer.o + $(CC) $(LDFLAGS) -o $@ test/fuzz/hts_open_fuzzer.o libhts.a $(LIBS) -lpthread + test/fieldarith: test/fieldarith.o libhts.a $(CC) $(LDFLAGS) -o $@ test/fieldarith.o libhts.a $(LIBS) -lpthread test/hfile: test/hfile.o libhts.a $(CC) $(LDFLAGS) -o $@ test/hfile.o libhts.a $(LIBS) -lpthread +test/pileup: test/pileup.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/pileup.o libhts.a $(LIBS) -lpthread + test/sam: test/sam.o libhts.a $(CC) $(LDFLAGS) -o $@ test/sam.o libhts.a $(LIBS) -lpthread test/test_bgzf: test/test_bgzf.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a -lz $(LIBS) -lpthread +test/test_kstring: test/test_kstring.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a -lz $(LIBS) -lpthread + test/test_realn: test/test_realn.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_realn.o libhts.a $(LIBS) -lpthread test/test-regidx: test/test-regidx.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test-regidx.o libhts.a $(LIBS) -lpthread +test/test-parse-reg: test/test-parse-reg.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test-parse-reg.o libhts.a $(LIBS) -lpthread + +test/test_str2int: test/test_str2int.o + $(CC) $(LDFLAGS) -o $@ test/test_str2int.o + test/test_view: test/test_view.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LIBS) -lpthread +test/test_index: test/test_index.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_index.o libhts.a $(LIBS) -lpthread + test/test-vcf-api: test/test-vcf-api.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test-vcf-api.o libhts.a $(LIBS) -lpthread @@ -401,13 +456,19 @@ test/test-bcf-translate: test/test-bcf-translate.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a -lz $(LIBS) -lpthread test/hts_endian.o: test/hts_endian.c config.h $(htslib_hts_endian_h) +test/fuzz/hts_open_fuzzer.o: test/fuzz/hts_open_fuzzer.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) test/fieldarith.o: test/fieldarith.c config.h $(htslib_sam_h) -test/hfile.o: test/hfile.c config.h $(htslib_hfile_h) $(htslib_hts_defs_h) -test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) +test/hfile.o: test/hfile.c config.h $(htslib_hfile_h) $(htslib_hts_defs_h) $(htslib_kstring_h) +test/pileup.o: test/pileup.c config.h $(htslib_sam_h) $(htslib_kstring_h) +test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_khash_h) $(htslib_hts_log_h) test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hfile_internal_h) -test/test-realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) -test/test-regidx.o: test/test-regidx.c config.h $(htslib_regidx_h) $(hts_internal_h) -test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) +test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h) +test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_sam_h) +test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) +test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h) +test/test_str2int.o: test/test_str2int.c config.h $(textutils_internal_h) +test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_hts_log_h) +test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h) test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h) test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) @@ -431,9 +492,36 @@ test/thrash_threads5: test/thrash_threads5.o libhts.a test/thrash_threads6: test/thrash_threads6.o libhts.a $(CC) $(LDFLAGS) -o $@ test/thrash_threads6.o libhts.a -lz $(LIBS) -lpthread - +test/thrash_threads7: test/thrash_threads7.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/thrash_threads7.o libhts.a -lz $(LIBS) -lpthread test_thrash: $(BUILT_THRASH_PROGRAMS) +# Test to ensure the functions in the header files are exported by the shared +# library. This currently works by comparing the output from ctags on +# the headers with the list of functions exported by the shared library. +# Note that functions marked as exported in the .c files and not the public +# headers will be missed by this test. +test-shlib-exports: header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt + @echo "Checking shared library exports" + @if test ! -s header-exports.txt ; then echo "Error: header-exports.txt empty" ; false ; fi + @if test ! -s shlib-exports-$(SHLIB_FLAVOUR).txt ; then echo "Error: shlib-exports-$(SHLIB_FLAVOUR).txt empty" ; false ; fi + @! comm -23 header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt | grep . || \ + ( echo "Error: Found unexported symbols (listed above)" ; false ) + +# Extract symbols that should be exported from public headers using ctags +# Filter out macros in htslib/hts_defs.h, and knet_win32_ functions that +# aren't needed on non-Windows platforms. +header-exports.txt: test/header_syms.pl htslib/*.h + test/header_syms.pl htslib/*.h | sort -u -o $@ + +shlib-exports-so.txt: libhts.so + nm -D -g libhts.so | awk '$$2 == "T" { print $$3 }' | sort -u -o $@ + +shlib-exports-dylib.txt: libhts.dylib + nm -Ug libhts.dylib | awk '$$2 == "T" { sub("^_", "", $$3); print $$3 }' | sort -u -o $@ + +shlib-exports-dll.txt: hts.dll.a + nm -g hts.dll.a | awk '$$2 == "T" { print $$3 }' | sort -u -o $@ install: libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) installdirs install-$(SHLIB_FLAVOUR) install-pkgconfig $(INSTALL_PROGRAM) $(BUILT_PROGRAMS) $(DESTDIR)$(bindir) @@ -442,9 +530,10 @@ install: libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) installdirs install-$(SHLIB $(INSTALL_DATA) libhts.a $(DESTDIR)$(libdir)/libhts.a $(INSTALL_MAN) bgzip.1 htsfile.1 tabix.1 $(DESTDIR)$(man1dir) $(INSTALL_MAN) faidx.5 sam.5 vcf.5 $(DESTDIR)$(man5dir) + $(INSTALL_MAN) htslib-s3-plugin.7 $(DESTDIR)$(man7dir) installdirs: - $(INSTALL_DIR) $(DESTDIR)$(bindir) $(DESTDIR)$(includedir) $(DESTDIR)$(includedir)/htslib $(DESTDIR)$(libdir) $(DESTDIR)$(man1dir) $(DESTDIR)$(man5dir) $(DESTDIR)$(pkgconfigdir) + $(INSTALL_DIR) $(DESTDIR)$(bindir) $(DESTDIR)$(includedir) $(DESTDIR)$(includedir)/htslib $(DESTDIR)$(libdir) $(DESTDIR)$(man1dir) $(DESTDIR)$(man5dir) $(DESTDIR)$(man7dir) $(DESTDIR)$(pkgconfigdir) if test -n "$(plugindir)"; then $(INSTALL_DIR) $(DESTDIR)$(plugindir); fi # After installation, the real file in $(libdir) will be libhts.so.X.Y.Z, @@ -482,10 +571,11 @@ htslib-uninstalled.pc: htslib.pc.tmp testclean: - -rm -f test/*.tmp test/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* + -rm -f test/*.tmp test/*.tmp.* test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt mostlyclean: testclean -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM version.h + -rm -f hts-object-files clean: mostlyclean clean-$(SHLIB_FLAVOUR) -rm -f libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) $(BUILT_TEST_PROGRAMS) $(BUILT_THRASH_PROGRAMS) diff --git a/NEWS b/NEWS index c895ea1de..48279b591 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,487 @@ +Noteworthy changes in release 1.10 (6th December 2019) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Brief summary +------------- + +There are many changes in this release, so the executive summary is: + +* Addition of support for references longer than 2Gb (NB: SAM and VCF + formats only, not their binary counterparts). This may need changes + in code using HTSlib. See README.large_positions.md for more information. + +* Added a SAM header API. + +* Major speed up to SAM reading and writing. This also now supports + multi-threading. + +* We can now auto-index on-the-fly while writing a file. This also + includes to bgzipped SAM.gz. + +* Overhaul of the S3 interface, which now supports version 4 + signatures. This also makes writing to S3 work. + +These also required some ABI changes. See below for full details. + + +Features / updates +------------------ + +* A new SAM/BAM/CRAM header API has been added to HTSlib, allowing header + data to be updated without having to parse or rewrite large parts of the + header text. See htslib/sam.h for function definitions and + documentation. (#812) + + The header typedef and several pre-existing functions have been renamed + to have a sam_hdr_ prefix: sam_hdr_t, sam_hdr_init(), sam_hdr_destroy(), + and sam_hdr_dup(). (The existing bam_hdr_-prefixed names are still + provided for compatibility with existing code.) (#887, thanks to + John Marshall) + +* Changes to hfile_s3, which provides support for the AWS S3 API. (#839) + + - hfile_s3 now uses version 4 signatures by default. Attempting to write to + an S3 bucket will also now work correctly. It is possible to force + version 2 signatures by creating environment variable HTS_S3_V2 (the exact + value does not matter, it just has to exist). Note that writing depends + on features that need version 4 signatures, so forcing version 2 will + disable writes. + + - hfile_s3 will automatically retry requests where the region endpoint + was not specified correctly, either by following the 301 redirect (when + using path-style requests) or reading the 400 response (when using + virtual-hosted style requests and version 4 signatures). The first + region to try can be set by using the AWS_DEFAULT_REGION environment + variable, by setting "region" in ".aws/credentials" or by setting + "bucket_location" in ".s3cfg". + + - hfile_s3 now percent-escapes the path component of s3:// URLs. For + backwards-compatibility it will ignore any paths that have already + been escaped (detected by looking for '%' followed by two hexadecimal + digits.) + + - New environment variables HTS_S3_V2, HTS_S3_HOST, HTS_S3_S3CFG + and HTS_S3_PART_SIZE to force version-2 signatures, control the + S3 server hostname, the configuration file and upload chunk + sizes respectively. + +* Numerous SAM format improvements. + + - Bgzipped SAM files can now be indexed and queried. The library now + recognises sam.gz as a format name to ease this usage. (#718, #916) + + - The SAM reader and writer now supports multi-threading via the + thread-pool. (#916) + + Note that the multi-threaded SAM reader does not currently support seek + operations. Trying to do this (for example with an iterator range request) + will result in the SAM readers dropping back to single-threaded mode. + + - Major speed up of SAM decoding and encoding, by around 2x. (#722) + + - SAM format can now handle 64-bit coordinates and references. This + has implications for the ABI too (see below). Note BAM and CRAM + currently cannot handle references longer than 2Gb, however given + the speed and threading improvements SAM.gz is a viable workaround. (#709) + +* We can now automatically build indices on-the-fly while writing + SAM, BAM, CRAM, VCF and BCF files. (Note for SAM and VCF this only + works when bgzipped.) (#718) + +* HTSlib now supports the @SQ-AN header field, which lists alternative names + for reference sequences. This means given "@SQ SN:1 AN:chr1", tools like + samtools can accept requests for "1" or "chr1" equivalently. (#931) + +* Zero-length files are no longer considered to be valid SAM files + (with no header and no alignments). This has been changed so that pipelines + such as `somecmd | samtools ...` with `somecmd` aborting before outputting + anything will now propagate the error to the second command. (#721, thanks + to John Marshall; #261 reported by Adrian Tan) + +* Added support for use of non-standard index names by pasting the + data filename and index filename with ##idx##. For example + "/path1/my_data.bam##idx##/path2/my_index.csi" will open bam file + "/path1/my_data.bam" and index file "/path2/my_index.csi". (#884) + + This affects hts_idx_load() and hts_open() functions. + +* Improved the region parsing code to handle colons in reference + names. Strings can be disambiguated by the use of braces, so for + example when reference sequences called "chr1" and "chr1:100-200" + are both present, the regions "{chr1}:100-200" and "{chr1:100-200}" + unambiguously indicate which reference is being used. (#708) + + A new function hts_parse_region() has been added along with + specialisations for sam_parse_region() and fai_parse_region(). + +* CRAM encoding now has additional checks for MD/NM validity. If + they are incorrect, it stores the (incorrect copy) verbatim so + round-trips "work". (#792) + +* Sped up decoding of CRAM by around 10% when the MD tag is being + generated. (#874) + +* CRAM REF_PATH now supports %Ns (where N is a single digit) + expansion in http URLs, similar to how it already supported this + for directories. (#791) + +* BGZF now permits indexing and seeking using virtual offsets in + completely uncompressed streams. (#904, thanks to Adam Novak) + +* bgzip now asks for extra confirmation before decompressing files + that don't have a known compression extension (e.g. .gz). This avoids + `bgzip -d foo.bam.bai` producing a foo.bam file that is very much not + a BAM-formatted file. (#927, thanks to John Marshall) + +* The htsfile utility can now copy files (including to/from URLs using + HTSlib's remote access facilities) with the --copy option, in + addition to its existing uses of identifying file formats and + displaying sequence or variant data. (#756, thanks to John Marshall) + +* Added tabix --min-shift option. (#752, thanks to Garrett Stevens) + +* Tabix now has an -D option to disable storing a local copy of a + remote index. (#870) + +* Improved support for MSYS Windows compiler environment. (#966) + +* External htslib plugins are now supported on Windows. (#966) + + +API additions and improvements +------------------------------ + +* New API functions bam_set_mempolicy() and bam_get_mempolicy() have + been added. These allow more control over the ownership of bam1_t + alignment record data; see documentation in htslib/sam.h for more + information. (#922) + +* Added more HTS_RESULT_USED checks, this time for VCF I/O. (#805) + +* khash can now hash kstrings. This makes it easier to hash + non-NUL-terminated strings. (#713) + +* New haddextension() filename extension API function. (#788, thanks to + John Marshall) + +* New hts_resize() macro, designed to replace uses of hts_expand() + and hts_expand0(). (#805) + +* Added way of cleaning up unused jobs in the thread pool via the new + hts_tpool_dispatch3() function. (#830) + +* New API functions hts_reglist_create() and sam_itr_regarray() are added + to create hts_reglist_t region lists from `chr:-` type region + specifiers. (#836) + +* Ksort has been improved to facilitate library use. See KSORT_INIT2 + (adds scope / namespace capabilities) and KSORT_INIT_STATIC interfaces. + (#851, thanks to John Marshall) + +* New kstring functions (#879): + KS_INITIALIZE - Initializer for structure assignment + ks_initialize() - Initializer for pointed-to kstrings + ks_expand() - Increase kstring capacity by a given amount + ks_clear() - Set kstring length to zero + ks_free() - Free the underlying buffer + ks_c_str() - Returns the kstring buffer as a const char *, + or an empty string if the length is zero. + +* New API functions hts_idx_load3(), sam_index_load3(), tbx_index_load3() + and bcf_index_load3() have been added. These allow control of whether + remote indexes should be cached locally, and allow the error message + printed when the index does not exist to be suppressed. (#870) + +* Improved hts_detect_format() so it no longer assumes all text is + SAM unless positively identified otherwise. It also makes a stab + at detecting bzip2 format and identifying BED, FASTA and FASTQ + files. (#721, thanks to John Marshall; #200, #719 both reported by + Torsten Seemann) + +* File format errors now set errno to EFTYPE (BSD, MacOS) when + available instead of ENOEXEC. (#721) + +* New API function bam_set_qname (#942) + +* In addition to the existing hts_version() function, which reflects the + HTSlib version being used at runtime, now also provides + HTS_VERSION, a preprocessor macro reflecting the HTSlib version that + a program is being compiled against. (#951, thanks to John Marshall; #794) + + +ABI changes +----------- + +This release contains a number of things which change the Application +Binary Interface (ABI). This means code compiled against an earlier +library will require recompiling. The shared library soversion has +been bumped. + +* On systems that support it, the default symbol visibility has been + changed to hidden and the only exported symbols are ones that form part + of the officially supported ABI. This is to make clear exactly which + symbols are considered parts of the library interface. It also + helps packagers who want to check compatibility between HTSlib versions. + (#946; see for example issues #311, #616, and #695) + +* HTSlib now supports 64 bit reference positions. This means several + structures, function parameters, and return values have been made bigger + to allow larger values to be stored. While most code that uses + HTSlib interfaces should still build after this change, some alterations + may be needed - notably to printf() formats where the values of structure + members are being printed. (#709) + + Due to file format limitations, large positions are only supported + when reading and writing SAM and VCF files. + + See README.large_positions.md for more information. + +* An extra field has been added to the kbitset_t struct so bitsets can + be made smaller (and later enlarged) without involving memory allocation. + (#710, thanks to John Marshall) + +* A new field has been added to the bam_pileup1_t structure to keep track + of which CIGAR operator is being processed. This is used by a new + bam_plp_insertion() function which can be used to return the sequence of + any inserted bases at a given pileup location. If the alignment includes + CIGAR P operators, the returned sequence will include pads. (#699) + +* The hts_itr_t and hts_itr_multi_t structures have been merged and can be + used interchangeably. Extra fields have been added to hts_itr_t to support + this. hts_itr_multi_t is now a typedef for hts_itr_t; sam_itr_multi_next() + is now an alias for sam_itr_next() and hts_itr_multi_destroy() is an alias + for hts_itr_destroy(). (#836) + +* An improved regidx interface has been added. To allow this, struct + reg_t has been removed, regitr_t has been modified and various new + API functions have been added to htslib/regidx.h. While parts of + the old regidx API have been retained for backwards compatibility, + it is recommended that all code using regidx should be changed to use + the new interface. (#761) + +* Elements in the hts_reglist_t structure have been reordered slightly + so that they pack together better. (#761) + +* bgzf_utell() and bgzf_useek() now use type off_t instead of long for + the offset. This allows them to work correctly on files longer than + 2G bytes on Windows and 32-bit Linux. (#868) + +* A number of functions that used to return void now return int so that + they can report problems like memory allocation failures. Callers + should take care to check the return values from these functions. (#834) + + The affected functions are: + ksort.h: ks_introsort(), ks_mergesort() + sam.h: bam_mplp_init_overlaps() + synced_bcf_reader.h: bcf_sr_regions_flush() + vcf.h: bcf_format_gt(), bcf_fmt_array(), + bcf_enc_int1(), bcf_enc_size(), + bcf_enc_vchar(), bcf_enc_vfloat(), bcf_enc_vint(), + bcf_hdr_set_version(), bcf_hrec_format() + vcfutils.h: bcf_remove_alleles() + +* bcf_set_variant_type() now outputs VCF_OVERLAP for spanning + deletions (ALT=*). (#726) + +* A new field (hrecs) has been added to the bam_hdr_t structure for + use by the new header API. The old sdict field is now not used and + marked as deprecated. The l_text field has been changed from uint32_t + to size_t, to allow for very large headers in SAM files. The text + and l_text fields have been left for backwards compatibility, but + should not be accessed directly in code that uses the new header API. + To access the header text, the new functions sam_hdr_length() and + sam_hdr_str() should be used instead. (#812) + +* The old cigar_tab field is now marked as deprecated; use the new + bam_cigar_table[] instead. (#891, thanks to John Marshall) + +* The bam1_core_t structure's l_qname and l_extranul fields have been + rearranged and enlarged; l_qname still includes the extra NULs. + (Almost all code should use bam_get_qname(), bam_get_cigar(), etc, + and has no need to use these fields directly.) HTSlib now supports + the SAM specification's full 254 QNAME length again. (#900, thanks + to John Marshall; #520) + +* bcf_index_load() no longer tries the '.tbi' suffix when looking for + BCF index files (.tbi indexes are for text files, not binary BCF). (#870) + +* htsFile has a new 'state' member to support SAM multi-threading. (#916) + +* A new field has been added to the bam1_t structure, and others + have been rearranged to remove structure holes. (#709; #922) + + +Bug fixes +--------- + +* Several BGZF format fixes: + + - Support for multi-member gzip files. (#744, thanks to Adam Novak; #742) + + - Fixed error handling code for native gzip formatted files. (64c4927) + + - CRCs checked when threading too (previously only when non-threaded). (#745) + + - Made bgzf_useek function work with threads. (#818) + + - Fixed rare threading deadlocks. (#831) + + - Reading of very short files (<28 bytes) that do not contain an EOF block. + (#910) + +* Fixed some thread pool deadlocks caused by race conditions. (#746, #906) + +* Many additional memory allocation checks in VCF, BCF, SAM and CRAM + code. This also changes the return type of some functions. See ABI + changes above. (#920 amongst others) + +* Replace some sam parsing abort() calls with proper errors. + (#721, thanks to John Marshall; #576) + +* Fixed to permit SAM read names of length 252 to 254 (the maximum + specified by the SAM specification). (#900, thanks to John Marshall) + +* Fixed mpileup overlap detection heuristic to work with BAMs having + long CIGARs (more than 65536 operations). (#802) + +* Security fix: CIGAR strings starting with the "N" operation can no + longer cause underflow on the bam CIGAR structure. Similarly CIGAR + strings that are entirely "D" ops could leak the contents of + uninitialised variables. (#699) + +* Fixed bug where alignments starting 0M could cause an invalid + memory access in sam_prob_realn(). (#699) + +* Fixed out of bounds memory access in mpileup when given a reference + with binary characters (top-bit set). (#808, thanks to John Marshall) + +* Fixed crash in mpileup overlap_push() function. (#882; #852 reported + by Pierre Lindenbaum) + +* Fixed various potential CRAM memory leaks when recovering from + error cases. + +* Fixed CRAM index queries for unmapped reads (#911; samtools/samtools#958 + reported by @acorvelo) + +* Fixed the combination of CRAM embedded references and multiple + slices per container. This was incorrectly setting the header + MD5sum. (No impact on default CRAM behaviour.) (b2552fd) + +* Removed unwanted explicit data flushing in CRAM writing, which on + some OSes caused major slowdowns. (#883) + +* Fixed inefficiencies in CRAM encoding when many small references + occur within the middle of large chromosomes. Previously it + switched into multi-ref mode, but not back out of it which caused + the read POS field to be stored poorly. (#896) + +* Fixed CRAM handling of references when the order of sequences in a + supplied fasta file differs to the order of the @SQ headers. (#935) + +* Fixed BAM and CRAM multi-threaded decoding when used in conjunction + with the multi-region iterator. (#830; #577, #822, #926 all reported by + Brent Pedersen) + +* Removed some unaligned memory accesses in CRAM encoder and + undefined behaviour in BCF reading (#867, thanks to David Seifert) + +* Repeated calling of bcf_empty() no longer crashes. (#741) + +* Fixed bug where some 8 or 16-bit negative integers were stored using values + reserved by the BCF specification. These numbers are now promoted to the + next size up, so -121 to -128 are stored using at least 16 bits, and -32761 + to -32768 are stored using 32 bits. + + Note that while BCF files affected by this bug are technically incorrect, + it is still possible to read them. When converting to VCF format, + HTSlib (and therefore bcftools) will interpret the values as intended + and write out the correct negative numbers. (#766, thanks to John Marshall; + samtools/bcftools#874) + +* Allow repeated invocations of bcf_update_info() and bcf_update_format_*() + functions. (#856, thanks to John Marshall; #813 reported by Steffen Möller) + +* Memory leak removed in knetfile's kftp_parse_url() function. (#759, thanks + to David Alexander) + +* Fixed various crashes found by libfuzzer (invalid data leading to + errors), mostly but not exclusively in CRAM, VCF and BCF decoding. (#805) + +* Improved robustness of BAI and CSI index creation and loading. (#870; #967) + +* Prevent (invalid) creation of TBI indices for BCF files. + (#837; samtools/bcftools#707) + +* Better parsing of handling of remote URLs with ?param=val + components and their interaction with remote index URLs. (#790; #784 + reported by Mark Ebbert) + +* hts_idx_load() now checks locally for all possible index names before + attempting to download a remote index. It also checks that the remote + file it downloads is actually an index before trying to save and use + it. (#870; samtools/samtools#1045 reported by Albert Vilella) + +* hts_open_format() now honours the compression field, no longer also + requiring an explicit "z" in the mode string. Also fixed a 1 byte + buffer overrun. (#880) + +* Removed duplicate hts_tpool_process_flush prototype. (#816, reported by + James S Blachly) + +* Deleted defunct cram_tell declaration. (66c41e2; #915 reported by + Martin Morgan) + +* Fixed overly aggressive filename suffix checking in bgzip. (#927, thanks to + John Marshall; #129, reported by @hguturu) + +* Tabix and bgzip --help output now goes to standard output. (#754, thanks to + John Marshall) + +* Fixed bgzip index creation when using multiple threads. (#817) + +* Made bgzip -b option honour -I (index filename). (#817) + +* Bgzip -d no longer attempts to unlink(NULL) when decompressing stdin. (#718) + + +Miscellaneous other changes +--------------------------- + +* Integration with Google OSS fuzzing for automatic detection of + more bugs. (Thanks to Google for their assistance and the bugs it + has found.) (#796, thanks to Markus Kusano) + +* aclocal.m4 now has the pkg-config macros. (6ec3b94d; #733 reported by + Thomas Hickman) + +* Improved C++ compatibility of some header files. (#772; #771 reported + by @cwrussell) + +* Improved strict C99 compatibility. (#860, thanks to John Marshall) + +* Travis and AppVeyor improvements to aid testing. (#747; #773 thanks to + Lennard Berger; #781; #809; #804; #860; #909) + +* Various minor compiler warnings fixed. (#708; #765; #846, #860, thanks to + John Marshall; #865; #966; #973) + +* Various new and improved error messages. + +* Documentation updates (mostly in the header files). + +* Even more testing with "make check". + +* Corrected many copyright dates. (#979) + +* The default non-configure Makefile now uses libcurl instead of + knet, so it can support https. (#895) + + + + + + Noteworthy changes in release 1.9 (18th July 2018) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -60,8 +544,8 @@ Noteworthy changes in release 1.9 (18th July 2018) - sam_hdr_read() and sam_hdr_write() will now return an error code if passed a NULL file pointer, instead of crashing. - - Fixed possible negative array look-up in sam_parse1() that somehow - escaped previous fuzz testing. (#731, reported by @fCorleone) + - Fixed possible negative array look-up in sam_parse1() that somehow escaped + previous fuzz testing. (CVE-2018-13845, #731, reported by @fCorleone) - Fixed bug where cram range queries could incorrectly report an error when using multiple threads. (#734, reported by Brent Pedersen) @@ -214,7 +698,8 @@ Noteworthy changes in release 1.4.1 (8th May 2017) This is primarily a security bug fix update. -* Fixed SECURITY (CVE-2017-1000206) issue with buffer overruns with malicious data. (#514). +* Fixed SECURITY (CVE-2017-1000206) issue with buffer overruns with + malicious data. (#514) * S3 support for non Amazon AWS endpoints. (#506) diff --git a/README.large_positions.md b/README.large_positions.md new file mode 100644 index 000000000..b0ce7ae71 --- /dev/null +++ b/README.large_positions.md @@ -0,0 +1,232 @@ +# HTSlib 64 bit reference positions + +HTSlib version 1.10 onwards internally use 64 bit reference positions. This +is to support analysis of species like axolotl, tulip and marbled lungfish +which have, or are expected to have, chromosomes longer than two gigabases. + +# File format support + +Currently 64 bit positions can only be stored in SAM and VCF format files. +Binary BAM, CRAM and BCF cannot be used due to limitations in the formats +themselves. As SAM and VCF are text formats, they have no limit on the +size of numeric values. + +# Compatibility issues to check + +Various data structure members, function parameters, and return values have +been expanded from 32 to 64 bits. As a result, some changes may be needed to +code that uses the library, even if it does not support long references. + +## Variadic functions taking format strings + +The type of various structure members (e.g. `bam1_core_t::pos`) and return +values from some functions (e.g. `bam_cigar2rlen()`) have been changed to +`hts_pos_t`, which is a 64-bit signed integer. Using these in 32-bit +code will generally work (as long as the stored positions are within range), +however care needs to be taken when these values are passed directly +to functions like `printf()` which take a variable-length argument list and +a format string. + +Header file `htslib/hts.h` defines macro `PRIhts_pos` which can be +used in `printf()` format strings to get the correct format specifier for +an `hts_pos_t` value. Code that needs to print positions should be +changed from: + +```c +printf("Position is %d\n", bam->core.pos); +``` + +to: + +```c +printf("Position is %"PRIhts_pos"\n", bam->core.pos); +``` + +If for some reason compatibility with older versions of HTSlib (which do +not have `hts_pos_t` or `PRIhts_pos`) is needed, the value can be cast to +`int64_t` and printed as an explicitly 64-bit value: + +```c +#include // For PRId64 and int64_t + +printf("Position is %" PRId64 "\n", (int64_t) bam->core.pos); +``` + +Passing incorrect types to variadic functions like `printf()` can lead +to incorrect behaviour and security risks, so it important to track down +and fix all of the places where this may happen. Modern C compilers like +gcc (version 3.0 onwards) and clang can check `printf()` and `scanf()` +parameter types for compatibility against the format string. To +enable this, build code with `-Wall` or `-Wformat` and fix all the +reported warnings. + +Where functions that take `printf`-style format strings are implemented, +they should use the appropriate gcc attributes to enable format string +checking. `htslib/hts_defs.h` includes macros `HTS_FORMAT` and +`HTS_PRINTF_FMT` which can be used to provide the attribute declaration +in a portable way. For example, `test/sam.c` uses them for a function +that prints error messages: + +``` +void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) fail(const char *fmt, ...) { /* ... */ } +``` + +## Implicit type conversions + +Conversion of signed `int` or `int32_t` to `hts_pos_t` will always work. + +Conversion of `hts_pos_t` to `int` or `int32_t` will work as long as the value +converted is within the range that can be stored in the destination. + +Code that casts unsigned `uint32_t` values to signed with the expectation +that the result may be negative will no longer work as `hts_pos_t` can store +values over UINT32_MAX. Such code should be changed to use signed values. + +Functions hts_parse_region() and hts_parse_reg64() return special value +`HTS_POS_MAX` for regions which extend to the end of the reference. +This value is slightly smaller than INT64_MAX, but should be larger than +any reference that is likely to be used. When cast to `int32_t` the +result should be `INT32_MAX`. + +# Upgrading code to work with 64 bit positions + +Variables used to store reference positions should be changed to +type `hts_pos_t`. Use `PRIhts_pos` in format strings when printing them. + +When converting positions stored in strings, use `strtoll()` in place of +`atoi()` or `strtol()` (which produces a 32 bit value on 64-bit Windows and +all 32-bit platforms). + +Programs which need to look up a reference sequence length from a `sam_hdr_t` +structure should use `sam_hdr_tid2len()` instead of the old +`sam_hdr_t::target_len` array (which is left as 32-bit for reasons of +compatibility). `sam_hdr_tid2len()` returns `hts_pos_t`, so works correctly +for large references. + +Various functions which take pointer arguments have new versions which +support `hts_pos_t *` arguments. Code supporting 64-bit positions should +use the new versions. These are: + +Original function | 64-bit version +------------------ | -------------------- +fai_fetch() | fai_fetch64() +fai_fetchqual() | fai_fetchqual64() +faidx_fetch_seq() | faidx_fetch_seq64() +faidx_fetch_qual() | faidx_fetch_qual64() +hts_parse_reg() | hts_parse_reg64() or hts_parse_region() +bam_plp_auto() | bam_plp64_auto() +bam_plp_next() | bam_plp64_next() +bam_mplp_auto() | bam_mplp64_auto() + +Limited support has been added for 64-bit INFO values in VCF files, for large +values in structural variant END tags. New functions `bcf_update_info_int64()` +and `bcf_get_info_int64()` can be used to set and fetch 64-bit INFO values. +They both take arrays of `int64_t`. `bcf_int64_missing` and +`bcf_int64_vector_end` can be used to set missing and vector end values in +these arrays. The INFO data is stored in the minimum size needed, so there +is no harm in using these functions to store smaller integer values. + +# Structure members that have changed size + +``` +File htslib/hts.h: + hts_pair32_t::begin + hts_pair32_t::end + + (typedef hts_pair_pos_t is provided as a better-named replacement for hts_pair32_t) + + hts_reglist_t::min_beg + hts_reglist_t::max_end + + hts_itr_t::beg + hts_itr_t::end + hts_itr_t::curr_beg + hts_itr_t::curr_end + +File htslib/regidx.h: + reg_t::start + reg_t::end + +File htslib/sam.h: + bam1_core_t::pos + bam1_core_t::mpos + bam1_core_t::isize + +File htslib/synced_bcf_reader.h: + bcf_sr_regions_t::start + bcf_sr_regions_t::end + bcf_sr_regions_t::prev_start + +File htslib/vcf.h: + bcf_idinfo_t::info + + bcf_info_t::v1::i + + bcf1_t::pos + bcf1_t::rlen +``` + +# Functions where parameters or the return value have changed size + +Functions are annotated as follows: + +* `[new]` The function has been added since version 1.9 +* `[parameters]` Function parameters have changed size +* `[return]` Function return value has changed size + +``` +File htslib/faidx.h: + + [new] fai_fetch64() + [new] fai_fetchqual64() + [new] faidx_fetch_seq64() + [new] faidx_fetch_qual64() + [new] fai_parse_region() + +File htslib/hts.h: + + [parameters] hts_idx_push() + [new] hts_parse_reg64() + [parameters] hts_itr_query() + [parameters] hts_reg2bin() + +File htslib/kstring.h: + + [new] kputll() + +File htslib/regidx.h: + + [parameters] regidx_overlap() + +File htslib/sam.h: + + [new] sam_hdr_tid2len() + [return] bam_cigar2qlen() + [return] bam_cigar2rlen() + [return] bam_endpos() + [parameters] bam_itr_queryi() + [parameters] sam_itr_queryi() + [new] bam_plp64_next() + [new] bam_plp64_auto() + [new] bam_mplp64_auto() + [parameters] sam_cap_mapq() + [parameters] sam_prob_realn() + +File htslib/synced_bcf_reader.h: + + [parameters] bcf_sr_seek() + [parameters] bcf_sr_regions_overlap() + +File htslib/tbx.h: + + [parameters] tbx_readrec() + +File htslib/vcf.h: + + [parameters] bcf_readrec() + [new] bcf_update_info_int64() + [new] bcf_get_info_int64() + [return] bcf_dec_int1() + [return] bcf_dec_typed_int1() + +``` diff --git a/README.md b/README.md index b69d41032..1b01d7271 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ +[![Build Status](https://travis-ci.org/samtools/htslib.svg?branch=develop)](https://travis-ci.org/samtools/htslib) +[![Build status](https://ci.appveyor.com/api/projects/status/v46hkwyfjp3l8nd3/branch/develop?svg=true)](https://ci.appveyor.com/project/samtools/htslib/branch/develop) +[![Github All Releases](https://img.shields.io/github/downloads/samtools/htslib/total.svg)](https://github.com/samtools/htslib) + HTSlib is an implementation of a unified C library for accessing common file formats, such as [SAM, CRAM and VCF][1], used for high-throughput sequencing data, and is the core library used by [samtools][2] and [bcftools][3]. @@ -26,7 +30,7 @@ requires extra steps: ```sh autoheader # If using configure, generate the header template... autoconf # ...and configure script (or use autoreconf to do both) -./configure # Optional, needed for choosing optional functionality +./configure # Optional but recommended, for choosing extra functionality make make install ``` diff --git a/aclocal.m4 b/aclocal.m4 new file mode 100644 index 000000000..7c76ad5c9 --- /dev/null +++ b/aclocal.m4 @@ -0,0 +1,174 @@ +# generated automatically by aclocal 1.14.1 -*- Autoconf -*- + +# Copyright (C) 1996-2013 Free Software Foundation, Inc. + +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])]) +# pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- +# serial 1 (pkg-config-0.24) +# +# Copyright © 2004 Scott James Remnant . +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# PKG_PROG_PKG_CONFIG([MIN-VERSION]) +# ---------------------------------- +AC_DEFUN([PKG_PROG_PKG_CONFIG], +[m4_pattern_forbid([^_?PKG_[A-Z_]+$]) +m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$]) +m4_pattern_allow([^PKG_CONFIG_(DISABLE_UNINSTALLED|TOP_BUILD_DIR|DEBUG_SPEW)$]) +AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility]) +AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path]) +AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path]) + +if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then + AC_PATH_TOOL([PKG_CONFIG], [pkg-config]) +fi +if test -n "$PKG_CONFIG"; then + _pkg_min_version=m4_default([$1], [0.9.0]) + AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version]) + if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + PKG_CONFIG="" + fi +fi[]dnl +])# PKG_PROG_PKG_CONFIG + +# PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) +# +# Check to see whether a particular set of modules exists. Similar +# to PKG_CHECK_MODULES(), but does not set variables or print errors. +# +# Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG]) +# only at the first occurence in configure.ac, so if the first place +# it's called might be skipped (such as if it is within an "if", you +# have to call PKG_CHECK_EXISTS manually +# -------------------------------------------------------------- +AC_DEFUN([PKG_CHECK_EXISTS], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl +if test -n "$PKG_CONFIG" && \ + AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then + m4_default([$2], [:]) +m4_ifvaln([$3], [else + $3])dnl +fi]) + +# _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) +# --------------------------------------------- +m4_define([_PKG_CONFIG], +[if test -n "$$1"; then + pkg_cv_[]$1="$$1" + elif test -n "$PKG_CONFIG"; then + PKG_CHECK_EXISTS([$3], + [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes ], + [pkg_failed=yes]) + else + pkg_failed=untried +fi[]dnl +])# _PKG_CONFIG + +# _PKG_SHORT_ERRORS_SUPPORTED +# ----------------------------- +AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG]) +if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then + _pkg_short_errors_supported=yes +else + _pkg_short_errors_supported=no +fi[]dnl +])# _PKG_SHORT_ERRORS_SUPPORTED + + +# PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], +# [ACTION-IF-NOT-FOUND]) +# +# +# Note that if there is a possibility the first call to +# PKG_CHECK_MODULES might not happen, you should be sure to include an +# explicit call to PKG_PROG_PKG_CONFIG in your configure.ac +# +# +# -------------------------------------------------------------- +AC_DEFUN([PKG_CHECK_MODULES], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl +AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl +AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl + +pkg_failed=no +AC_MSG_CHECKING([for $1]) + +_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2]) +_PKG_CONFIG([$1][_LIBS], [libs], [$2]) + +m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS +and $1[]_LIBS to avoid the need to call pkg-config. +See the pkg-config man page for more details.]) + +if test $pkg_failed = yes; then + AC_MSG_RESULT([no]) + _PKG_SHORT_ERRORS_SUPPORTED + if test $_pkg_short_errors_supported = yes; then + $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "$2" 2>&1` + else + $1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "$2" 2>&1` + fi + # Put the nasty error message in config.log where it belongs + echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD + + m4_default([$4], [AC_MSG_ERROR( +[Package requirements ($2) were not met: + +$$1_PKG_ERRORS + +Consider adjusting the PKG_CONFIG_PATH environment variable if you +installed software in a non-standard prefix. + +_PKG_TEXT])[]dnl + ]) +elif test $pkg_failed = untried; then + AC_MSG_RESULT([no]) + m4_default([$4], [AC_MSG_FAILURE( +[The pkg-config script could not be found or is too old. Make sure it +is in your PATH or set the PKG_CONFIG environment variable to the full +path to pkg-config. + +_PKG_TEXT + +To get pkg-config, see .])[]dnl + ]) +else + $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS + $1[]_LIBS=$pkg_cv_[]$1[]_LIBS + AC_MSG_RESULT([yes]) + $3 +fi[]dnl +])# PKG_CHECK_MODULES + diff --git a/bcf_sr_sort.c b/bcf_sr_sort.c index 1b7649695..e8c5c5053 100644 --- a/bcf_sr_sort.c +++ b/bcf_sr_sort.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017 Genome Research Ltd. + Copyright (C) 2017-2019 Genome Research Ltd. Author: Petr Danecek @@ -22,12 +22,14 @@ THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include #include "bcf_sr_sort.h" #include "htslib/khash_str2int.h" +#include "htslib/kbitset.h" #define SR_REF 1 #define SR_SNP 2 @@ -35,22 +37,6 @@ #define SR_OTHER 8 #define SR_SCORE(srt,a,b) (srt)->score[((a)<<4)|(b)] -// Resize a bit set. -static inline kbitset_t *kbs_resize(kbitset_t *bs, size_t ni) -{ - if ( !bs ) return kbs_init(ni); - size_t n = (ni + KBS_ELTBITS-1) / KBS_ELTBITS; - if ( n==bs->n ) return bs; - - bs = (kbitset_t *) realloc(bs, sizeof(kbitset_t) + n * sizeof(unsigned long)); - if ( bs==NULL ) return NULL; - if ( n > bs->n ) - memset(bs->b + bs->n, 0, (n - bs->n) * sizeof (unsigned long)); - bs->n = n; - bs->b[n] = ~0UL; - return bs; -} - // Logical AND static inline int kbs_logical_and(kbitset_t *bs1, kbitset_t *bs2) { @@ -162,7 +148,7 @@ static int multi_is_subset(var_t *avar, var_t *bvar) } return 0; } -int32_t pairing_score(sr_sort_t *srt, int ivset, int jvset) +static uint32_t pairing_score(sr_sort_t *srt, int ivset, int jvset) { varset_t *iv = &srt->vset[ivset]; varset_t *jv = &srt->vset[jvset]; @@ -200,9 +186,9 @@ int32_t pairing_score(sr_sort_t *srt, int ivset, int jvset) for (i=0; invar; i++) cnt += srt->var[iv->var[i]].nvcf; for (j=0; jnvar; j++) cnt += srt->var[jv->var[j]].nvcf; - return (1<<(28+min)) + cnt; + return (1u<<(28+min)) + cnt; } -void remove_vset(sr_sort_t *srt, int jvset) +static void remove_vset(sr_sort_t *srt, int jvset) { if ( jvset+1 < srt->nvset ) { @@ -217,7 +203,7 @@ void remove_vset(sr_sort_t *srt, int jvset) } srt->nvset--; } -int merge_vsets(sr_sort_t *srt, int ivset, int jvset) +static int merge_vsets(sr_sort_t *srt, int ivset, int jvset) { int i,j; if ( ivset > jvset ) { i = ivset; ivset = jvset; jvset = i; } @@ -241,7 +227,8 @@ int merge_vsets(sr_sort_t *srt, int ivset, int jvset) return ivset; } -void push_vset(sr_sort_t *srt, int ivset) + +static int push_vset(sr_sort_t *srt, int ivset) { varset_t *iv = &srt->vset[ivset]; int i,j; @@ -263,6 +250,7 @@ void push_vset(sr_sort_t *srt, int ivset) } } remove_vset(srt, ivset); + return 0; // FIXME: check for errs in this function } static int cmpstringp(const void *p1, const void *p2) @@ -301,14 +289,14 @@ void debug_vbuf(sr_sort_t *srt) for (i=0; isr->nreaders; i++) { vcf_buf_t *buf = &srt->vcf_buf[i]; - fprintf(stderr,"\t%d", buf->rec[j] ? buf->rec[j]->pos+1 : 0); + fprintf(stderr,"\t%"PRIhts_pos, buf->rec[j] ? buf->rec[j]->pos+1 : 0); } fprintf(stderr,"\n"); } } #endif -char *grp_create_key(sr_sort_t *srt) +static char *grp_create_key(sr_sort_t *srt) { if ( !srt->str.l ) return strdup(""); int i; @@ -334,16 +322,16 @@ int bcf_sr_sort_set_active(sr_sort_t *srt, int idx) hts_expand(int,idx+1,srt->mactive,srt->active); srt->nactive = 1; srt->active[srt->nactive - 1] = idx; - return 0; + return 0; // FIXME: check for errs in this function } int bcf_sr_sort_add_active(sr_sort_t *srt, int idx) { hts_expand(int,idx+1,srt->mactive,srt->active); srt->nactive++; srt->active[srt->nactive - 1] = idx; - return 0; + return 0; // FIXME: check for errs in this function } -static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos) +static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos) { if ( !srt->grp_str2int ) { @@ -469,7 +457,11 @@ static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, // initialize bitmask - which groups is the variant present in for (ivar=0; ivarnvar; ivar++) { - srt->var[ivar].mask = kbs_resize(srt->var[ivar].mask, srt->ngrp); + if ( kbs_resize(&srt->var[ivar].mask, srt->ngrp) < 0 ) + { + fprintf(stderr, "[%s:%d %s] kbs_resize failed\n", __FILE__,__LINE__,__func__); + exit(1); + } kbs_clear(srt->var[ivar].mask); } for (igrp=0; igrpngrp; igrp++) @@ -493,7 +485,11 @@ static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, vset->var[vset->nvar-1] = ivar; var_t *var = &srt->var[ivar]; vset->cnt = var->nvcf; - vset->mask = kbs_resize(vset->mask, srt->ngrp); + if ( kbs_resize(&vset->mask, srt->ngrp) < 0 ) + { + fprintf(stderr, "[%s:%d %s] kbs_resize failed\n", __FILE__,__LINE__,__func__); + exit(1); + } kbs_clear(vset->mask); kbs_bitwise_or(vset->mask, var->mask); @@ -557,9 +553,11 @@ static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, srt->chr = chr; srt->pos = min_pos; + + return 0; // FIXME: check for errs in this function } -int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos) +int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos) { int i,j; assert( srt->nactive>0 ); diff --git a/bcf_sr_sort.h b/bcf_sr_sort.h index b903ae400..c8bd787a1 100644 --- a/bcf_sr_sort.h +++ b/bcf_sr_sort.h @@ -31,8 +31,8 @@ */ -#ifndef __BCF_SR_SORT_H__ -#define __BCF_SR_SORT_H__ +#ifndef BCF_SR_SORT_H +#define BCF_SR_SORT_H #include "htslib/synced_bcf_reader.h" #include "htslib/kbitset.h" @@ -90,7 +90,8 @@ typedef struct int moff, noff, *off, mcharp; char **charp; const char *chr; - int pos, nsr, msr; + hts_pos_t pos; + int nsr, msr; int pair; int nactive, mactive, *active; // list of readers with lines at the current pos } @@ -98,7 +99,7 @@ sr_sort_t; sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt); void bcf_sr_sort_reset(sr_sort_t *srt); -int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int pos); +int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t pos); int bcf_sr_sort_set_active(sr_sort_t *srt, int i); int bcf_sr_sort_add_active(sr_sort_t *srt, int i); void bcf_sr_sort_destroy(sr_sort_t *srt); diff --git a/bgzf.c b/bgzf.c index bd98ed42b..0a76676f8 100644 --- a/bgzf.c +++ b/bgzf.c @@ -2,7 +2,7 @@ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos - Copyright (C) 2009, 2013-2017 Genome Research Ltd + Copyright (C) 2009, 2013-2019 Genome Research Ltd Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -23,6 +23,7 @@ THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -34,6 +35,7 @@ #include #include #include +#include #ifdef HAVE_LIBDEFLATE #include @@ -45,6 +47,7 @@ #include "htslib/thread_pool.h" #include "htslib/hts_endian.h" #include "cram/pooled_alloc.h" +#include "hts_internal.h" #define BGZF_CACHE #define BGZF_MT @@ -101,10 +104,25 @@ typedef struct bgzf_job { enum mtaux_cmd { NONE = 0, SEEK, + SEEK_DONE, HAS_EOF, + HAS_EOF_DONE, CLOSE, }; +// When multi-threaded bgzf_tell won't work, so we delay the hts_idx_push +// until we've written the last block. +typedef struct { + hts_pos_t beg, end; + int tid, is_mapped; // args for hts_idx_push + uint64_t offset, block_number; +} hts_idx_cache_entry; + +typedef struct { + int nentries, mentries; // used and allocated + hts_idx_cache_entry *e; // hts_idx elements +} hts_idx_cache_t; + typedef struct bgzf_mtaux_t { // Memory pool for bgzf_job structs, to avoid many malloc/free pool_alloc_t *job_pool; @@ -133,6 +151,12 @@ typedef struct bgzf_mtaux_t { pthread_mutex_t command_m; // Set whenever fp is being updated pthread_cond_t command_c; enum mtaux_cmd command; + + // For multi-threaded on-the-fly indexing. See bgzf_idx_push below. + pthread_mutex_t idx_m; + hts_idx_t *hts_idx; + uint64_t block_number, block_written; + hts_idx_cache_t idx_cache; } mtaux_t; #endif @@ -150,9 +174,93 @@ struct __bgzidx_t uint64_t ublock_addr; // offset of the current block (uncompressed data) }; +/* + * Buffers up arguments to hts_idx_push for later use, once we've written all bar + * this block. This is necessary when multiple blocks are in flight (threading) + * and fp->block_address isn't known at the time of call as we have in-flight + * blocks that haven't yet been compressed. + * + * NB: this only matters when we're indexing on the fly (writing). + * Normal indexing is threaded reads, but we already know block sizes + * so it's a simpler process + * + * Returns 0 on success, + * -1 on failure + */ +int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped) { + hts_idx_cache_entry *e; + mtaux_t *mt = fp->mt; + + if (!mt) + return hts_idx_push(hidx, tid, beg, end, offset, is_mapped); + + // Early check for out of range positions which would fail in hts_idx_push() + if (hts_idx_check_range(hidx, tid, beg, end) < 0) + return -1; + + pthread_mutex_lock(&mt->idx_m); + + mt->hts_idx = hidx; + hts_idx_cache_t *ic = &mt->idx_cache; + + if (ic->nentries >= ic->mentries) { + int new_sz = ic->mentries ? ic->mentries*2 : 1024; + if (!(e = realloc(ic->e, new_sz * sizeof(*ic->e)))) { + pthread_mutex_unlock(&mt->idx_m); + return -1; + } + ic->e = e; + ic->mentries = new_sz; + } + + e = &ic->e[ic->nentries++]; + e->tid = tid; + e->beg = beg; + e->end = end; + e->is_mapped = is_mapped; + e->offset = offset & 0xffff; + e->block_number = mt->block_number; + + pthread_mutex_unlock(&mt->idx_m); + + return 0; +} + +static int bgzf_idx_flush(BGZF *fp) { + mtaux_t *mt = fp->mt; + + if (!mt->idx_cache.e) { + mt->block_written++; + return 0; + } + + pthread_mutex_lock(&mt->idx_m); + + hts_idx_cache_entry *e = mt->idx_cache.e; + int i; + + assert(mt->idx_cache.nentries == 0 || mt->block_written >= e[0].block_number); + + for (i = 0; i < mt->idx_cache.nentries && e[i].block_number == mt->block_written; i++) { + if (hts_idx_push(mt->hts_idx, e[i].tid, e[i].beg, e[i].end, + (mt->block_address << 16) + e[i].offset, + e[i].is_mapped) < 0) { + pthread_mutex_unlock(&mt->idx_m); + return -1; + } + } + + memmove(&e[0], &e[i], (mt->idx_cache.nentries - i) * sizeof(*e)); + mt->idx_cache.nentries -= i; + mt->block_written++; + + pthread_mutex_unlock(&mt->idx_m); + return 0; +} + void bgzf_index_destroy(BGZF *fp); int bgzf_index_add_block(BGZF *fp); -static void mt_destroy(mtaux_t *mt); +static int mt_destroy(mtaux_t *mt); static inline void packInt16(uint8_t *buffer, uint16_t value) { @@ -201,6 +309,8 @@ static const char *bgzf_zerr(int errnum, z_stream *zs) return "progress temporarily not possible, or in() / out() returned an error"; case Z_VERSION_ERROR: return "zlib version mismatch"; + case Z_NEED_DICT: + return "data was compressed using a dictionary"; case Z_OK: // 0: maybe gzgets error Z_NULL default: snprintf(buffer, sizeof(buffer), "[%d] unknown", errnum); @@ -514,7 +624,9 @@ static int deflate_block(BGZF *fp, int block_length) #ifdef HAVE_LIBDEFLATE -static int bgzf_uncompress(uint8_t *dst, size_t *dlen, const uint8_t *src, size_t slen) { +static int bgzf_uncompress(uint8_t *dst, size_t *dlen, + const uint8_t *src, size_t slen, + uint32_t expected_crc) { struct libdeflate_decompressor *z = libdeflate_alloc_decompressor(); if (!z) { hts_log_error("Call to libdeflate_alloc_decompressor failed"); @@ -529,20 +641,29 @@ static int bgzf_uncompress(uint8_t *dst, size_t *dlen, const uint8_t *src, size_ return -1; } + uint32_t crc = libdeflate_crc32(0, (unsigned char *)dst, *dlen); + if (crc != expected_crc) { + hts_log_error("CRC32 checksum mismatch"); + return -2; + } + return 0; } #else -static int bgzf_uncompress(uint8_t *dst, size_t *dlen, const uint8_t *src, size_t slen) { - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.msg = NULL; - zs.next_in = (Bytef*)src; - zs.avail_in = slen; - zs.next_out = (Bytef*)dst; - zs.avail_out = *dlen; +static int bgzf_uncompress(uint8_t *dst, size_t *dlen, + const uint8_t *src, size_t slen, + uint32_t expected_crc) { + z_stream zs = { + .zalloc = NULL, + .zfree = NULL, + .msg = NULL, + .next_in = (Bytef*)src, + .avail_in = slen, + .next_out = (Bytef*)dst, + .avail_out = *dlen + }; int ret = inflateInit2(&zs, -15); if (ret != Z_OK) { @@ -561,6 +682,13 @@ static int bgzf_uncompress(uint8_t *dst, size_t *dlen, const uint8_t *src, size_ return -1; } *dlen = *dlen - zs.avail_out; + + uint32_t crc = crc32(crc32(0L, NULL, 0L), (unsigned char *)dst, *dlen); + if (crc != expected_crc) { + hts_log_error("CRC32 checksum mismatch"); + return -2; + } + return 0; } #endif // HAVE_LIBDEFLATE @@ -569,60 +697,87 @@ static int bgzf_uncompress(uint8_t *dst, size_t *dlen, const uint8_t *src, size_ static int inflate_block(BGZF* fp, int block_length) { size_t dlen = BGZF_MAX_BLOCK_SIZE; + uint32_t crc = le_to_u32((uint8_t *)fp->compressed_block + block_length-8); int ret = bgzf_uncompress(fp->uncompressed_block, &dlen, - (Bytef*)fp->compressed_block + 18, block_length - 18); + (Bytef*)fp->compressed_block + 18, + block_length - 18, crc); if (ret < 0) { - fp->errcode |= BGZF_ERR_ZLIB; - return -1; - } - - // Check CRC of uncompressed block matches the gzip header. - // NB: we may wish to switch out the zlib crc32 for something more performant. - // See PR#361 and issue#467 -#ifdef HAVE_LIBDEFLATE - uint32_t c1 = libdeflate_crc32(0L, (unsigned char *)fp->uncompressed_block, dlen); -#else - uint32_t c1 = crc32(0L, (unsigned char *)fp->uncompressed_block, dlen); -#endif - uint32_t c2 = le_to_u32((uint8_t *)fp->compressed_block + block_length-8); - if (c1 != c2) { - fp->errcode |= BGZF_ERR_CRC; + if (ret == -2) + fp->errcode |= BGZF_ERR_CRC; + else + fp->errcode |= BGZF_ERR_ZLIB; return -1; } return dlen; } -static int inflate_gzip_block(BGZF *fp, int cached) +// Decompress the next part of a non-blocked GZIP file. +// Return the number of uncompressed bytes read, 0 on EOF, or a negative number on error. +// Will fill the output buffer unless the end of the GZIP file is reached. +static int inflate_gzip_block(BGZF *fp) { - int ret = Z_OK; - do - { - if ( !cached && fp->gz_stream->avail_out!=0 ) - { - fp->gz_stream->avail_in = hread(fp->fp, fp->compressed_block, BGZF_BLOCK_SIZE); - if ( fp->gz_stream->avail_in<=0 ) return fp->gz_stream->avail_in; - if ( fp->gz_stream->avail_in==0 ) break; + // we will set this to true when we detect EOF, so we don't bang against the EOF more than once per call + int input_eof = 0; + + // write to the part of the output buffer after block_offset + fp->gz_stream->next_out = (Bytef*)fp->uncompressed_block + fp->block_offset; + fp->gz_stream->avail_out = BGZF_MAX_BLOCK_SIZE - fp->block_offset; + + while ( fp->gz_stream->avail_out != 0 ) { + // until we fill the output buffer (or hit EOF) + + if ( !input_eof && fp->gz_stream->avail_in == 0 ) { + // we are out of input data in the buffer. Get more. fp->gz_stream->next_in = fp->compressed_block; + int ret = hread(fp->fp, fp->compressed_block, BGZF_BLOCK_SIZE); + if ( ret < 0 ) { + // hread had an error. Pass it on. + return ret; + } + fp->gz_stream->avail_in = ret; + if ( fp->gz_stream->avail_in < BGZF_BLOCK_SIZE ) { + // we have reached EOF but the decompressor hasn't necessarily + input_eof = 1; + } } - else cached = 0; - do - { - fp->gz_stream->next_out = (Bytef*)fp->uncompressed_block + fp->block_offset; - fp->gz_stream->avail_out = BGZF_MAX_BLOCK_SIZE - fp->block_offset; - fp->gz_stream->msg = NULL; - ret = inflate(fp->gz_stream, Z_NO_FLUSH); - if (ret < 0 && ret != Z_BUF_ERROR) { - hts_log_error("Inflate operation failed: %s", bgzf_zerr(ret, ret == Z_DATA_ERROR ? fp->gz_stream : NULL)); - fp->errcode |= BGZF_ERR_ZLIB; - return -1; + + fp->gz_stream->msg = NULL; + // decompress as much data as we can + int ret = inflate(fp->gz_stream, Z_SYNC_FLUSH); + + if ( (ret < 0 && ret != Z_BUF_ERROR) || ret == Z_NEED_DICT ) { + // an error occurred, other than running out of space + hts_log_error("Inflate operation failed: %s", bgzf_zerr(ret, ret == Z_DATA_ERROR ? fp->gz_stream : NULL)); + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } else if ( ret == Z_STREAM_END ) { + // we finished a GZIP member + + // scratch for peeking to see if the file is over + char c; + if (fp->gz_stream->avail_in > 0 || hpeek(fp->fp, &c, 1) == 1) { + // there is more data; try and read another GZIP member in the remaining data + int reset_ret = inflateReset(fp->gz_stream); + if (reset_ret != Z_OK) { + hts_log_error("Call to inflateReset failed: %s", bgzf_zerr(reset_ret, NULL)); + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + } else { + // we consumed all the input data and hit Z_STREAM_END + // so stop looping, even if we never fill the output buffer + break; } - unsigned int have = BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out; - if ( have ) return have; + } else if ( ret == Z_BUF_ERROR && input_eof && fp->gz_stream->avail_out > 0 ) { + // the gzip file has ended prematurely + hts_log_error("Gzip file truncated"); + fp->errcode |= BGZF_ERR_IO; + return -1; } - while ( fp->gz_stream->avail_out == 0 ); } - while (ret != Z_STREAM_END); + + // when we get here, the buffer is full or there is an EOF after a complete gzip member return BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out; } @@ -664,7 +819,7 @@ static int load_block_from_cache(BGZF *fp, int64_t block_address) if ( hseek(fp->fp, p->end_offset, SEEK_SET) < 0 ) { // todo: move the error up - hts_log_error("Could not hseek to %"PRId64"", p->end_offset); + hts_log_error("Could not hseek to %" PRId64, p->end_offset); exit(1); } return p->size; @@ -759,7 +914,8 @@ int bgzf_read_block(BGZF *fp) if (fp->uncompressed_block == NULL) return -1; fp->compressed_block = (char *)fp->uncompressed_block + BGZF_MAX_BLOCK_SIZE; } // else it's already allocated with malloc, maybe even in-use. - mt_destroy(fp->mt); + if (mt_destroy(fp->mt) < 0) + fp->errcode = BGZF_ERR_IO; fp->mt = NULL; hts_tpool_delete_result(r, 0); @@ -826,6 +982,9 @@ int bgzf_read_block(BGZF *fp) single_threaded: size = 0; + int64_t block_address; + block_address = bgzf_htell(fp); + // Reading an uncompressed file if ( !fp->is_compressed ) { @@ -841,17 +1000,15 @@ int bgzf_read_block(BGZF *fp) return 0; } if (fp->block_length != 0) fp->block_offset = 0; - fp->block_address += count; + fp->block_address = block_address; fp->block_length = count; return 0; } // Reading compressed file - int64_t block_address; - block_address = bgzf_htell(fp); if ( fp->is_gzip && fp->gz_stream ) // is this is an initialized gzip stream? { - count = inflate_gzip_block(fp, 0); + count = inflate_gzip_block(fp); if ( count<0 ) { fp->errcode |= BGZF_ERR_ZLIB; @@ -887,47 +1044,20 @@ int bgzf_read_block(BGZF *fp) uint8_t *cblock = (uint8_t*)fp->compressed_block; memcpy(cblock, header, sizeof(header)); count = hread(fp->fp, cblock+sizeof(header), BGZF_BLOCK_SIZE - sizeof(header)) + sizeof(header); - int nskip = 10; - - // Check optional fields to skip: FLG.FNAME,FLG.FCOMMENT,FLG.FHCRC,FLG.FEXTRA - // Note: Some of these fields are untested, I did not have appropriate data available - if ( header[3] & 0x4 ) // FLG.FEXTRA - { - nskip += unpackInt16(&cblock[nskip]) + 2; - } - if ( header[3] & 0x8 ) // FLG.FNAME - { - while ( nskip= count ) - { - fp->errcode |= BGZF_ERR_HEADER; - return -1; - } fp->is_gzip = 1; fp->gz_stream = (z_stream*) calloc(1,sizeof(z_stream)); - int ret = inflateInit2(fp->gz_stream, -15); + // Set up zlib, using a window size of 15, and its built-in GZIP header processing (+16). + int ret = inflateInit2(fp->gz_stream, 15 + 16); if (ret != Z_OK) { hts_log_error("Call to inflateInit2 failed: %s", bgzf_zerr(ret, fp->gz_stream)); fp->errcode |= BGZF_ERR_ZLIB; return -1; } - fp->gz_stream->avail_in = count - nskip; - fp->gz_stream->next_in = cblock + nskip; - count = inflate_gzip_block(fp, 1); + fp->gz_stream->avail_in = count; + fp->gz_stream->next_in = cblock; + count = inflate_gzip_block(fp); if ( count<0 ) { fp->errcode |= BGZF_ERR_ZLIB; @@ -1012,6 +1142,23 @@ ssize_t bgzf_read(BGZF *fp, void *data, size_t length) return bytes_read; } +// -1 for EOF, -2 for error, 0-255 for byte. +int bgzf_peek(BGZF *fp) { + int available = fp->block_length - fp->block_offset; + if (available <= 0) { + if (bgzf_read_block(fp) < 0) { + hts_log_error("Read block operation failed with error %d", fp->errcode); + fp->errcode = BGZF_ERR_ZLIB; + return -2; + } + } + available = fp->block_length - fp->block_offset; + if (available) + return ((unsigned char *)fp->uncompressed_block)[fp->block_offset]; + + return -1; +} + ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length) { ssize_t ret = hread(fp->fp, data, length); @@ -1021,7 +1168,18 @@ ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length) #ifdef BGZF_MT -void *bgzf_encode_func(void *arg) { +/* Function to clean up when jobs are discarded (e.g. during seek) + * This works for results too, as results are the same struct with + * decompressed data stored in it. */ +static void job_cleanup(void *arg) { + bgzf_job *j = (bgzf_job *)arg; + mtaux_t *mt = j->fp->mt; + pthread_mutex_lock(&mt->job_pool_m); + pool_free(mt->job_pool, j); + pthread_mutex_unlock(&mt->job_pool_m); +} + +static void *bgzf_encode_func(void *arg) { bgzf_job *j = (bgzf_job *)arg; j->comp_len = BGZF_MAX_BLOCK_SIZE; @@ -1036,7 +1194,7 @@ void *bgzf_encode_func(void *arg) { // Optimisation for compression level 0 (uncompressed deflate blocks) // Avoids memcpy of the data from uncompressed to compressed buffer. -void *bgzf_encode_level0_func(void *arg) { +static void *bgzf_encode_level0_func(void *arg) { bgzf_job *j = (bgzf_job *)arg; uint32_t crc; j->comp_len = j->uncomp_len + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH + 5; @@ -1070,12 +1228,13 @@ void *bgzf_encode_level0_func(void *arg) { // Our input block has already been decoded by bgzf_mt_read_block(). // We need to split that into a fetch block (compressed) and make this // do the actual decompression step. -void *bgzf_decode_func(void *arg) { +static void *bgzf_decode_func(void *arg) { bgzf_job *j = (bgzf_job *)arg; j->uncomp_len = BGZF_MAX_BLOCK_SIZE; + uint32_t crc = le_to_u32((uint8_t *)j->comp_data + j->comp_len-8); int ret = bgzf_uncompress(j->uncomp_data, &j->uncomp_len, - j->comp_data+18, j->comp_len-18); + j->comp_data+18, j->comp_len-18, crc); if (ret != 0) j->errcode |= BGZF_ERR_ZLIB; @@ -1086,7 +1245,7 @@ void *bgzf_decode_func(void *arg) { * Nul function so we can dispatch a job with the correct serial * to mark failure or to indicate an empty read (EOF). */ -void *bgzf_nul_func(void *arg) { return arg; } +static void *bgzf_nul_func(void *arg) { return arg; } /* * Takes compressed blocks off the results queue and calls hwrite to @@ -1099,16 +1258,43 @@ static void *bgzf_mt_writer(void *vp) { mtaux_t *mt = fp->mt; hts_tpool_result *r; + if (fp->idx_build_otf) { + fp->idx->moffs = fp->idx->noffs = 1; + fp->idx->offs = (bgzidx1_t*) calloc(fp->idx->moffs, sizeof(bgzidx1_t)); + if (!fp->idx->offs) goto err; + } + // Iterates until result queue is shutdown, where it returns NULL. while ((r = hts_tpool_next_result_wait(mt->out_queue))) { bgzf_job *j = (bgzf_job *)hts_tpool_result_data(r); assert(j); - if (hwrite(fp->fp, j->comp_data, j->comp_len) != j->comp_len) { - fp->errcode |= BGZF_ERR_IO; - goto err; + if (fp->idx_build_otf) { + fp->idx->noffs++; + if ( fp->idx->noffs > fp->idx->moffs ) + { + fp->idx->moffs = fp->idx->noffs; + kroundup32(fp->idx->moffs); + fp->idx->offs = (bgzidx1_t*) realloc(fp->idx->offs, fp->idx->moffs*sizeof(bgzidx1_t)); + if ( !fp->idx->offs ) goto err; + } + fp->idx->offs[ fp->idx->noffs-1 ].uaddr = fp->idx->offs[ fp->idx->noffs-2 ].uaddr + j->uncomp_len; + fp->idx->offs[ fp->idx->noffs-1 ].caddr = fp->idx->offs[ fp->idx->noffs-2 ].caddr + j->comp_len; } + // Flush any cached hts_idx_push calls + if (bgzf_idx_flush(fp) < 0) + goto err; + + if (hwrite(fp->fp, j->comp_data, j->comp_len) != j->comp_len) + goto err; + + // Update our local block_address. Cannot be fp->block_address due to no + // locking in bgzf_tell. + pthread_mutex_lock(&mt->idx_m); + mt->block_address += j->comp_len; + pthread_mutex_unlock(&mt->idx_m); + /* * Periodically call hflush (which calls fsync when on a file). * This avoids the fsync being done at the bgzf_close stage, @@ -1219,8 +1405,13 @@ static int bgzf_check_EOF_common(BGZF *fp) if (errno == ESPIPE) { hclearerr(fp->fp); return 2; } #ifdef _WIN32 if (errno == EINVAL) { hclearerr(fp->fp); return 2; } +#else + // Assume that EINVAL was due to the file being less than 28 bytes + // long, rather than being a random error return from an hfile backend. + // This should be reported as "no EOF block" rather than an error. + if (errno == EINVAL) { hclearerr(fp->fp); return 0; } #endif - else return -1; + return -1; } if ( hread(fp->fp, buf, 28) != 28 ) return -1; if ( hseek(fp->fp, offset, SEEK_SET) < 0 ) return -1; @@ -1236,6 +1427,7 @@ static void bgzf_mt_eof(BGZF *fp) { pthread_mutex_lock(&mt->job_pool_m); mt->eof = bgzf_check_EOF_common(fp); pthread_mutex_unlock(&mt->job_pool_m); + mt->command = HAS_EOF_DONE; pthread_cond_signal(&mt->command_c); } @@ -1251,13 +1443,13 @@ static void bgzf_mt_seek(BGZF *fp) { hts_tpool_process_reset(mt->out_queue, 0); pthread_mutex_lock(&mt->job_pool_m); - mt->command = NONE; mt->errcode = 0; if (hseek(fp->fp, mt->block_address, SEEK_SET) < 0) mt->errcode = BGZF_ERR_IO; pthread_mutex_unlock(&mt->job_pool_m); + mt->command = SEEK_DONE; pthread_cond_signal(&mt->command_c); } @@ -1269,25 +1461,40 @@ static void *bgzf_mt_reader(void *vp) { pthread_mutex_lock(&mt->job_pool_m); bgzf_job *j = pool_alloc(mt->job_pool); pthread_mutex_unlock(&mt->job_pool_m); + if (!j) { + hts_tpool_process_destroy(mt->out_queue); + return NULL; + } j->errcode = 0; j->comp_len = 0; j->uncomp_len = 0; j->hit_eof = 0; + j->fp = fp; while (bgzf_mt_read_block(fp, j) == 0) { // Dispatch - hts_tpool_dispatch(mt->pool, mt->out_queue, bgzf_decode_func, j); + if (hts_tpool_dispatch3(mt->pool, mt->out_queue, bgzf_decode_func, j, + job_cleanup, job_cleanup, 0) < 0) { + job_cleanup(j); + hts_tpool_process_destroy(mt->out_queue); + return NULL; + } // Check for command pthread_mutex_lock(&mt->command_m); switch (mt->command) { case SEEK: - bgzf_mt_seek(fp); + bgzf_mt_seek(fp); // Sets mt->command to SEEK_DONE pthread_mutex_unlock(&mt->command_m); goto restart; case HAS_EOF: - bgzf_mt_eof(fp); + bgzf_mt_eof(fp); // Sets mt->command to HAS_EOF_DONE + break; + + case SEEK_DONE: + case HAS_EOF_DONE: + pthread_cond_signal(&mt->command_c); break; case CLOSE: @@ -1305,16 +1512,26 @@ static void *bgzf_mt_reader(void *vp) { pthread_mutex_lock(&mt->job_pool_m); j = pool_alloc(mt->job_pool); pthread_mutex_unlock(&mt->job_pool_m); + if (!j) { + hts_tpool_process_destroy(mt->out_queue); + return NULL; + } j->errcode = 0; j->comp_len = 0; j->uncomp_len = 0; j->hit_eof = 0; + j->fp = fp; } if (j->errcode == BGZF_ERR_MT) { // Attempt to multi-thread decode a raw gzip stream cannot be done. // We tear down the multi-threaded decoder and revert to the old code. - hts_tpool_dispatch(mt->pool, mt->out_queue, bgzf_nul_func, j); + if (hts_tpool_dispatch3(mt->pool, mt->out_queue, bgzf_nul_func, j, + job_cleanup, job_cleanup, 0) < 0) { + job_cleanup(j); + hts_tpool_process_destroy(mt->out_queue); + return NULL; + } hts_tpool_process_ref_decr(mt->out_queue); return &j->errcode; } @@ -1324,7 +1541,12 @@ static void *bgzf_mt_reader(void *vp) { // j->errcode is set already. j->hit_eof = 1; - hts_tpool_dispatch(mt->pool, mt->out_queue, bgzf_nul_func, j); + if (hts_tpool_dispatch3(mt->pool, mt->out_queue, bgzf_nul_func, j, + job_cleanup, job_cleanup, 0) < 0) { + job_cleanup(j); + hts_tpool_process_destroy(mt->out_queue); + return NULL; + } if (j->errcode != 0) { hts_tpool_process_destroy(mt->out_queue); return &j->errcode; @@ -1350,9 +1572,15 @@ static void *bgzf_mt_reader(void *vp) { goto restart; case HAS_EOF: - bgzf_mt_eof(fp); + bgzf_mt_eof(fp); // Sets mt->command to HAS_EOF_DONE pthread_mutex_unlock(&mt->command_m); - continue; + break; + + case SEEK_DONE: + case HAS_EOF_DONE: + pthread_cond_signal(&mt->command_c); + pthread_mutex_unlock(&mt->command_m); + break; case CLOSE: pthread_cond_signal(&mt->command_c); @@ -1387,10 +1615,12 @@ int bgzf_thread_pool(BGZF *fp, hts_tpool *pool, int qsize) { pthread_mutex_init(&mt->job_pool_m, NULL); pthread_mutex_init(&mt->command_m, NULL); + pthread_mutex_init(&mt->idx_m, NULL); pthread_cond_init(&mt->command_c, NULL); mt->flush_pending = 0; mt->jobs_pending = 0; mt->free_block = fp->uncompressed_block; // currently in-use block + mt->block_address = fp->block_address; pthread_create(&mt->io_task, NULL, fp->is_write ? bgzf_mt_writer : bgzf_mt_reader, fp); @@ -1418,8 +1648,10 @@ int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) return 0; } -static void mt_destroy(mtaux_t *mt) +static int mt_destroy(mtaux_t *mt) { + int ret = 0; + pthread_mutex_lock(&mt->command_m); mt->command = CLOSE; pthread_cond_signal(&mt->command_c); @@ -1428,10 +1660,13 @@ static void mt_destroy(mtaux_t *mt) // Destroying the queue first forces the writer to exit. hts_tpool_process_destroy(mt->out_queue); - pthread_join(mt->io_task, NULL); + void *retval = NULL; + pthread_join(mt->io_task, &retval); + ret = retval != NULL ? -1 : 0; pthread_mutex_destroy(&mt->job_pool_m); pthread_mutex_destroy(&mt->command_m); + pthread_mutex_destroy(&mt->idx_m); pthread_cond_destroy(&mt->command_c); if (mt->curr_job) pool_free(mt->job_pool, mt->curr_job); @@ -1441,19 +1676,27 @@ static void mt_destroy(mtaux_t *mt) pool_destroy(mt->job_pool); + if (mt->idx_cache.e) + free(mt->idx_cache.e); + free(mt); fflush(stderr); + + return ret; } static int mt_queue(BGZF *fp) { mtaux_t *mt = fp->mt; + mt->block_number++; + // Also updated by writer thread pthread_mutex_lock(&mt->job_pool_m); bgzf_job *j = pool_alloc(mt->job_pool); - mt->jobs_pending++; + if (j) mt->jobs_pending++; pthread_mutex_unlock(&mt->job_pool_m); + if (!j) return -1; j->fp = fp; j->errcode = 0; @@ -1461,16 +1704,30 @@ static int mt_queue(BGZF *fp) if (fp->compress_level == 0) { memcpy(j->comp_data + BLOCK_HEADER_LENGTH + 5, fp->uncompressed_block, j->uncomp_len); - hts_tpool_dispatch(mt->pool, mt->out_queue, bgzf_encode_level0_func, j); + if (hts_tpool_dispatch3(mt->pool, mt->out_queue, + bgzf_encode_level0_func, j, + job_cleanup, job_cleanup, 0) < 0) { + goto fail; + } } else { memcpy(j->uncomp_data, fp->uncompressed_block, j->uncomp_len); // Need non-block vers & job_pending? - hts_tpool_dispatch(mt->pool, mt->out_queue, bgzf_encode_func, j); + if (hts_tpool_dispatch3(mt->pool, mt->out_queue, bgzf_encode_func, j, + job_cleanup, job_cleanup, 0) < 0) { + goto fail; + } } fp->block_offset = 0; return 0; + + fail: + job_cleanup(j); + pthread_mutex_lock(&mt->job_pool_m); + mt->jobs_pending--; + pthread_mutex_unlock(&mt->job_pool_m); + return -1; } static int mt_flush_queue(BGZF *fp) @@ -1526,7 +1783,17 @@ int bgzf_flush(BGZF *fp) if (fp->mt) { int ret = 0; if (fp->block_offset) ret = mt_queue(fp); - return ret ? ret : mt_flush_queue(fp); + if (!ret) ret = mt_flush_queue(fp); + + // We maintain mt->block_address when threading as the + // main code can call bgzf_tell without any locks. + // (The result from tell are wrong, but we only care about the last + // 16-bits worth except for the final flush process. + pthread_mutex_lock(&fp->mt->idx_m); + fp->block_address = fp->mt->block_address; + pthread_mutex_unlock(&fp->mt->idx_m); + + return ret; } #endif while (fp->block_offset > 0) { @@ -1559,8 +1826,12 @@ int bgzf_flush_try(BGZF *fp, ssize_t size) ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) { - if ( !fp->is_compressed ) + if ( !fp->is_compressed ) { + size_t push = length + (size_t) fp->block_offset; + fp->block_offset = push % BGZF_MAX_BLOCK_SIZE; + fp->block_address += (push - fp->block_offset); return hwrite(fp->fp, data, length); + } const uint8_t *input = (const uint8_t*)data; ssize_t remaining = length; @@ -1582,8 +1853,13 @@ ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) ssize_t bgzf_block_write(BGZF *fp, const void *data, size_t length) { - if ( !fp->is_compressed ) + if ( !fp->is_compressed ) { + size_t push = length + (size_t) fp->block_offset; + fp->block_offset = push % BGZF_MAX_BLOCK_SIZE; + fp->block_address += (push - fp->block_offset); return hwrite(fp->fp, data, length); + } + const uint8_t *input = (const uint8_t*)data; ssize_t remaining = length; assert(fp->is_write); @@ -1616,16 +1892,30 @@ ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length) return ret; } +// Helper function for tidying up fp->mt and setting errcode +static void bgzf_close_mt(BGZF *fp) { + if (fp->mt) { + if (!fp->mt->free_block) + fp->uncompressed_block = NULL; + if (mt_destroy(fp->mt) < 0) + fp->errcode = BGZF_ERR_IO; + } +} + int bgzf_close(BGZF* fp) { int ret, block_length; if (fp == 0) return -1; if (fp->is_write && fp->is_compressed) { - if (bgzf_flush(fp) != 0) return -1; + if (bgzf_flush(fp) != 0) { + bgzf_close_mt(fp); + return -1; + } fp->compress_level = -1; block_length = deflate_block(fp, 0); // write an empty block if (block_length < 0) { hts_log_debug("Deflate block operation failed: %s", bgzf_zerr(block_length, NULL)); + bgzf_close_mt(fp); return -1; } if (hwrite(fp->fp, fp->compressed_block, block_length) < 0 @@ -1635,13 +1925,9 @@ int bgzf_close(BGZF* fp) return -1; } } -#ifdef BGZF_MT - if (fp->mt) { - if (!fp->mt->free_block) - fp->uncompressed_block = NULL; - mt_destroy(fp->mt); - } -#endif + + bgzf_close_mt(fp); + if ( fp->is_gzip ) { if (fp->gz_stream == NULL) ret = Z_OK; @@ -1663,6 +1949,7 @@ int bgzf_close(BGZF* fp) void bgzf_set_cache_size(BGZF *fp, int cache_size) { + if (fp && fp->mt) return; // Not appropriate when multi-threading if (fp && fp->cache) fp->cache_size = cache_size; } @@ -1671,10 +1958,25 @@ int bgzf_check_EOF(BGZF *fp) { if (fp->mt) { pthread_mutex_lock(&fp->mt->command_m); + // fp->mt->command state transitions should be: + // NONE -> HAS_EOF -> HAS_EOF_DONE -> NONE + // (HAS_EOF -> HAS_EOF_DONE happens in bgzf_mt_reader thread) fp->mt->command = HAS_EOF; pthread_cond_signal(&fp->mt->command_c); hts_tpool_wake_dispatch(fp->mt->out_queue); - pthread_cond_wait(&fp->mt->command_c, &fp->mt->command_m); + do { + pthread_cond_wait(&fp->mt->command_c, &fp->mt->command_m); + switch (fp->mt->command) { + case HAS_EOF_DONE: break; + case HAS_EOF: + // Resend signal intended for bgzf_mt_reader() + pthread_cond_signal(&fp->mt->command_c); + break; + default: + abort(); // Should not get to any other state + } + } while (fp->mt->command != HAS_EOF_DONE); + fp->mt->command = NONE; has_eof = fp->mt->eof; pthread_mutex_unlock(&fp->mt->command_m); } else { @@ -1686,18 +1988,9 @@ int bgzf_check_EOF(BGZF *fp) { return has_eof; } -int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) +static inline int64_t bgzf_seek_common(BGZF* fp, + int64_t block_address, int block_offset) { - int block_offset; - int64_t block_address; - - if (fp->is_write || where != SEEK_SET || fp->is_gzip) { - fp->errcode |= BGZF_ERR_MISUSE; - return -1; - } - block_offset = pos & 0xFFFF; - block_address = pos >> 16; - if (fp->mt) { // The reader runs asynchronous and does loops of: // Read block @@ -1714,11 +2007,26 @@ int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) // know the seek succeeded. pthread_mutex_lock(&fp->mt->command_m); fp->mt->hit_eof = 0; + // fp->mt->command state transitions should be: + // NONE -> SEEK -> SEEK_DONE -> NONE + // (SEEK -> SEEK_DONE happens in bgzf_mt_reader thread) fp->mt->command = SEEK; fp->mt->block_address = block_address; pthread_cond_signal(&fp->mt->command_c); hts_tpool_wake_dispatch(fp->mt->out_queue); - pthread_cond_wait(&fp->mt->command_c, &fp->mt->command_m); + do { + pthread_cond_wait(&fp->mt->command_c, &fp->mt->command_m); + switch (fp->mt->command) { + case SEEK_DONE: break; + case SEEK: + // Resend signal intended for bgzf_mt_reader() + pthread_cond_signal(&fp->mt->command_c); + break; + default: + abort(); // Should not get to any other state + } + } while (fp->mt->command != SEEK_DONE); + fp->mt->command = NONE; fp->block_length = 0; // indicates current block has not been loaded fp->block_address = block_address; @@ -1738,6 +2046,25 @@ int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) return 0; } +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) +{ + if (fp->is_write || where != SEEK_SET || fp->is_gzip) { + fp->errcode |= BGZF_ERR_MISUSE; + return -1; + } + + // This is a flag to indicate we've jumped elsewhere in the stream, to act + // as a hint to any other code which is wrapping up bgzf for its own + // purposes. We may not be able to tell when seek happens as it can be + // done on our behalf, eg by the iterator. + // + // This is never cleared here. Any tool that needs to handle it is also + // responsible for clearing it. + fp->seeked = pos; + + return bgzf_seek_common(fp, pos >> 16, pos & 0xFFFF); +} + int bgzf_is_bgzf(const char *fn) { uint8_t buf[16]; @@ -1880,6 +2207,10 @@ int bgzf_index_dump_hfile(BGZF *fp, struct hFILE *idx, const char *name) if (bgzf_flush(fp) != 0) return -1; + // discard the entry marking the end of the file + if (fp->mt && fp->idx) + fp->idx->noffs--; + if (hwrite_uint64(fp->idx->noffs - 1, idx) < 0) goto fail; for (i=1; iidx->noffs; i++) { @@ -2015,8 +2346,19 @@ int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix) return -1; } -int bgzf_useek(BGZF *fp, long uoffset, int where) +int bgzf_useek(BGZF *fp, off_t uoffset, int where) { + if (fp->is_write || where != SEEK_SET || fp->is_gzip) { + fp->errcode |= BGZF_ERR_MISUSE; + return -1; + } + if (uoffset >= fp->uncompressed_address - fp->block_offset && + uoffset < fp->uncompressed_address + fp->block_length - fp->block_offset) { + // Can seek into existing data + fp->block_offset += uoffset - fp->uncompressed_address; + fp->uncompressed_address = uoffset; + return 0; + } if ( !fp->is_compressed ) { if (hseek(fp->fp, uoffset, SEEK_SET) < 0) @@ -2051,14 +2393,9 @@ int bgzf_useek(BGZF *fp, long uoffset, int where) else break; } int i = ilo-1; - if (hseek(fp->fp, fp->idx->offs[i].caddr, SEEK_SET) < 0) - { - fp->errcode |= BGZF_ERR_IO; + if (bgzf_seek_common(fp, fp->idx->offs[i].caddr, 0) < 0) return -1; - } - fp->block_length = 0; // indicates current block has not been loaded - fp->block_address = fp->idx->offs[i].caddr; - fp->block_offset = 0; + if ( bgzf_read_block(fp) < 0 ) { fp->errcode |= BGZF_ERR_IO; return -1; @@ -2072,7 +2409,7 @@ int bgzf_useek(BGZF *fp, long uoffset, int where) return 0; } -long bgzf_utell(BGZF *fp) +off_t bgzf_utell(BGZF *fp) { return fp->uncompressed_address; // currently maintained only when reading } diff --git a/bgzip.1 b/bgzip.1 index 71eddde42..6a89280c5 100644 --- a/bgzip.1 +++ b/bgzip.1 @@ -1,4 +1,4 @@ -.TH bgzip 1 "18 July 2018" "htslib-1.9" "Bioinformatics tools" +.TH bgzip 1 "6 December 2019" "htslib-1.10" "Bioinformatics tools" .SH NAME .PP bgzip \- Block compression/decompression utility @@ -86,7 +86,9 @@ Write to standard output, keep original files unchanged. Decompress. .TP .B "-f, --force" -Overwrite files without asking. +Overwrite files without asking, or decompress files that don't have a known +compression filename extension (e.g., \fI.gz\fR) without asking. +Use \fB--force\fR twice to do both without asking. .TP .B "-h, --help" Displays a help message. diff --git a/bgzip.c b/bgzip.c index 6ddd61fa0..c3ff929ae 100644 --- a/bgzip.c +++ b/bgzip.c @@ -1,7 +1,7 @@ /* bgzip.c -- Block compression/decompression utility. Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology - Copyright (C) 2010, 2013-2018 Genome Research Ltd. + Copyright (C) 2010, 2013-2019 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -33,7 +34,6 @@ #include #include #include -#include #include "htslib/bgzf.h" #include "htslib/hts.h" @@ -53,42 +53,76 @@ static void error(const char *format, ...) exit(EXIT_FAILURE); } +static int ask_yn() +{ + char line[1024]; + if (fgets(line, sizeof line, stdin) == NULL) + return 0; + return line[0] == 'Y' || line[0] == 'y'; +} + static int confirm_overwrite(const char *fn) { int save_errno = errno; int ret = 0; if (isatty(STDIN_FILENO)) { - char c; fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn); - if (scanf("%c", &c) == 1 && (c == 'Y' || c == 'y')) ret = 1; + if (ask_yn()) ret = 1; } errno = save_errno; return ret; } -static int bgzip_main_usage(void) +static int known_extension(const char *ext) +{ + static const char *known[] = { + "gz", "bgz", "bgzf", + NULL + }; + + const char **p; + for (p = known; *p; p++) + if (strcasecmp(ext, *p) == 0) return 1; + return 0; +} + +static int confirm_filename(int *is_forced, const char *name, const char *ext) { - fprintf(stderr, "\n"); - fprintf(stderr, "Version: %s\n", hts_version()); - fprintf(stderr, "Usage: bgzip [OPTIONS] [FILE] ...\n"); - fprintf(stderr, "Options:\n"); - fprintf(stderr, " -b, --offset INT decompress at virtual file pointer (0-based uncompressed offset)\n"); - fprintf(stderr, " -c, --stdout write on standard output, keep original files unchanged\n"); - fprintf(stderr, " -d, --decompress decompress\n"); - fprintf(stderr, " -f, --force overwrite files without asking\n"); - fprintf(stderr, " -h, --help give this help\n"); - fprintf(stderr, " -i, --index compress and create BGZF index\n"); - fprintf(stderr, " -I, --index-name FILE name of BGZF index file [file.gz.gzi]\n"); - fprintf(stderr, " -l, --compress-level INT Compression level to use when compressing; 0 to 9, or -1 for default [-1]\n"); - fprintf(stderr, " -r, --reindex (re)index compressed file\n"); - fprintf(stderr, " -g, --rebgzip use an index file to bgzip a file\n"); - fprintf(stderr, " -s, --size INT decompress INT bytes (uncompressed size)\n"); - fprintf(stderr, " -@, --threads INT number of compression threads to use [1]\n"); - fprintf(stderr, " -t, --test test integrity of compressed file"); - fprintf(stderr, "\n"); - return 1; + if (*is_forced) { + (*is_forced)--; + return 1; + } + + if (!isatty(STDIN_FILENO)) + return 0; + + fprintf(stderr, "[bgzip] .%s is not a known extension; do you wish to decompress to %s (y or n)? ", ext, name); + return ask_yn(); +} + +static int bgzip_main_usage(FILE *fp, int status) +{ + fprintf(fp, "\n"); + fprintf(fp, "Version: %s\n", hts_version()); + fprintf(fp, "Usage: bgzip [OPTIONS] [FILE] ...\n"); + fprintf(fp, "Options:\n"); + fprintf(fp, " -b, --offset INT decompress at virtual file pointer (0-based uncompressed offset)\n"); + fprintf(fp, " -c, --stdout write on standard output, keep original files unchanged\n"); + fprintf(fp, " -d, --decompress decompress\n"); + fprintf(fp, " -f, --force overwrite files without asking\n"); + fprintf(fp, " -h, --help give this help\n"); + fprintf(fp, " -i, --index compress and create BGZF index\n"); + fprintf(fp, " -I, --index-name FILE name of BGZF index file [file.gz.gzi]\n"); + fprintf(fp, " -l, --compress-level INT Compression level to use when compressing; 0 to 9, or -1 for default [-1]\n"); + fprintf(fp, " -r, --reindex (re)index compressed file\n"); + fprintf(fp, " -g, --rebgzip use an index file to bgzip a file\n"); + fprintf(fp, " -s, --size INT decompress INT bytes (uncompressed size)\n"); + fprintf(fp, " -@, --threads INT number of compression threads to use [1]\n"); + fprintf(fp, " -t, --test test integrity of compressed file"); + fprintf(fp, "\n"); + return status; } int main(int argc, char **argv) @@ -126,7 +160,7 @@ int main(int argc, char **argv) case 'c': pstdout = 1; break; case 'b': start = atol(optarg); compress = 0; pstdout = 1; break; case 's': size = atol(optarg); pstdout = 1; break; - case 'f': is_forced = 1; break; + case 'f': is_forced++; break; case 'i': index = 1; break; case 'I': index_fname = optarg; break; case 'l': compress_level = atol(optarg); break; @@ -137,10 +171,10 @@ int main(int argc, char **argv) case 1: printf( "bgzip (htslib) %s\n" -"Copyright (C) 2018 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2019 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; - case 'h': - case '?': return bgzip_main_usage(); + case 'h': return bgzip_main_usage(stdout, EXIT_SUCCESS); + case '?': return bgzip_main_usage(stderr, EXIT_FAILURE); } } if (size >= 0) end = start + size; @@ -149,7 +183,6 @@ int main(int argc, char **argv) return 1; } if (compress == 1) { - struct stat sbuf; int f_src = fileno(stdin); char out_mode[3] = "w\0"; char out_mode_exclusive[4] = "wx\0"; @@ -165,12 +198,6 @@ int main(int argc, char **argv) if ( argc>optind ) { - if ( stat(argv[optind],&sbuf)<0 ) - { - fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); - return 1; - } - if ((f_src = open(argv[optind], O_RDONLY)) < 0) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; @@ -195,7 +222,7 @@ int main(int argc, char **argv) } } else if (!pstdout && isatty(fileno((FILE *)stdout)) ) - return bgzip_main_usage(); + return bgzip_main_usage(stderr, EXIT_FAILURE); else if ( index && !index_fname ) { fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); @@ -216,10 +243,10 @@ int main(int argc, char **argv) return 1; } + if ( index ) bgzf_index_build_init(fp); if (threads > 1) bgzf_mt(fp, threads, 256); - if ( index ) bgzf_index_build_init(fp); buffer = malloc(WINDOW_SIZE); #ifdef _WIN32 _setmode(f_src, O_BINARY); @@ -284,26 +311,18 @@ int main(int argc, char **argv) } else { - struct stat sbuf; int f_dst; if ( argc>optind ) { - if ( stat(argv[optind],&sbuf)<0 ) - { - fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); - return 1; - } - char *name; - int len = strlen(argv[optind]); - if ( strcmp(argv[optind]+len-3,".gz") && !test) - { - fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); - return 1; - } fp = bgzf_open(argv[optind], "r"); if (fp == NULL) { - fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); + fprintf(stderr, "[bgzip] Could not open %s: %s\n", argv[optind], strerror(errno)); + return 1; + } + if (bgzf_compression(fp) == no_compression) { + fprintf(stderr, "[bgzip] %s: not a compressed file -- ignored\n", argv[optind]); + bgzf_close(fp); return 1; } @@ -312,8 +331,24 @@ int main(int argc, char **argv) } else { const int wrflags = O_WRONLY | O_CREAT | O_TRUNC; + char *name = argv[optind], *ext; + size_t pos; + for (pos = strlen(name); pos > 0; --pos) + if (name[pos] == '.' || name[pos] == '/') break; + if (pos == 0 || name[pos] != '.') { + fprintf(stderr, "[bgzip] can't remove an extension from %s -- please rename\n", argv[optind]); + bgzf_close(fp); + return 1; + } name = strdup(argv[optind]); - name[strlen(name) - 3] = '\0'; + name[pos] = '\0'; + ext = &name[pos+1]; + if (! (known_extension(ext) || confirm_filename(&is_forced, name, ext))) { + fprintf(stderr, "[bgzip] unknown extension .%s -- declining to decompress to %s\n", ext, name); + bgzf_close(fp); + free(name); + return 1; + } f_dst = open(name, is_forced? wrflags : wrflags|O_EXCL, 0666); if (f_dst < 0 && errno == EEXIST && confirm_overwrite(name)) f_dst = open(name, wrflags, 0666); @@ -326,7 +361,7 @@ int main(int argc, char **argv) } } else if (!pstdout && isatty(fileno((FILE *)stdin)) ) - return bgzip_main_usage(); + return bgzip_main_usage(stderr, EXIT_FAILURE); else { f_dst = fileno(stdout); @@ -335,22 +370,33 @@ int main(int argc, char **argv) fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); return 1; } + if (bgzf_compression(fp) == no_compression) { + fprintf(stderr, "[bgzip] stdin is not compressed -- ignored\n"); + bgzf_close(fp); + return 1; + } } - if (!fp->is_compressed) { - fprintf(stderr, "[bgzip] Expected compressed file -- ignored\n"); - return 1; - } - - if (threads > 1) - bgzf_mt(fp, threads, 256); - buffer = malloc(WINDOW_SIZE); if ( start>0 ) { - if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); + if (index_fname) { + if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) + error("Could not load index: %s\n", index_fname); + } else { + if (optind >= argc) { + error("The -b option requires -I when reading from stdin " + "(and stdin must be seekable)\n"); + } + if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) + error("Could not load index: %s.gzi\n", argv[optind]); + } if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start); } + + if (threads > 1) + bgzf_mt(fp, threads, 256); + #ifdef _WIN32 _setmode(f_dst, O_BINARY); #endif @@ -370,7 +416,7 @@ int main(int argc, char **argv) } free(buffer); if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode); - if (!pstdout && !test) unlink(argv[optind]); + if (argc > optind && !pstdout && !test) unlink(argv[optind]); return 0; } } diff --git a/config.mk.in b/config.mk.in index 55da9c019..976e557df 100644 --- a/config.mk.in +++ b/config.mk.in @@ -1,6 +1,6 @@ # Optional configure Makefile overrides for htslib. # -# Copyright (C) 2015-2017 Genome Research Ltd. +# Copyright (C) 2015-2017, 2019 Genome Research Ltd. # # Author: John Marshall # @@ -48,6 +48,10 @@ LIBS = @LIBS@ PLATFORM = @PLATFORM@ PLUGIN_EXT = @PLUGIN_EXT@ +# The default Makefile enables some of the optional files, but we blank +# them so they can be controlled by configure instead. +NONCONFIGURE_OBJS = + # Lowercase here indicates these are "local" to config.mk plugin_OBJS = noplugin_LDFLAGS = @@ -74,10 +78,12 @@ endif ifeq "s3-@s3@" "s3-enabled" plugin_OBJS += hfile_s3.o +plugin_OBJS += hfile_s3_write.o CRYPTO_LIBS = @CRYPTO_LIBS@ noplugin_LIBS += $(CRYPTO_LIBS) hfile_s3$(PLUGIN_EXT): LIBS += $(CRYPTO_LIBS) +hfile_s3_write$(PLUGIN_EXT): LIBS += $(CRYPTO_LIBS) $(LIBCURL_LIBS) endif ifeq "plugins-@enable_plugins@" "plugins-yes" @@ -94,6 +100,7 @@ plugin.o plugin.pico: CPPFLAGS += -DPLUGINPATH=\"$(pluginpath)\" hfile_gcs.o hfile_gcs.pico: version.h hfile_libcurl.o hfile_libcurl.pico: version.h hfile_s3.o hfile_s3.pico: version.h +hfile_s3_write.o hfile_s3_write.pico: version.h # Windows DLL plugins depend on the import library, built as a byproduct. $(plugin_OBJS:.o=.cygdll): cyghts-$(LIBHTS_SOVERSION).dll diff --git a/configure.ac b/configure.ac index d53bf9935..b17526c7a 100644 --- a/configure.ac +++ b/configure.ac @@ -30,6 +30,7 @@ AC_CONFIG_SRCDIR(hts.c) AC_CONFIG_HEADERS(config.h) m4_include([m4/hts_prog_cc_warnings.m4]) +m4_include([m4/hts_hide_dynamic_syms.m4]) dnl Copyright notice to be copied into the generated configure script AC_COPYRIGHT([Portions copyright (C) 2018 Genome Research Ltd. @@ -89,7 +90,6 @@ AC_ARG_ENABLE([gcs], [], [enable_gcs=check]) AC_SYS_LARGEFILE -AC_FUNC_FSEEKO AC_ARG_ENABLE([libcurl], [AS_HELP_STRING([--enable-libcurl], @@ -167,6 +167,10 @@ esac AC_MSG_RESULT([$host_result]) AC_SUBST([PLATFORM]) +dnl Try to get more control over which symbols are exported in the shared +dnl library. +HTS_HIDE_DYNAMIC_SYMBOLS + dnl FIXME This pulls in dozens of standard header checks AC_FUNC_MMAP AC_CHECK_FUNCS([gmtime_r fsync drand48]) @@ -180,9 +184,12 @@ if test $enable_plugins != no; then Plugin support requires dynamic linking facilities from the operating system. Either configure with --disable-plugins or resolve this error to build HTSlib.])]) + # Check if the compiler understands -rdynamic # TODO Test whether this is required and/or needs tweaking per-platform - LDFLAGS="$LDFLAGS -rdynamic" - static_LDFLAGS="$static_LDFLAGS -rdynamic" + HTS_TEST_CC_C_LD_FLAG([-rdynamic],[rdynamic_flag]) + AS_IF([test x"$rdynamic_flag" != "xno"], + [LDFLAGS="$LDFLAGS $rdynamic_flag" + static_LDFLAGS="$static_LDFLAGS $rdynamic_flag"]) case "$ac_cv_search_dlopen" in -l*) static_LIBS="$static_LIBS $ac_cv_search_dlopen" ;; esac @@ -275,9 +282,10 @@ fi AS_IF([test "x$with_libdeflate" != "xno"], [libdeflate=ok AC_CHECK_HEADER([libdeflate.h],[],[libdeflate='missing header'],[;]) - AC_CHECK_LIB([deflate], [libdeflate_deflate_compress],[],[libdeflate='missing library']) + AC_CHECK_LIB([deflate], [libdeflate_deflate_compress],[:],[libdeflate='missing library']) AS_IF([test "$libdeflate" = "ok"], [AC_DEFINE([HAVE_LIBDEFLATE], 1, [Define if libdeflate is available.]) + LIBS="-ldeflate $LIBS" private_LIBS="$private_LIBS -ldeflate" static_LIBS="$static_LIBS -ldeflate"], [AS_IF([test "x$with_libdeflate" != "xcheck"], diff --git a/cram/cram.h b/cram/cram.h index 5b7057e46..42aa13595 100644 --- a/cram/cram.h +++ b/cram/cram.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2013 Genome Research Ltd. +Copyright (c) 2012-2013, 2015, 2018 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -39,11 +39,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * although these should not be included directly (use this file instead). */ -#ifndef _CRAM_H_ -#define _CRAM_H_ +#ifndef CRAM_ALL_H +#define CRAM_ALL_H #include "cram/cram_samtools.h" -#include "cram/sam_header.h" +#include "header.h" #include "cram_structs.h" #include "cram_io.h" #include "cram_encode.h" diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 2dcc5a876..cfc0f883b 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2013 Genome Research Ltd. +Copyright (c) 2012-2019 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * {codec,type} tuples. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -318,7 +319,7 @@ int cram_external_decode_int(cram_slice *slice, cram_codec *c, cram_block *b; /* Find the external block */ - b = cram_get_block_by_id(slice, c->external.content_id); + b = cram_get_block_by_id(slice, c->u.external.content_id); if (!b) return *out_size?-1:0; @@ -331,6 +332,26 @@ int cram_external_decode_int(cram_slice *slice, cram_codec *c, return l > 0 ? 0 : -1; } +int cram_external_decode_long(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int64_t l; + char *cp; + cram_block *b; + + /* Find the external block */ + b = cram_get_block_by_id(slice, c->u.external.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + l = safe_ltf8_get(cp, (char *)b->data + b->uncomp_size, (int64_t *)out); + b->idx += l; + *out_size = 1; + + return l > 0 ? 0 : -1; +} + int cram_external_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { @@ -338,7 +359,7 @@ int cram_external_decode_char(cram_slice *slice, cram_codec *c, cram_block *b; /* Find the external block */ - b = cram_get_block_by_id(slice, c->external.content_id); + b = cram_get_block_by_id(slice, c->u.external.content_id); if (!b) return *out_size?-1:0; @@ -359,7 +380,7 @@ static int cram_external_decode_block(cram_slice *slice, cram_codec *c, cram_block *b = NULL; /* Find the external block */ - b = cram_get_block_by_id(slice, c->external.content_id); + b = cram_get_block_by_id(slice, c->u.external.content_id); if (!b) return *out_size?-1:0; @@ -369,6 +390,9 @@ static int cram_external_decode_block(cram_slice *slice, cram_codec *c, BLOCK_APPEND(out, cp, *out_size); return 0; + + block_err: + return -1; } void cram_external_decode_free(cram_codec *c) { @@ -389,20 +413,22 @@ cram_codec *cram_external_decode_init(char *data, int size, return NULL; c->codec = E_EXTERNAL; - if (option == E_INT || option == E_LONG) + if (option == E_INT) c->decode = cram_external_decode_int; + else if (option == E_LONG) + c->decode = cram_external_decode_long; else if (option == E_BYTE_ARRAY || option == E_BYTE) c->decode = cram_external_decode_char; else c->decode = cram_external_decode_block; c->free = cram_external_decode_free; - cp += safe_itf8_get(cp, data + size, &c->external.content_id); + cp += safe_itf8_get(cp, data + size, &c->u.external.content_id); if (cp - data != size) goto malformed; - c->external.type = option; + c->u.external.type = option; return c; @@ -416,7 +442,14 @@ int cram_external_encode_int(cram_slice *slice, cram_codec *c, char *in, int in_size) { uint32_t *i32 = (uint32_t *)in; - itf8_put_blk(c->out, *i32); + return itf8_put_blk(c->out, *i32) >= 0 ? 0 : -1; +} + +int cram_external_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + uint64_t *i64 = (uint64_t *)in; + + ltf8_put_blk(c->out, *i64); return 0; } @@ -424,6 +457,9 @@ int cram_external_encode_char(cram_slice *slice, cram_codec *c, char *in, int in_size) { BLOCK_APPEND(c->out, in, in_size); return 0; + + block_err: + return -1; } void cram_external_encode_free(cram_codec *c) { @@ -435,7 +471,7 @@ void cram_external_encode_free(cram_codec *c) { int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, int version) { char tmp[99], *tp = tmp; - int len = 0; + int len = 0, r = 0, n; if (prefix) { size_t l = strlen(prefix); @@ -443,13 +479,17 @@ int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, len += l; } - tp += itf8_put(tp, c->e_external.content_id); - len += itf8_put_blk(b, c->codec); - len += itf8_put_blk(b, tp-tmp); + tp += itf8_put(tp, c->u.e_external.content_id); + len += (n = itf8_put_blk(b, c->codec)); r |= n; + len += (n = itf8_put_blk(b, tp-tmp)); r |= n; BLOCK_APPEND(b, tmp, tp-tmp); len += tp-tmp; - return len; + if (r > 0) + return len; + + block_err: + return -1; } cram_codec *cram_external_encode_init(cram_stats *st, @@ -463,15 +503,17 @@ cram_codec *cram_external_encode_init(cram_stats *st, return NULL; c->codec = E_EXTERNAL; c->free = cram_external_encode_free; - if (option == E_INT || option == E_LONG) + if (option == E_INT) c->encode = cram_external_encode_int; + else if (option == E_LONG) + c->encode = cram_external_encode_long; else if (option == E_BYTE_ARRAY || option == E_BYTE) c->encode = cram_external_encode_char; else abort(); c->store = cram_external_encode_store; - c->e_external.content_id = (size_t)dat; + c->u.e_external.content_id = (size_t)dat; return c; } @@ -480,19 +522,37 @@ cram_codec *cram_external_encode_init(cram_stats *st, * --------------------------------------------------------------------------- * BETA */ +int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n = *out_size; + + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) + return -1; + + for (i = 0; i < n; i++) + out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; + } else { + for (i = 0; i < n; i++) + out_i[i] = -c->u.beta.offset; + } + + return 0; +} + int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { int32_t *out_i = (int32_t *)out; int i, n = *out_size; - if (c->beta.nbits) { - if (cram_not_enough_bits(in, c->beta.nbits * n)) + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) return -1; for (i = 0; i < n; i++) - out_i[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset; + out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; } else { for (i = 0; i < n; i++) - out_i[i] = -c->beta.offset; + out_i[i] = -c->u.beta.offset; } return 0; @@ -502,20 +562,20 @@ int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char int i, n = *out_size; - if (c->beta.nbits) { - if (cram_not_enough_bits(in, c->beta.nbits * n)) + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) return -1; if (out) for (i = 0; i < n; i++) - out[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset; + out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; else for (i = 0; i < n; i++) - get_bits_MSB(in, c->beta.nbits); + get_bits_MSB(in, c->u.beta.nbits); } else { if (out) for (i = 0; i < n; i++) - out[i] = -c->beta.offset; + out[i] = -c->u.beta.offset; } return 0; @@ -536,23 +596,26 @@ cram_codec *cram_beta_decode_init(char *data, int size, return NULL; c->codec = E_BETA; - if (option == E_INT || option == E_LONG) + if (option == E_INT) c->decode = cram_beta_decode_int; + else if (option == E_LONG) + c->decode = cram_beta_decode_long; else if (option == E_BYTE_ARRAY || option == E_BYTE) c->decode = cram_beta_decode_char; else { hts_log_error("BYTE_ARRAYs not supported by this codec"); + free(c); return NULL; } c->free = cram_beta_decode_free; - c->beta.nbits = -1; - cp += safe_itf8_get(cp, data + size, &c->beta.offset); + c->u.beta.nbits = -1; + cp += safe_itf8_get(cp, data + size, &c->u.beta.offset); if (cp < data + size) // Ensure test below works - cp += safe_itf8_get(cp, data + size, &c->beta.nbits); + cp += safe_itf8_get(cp, data + size, &c->u.beta.nbits); if (cp - data != size - || c->beta.nbits < 0 || c->beta.nbits > 8 * sizeof(int)) { + || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) { hts_log_error("Malformed beta header stream"); free(c); return NULL; @@ -563,7 +626,7 @@ cram_codec *cram_beta_decode_init(char *data, int size, int cram_beta_encode_store(cram_codec *c, cram_block *b, char *prefix, int version) { - int len = 0; + int len = 0, r = 0, n; if (prefix) { size_t l = strlen(prefix); @@ -571,13 +634,29 @@ int cram_beta_encode_store(cram_codec *c, cram_block *b, len += l; } - len += itf8_put_blk(b, c->codec); - len += itf8_put_blk(b, itf8_size(c->e_beta.offset) - + itf8_size(c->e_beta.nbits)); // codec length - len += itf8_put_blk(b, c->e_beta.offset); - len += itf8_put_blk(b, c->e_beta.nbits); + len += (n = itf8_put_blk(b, c->codec)); r |= n; + len += (n = itf8_put_blk(b, itf8_size(c->u.e_beta.offset) + + itf8_size(c->u.e_beta.nbits))); // codec length + r |= n; + len += (n = itf8_put_blk(b, c->u.e_beta.offset)); r |= n; + len += (n = itf8_put_blk(b, c->u.e_beta.nbits)); r |= n; - return len; + if (r > 0) return len; + + block_err: + return -1; +} + +int cram_beta_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *syms = (int64_t *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); + + return r; } int cram_beta_encode_int(cram_slice *slice, cram_codec *c, @@ -586,8 +665,8 @@ int cram_beta_encode_int(cram_slice *slice, cram_codec *c, int i, r = 0; for (i = 0; i < in_size; i++) - r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset, - c->e_beta.nbits); + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); return r; } @@ -598,8 +677,8 @@ int cram_beta_encode_char(cram_slice *slice, cram_codec *c, int i, r = 0; for (i = 0; i < in_size; i++) - r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset, - c->e_beta.nbits); + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); return r; } @@ -623,6 +702,8 @@ cram_codec *cram_beta_encode_init(cram_stats *st, c->free = cram_beta_encode_free; if (option == E_INT) c->encode = cram_beta_encode_int; + else if (option == E_LONG) + c->encode = cram_beta_encode_long; else c->encode = cram_beta_encode_char; c->store = cram_beta_encode_store; @@ -658,13 +739,13 @@ cram_codec *cram_beta_encode_init(cram_stats *st, } assert(max_val >= min_val); - c->e_beta.offset = -min_val; + c->u.e_beta.offset = -min_val; range = (int64_t) max_val - min_val; while (range) { len++; range >>= 1; } - c->e_beta.nbits = len; + c->u.e_beta.nbits = len; return c; } @@ -676,7 +757,7 @@ cram_codec *cram_beta_encode_init(cram_stats *st, int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { int32_t *out_i = (int32_t *)out; int n, count; - int k = c->subexp.k; + int k = c->u.subexp.k; for (count = 0, n = *out_size; count < n; count++) { int i = 0, tail; @@ -711,7 +792,7 @@ int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *o } } - out_i[count] = val - c->subexp.offset; + out_i[count] = val - c->u.subexp.offset; } return 0; @@ -739,12 +820,12 @@ cram_codec *cram_subexp_decode_init(char *data, int size, c->codec = E_SUBEXP; c->decode = cram_subexp_decode; c->free = cram_subexp_decode_free; - c->subexp.k = -1; + c->u.subexp.k = -1; - cp += safe_itf8_get(cp, data + size, &c->subexp.offset); - cp += safe_itf8_get(cp, data + size, &c->subexp.k); + cp += safe_itf8_get(cp, data + size, &c->u.subexp.offset); + cp += safe_itf8_get(cp, data + size, &c->u.subexp.k); - if (cp - data != size || c->subexp.k < 0) { + if (cp - data != size || c->u.subexp.k < 0) { hts_log_error("Malformed subexp header stream"); free(c); return NULL; @@ -775,7 +856,7 @@ int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *ou nz--; } - out_i[i] = val - c->gamma.offset; + out_i[i] = val - c->u.gamma.offset; } return 0; @@ -807,7 +888,7 @@ cram_codec *cram_gamma_decode_init(char *data, int size, c->decode = cram_gamma_decode; c->free = cram_gamma_decode_free; - cp += safe_itf8_get(cp, data + size, &c->gamma.offset); + cp += safe_itf8_get(cp, data + size, &c->u.gamma.offset); if (cp - data != size) goto malformed; @@ -839,8 +920,8 @@ void cram_huffman_decode_free(cram_codec *c) { if (!c) return; - if (c->huffman.codes) - free(c->huffman.codes); + if (c->u.huffman.codes) + free(c->u.huffman.codes); free(c); } @@ -858,15 +939,15 @@ int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c, /* Special case of 0 length codes */ for (i = 0, n = *out_size; i < n; i++) { - out[i] = c->huffman.codes[0].symbol; + out[i] = c->u.huffman.codes[0].symbol; } return 0; } int cram_huffman_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - int i, n, ncodes = c->huffman.ncodes; - const cram_huffman_code * const codes = c->huffman.codes; + int i, n, ncodes = c->u.huffman.ncodes; + const cram_huffman_code * const codes = c->u.huffman.codes; for (i = 0, n = *out_size; i < n; i++) { int idx = 0; @@ -902,7 +983,7 @@ int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { int32_t *out_i = (int32_t *)out; int i, n; - const cram_huffman_code * const codes = c->huffman.codes; + const cram_huffman_code * const codes = c->u.huffman.codes; /* Special case of 0 length codes */ for (i = 0, n = *out_size; i < n; i++) { @@ -914,8 +995,58 @@ int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c, int cram_huffman_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { int32_t *out_i = (int32_t *)out; - int i, n, ncodes = c->huffman.ncodes; - const cram_huffman_code * const codes = c->huffman.codes; + int i, n, ncodes = c->u.huffman.ncodes; + const cram_huffman_code * const codes = c->u.huffman.codes; + + for (i = 0, n = *out_size; i < n; i++) { + int idx = 0; + int val = 0, len = 0, last_len = 0; + + // Now one bit at a time for remaining checks + for (;;) { + int dlen = codes[idx].len - last_len; + if (cram_not_enough_bits(in, dlen)) + return -1; + + //val <<= dlen; + //val |= get_bits_MSB(in, dlen); + //last_len = (len += dlen); + + last_len = (len += dlen); + for (; dlen; dlen--) GET_BIT_MSB(in, val); + + idx = val - codes[idx].p; + if (idx >= ncodes || idx < 0) + return -1; + + if (codes[idx].code == val && codes[idx].len == len) { + out_i[i] = codes[idx].symbol; + break; + } + } + } + + return 0; +} + +int cram_huffman_decode_long0(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n; + const cram_huffman_code * const codes = c->u.huffman.codes; + + /* Special case of 0 length codes */ + for (i = 0, n = *out_size; i < n; i++) { + out_i[i] = codes[0].symbol; + } + return 0; +} + +int cram_huffman_decode_long(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n, ncodes = c->u.huffman.ncodes; + const cram_huffman_code * const codes = c->u.huffman.codes; for (i = 0, n = *out_size; i < n; i++) { int idx = 0; @@ -985,20 +1116,28 @@ cram_codec *cram_huffman_decode_init(char *data, int size, h->codec = E_HUFFMAN; h->free = cram_huffman_decode_free; - h->huffman.ncodes = ncodes; + h->u.huffman.ncodes = ncodes; if (ncodes) { - codes = h->huffman.codes = malloc(ncodes * sizeof(*codes)); + codes = h->u.huffman.codes = malloc(ncodes * sizeof(*codes)); if (!codes) { free(h); return NULL; } } else { - codes = h->huffman.codes = NULL; + codes = h->u.huffman.codes = NULL; } /* Read symbols and bit-lengths */ - for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { - l = safe_itf8_get(cp, data_end, &codes[i].symbol); + if (option == E_LONG) { + for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { + l = safe_ltf8_get(cp, data_end, &codes[i].symbol); + } + } else { + for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { + int32_t i32; + l = safe_itf8_get(cp, data_end, &i32); + codes[i].symbol = i32; + } } if (l < 1) @@ -1068,7 +1207,7 @@ cram_codec *cram_huffman_decode_init(char *data, int size, // puts("==HUFF LEN=="); // for (i = 0; i <= last_len+1; i++) { - // printf("len %d=%d prefix %d\n", i, h->huffman.lengths[i], h->huffman.prefix[i]); + // printf("len %d=%d prefix %d\n", i, h->u.huffman.lengths[i], h->u.huffman.prefix[i]); // } // puts("===HUFFMAN CODES==="); // for (i = 0; i < ncodes; i++) { @@ -1082,17 +1221,22 @@ cram_codec *cram_huffman_decode_init(char *data, int size, // } if (option == E_BYTE || option == E_BYTE_ARRAY) { - if (h->huffman.codes[0].len == 0) + if (h->u.huffman.codes[0].len == 0) h->decode = cram_huffman_decode_char0; else h->decode = cram_huffman_decode_char; - } else if (option == E_BYTE_ARRAY_BLOCK) { - abort(); - } else { - if (h->huffman.codes[0].len == 0) + } else if (option == E_LONG) { + if (h->u.huffman.codes[0].len == 0) + h->decode = cram_huffman_decode_long0; + else + h->decode = cram_huffman_decode_long; + } else if (option == E_INT) { + if (h->u.huffman.codes[0].len == 0) h->decode = cram_huffman_decode_int0; else h->decode = cram_huffman_decode_int; + } else { + return NULL; } return (cram_codec *)h; @@ -1117,21 +1261,21 @@ int cram_huffman_encode_char(cram_slice *slice, cram_codec *c, while (in_size--) { int sym = *syms++; if (sym >= -1 && sym < MAX_HUFF) { - i = c->e_huffman.val2code[sym+1]; - assert(c->e_huffman.codes[i].symbol == sym); - code = c->e_huffman.codes[i].code; - len = c->e_huffman.codes[i].len; + i = c->u.e_huffman.val2code[sym+1]; + assert(c->u.e_huffman.codes[i].symbol == sym); + code = c->u.e_huffman.codes[i].code; + len = c->u.e_huffman.codes[i].len; } else { /* Slow - use a lookup table for when sym < MAX_HUFF? */ - for (i = 0; i < c->e_huffman.nvals; i++) { - if (c->e_huffman.codes[i].symbol == sym) + for (i = 0; i < c->u.e_huffman.nvals; i++) { + if (c->u.e_huffman.codes[i].symbol == sym) break; } - if (i == c->e_huffman.nvals) + if (i == c->u.e_huffman.nvals) return -1; - code = c->e_huffman.codes[i].code; - len = c->e_huffman.codes[i].len; + code = c->u.e_huffman.codes[i].code; + len = c->u.e_huffman.codes[i].len; } r |= store_bits_MSB(c->out, code, len); @@ -1154,21 +1298,58 @@ int cram_huffman_encode_int(cram_slice *slice, cram_codec *c, int sym = *syms++; if (sym >= -1 && sym < MAX_HUFF) { - i = c->e_huffman.val2code[sym+1]; - assert(c->e_huffman.codes[i].symbol == sym); - code = c->e_huffman.codes[i].code; - len = c->e_huffman.codes[i].len; + i = c->u.e_huffman.val2code[sym+1]; + assert(c->u.e_huffman.codes[i].symbol == sym); + code = c->u.e_huffman.codes[i].code; + len = c->u.e_huffman.codes[i].len; + } else { + /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */ + for (i = 0; i < c->u.e_huffman.nvals; i++) { + if (c->u.e_huffman.codes[i].symbol == sym) + break; + } + if (i == c->u.e_huffman.nvals) + return -1; + + code = c->u.e_huffman.codes[i].code; + len = c->u.e_huffman.codes[i].len; + } + + r |= store_bits_MSB(c->out, code, len); + } + + return r; +} + +int cram_huffman_encode_long0(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + return 0; +} + +int cram_huffman_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int i, code, len, r = 0; + int64_t *syms = (int64_t *)in; + + while (in_size--) { + int sym = *syms++; + + if (sym >= -1 && sym < MAX_HUFF) { + i = c->u.e_huffman.val2code[sym+1]; + assert(c->u.e_huffman.codes[i].symbol == sym); + code = c->u.e_huffman.codes[i].code; + len = c->u.e_huffman.codes[i].len; } else { /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */ - for (i = 0; i < c->e_huffman.nvals; i++) { - if (c->e_huffman.codes[i].symbol == sym) + for (i = 0; i < c->u.e_huffman.nvals; i++) { + if (c->u.e_huffman.codes[i].symbol == sym) break; } - if (i == c->e_huffman.nvals) + if (i == c->u.e_huffman.nvals) return -1; - code = c->e_huffman.codes[i].code; - len = c->e_huffman.codes[i].len; + code = c->u.e_huffman.codes[i].code; + len = c->u.e_huffman.codes[i].len; } r |= store_bits_MSB(c->out, code, len); @@ -1181,8 +1362,8 @@ void cram_huffman_encode_free(cram_codec *c) { if (!c) return; - if (c->e_huffman.codes) - free(c->e_huffman.codes); + if (c->u.e_huffman.codes) + free(c->u.e_huffman.codes); free(c); } @@ -1192,8 +1373,8 @@ void cram_huffman_encode_free(cram_codec *c) { */ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, int version) { - int i, len = 0; - cram_huffman_code *codes = c->e_huffman.codes; + int i, len = 0, r = 0, n; + cram_huffman_code *codes = c->u.e_huffman.codes; /* * Up to code length 127 means 2.5e+26 bytes of data required (worst * case huffman tree needs symbols with freqs matching the Fibonacci @@ -1203,7 +1384,7 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, * * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory */ - char *tmp = malloc(6*c->e_huffman.nvals+16); + char *tmp = malloc(6*c->u.e_huffman.nvals+16); char *tp = tmp; if (!tmp) @@ -1215,31 +1396,42 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, len += l; } - tp += itf8_put(tp, c->e_huffman.nvals); - for (i = 0; i < c->e_huffman.nvals; i++) { - tp += itf8_put(tp, codes[i].symbol); + tp += itf8_put(tp, c->u.e_huffman.nvals); + if (c->u.e_huffman.option == E_LONG) { + for (i = 0; i < c->u.e_huffman.nvals; i++) { + tp += ltf8_put(tp, codes[i].symbol); + } + } else { + for (i = 0; i < c->u.e_huffman.nvals; i++) { + tp += itf8_put(tp, codes[i].symbol); + } } - tp += itf8_put(tp, c->e_huffman.nvals); - for (i = 0; i < c->e_huffman.nvals; i++) { + tp += itf8_put(tp, c->u.e_huffman.nvals); + for (i = 0; i < c->u.e_huffman.nvals; i++) { tp += itf8_put(tp, codes[i].len); } - len += itf8_put_blk(b, c->codec); - len += itf8_put_blk(b, tp-tmp); + len += (n = itf8_put_blk(b, c->codec)); r |= n; + len += (n = itf8_put_blk(b, tp-tmp)); r |= n; BLOCK_APPEND(b, tmp, tp-tmp); len += tp-tmp; free(tmp); - return len; + if (r > 0) + return len; + + block_err: + return -1; } cram_codec *cram_huffman_encode_init(cram_stats *st, enum cram_external_type option, void *dat, int version) { - int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens, code, len; + int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens = NULL, code, len; + int *new_vals, *new_freqs; int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k; cram_codec *c; cram_huffman_code *codes; @@ -1255,14 +1447,12 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, continue; if (nvals >= vals_alloc) { vals_alloc = vals_alloc ? vals_alloc*2 : 1024; - vals = realloc(vals, vals_alloc * sizeof(int)); - freqs = realloc(freqs, vals_alloc * sizeof(int)); - if (!vals || !freqs) { - if (vals) free(vals); - if (freqs) free(freqs); - free(c); - return NULL; - } + new_vals = realloc(vals, vals_alloc * sizeof(int)); + if (!new_vals) goto nomem; + vals = new_vals; + new_freqs = realloc(freqs, vals_alloc * sizeof(int)); + if (!new_freqs) goto nomem; + freqs = new_freqs; } vals[nvals] = i; freqs[nvals] = st->freqs[i]; @@ -1280,10 +1470,12 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, continue; if (nvals >= vals_alloc) { vals_alloc = vals_alloc ? vals_alloc*2 : 1024; - vals = realloc(vals, vals_alloc * sizeof(int)); - freqs = realloc(freqs, vals_alloc * sizeof(int)); - if (!vals || !freqs) - return NULL; + new_vals = realloc(vals, vals_alloc * sizeof(int)); + if (!new_vals) goto nomem; + vals = new_vals; + new_freqs = realloc(freqs, vals_alloc * sizeof(int)); + if (!new_freqs) goto nomem; + freqs = new_freqs; } vals[nvals]= kh_key(st->h, k); freqs[nvals] = kh_val(st->h, k); @@ -1297,10 +1489,11 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, assert(nvals > 0); - freqs = realloc(freqs, 2*nvals*sizeof(*freqs)); + new_freqs = realloc(freqs, 2*nvals*sizeof(*freqs)); + if (!new_freqs) goto nomem; + freqs = new_freqs; lens = calloc(2*nvals, sizeof(*lens)); - if (!lens || !freqs) - return NULL; + if (!lens) goto nomem; /* Inefficient, use pointers to form chain so we can insert and maintain * a sorted list? This is currently O(nvals^2) complexity. @@ -1341,7 +1534,7 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, /* Sort, need in a struct */ if (!(codes = malloc(nvals * sizeof(*codes)))) - return NULL; + goto nomem; for (i = 0; i < nvals; i++) { codes[i].symbol = vals[i]; codes[i].len = lens[i]; @@ -1377,7 +1570,7 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, codes[i].code = code++; if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF) - c->e_huffman.val2code[codes[i].symbol+1] = i; + c->u.e_huffman.val2code[codes[i].symbol+1] = i; //fprintf(stderr, "sym %d, code %d, len %d\n", // codes[i].symbol, codes[i].code, codes[i].len); @@ -1387,24 +1580,38 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, free(vals); free(freqs); - c->e_huffman.codes = codes; - c->e_huffman.nvals = nvals; + c->u.e_huffman.codes = codes; + c->u.e_huffman.nvals = nvals; + c->u.e_huffman.option = option; c->free = cram_huffman_encode_free; if (option == E_BYTE || option == E_BYTE_ARRAY) { - if (c->e_huffman.codes[0].len == 0) + if (c->u.e_huffman.codes[0].len == 0) c->encode = cram_huffman_encode_char0; else c->encode = cram_huffman_encode_char; - } else { - if (c->e_huffman.codes[0].len == 0) + } else if (option == E_INT) { + if (c->u.e_huffman.codes[0].len == 0) c->encode = cram_huffman_encode_int0; else c->encode = cram_huffman_encode_int; + } else if (option == E_LONG) { + if (c->u.e_huffman.codes[0].len == 0) + c->encode = cram_huffman_encode_long0; + else + c->encode = cram_huffman_encode_long; } c->store = cram_huffman_encode_store; return c; + + nomem: + hts_log_error("Out of memory"); + free(vals); + free(freqs); + free(lens); + free(c); + return NULL; } /* @@ -1418,14 +1625,14 @@ int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c, int32_t len = 0, one = 1; int r; - r = c->byte_array_len.len_codec->decode(slice, c->byte_array_len.len_codec, - in, (char *)&len, &one); + r = c->u.byte_array_len.len_codec->decode(slice, c->u.byte_array_len.len_codec, + in, (char *)&len, &one); //printf("ByteArray Len=%d\n", len); - if (!r && c->byte_array_len.val_codec && len >= 0) { - r = c->byte_array_len.val_codec->decode(slice, - c->byte_array_len.val_codec, - in, out, &len); + if (!r && c->u.byte_array_len.val_codec && len >= 0) { + r = c->u.byte_array_len.val_codec->decode(slice, + c->u.byte_array_len.val_codec, + in, out, &len); } else { return -1; } @@ -1438,11 +1645,11 @@ int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c, void cram_byte_array_len_decode_free(cram_codec *c) { if (!c) return; - if (c->byte_array_len.len_codec) - c->byte_array_len.len_codec->free(c->byte_array_len.len_codec); + if (c->u.byte_array_len.len_codec) + c->u.byte_array_len.len_codec->free(c->u.byte_array_len.len_codec); - if (c->byte_array_len.val_codec) - c->byte_array_len.val_codec->free(c->byte_array_len.val_codec); + if (c->u.byte_array_len.val_codec) + c->u.byte_array_len.val_codec->free(c->u.byte_array_len.val_codec); free(c); } @@ -1462,14 +1669,16 @@ cram_codec *cram_byte_array_len_decode_init(char *data, int size, c->codec = E_BYTE_ARRAY_LEN; c->decode = cram_byte_array_len_decode; c->free = cram_byte_array_len_decode_free; + c->u.byte_array_len.len_codec = NULL; + c->u.byte_array_len.val_codec = NULL; cp += safe_itf8_get(cp, endp, &encoding); cp += safe_itf8_get(cp, endp, &sub_size); if (sub_size < 0 || endp - cp < sub_size) goto malformed; - c->byte_array_len.len_codec = cram_decoder_init(encoding, cp, sub_size, - E_INT, version); - if (c->byte_array_len.len_codec == NULL) + c->u.byte_array_len.len_codec = cram_decoder_init(encoding, cp, sub_size, + E_INT, version); + if (c->u.byte_array_len.len_codec == NULL) goto no_codec; cp += sub_size; @@ -1478,9 +1687,9 @@ cram_codec *cram_byte_array_len_decode_init(char *data, int size, cp += safe_itf8_get(cp, endp, &sub_size); if (sub_size < 0 || endp - cp < sub_size) goto malformed; - c->byte_array_len.val_codec = cram_decoder_init(encoding, cp, sub_size, - option, version); - if (c->byte_array_len.val_codec == NULL) + c->u.byte_array_len.val_codec = cram_decoder_init(encoding, cp, sub_size, + option, version); + if (c->u.byte_array_len.val_codec == NULL) goto no_codec; cp += sub_size; @@ -1492,7 +1701,7 @@ cram_codec *cram_byte_array_len_decode_init(char *data, int size, malformed: hts_log_error("Malformed byte_array_len header stream"); no_codec: - free(c); + cram_byte_array_len_decode_free(c); return NULL; } @@ -1501,12 +1710,12 @@ int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c, int32_t i32 = in_size; int r = 0; - r |= c->e_byte_array_len.len_codec->encode(slice, - c->e_byte_array_len.len_codec, - (char *)&i32, 1); - r |= c->e_byte_array_len.val_codec->encode(slice, - c->e_byte_array_len.val_codec, - in, in_size); + r |= c->u.e_byte_array_len.len_codec->encode(slice, + c->u.e_byte_array_len.len_codec, + (char *)&i32, 1); + r |= c->u.e_byte_array_len.val_codec->encode(slice, + c->u.e_byte_array_len.val_codec, + in, in_size); return r; } @@ -1514,20 +1723,20 @@ void cram_byte_array_len_encode_free(cram_codec *c) { if (!c) return; - if (c->e_byte_array_len.len_codec) - c->e_byte_array_len.len_codec->free(c->e_byte_array_len.len_codec); + if (c->u.e_byte_array_len.len_codec) + c->u.e_byte_array_len.len_codec->free(c->u.e_byte_array_len.len_codec); - if (c->e_byte_array_len.val_codec) - c->e_byte_array_len.val_codec->free(c->e_byte_array_len.val_codec); + if (c->u.e_byte_array_len.val_codec) + c->u.e_byte_array_len.val_codec->free(c->u.e_byte_array_len.val_codec); free(c); } int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, char *prefix, int version) { - int len = 0, len2, len3; + int len = 0, len2, len3, r = 0, n; cram_codec *tc; - cram_block *b_len, *b_val; + cram_block *b_len = NULL, *b_val = NULL; if (prefix) { size_t l = strlen(prefix); @@ -1535,23 +1744,33 @@ int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, len += l; } - tc = c->e_byte_array_len.len_codec; + tc = c->u.e_byte_array_len.len_codec; b_len = cram_new_block(0, 0); + if (!b_len) goto block_err; len2 = tc->store(tc, b_len, NULL, version); + if (len2 < 0) goto block_err; - tc = c->e_byte_array_len.val_codec; + tc = c->u.e_byte_array_len.val_codec; b_val = cram_new_block(0, 0); + if (!b_val) goto block_err; len3 = tc->store(tc, b_val, NULL, version); + if (len3 < 0) goto block_err; - len += itf8_put_blk(b, c->codec); - len += itf8_put_blk(b, len2+len3); + len += (n = itf8_put_blk(b, c->codec)); r |= n; + len += (n = itf8_put_blk(b, len2+len3)); r |= n; BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len)); BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val)); cram_free_block(b_len); cram_free_block(b_val); - return len + len2 + len3; + if (r > 0) + return len + len2 + len3; + + block_err: + if (b_len) cram_free_block(b_len); + if (b_val) cram_free_block(b_val); + return -1; } cram_codec *cram_byte_array_len_encode_init(cram_stats *st, @@ -1569,14 +1788,20 @@ cram_codec *cram_byte_array_len_encode_init(cram_stats *st, c->encode = cram_byte_array_len_encode; c->store = cram_byte_array_len_encode_store; - c->e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding, - st, E_INT, - e->len_dat, - version); - c->e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding, - NULL, E_BYTE_ARRAY, - e->val_dat, - version); + c->u.e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding, + st, E_INT, + e->len_dat, + version); + c->u.e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding, + NULL, E_BYTE_ARRAY, + e->val_dat, + version); + + if (!c->u.e_byte_array_len.len_codec || + !c->u.e_byte_array_len.val_codec) { + cram_byte_array_len_encode_free(c); + return NULL; + } return c; } @@ -1591,7 +1816,7 @@ static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c, char *cp, ch; cram_block *b = NULL; - b = cram_get_block_by_id(slice, c->byte_array_stop.content_id); + b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id); if (!b) return *out_size?-1:0; @@ -1600,7 +1825,7 @@ static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c, cp = (char *)b->data + b->idx; if (out) { - while ((ch = *cp) != (char)c->byte_array_stop.stop) { + while ((ch = *cp) != (char)c->u.byte_array_stop.stop) { if (cp - (char *)b->data >= b->uncomp_size) return -1; *out++ = ch; @@ -1608,7 +1833,7 @@ static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c, } } else { // Consume input, but produce no output - while ((ch = *cp) != (char)c->byte_array_stop.stop) { + while ((ch = *cp) != (char)c->u.byte_array_stop.stop) { if (cp - (char *)b->data >= b->uncomp_size) return -1; cp++; @@ -1629,7 +1854,7 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, char *cp, *out_cp, *cp_end; char stop; - b = cram_get_block_by_id(slice, c->byte_array_stop.content_id); + b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id); if (!b) return *out_size?-1:0; @@ -1639,7 +1864,7 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, cp_end = (char *)b->data + b->uncomp_size; out_cp = (char *)BLOCK_END(out); - stop = c->byte_array_stop.stop; + stop = c->u.byte_array_stop.stop; if (cp_end - cp < out->alloc - out->byte) { while (cp != cp_end && *cp != stop) *out_cp++ = *cp++; @@ -1656,6 +1881,9 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, b->idx = cp - (char *)b->data + 1; return 0; + + block_err: + return -1; } void cram_byte_array_stop_decode_free(cram_codec *c) { @@ -1691,14 +1919,14 @@ cram_codec *cram_byte_array_stop_decode_init(char *data, int size, } c->free = cram_byte_array_stop_decode_free; - c->byte_array_stop.stop = *cp++; + c->u.byte_array_stop.stop = *cp++; if (CRAM_MAJOR_VERS(version) == 1) { - c->byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16) + c->u.byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16) + (cp[3]<<24); cp += 4; } else { cp += safe_itf8_get((char *) cp, data + size, - &c->byte_array_stop.content_id); + &c->u.byte_array_stop.content_id); } if ((char *)cp - data != size) @@ -1715,8 +1943,11 @@ cram_codec *cram_byte_array_stop_decode_init(char *data, int size, int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c, char *in, int in_size) { BLOCK_APPEND(c->out, in, in_size); - BLOCK_APPEND_CHAR(c->out, c->e_byte_array_stop.stop); + BLOCK_APPEND_CHAR(c->out, c->u.e_byte_array_stop.stop); return 0; + + block_err: + return -1; } void cram_byte_array_stop_encode_free(cram_codec *c) { @@ -1740,21 +1971,24 @@ int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b, if (CRAM_MAJOR_VERS(version) == 1) { cp += itf8_put(cp, 5); - *cp++ = c->e_byte_array_stop.stop; - *cp++ = (c->e_byte_array_stop.content_id >> 0) & 0xff; - *cp++ = (c->e_byte_array_stop.content_id >> 8) & 0xff; - *cp++ = (c->e_byte_array_stop.content_id >> 16) & 0xff; - *cp++ = (c->e_byte_array_stop.content_id >> 24) & 0xff; + *cp++ = c->u.e_byte_array_stop.stop; + *cp++ = (c->u.e_byte_array_stop.content_id >> 0) & 0xff; + *cp++ = (c->u.e_byte_array_stop.content_id >> 8) & 0xff; + *cp++ = (c->u.e_byte_array_stop.content_id >> 16) & 0xff; + *cp++ = (c->u.e_byte_array_stop.content_id >> 24) & 0xff; } else { - cp += itf8_put(cp, 1 + itf8_size(c->e_byte_array_stop.content_id)); - *cp++ = c->e_byte_array_stop.stop; - cp += itf8_put(cp, c->e_byte_array_stop.content_id); + cp += itf8_put(cp, 1 + itf8_size(c->u.e_byte_array_stop.content_id)); + *cp++ = c->u.e_byte_array_stop.stop; + cp += itf8_put(cp, c->u.e_byte_array_stop.content_id); } BLOCK_APPEND(b, buf, cp-buf); len += cp-buf; return len; + + block_err: + return -1; } cram_codec *cram_byte_array_stop_encode_init(cram_stats *st, @@ -1771,8 +2005,8 @@ cram_codec *cram_byte_array_stop_encode_init(cram_stats *st, c->encode = cram_byte_array_stop_encode; c->store = cram_byte_array_stop_encode_store; - c->e_byte_array_stop.stop = ((int *)dat)[0]; - c->e_byte_array_stop.content_id = ((int *)dat)[1]; + c->u.e_byte_array_stop.stop = ((int *)dat)[0]; + c->u.e_byte_array_stop.content_id = ((int *)dat)[1]; return c; } @@ -1871,7 +2105,7 @@ int cram_codec_to_id(cram_codec *c, int *id2) { switch (c->codec) { case E_HUFFMAN: - bnum1 = c->huffman.ncodes == 1 ? -2 : -1; + bnum1 = c->u.huffman.ncodes == 1 ? -2 : -1; break; case E_GOLOMB: case E_BETA: @@ -1881,14 +2115,14 @@ int cram_codec_to_id(cram_codec *c, int *id2) { bnum1 = -1; break; case E_EXTERNAL: - bnum1 = c->external.content_id; + bnum1 = c->u.external.content_id; break; case E_BYTE_ARRAY_LEN: - bnum1 = cram_codec_to_id(c->byte_array_len.len_codec, NULL); - bnum2 = cram_codec_to_id(c->byte_array_len.val_codec, NULL); + bnum1 = cram_codec_to_id(c->u.byte_array_len.len_codec, NULL); + bnum2 = cram_codec_to_id(c->u.byte_array_len.val_codec, NULL); break; case E_BYTE_ARRAY_STOP: - bnum1 = c->byte_array_stop.content_id; + bnum1 = c->u.byte_array_stop.content_id; break; case E_NULL: bnum1 = -2; @@ -1925,6 +2159,8 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { c->store = cram_external_encode_store; if (c->decode == cram_external_decode_int) c->encode = cram_external_encode_int; + if (c->decode == cram_external_decode_long) + c->encode = cram_external_encode_long; else if (c->decode == cram_external_decode_char) c->encode = cram_external_encode_char; else @@ -1939,12 +2175,12 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { t->codec = E_HUFFMAN; t->free = cram_huffman_encode_free; t->store = cram_huffman_encode_store; - t->e_huffman.codes = c->huffman.codes; - t->e_huffman.nvals = c->huffman.ncodes; - for (j = 0; j < t->e_huffman.nvals; j++) { - int32_t sym = t->e_huffman.codes[j].symbol; + t->u.e_huffman.codes = c->u.huffman.codes; + t->u.e_huffman.nvals = c->u.huffman.ncodes; + for (j = 0; j < t->u.e_huffman.nvals; j++) { + int32_t sym = t->u.e_huffman.codes[j].symbol; if (sym >= -1 && sym < MAX_HUFF) - t->e_huffman.val2code[sym+1] = j; + t->u.e_huffman.val2code[sym+1] = j; } if (c->decode == cram_huffman_decode_char0) @@ -1955,6 +2191,10 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { t->encode = cram_huffman_encode_int0; else if (c->decode == cram_huffman_decode_int) t->encode = cram_huffman_encode_int; + else if (c->decode == cram_huffman_decode_long0) + t->encode = cram_huffman_encode_long0; + else if (c->decode == cram_huffman_decode_long) + t->encode = cram_huffman_encode_long; else { free(t); return -1; @@ -1970,6 +2210,8 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { c->store = cram_beta_encode_store; if (c->decode == cram_beta_decode_int) c->encode = cram_beta_encode_int; + else if (c->decode == cram_beta_decode_long) + c->encode = cram_beta_encode_long; else if (c->decode == cram_beta_decode_char) c->encode = cram_beta_encode_char; else @@ -1982,10 +2224,10 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { t->free = cram_byte_array_len_encode_free; t->store = cram_byte_array_len_encode_store; t->encode = cram_byte_array_len_encode; - t->e_byte_array_len.len_codec = c->byte_array_len.len_codec; - t->e_byte_array_len.val_codec = c->byte_array_len.val_codec; - if (cram_codec_decoder2encoder(fd, t->e_byte_array_len.len_codec) == -1 || - cram_codec_decoder2encoder(fd, t->e_byte_array_len.val_codec) == -1) { + t->u.e_byte_array_len.len_codec = c->u.byte_array_len.len_codec; + t->u.e_byte_array_len.val_codec = c->u.byte_array_len.val_codec; + if (cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.len_codec) == -1 || + cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.val_codec) == -1) { t->free(t); return -1; } diff --git a/cram/cram_codecs.h b/cram/cram_codecs.h index e79a4a744..31a170031 100644 --- a/cram/cram_codecs.h +++ b/cram/cram_codecs.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2013 Genome Research Ltd. +Copyright (c) 2012-2015, 2018 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -28,10 +28,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _CRAM_ENCODINGS_H_ -#define _CRAM_ENCODINGS_H_ +#ifndef CRAM_CODECS_H +#define CRAM_CODECS_H -#include +#include #ifdef __cplusplus extern "C" { @@ -49,7 +49,7 @@ struct cram_codec; * appears. */ typedef struct { - int32_t symbol; + int64_t symbol; int32_t p; // next code start value, minus index to codes[] int32_t code; int32_t len; @@ -65,6 +65,7 @@ typedef struct { cram_huffman_code *codes; int nvals; int val2code[MAX_HUFF+1]; // value to code lookup for small values + int option; } cram_huffman_encoder; typedef struct { @@ -108,9 +109,6 @@ typedef struct { /* * A generic codec structure. */ -#ifdef __SUNPRO_C -# pragma error_messages(off, E_ANONYMOUS_UNION_DECL) -#endif typedef struct cram_codec { enum cram_encoding codec; cram_block *out; @@ -136,11 +134,8 @@ typedef struct cram_codec { cram_byte_array_stop_decoder e_byte_array_stop; cram_byte_array_len_encoder e_byte_array_len; cram_beta_decoder e_beta; - }; + } u; } cram_codec; -#ifdef __SUNPRO_C -# pragma error_messages(default, E_ANONYMOUS_UNION_DECL) -#endif const char *cram_encoding2str(enum cram_encoding t); @@ -198,4 +193,4 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c); } #endif -#endif /* _CRAM_ENCODINGS_H_ */ +#endif /* CRAM_CODECS_H */ diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 8a831b145..9b19b8704 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2014 Genome Research Ltd. +Copyright (c) 2012-2019 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - Iterator for reading CRAM record by record. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -45,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "cram/cram.h" #include "cram/os.h" @@ -74,11 +76,18 @@ int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) { if (!(b = cram_new_block(0, 0))) return -1; + if (h->TD_blk || h->TL) { + hts_log_warning("More than one TD block found in compression header"); + cram_free_block(h->TD_blk); + free(h->TL); + h->TD_blk = NULL; + h->TL = NULL; + } + /* Decode */ cp += safe_itf8_get(cp, endp, &blk_size); if (!blk_size) { h->nTL = 0; - h->TL = NULL; cram_free_block(b); return cp - op; } @@ -106,8 +115,7 @@ int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) { } // Copy - h->nTL = nTL; - if (!(h->TL = calloc(h->nTL, sizeof(unsigned char *)))) { + if (!(h->TL = calloc(nTL, sizeof(*h->TL)))) { cram_free_block(b); return -1; } @@ -117,8 +125,13 @@ int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) { i++; } h->TD_blk = b; + h->nTL = nTL; return sz; + + block_err: + cram_free_block(b); + return -1; } /* @@ -147,13 +160,30 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, endp = cp + b->uncomp_size; if (CRAM_MAJOR_VERS(fd->version) == 1) { + int32_t i32; cp += safe_itf8_get(cp, endp, &hdr->ref_seq_id); - cp += safe_itf8_get(cp, endp, &hdr->ref_seq_start); - cp += safe_itf8_get(cp, endp, &hdr->ref_seq_span); +/* + * LARGE_POS used in this code is purely a debugging mechanism for testing + * whether the htslib API can cope with 64-bit quantities. These are + * possible in SAM, but not *yet* in BAM or CRAM. + * + * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. + * + * At some point it is expected these ifdefs will become a version check + * instead. + */ +#ifdef LARGE_POS + cp += safe_ltf8_get(cp, endp, &hdr->ref_seq_start); + cp += safe_ltf8_get(cp, endp, &hdr->ref_seq_span); +#else + cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_start=i32; + cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_span=i32; +#endif cp += safe_itf8_get(cp, endp, &hdr->num_records); cp += safe_itf8_get(cp, endp, &hdr->num_landmarks); - if ((hdr->num_landmarks < 0 || - hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t))) { + if (hdr->num_landmarks < 0 || + hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t) || + endp - cp < hdr->num_landmarks) { free(hdr); return NULL; } @@ -179,10 +209,6 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } /* Initialise defaults for preservation map */ - hdr->mapped_qs_included = 0; - hdr->unmapped_qs_included = 0; - hdr->unmapped_placed = 0; - hdr->qs_included = 0; hdr->read_names_included = 0; hdr->AP_delta = 1; memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20); @@ -201,40 +227,10 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } cp += 2; switch(CRAM_KEY(cp[-2],cp[-1])) { - case CRAM_KEY('M','I'): - hd.i = *cp++; - k = kh_put(map, hdr->preservation_map, "MI", &r); - if (-1 == r) { - cram_free_compression_header(hdr); - return NULL; - } - - kh_val(hdr->preservation_map, k) = hd; - hdr->mapped_qs_included = hd.i; - break; - - case CRAM_KEY('U','I'): - hd.i = *cp++; - k = kh_put(map, hdr->preservation_map, "UI", &r); - if (-1 == r) { - cram_free_compression_header(hdr); - return NULL; - } - - kh_val(hdr->preservation_map, k) = hd; - hdr->unmapped_qs_included = hd.i; - break; - - case CRAM_KEY('P','I'): + case CRAM_KEY('M','I'): // was mapped QS included in V1.0 + case CRAM_KEY('U','I'): // was unmapped QS included in V1.0 + case CRAM_KEY('P','I'): // was unmapped placed in V1.0 hd.i = *cp++; - k = kh_put(map, hdr->preservation_map, "PI", &r); - if (-1 == r) { - cram_free_compression_header(hdr); - return NULL; - } - - kh_val(hdr->preservation_map, k) = hd; - hdr->unmapped_placed = hd.i; break; case CRAM_KEY('R','N'): @@ -352,10 +348,12 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, char *key = cp; int32_t encoding = E_NULL; int32_t size = 0; - cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc + ptrdiff_t offset; + cram_map *m; + enum cram_DS_ID ds_id; + enum cram_external_type type; - if (!m || endp - cp < 4) { - free(m); + if (endp - cp < 4) { cram_free_compression_header(hdr); return NULL; } @@ -364,18 +362,12 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, cp += safe_itf8_get(cp, endp, &encoding); cp += safe_itf8_get(cp, endp, &size); - // Fill out cram_map purely for cram_dump to dump out. - m->key = (key[0]<<8)|key[1]; - m->encoding = encoding; - m->size = size; - m->offset = cp - (char *)b->data; - m->codec = NULL; + offset = cp - (char *)b->data; - if (m->encoding == E_NULL) + if (encoding == E_NULL) continue; if (size < 0 || endp - cp < size) { - free(m); cram_free_compression_header(hdr); return NULL; } @@ -391,225 +383,116 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, * Neither this C code nor Java reference implementations did this, * so we gloss over it and treat them as int. */ - + ds_id = DS_CORE; if (key[0] == 'B' && key[1] == 'F') { - if (!(hdr->codecs[DS_BF] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_BF; type = E_INT; } else if (key[0] == 'C' && key[1] == 'F') { - if (!(hdr->codecs[DS_CF] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_CF; type = E_INT; } else if (key[0] == 'R' && key[1] == 'I') { - if (!(hdr->codecs[DS_RI] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_RI; type = E_INT; } else if (key[0] == 'R' && key[1] == 'L') { - if (!(hdr->codecs[DS_RL] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_RL; type = E_INT; } else if (key[0] == 'A' && key[1] == 'P') { - if (!(hdr->codecs[DS_AP] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_AP; +#ifdef LARGE_POS + type = E_LONG, +#else + type = E_INT; +#endif } else if (key[0] == 'R' && key[1] == 'G') { - if (!(hdr->codecs[DS_RG] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_RG; type = E_INT; } else if (key[0] == 'M' && key[1] == 'F') { - if (!(hdr->codecs[DS_MF] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_MF; type = E_INT; } else if (key[0] == 'N' && key[1] == 'S') { - if (!(hdr->codecs[DS_NS] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_NS; type = E_INT; } else if (key[0] == 'N' && key[1] == 'P') { - if (!(hdr->codecs[DS_NP] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_NP; +#ifdef LARGE_POS + type = E_LONG, +#else + type = E_INT; +#endif } else if (key[0] == 'T' && key[1] == 'S') { - if (!(hdr->codecs[DS_TS] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_TS; +#ifdef LARGE_POS + type = E_LONG, +#else + type = E_INT; +#endif } else if (key[0] == 'N' && key[1] == 'F') { - if (!(hdr->codecs[DS_NF] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_NF; type = E_INT; } else if (key[0] == 'T' && key[1] == 'C') { - if (!(hdr->codecs[DS_TC] = cram_decoder_init(encoding, cp, size, - E_BYTE, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_TC; type = E_BYTE; } else if (key[0] == 'T' && key[1] == 'N') { - if (!(hdr->codecs[DS_TN] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_TN; type = E_INT; } else if (key[0] == 'F' && key[1] == 'N') { - if (!(hdr->codecs[DS_FN] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_FN; type = E_INT; } else if (key[0] == 'F' && key[1] == 'C') { - if (!(hdr->codecs[DS_FC] = cram_decoder_init(encoding, cp, size, - E_BYTE, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_FC; type = E_BYTE; } else if (key[0] == 'F' && key[1] == 'P') { - if (!(hdr->codecs[DS_FP] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_FP; type = E_INT; } else if (key[0] == 'B' && key[1] == 'S') { - if (!(hdr->codecs[DS_BS] = cram_decoder_init(encoding, cp, size, - E_BYTE, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_BS; type = E_BYTE; } else if (key[0] == 'I' && key[1] == 'N') { - if (!(hdr->codecs[DS_IN] = cram_decoder_init(encoding, cp, size, - E_BYTE_ARRAY, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_IN; type = E_BYTE_ARRAY; } else if (key[0] == 'S' && key[1] == 'C') { - if (!(hdr->codecs[DS_SC] = cram_decoder_init(encoding, cp, size, - E_BYTE_ARRAY, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_SC; type = E_BYTE_ARRAY; } else if (key[0] == 'D' && key[1] == 'L') { - if (!(hdr->codecs[DS_DL] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_DL; type = E_INT; } else if (key[0] == 'B' && key[1] == 'A') { - if (!(hdr->codecs[DS_BA] = cram_decoder_init(encoding, cp, size, - E_BYTE, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_BA; type = E_BYTE; } else if (key[0] == 'B' && key[1] == 'B') { - if (!(hdr->codecs[DS_BB] = cram_decoder_init(encoding, cp, size, - E_BYTE_ARRAY, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_BB; type = E_BYTE_ARRAY; } else if (key[0] == 'R' && key[1] == 'S') { - if (!(hdr->codecs[DS_RS] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_RS; type = E_INT; } else if (key[0] == 'P' && key[1] == 'D') { - if (!(hdr->codecs[DS_PD] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_PD; type = E_INT; } else if (key[0] == 'H' && key[1] == 'C') { - if (!(hdr->codecs[DS_HC] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_HC; type = E_INT; } else if (key[0] == 'M' && key[1] == 'Q') { - if (!(hdr->codecs[DS_MQ] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_MQ; type = E_INT; } else if (key[0] == 'R' && key[1] == 'N') { - if (!(hdr->codecs[DS_RN] = cram_decoder_init(encoding, cp, size, - E_BYTE_ARRAY_BLOCK, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_RN; type = E_BYTE_ARRAY_BLOCK; } else if (key[0] == 'Q' && key[1] == 'S') { - if (!(hdr->codecs[DS_QS] = cram_decoder_init(encoding, cp, size, - E_BYTE, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_QS; type = E_BYTE; } else if (key[0] == 'Q' && key[1] == 'Q') { - if (!(hdr->codecs[DS_QQ] = cram_decoder_init(encoding, cp, size, - E_BYTE_ARRAY, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_QQ; type = E_BYTE_ARRAY; } else if (key[0] == 'T' && key[1] == 'L') { - if (!(hdr->codecs[DS_TL] = cram_decoder_init(encoding, cp, size, - E_INT, - fd->version))) { - cram_free_compression_header(hdr); - return NULL; - } + ds_id = DS_TL; type = E_INT; } else if (key[0] == 'T' && key[1] == 'M') { } else if (key[0] == 'T' && key[1] == 'V') { } else { hts_log_warning("Unrecognised key: %.2s", key); } + if (ds_id != DS_CORE) { + if (hdr->codecs[ds_id] != NULL) { + hts_log_warning("Codec for key %.2s defined more than once", + key); + hdr->codecs[ds_id]->free(hdr->codecs[ds_id]); + } + hdr->codecs[ds_id] = cram_decoder_init(encoding, cp, size, + type, fd->version); + if (!hdr->codecs[ds_id]) { + cram_free_compression_header(hdr); + return NULL; + } + } + cp += size; + // Fill out cram_map purely for cram_dump to dump out. + m = malloc(sizeof(*m)); + if (!m) { + cram_free_compression_header(hdr); + return NULL; + } + m->key = CRAM_KEY(key[0], key[1]); + m->encoding = encoding; + m->size = size; + m->offset = offset; + m->codec = NULL; + m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])]; hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m; } @@ -625,7 +508,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, int32_t encoding = E_NULL; int32_t size = 0; cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc - char *key; + uint8_t *key; if (!m || endp - cp < 6) { free(m); @@ -633,7 +516,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, return NULL; } - key = cp + 1; + key = (uint8_t *) cp + 1; m->key = (key[0]<<16)|(key[1]<<8)|key[2]; cp += 4; // Strictly ITF8, but this suffices @@ -1094,8 +977,16 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { if (b->content_type == MAPPED_SLICE) { cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_id); - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_start); - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_span); +#ifdef LARGE_POS + cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_start); + cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_span); +#else + int32_t i32; + cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); + hdr->ref_seq_start = i32; + cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); + hdr->ref_seq_span = i32; +#endif } cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_records); hdr->record_counter = 0; @@ -1184,12 +1075,16 @@ static int sort_freqs(const void *vp1, const void *vp2) { * Primary CRAM sequence decoder */ -static inline void add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_dist) { +static inline int add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_dist) { if (decode_md) { BLOCK_APPEND_UINT(s->aux_blk, *md_dist); BLOCK_APPEND_CHAR(s->aux_blk, c); *md_dist = 0; } + return 0; + + block_err: + return -1; } /* @@ -1197,12 +1092,13 @@ static inline void add_md_char(cram_slice *s, int decode_md, char c, int32_t *md * Generates the sequence, quality and cigar components. */ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, - cram_block *blk, cram_record *cr, SAM_hdr *bfd, + cram_block *blk, cram_record *cr, sam_hdr_t *sh, int cf, char *seq, char *qual, int has_MD, int has_NM) { int prev_pos = 0, f, r = 0, out_sz = 1; int seq_pos = 1; - int cig_len = 0, ref_pos = cr->apos; + int cig_len = 0; + int64_t ref_pos = cr->apos; int32_t fn, i32; enum cigar_op cig_op = BAM_CMATCH; uint32_t *cigar = s->cigar; @@ -1214,6 +1110,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int decode_md = s->decode_md && s->ref && !has_MD && cr->ref_id >= 0; int decode_nm = s->decode_md && s->ref && !has_NM && cr->ref_id >= 0; uint32_t ds = s->data_series; + sam_hrecs_t *bfd = sh->hrecs; if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { memset(qual, 255, cr->len); @@ -1316,9 +1213,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, // FIXME: not N, but nt16 lookup == 15? char base = s->ref[ref_pos - s->ref_start + 1 + i]; if (base == 'N') { - add_md_char(s, decode_md, - s->ref[ref_pos - s->ref_start + 1 + i], - &md_dist); + if (add_md_char(s, decode_md, + s->ref[ref_pos - s->ref_start + 1 + i], + &md_dist) < 0) + return -1; nm++; } else { md_dist++; @@ -1452,7 +1350,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (pos-1 < cr->len) seq[pos-1] = c->comp_hdr-> substitution_matrix[ref_base][base]; - add_md_char(s, decode_md, ref_call, &md_dist); + if (add_md_char(s, decode_md, ref_call, &md_dist) < 0) + return -1; } } cig_op = BAM_CMATCH; @@ -1791,23 +1690,29 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (ref_pos + cr->len-seq_pos +1 > s->ref_end) goto beyond_slice; if (decode_md || decode_nm) { - int i; - for (i = 0; i < cr->len - seq_pos + 1; i++) { - // FIXME: not N, but nt16 lookup == 15? - char base = s->ref[ref_pos - s->ref_start + 1 + i]; - if (base == 'N') { - add_md_char(s, decode_md, - s->ref[ref_pos - s->ref_start + 1 + i], - &md_dist); - nm++; - } else { - md_dist++; + int i, j = ref_pos - s->ref_start + 1; + // FIXME: Update this to match spec once we're also + // ready to update samtools calmd. (N vs any ambig) + if (memchr(&s->ref[j], 'N', cr->len - (seq_pos-1))) { + for (i = seq_pos-1, j -= i; i < cr->len; i++) { + char base = s->ref[j+i]; + if (base == 'N') { + if (add_md_char(s, decode_md, 'N', &md_dist) < 0) + return -1; + nm++; + } else { + md_dist++; + } + seq[i] = base; } - seq[seq_pos-1+i] = base; + } else { + // faster than above code + memcpy(&seq[seq_pos-1], &s->ref[j], cr->len - (seq_pos-1)); + md_dist += cr->len - (seq_pos-1); } } else { memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1], - cr->len - seq_pos + 1); + cr->len - (seq_pos-1)); } } ref_pos += cr->len - seq_pos + 1; @@ -1896,13 +1801,27 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (decode_nm) { char buf[7]; - buf[0] = 'N'; buf[1] = 'M'; buf[2] = 'I'; - buf[3] = (nm>> 0) & 0xff; - buf[4] = (nm>> 8) & 0xff; - buf[5] = (nm>>16) & 0xff; - buf[6] = (nm>>24) & 0xff; - BLOCK_APPEND(s->aux_blk, buf, 7); - cr->aux_size += 7; + size_t buf_size; + buf[0] = 'N'; buf[1] = 'M'; + if (nm <= UINT8_MAX) { + buf_size = 4; + buf[2] = 'C'; + buf[3] = (nm>> 0) & 0xff; + } else if (nm <= UINT16_MAX) { + buf_size = 5; + buf[2] = 'S'; + buf[3] = (nm>> 0) & 0xff; + buf[4] = (nm>> 8) & 0xff; + } else { + buf_size = 7; + buf[2] = 'I'; + buf[3] = (nm>> 0) & 0xff; + buf[4] = (nm>> 8) & 0xff; + buf[5] = (nm>>16) & 0xff; + buf[6] = (nm>>24) & 0xff; + } + BLOCK_APPEND(s->aux_blk, buf, buf_size); + cr->aux_size += buf_size; } return r; @@ -1914,6 +1833,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, // error reporting in only one. hts_log_error("CRAM CIGAR extends beyond slice reference extents"); return -1; + + block_err: + return -1; } /* @@ -1977,6 +1899,9 @@ static int cram_decode_aux_1_0(cram_container *c, cram_slice *s, } return r; + + block_err: + return -1; } static int cram_decode_aux(cram_container *c, cram_slice *s, @@ -2037,6 +1962,9 @@ static int cram_decode_aux(cram_container *c, cram_slice *s, } return r; + + block_err: + return -1; } /* Resolve mate pair cross-references between recs within this slice */ @@ -2075,8 +2003,8 @@ static int cram_decode_slice_xref(cram_slice *s, int required_fields) { */ if (cr->tlen == INT_MIN) { int id1 = rec, id2 = rec; - int aleft = cr->apos, aright = cr->aend; - int tlen; + int64_t aleft = cr->apos, aright = cr->aend; + int64_t tlen; int ref = cr->ref_id; // number of segments starting at the same point. @@ -2205,7 +2133,7 @@ static char *md5_print(unsigned char *md5, char *out) { * -1 on failure */ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, - SAM_hdr *bfd) { + sam_hdr_t *sh) { cram_block *blk = s->block[0]; int32_t bf, ref_id; unsigned char cf; @@ -2216,6 +2144,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, int embed_ref; char **refs = NULL; uint32_t ds; + sam_hrecs_t *bfd = sh->hrecs; if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0) return -1; @@ -2286,11 +2215,12 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0); //s->ref_start = 1; - if (fd->required_fields & SAM_SEQ) + if (fd->required_fields & SAM_SEQ) { s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, s->hdr->ref_seq_start, s->hdr->ref_seq_start + s->hdr->ref_seq_span -1); + } s->ref_start = s->hdr->ref_seq_start; s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1; @@ -2313,7 +2243,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if ((fd->required_fields & SAM_SEQ) && s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) { - hts_log_error("Unable to fetch reference #%d %d..%d", + hts_log_error("Unable to fetch reference #%d %"PRId64"..%"PRId64"\n", s->hdr->ref_seq_id, s->hdr->ref_seq_start, s->hdr->ref_seq_start + s->hdr->ref_seq_span-1); return -1; @@ -2458,13 +2388,17 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, } pthread_mutex_lock(&fd->range_lock); - int discard_last_ref = (!fd->unsorted && - last_ref_id >= 0 && + int discard_last_ref = (last_ref_id >= 0 && refs[last_ref_id] && (fd->range.refid == -2 || last_ref_id == fd->range.refid)); pthread_mutex_unlock(&fd->range_lock); - if (discard_last_ref) { + if (discard_last_ref) { + pthread_mutex_lock(&fd->ref_lock); + discard_last_ref = !fd->unsorted; + pthread_mutex_unlock(&fd->ref_lock); + } + if (discard_last_ref) { cram_ref_decr(fd->refs, last_ref_id); refs[last_ref_id] = NULL; } @@ -2503,9 +2437,17 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (ds & CRAM_AP) { if (!c->comp_hdr->codecs[DS_AP]) return -1; +#ifdef LARGE_POS r |= c->comp_hdr->codecs[DS_AP] ->decode(s, c->comp_hdr->codecs[DS_AP], blk, (char *)&cr->apos, &out_sz); +#else + int32_t i32; + r |= c->comp_hdr->codecs[DS_AP] + ->decode(s, c->comp_hdr->codecs[DS_AP], blk, + (char *)&i32, &out_sz); + cr->apos = i32; +#endif if (r) return r; if (c->comp_hdr->AP_delta) cr->apos += s->last_apos; @@ -2602,17 +2544,33 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (ds & CRAM_NP) { if (!c->comp_hdr->codecs[DS_NP]) return -1; +#ifdef LARGE_POS r |= c->comp_hdr->codecs[DS_NP] ->decode(s, c->comp_hdr->codecs[DS_NP], blk, (char *)&cr->mate_pos, &out_sz); +#else + int32_t i32; + r |= c->comp_hdr->codecs[DS_NP] + ->decode(s, c->comp_hdr->codecs[DS_NP], blk, + (char *)&i32, &out_sz); + cr->mate_pos = i32; +#endif if (r) return r; } if (ds & CRAM_TS) { if (!c->comp_hdr->codecs[DS_TS]) return -1; +#ifdef LARGE_POS r |= c->comp_hdr->codecs[DS_TS] ->decode(s, c->comp_hdr->codecs[DS_TS], blk, (char *)&cr->tlen, &out_sz); +#else + int32_t i32; + r |= c->comp_hdr->codecs[DS_TS] + ->decode(s, c->comp_hdr->codecs[DS_TS], blk, + (char *)&i32, &out_sz); + cr->tlen = i32; +#endif if (r) return r; } else { cr->tlen = INT_MIN; @@ -2683,13 +2641,14 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (!(bf & BAM_FUNMAP)) { if ((ds & CRAM_AP) && cr->apos <= 0) { - hts_log_error("Read has alignment position %d but no unmapped flag", + hts_log_error("Read has alignment position %"PRId64 + " but no unmapped flag", cr->apos); return -1; } /* Decode sequence and generate CIGAR */ if (ds & (CRAM_SEQ | CRAM_MQ)) { - r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual, + r |= cram_decode_seq(fd, c, s, blk, cr, sh, cf, seq, qual, has_MD, has_NM); if (r) return r; } else { @@ -2772,13 +2731,16 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, BLOCK_RESIZE_EXACT(s->aux_blk, BLOCK_SIZE(s->aux_blk)+1); return r; + + block_err: + return -1; } typedef struct { cram_fd *fd; cram_container *c; cram_slice *s; - SAM_hdr *h; + sam_hdr_t *h; int exit_code; } cram_decode_job; @@ -2794,7 +2756,7 @@ void *cram_decode_slice_thread(void *arg) { * Spawn a multi-threaded version of cram_decode_slice(). */ int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s, - SAM_hdr *bfd) { + sam_hdr_t *bfd) { cram_decode_job *j; int nonblock; @@ -2841,13 +2803,14 @@ int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s, * Returns the used size of the bam record on success * -1 on failure. */ -static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s, +static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, cram_record *cr, int rec, bam_seq_t **bam) { int bam_idx, rg_len; char name_a[1024], *name; int name_len; char *aux, *aux_orig; char *seq, *qual; + sam_hrecs_t *bfd = sh->hrecs; /* Assign names if not explicitly set */ if (fd->required_fields & SAM_QNAME) { @@ -2951,6 +2914,9 @@ static cram_container *cram_first_slice(cram_fd *fd) { cram_container *c; do { + if (fd->ctr) + cram_free_container(fd->ctr); + if (!(c = fd->ctr = cram_read_container(fd))) return NULL; c->curr_slice_mt = c->curr_slice; @@ -2991,7 +2957,7 @@ static cram_container *cram_first_slice(cram_fd *fd) { if (!c->comp_hdr) return NULL; if (!c->comp_hdr->AP_delta && - sam_hdr_sort_order(fd->header) != ORDER_COORD) { + sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) { pthread_mutex_lock(&fd->ref_lock); fd->unsorted = 1; pthread_mutex_unlock(&fd->ref_lock); @@ -3119,7 +3085,7 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { return NULL; if (!c_next->comp_hdr->AP_delta && - sam_hdr_sort_order(fd->header) != ORDER_COORD) { + sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) { pthread_mutex_lock(&fd->ref_lock); fd->unsorted = 1; pthread_mutex_unlock(&fd->ref_lock); @@ -3127,9 +3093,13 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } if (c_next->num_records == 0) { - cram_free_container(c_next); + if (fd->ctr == c_next) + fd->ctr = NULL; + if (c_curr == c_next) + c_curr = NULL; if (fd->ctr_mt == c_next) fd->ctr_mt = NULL; + cram_free_container(c_next); c_next = NULL; goto empty_container; } diff --git a/cram/cram_decode.h b/cram/cram_decode.h index 8de0edf66..400eb6beb 100644 --- a/cram/cram_decode.h +++ b/cram/cram_decode.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2013 Genome Research Ltd. +Copyright (c) 2012-2013, 2018 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -38,8 +38,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * cram_codecs.[ch] for the actual encoding functions themselves. */ -#ifndef _CRAM_READ_H_ -#define _CRAM_READ_H_ +#ifndef CRAM_DECODE_H +#define CRAM_DECODE_H #ifdef __cplusplus extern "C" { @@ -102,7 +102,7 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b); * -1 on failure */ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, - SAM_hdr *hdr); + sam_hdr_t *hdr); /* diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 08f0e431c..4a5d18189 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2013 Genome Research Ltd. +Copyright (c) 2012-2019 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -28,6 +28,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -35,10 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include #include +#include #include "cram/cram.h" #include "cram/os.h" @@ -54,7 +57,7 @@ KHASH_MAP_INIT_STR(m_s2u64, uint64_t) static int process_one_read(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *cr, - bam_seq_t *b, int rnum); + bam_seq_t *b, int rnum, kstring_t *MD); /* * Returns index of val into key. @@ -77,7 +80,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, cram_block_compression_hdr *h) { cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0); cram_block *map = cram_new_block(COMPRESSION_HEADER, 0); - int i, mc; + int i, mc, r = 0; if (!cb || !map) return NULL; @@ -91,15 +94,31 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, * the total size (stored as a variable length string). */ +/* + * LARGE_POS used in this code is purely a debugging mechanism for testing + * whether the htslib API can cope with 64-bit quantities. These are + * possible in SAM, but not *yet* in BAM or CRAM. + * + * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. + * + * At some point it is expected these ifdefs will become a version check + * instead. + */ + // Duplicated from container itself, and removed in 1.1 if (CRAM_MAJOR_VERS(fd->version) == 1) { - itf8_put_blk(cb, h->ref_seq_id); - itf8_put_blk(cb, h->ref_seq_start); - itf8_put_blk(cb, h->ref_seq_span); - itf8_put_blk(cb, h->num_records); - itf8_put_blk(cb, h->num_landmarks); + r |= itf8_put_blk(cb, h->ref_seq_id); +#ifdef LARGE_POS + r |= ltf8_put_blk(cb, h->ref_seq_start); + r |= ltf8_put_blk(cb, h->ref_seq_span); +#else + r |= itf8_put_blk(cb, h->ref_seq_start); + r |= itf8_put_blk(cb, h->ref_seq_span); +#endif + r |= itf8_put_blk(cb, h->num_records); + r |= itf8_put_blk(cb, h->num_landmarks); for (i = 0; i < h->num_landmarks; i++) { - itf8_put_blk(cb, h->landmark[i]); + r |= itf8_put_blk(cb, h->landmark[i]); } } @@ -231,7 +250,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, } case CRAM_KEY('T','D'): { - itf8_put_blk(map, BLOCK_SIZE(h->TD_blk)); + r |= itf8_put_blk(map, BLOCK_SIZE(h->TD_blk)); BLOCK_APPEND(map, BLOCK_DATA(h->TD_blk), BLOCK_SIZE(h->TD_blk)); @@ -246,8 +265,8 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, mc++; } } - itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - itf8_put_blk(cb, mc); + r |= itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); + r |= itf8_put_blk(cb, mc); BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); /* rec encoding map */ @@ -447,8 +466,8 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, return NULL; mc++; } - itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - itf8_put_blk(cb, mc); + r |= itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); + r |= itf8_put_blk(cb, mc); BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); /* tag encoding map */ @@ -487,7 +506,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, key = kh_key(c->tags_used, k); cram_codec *cd = kh_val(c->tags_used, k)->codec; - itf8_put_blk(map, key); + r |= itf8_put_blk(map, key); if (-1 == cd->store(cd, map, NULL, fd->version)) return NULL; @@ -495,8 +514,8 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, } } #endif - itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - itf8_put_blk(cb, mc); + r |= itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); + r |= itf8_put_blk(cb, mc); BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); hts_log_info("Wrote compression block header in %d bytes", (int)BLOCK_SIZE(cb)); @@ -505,7 +524,11 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, cram_free_block(map); - return cb; + if (r >= 0) + return cb; + + block_err: + return NULL; } @@ -530,8 +553,13 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { } cp += itf8_put(cp, s->hdr->ref_seq_id); +#ifdef LARGE_POS + cp += ltf8_put(cp, s->hdr->ref_seq_start); + cp += ltf8_put(cp, s->hdr->ref_seq_span); +#else cp += itf8_put(cp, s->hdr->ref_seq_start); cp += itf8_put(cp, s->hdr->ref_seq_span); +#endif cp += itf8_put(cp, s->hdr->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) cp += itf8_put(cp, s->hdr->record_counter); @@ -569,7 +597,7 @@ static int cram_encode_slice_read(cram_fd *fd, cram_block_compression_hdr *h, cram_slice *s, cram_record *cr, - int *last_pos) { + int64_t *last_pos) { int r = 0; int32_t i32; unsigned char uc; @@ -590,12 +618,24 @@ static int cram_encode_slice_read(cram_fd *fd, r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1); if (c->pos_sorted) { +#ifdef LARGE_POS + int64_t i64; + i64 = cr->apos - *last_pos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); +#else i32 = cr->apos - *last_pos; r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); +#endif *last_pos = cr->apos; } else { +#ifdef LARGE_POS + int64_t i64; + i64 = cr->apos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); +#else i32 = cr->apos; r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); +#endif } r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1); @@ -607,11 +647,20 @@ static int cram_encode_slice_read(cram_fd *fd, r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS], (char *)&cr->mate_ref_id, 1); +#ifdef LARGE_POS r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], (char *)&cr->mate_pos, 1); r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], (char *)&cr->tlen, 1); +#else + i32 = cr->mate_pos; + r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], + (char *)&i32, 1); + i32 = cr->tlen; + r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], + (char *)&i32, 1); +#endif } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF], (char *)&cr->mate_line, 1); @@ -905,7 +954,8 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { */ static int cram_encode_slice(cram_fd *fd, cram_container *c, cram_block_compression_hdr *h, cram_slice *s) { - int rec, r = 0, last_pos; + int rec, r = 0; + int64_t last_pos; int embed_ref; enum cram_DS_ID id; @@ -942,11 +992,10 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, if (CRAM_MAJOR_VERS(fd->version) == 1) { if (h->codecs[DS_TN]->codec == E_EXTERNAL) { if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1; - h->codecs[DS_TN]->external.content_id = DS_TN; + h->codecs[DS_TN]->u.external.content_id = DS_TN; } else { s->block[DS_TN] = s->block[0]; } - s->block[DS_TN] = s->block[DS_TN]; } // Embedded reference @@ -955,8 +1004,8 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, return -1; s->ref_id = DS_ref; // needed? BLOCK_APPEND(s->block[DS_ref], - c->ref + c->first_base - c->ref_start, - c->last_base - c->first_base + 1); + c->ref + s->hdr->ref_seq_start - c->ref_start, + s->hdr->ref_seq_span); } /* @@ -970,34 +1019,34 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, case E_EXTERNAL: if (!(s->block[id] = cram_new_block(EXTERNAL, id))) return -1; - h->codecs[id]->external.content_id = id; + h->codecs[id]->u.external.content_id = id; break; case E_BYTE_ARRAY_STOP: if (!(s->block[id] = cram_new_block(EXTERNAL, id))) return -1; - h->codecs[id]->byte_array_stop.content_id = id; + h->codecs[id]->u.byte_array_stop.content_id = id; break; case E_BYTE_ARRAY_LEN: { cram_codec *cc; - cc = h->codecs[id]->e_byte_array_len.len_codec; + cc = h->codecs[id]->u.e_byte_array_len.len_codec; if (cc->codec == E_EXTERNAL) { - int eid = cc->external.content_id; + int eid = cc->u.external.content_id; if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) return -1; - cc->external.content_id = eid; + cc->u.external.content_id = eid; cc->out = s->block[eid]; } - cc = h->codecs[id]->e_byte_array_len.val_codec; + cc = h->codecs[id]->u.e_byte_array_len.val_codec; if (cc->codec == E_EXTERNAL) { - int eid = cc->external.content_id; + int eid = cc->u.external.content_id; if (!s->block[eid]) if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) return -1; - cc->external.content_id = eid; + cc->u.external.content_id = eid; cc->out = s->block[eid]; } break; @@ -1082,6 +1131,9 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, } return r ? -1 : 0; + + block_err: + return -1; } /* @@ -1219,8 +1271,8 @@ static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s, * Output is an update s->name_blk, and cr->name / cr->name_len * fields. */ -static void add_read_names(cram_fd *fd, cram_container *c, cram_slice *s, - int bam_start) { +static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s, + int bam_start) { int r1, r2; int keep_names = !fd->lossy_read_names; @@ -1238,8 +1290,14 @@ static void add_read_names(cram_fd *fd, cram_container *c, cram_slice *s, // Can only discard duplicate names if not detached cr->name_len = 0; } - cram_stats_add(c->stats[DS_RN], cr->name_len); + if (cram_stats_add(c->stats[DS_RN], cr->name_len) < 0) + goto block_err; } + + return 0; + + block_err: + return -1; } /* @@ -1255,6 +1313,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { int r1, r2, sn, nref; spare_bams *spares; +//#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;} +#define goto_err goto err + /* Cache references up-front if we have unsorted access patterns */ pthread_mutex_lock(&fd->ref_lock); nref = fd->refs->nref; @@ -1270,10 +1331,11 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { /* To create M5 strings */ /* Fetch reference sequence */ if (!fd->no_ref) { + if (!c->bams || !c->bams[0]) + goto_err; bam_seq_t *b = c->bams[0]; - char *ref; - ref = cram_get_ref(fd, bam_ref(b), 1, 0); + char *ref = cram_get_ref(fd, bam_ref(b), 1, 0); if (!ref && bam_ref(b) >= 0) { hts_log_error("Failed to load reference #%d", bam_ref(b)); return -1; @@ -1295,7 +1357,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { /* Turn bams into cram_records and gather basic stats */ for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) { cram_slice *s = c->slices[sn]; - int first_base = INT_MAX, last_base = INT_MIN; + int64_t first_base = INT64_MAX, last_base = INT64_MIN; int r1_start = r1; @@ -1306,6 +1368,12 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (lossy_read_names(fd, c, s, r1_start) != 0) return -1; + // Tracking of MD tags so we can spot when the auto-generated values + // will differ from the current stored ones. The kstring here is + // simply to avoid excessive malloc and free calls. All initialisation + // is done within process_one_read(). + kstring_t MD = {0}; + // Iterate through records creating the cram blocks for some // fields and just gathering stats for others. for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) { @@ -1320,6 +1388,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (!cram_get_ref(fd, bam_ref(b), 1, 0)) { hts_log_error("Failed to load reference #%d", bam_ref(b)); + free(MD.s); return -1; } @@ -1332,8 +1401,10 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } } - if (process_one_read(fd, c, s, cr, b, r2) != 0) + if (process_one_read(fd, c, s, cr, b, r2, &MD) != 0) { + free(MD.s); return -1; + } if (first_base > cr->apos) first_base = cr->apos; @@ -1341,13 +1412,15 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (last_base < cr->aend) last_base = cr->aend; } + free(MD.s); // Process_one_read doesn't add read names as it can change // its mind during the loop on the CRAM_FLAG_DETACHED setting // of earlier records (if it detects the auto-generation of // TLEN is incorrect). This affects which read-names can be // lossily compressed, so we do these in another pass. - add_read_names(fd, c, s, r1_start); + if (add_read_names(fd, c, s, r1_start) < 0) + return -1; if (c->multi_seq) { s->hdr->ref_seq_id = -2; @@ -1376,6 +1449,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { continue; cram_tag_map *tm = kh_val(c->tags_used, k); + if (!tm) goto_err; if (!tm->blk) continue; s->aux_block[s->naux_block++] = tm->blk; tm->blk = NULL; @@ -1391,6 +1465,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { /* Link our bams[] array onto the spare bam list for reuse */ spares = malloc(sizeof(*spares)); + if (!spares) goto_err; pthread_mutex_lock(&fd->bam_list_lock); spares->bams = c->bams; spares->next = fd->bl; @@ -1401,6 +1476,10 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { /* Detect if a multi-seq container */ cram_stats_encoding(fd, c->stats[DS_RI]); multi_ref = c->stats[DS_RI]->nvals > 1; + pthread_mutex_lock(&fd->metrics_lock); + fd->last_RI_count = c->stats[DS_RI]->nvals; + pthread_mutex_unlock(&fd->metrics_lock); + if (multi_ref) { hts_log_info("Multi-ref container"); @@ -1438,11 +1517,14 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]), c->stats[DS_BF], E_INT, NULL, fd->version); + if (c->stats[DS_BF]->nvals && !h->codecs[DS_BF]) goto_err; //fprintf(stderr, "=== CF ===\n"); h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]), c->stats[DS_CF], E_INT, NULL, fd->version); + if (c->stats[DS_CF]->nvals && !h->codecs[DS_CF]) goto_err; + //fprintf(stderr, "=== RN ===\n"); //h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]), // c->stats[DS_RN], E_BYTE_ARRAY, NULL, @@ -1451,76 +1533,107 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== AP ===\n"); if (c->pos_sorted) { h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), - c->stats[DS_AP], E_INT, NULL, - fd->version); + c->stats[DS_AP], +#ifdef LARGE_POS + E_LONG, +#else + E_INT, +#endif + NULL, fd->version); } else { int p[2] = {0, c->max_apos}; h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p, fd->version); } + if (!h->codecs[DS_AP]) goto_err; //fprintf(stderr, "=== RG ===\n"); h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]), c->stats[DS_RG], E_INT, NULL, fd->version); + if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err; //fprintf(stderr, "=== MQ ===\n"); h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]), c->stats[DS_MQ], E_INT, NULL, fd->version); + if (c->stats[DS_MQ]->nvals && !h->codecs[DS_MQ]) goto_err; //fprintf(stderr, "=== NS ===\n"); h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]), c->stats[DS_NS], E_INT, NULL, fd->version); + if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err; //fprintf(stderr, "=== MF ===\n"); h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]), c->stats[DS_MF], E_INT, NULL, fd->version); + if (c->stats[DS_MF]->nvals && !h->codecs[DS_MF]) goto_err; //fprintf(stderr, "=== TS ===\n"); h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]), - c->stats[DS_TS], E_INT, NULL, - fd->version); + c->stats[DS_TS], +#ifdef LARGE_POS + E_LONG, +#else + E_INT, +#endif + NULL, fd->version); + if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err; + //fprintf(stderr, "=== NP ===\n"); h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]), - c->stats[DS_NP], E_INT, NULL, - fd->version); + c->stats[DS_NP], +#ifdef LARGE_POS + E_LONG, +#else + E_INT, +#endif + NULL, fd->version); + if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err; + //fprintf(stderr, "=== NF ===\n"); h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]), c->stats[DS_NF], E_INT, NULL, fd->version); + if (c->stats[DS_NF]->nvals && !h->codecs[DS_NF]) goto_err; //fprintf(stderr, "=== RL ===\n"); h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]), c->stats[DS_RL], E_INT, NULL, fd->version); + if (c->stats[DS_RL]->nvals && !h->codecs[DS_RL]) goto_err; //fprintf(stderr, "=== FN ===\n"); h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]), c->stats[DS_FN], E_INT, NULL, fd->version); + if (c->stats[DS_FN]->nvals && !h->codecs[DS_FN]) goto_err; //fprintf(stderr, "=== FC ===\n"); h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]), c->stats[DS_FC], E_BYTE, NULL, fd->version); + if (c->stats[DS_FC]->nvals && !h->codecs[DS_FC]) goto_err; //fprintf(stderr, "=== FP ===\n"); h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]), c->stats[DS_FP], E_INT, NULL, fd->version); + if (c->stats[DS_FP]->nvals && !h->codecs[DS_FP]) goto_err; //fprintf(stderr, "=== DL ===\n"); h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]), c->stats[DS_DL], E_INT, NULL, fd->version); + if (c->stats[DS_DL]->nvals && !h->codecs[DS_DL]) goto_err; //fprintf(stderr, "=== BA ===\n"); h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]), c->stats[DS_BA], E_BYTE, NULL, fd->version); + if (c->stats[DS_BA]->nvals && !h->codecs[DS_BA]) goto_err; if (CRAM_MAJOR_VERS(fd->version) >= 3) { cram_byte_array_len_encoder e; @@ -1535,6 +1648,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, E_BYTE_ARRAY, (void *)&e, fd->version); + if (!h->codecs[DS_BB]) goto_err; } else { h->codecs[DS_BB] = NULL; } @@ -1543,6 +1657,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]), c->stats[DS_BS], E_BYTE, NULL, fd->version); + if (c->stats[DS_BS]->nvals && !h->codecs[DS_BS]) goto_err; if (CRAM_MAJOR_VERS(fd->version) == 1) { h->codecs[DS_TL] = NULL; @@ -1556,11 +1671,13 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]), c->stats[DS_TC], E_BYTE, NULL, fd->version); + if (c->stats[DS_TC]->nvals && !h->codecs[DS_TC]) goto_err; //fprintf(stderr, "=== TN ===\n"); h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]), c->stats[DS_TN], E_INT, NULL, fd->version); + if (c->stats[DS_TN]->nvals && !h->codecs[DS_TN]) goto_err; } else { h->codecs[DS_TC] = NULL; h->codecs[DS_TN] = NULL; @@ -1569,27 +1686,32 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]), c->stats[DS_TL], E_INT, NULL, fd->version); + if (c->stats[DS_TL]->nvals && !h->codecs[DS_TL]) goto_err; //fprintf(stderr, "=== RI ===\n"); h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]), c->stats[DS_RI], E_INT, NULL, fd->version); + if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err; //fprintf(stderr, "=== RS ===\n"); h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]), c->stats[DS_RS], E_INT, NULL, fd->version); + if (c->stats[DS_RS]->nvals && !h->codecs[DS_RS]) goto_err; //fprintf(stderr, "=== PD ===\n"); h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]), c->stats[DS_PD], E_INT, NULL, fd->version); + if (c->stats[DS_PD]->nvals && !h->codecs[DS_PD]) goto_err; //fprintf(stderr, "=== HC ===\n"); h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]), c->stats[DS_HC], E_INT, NULL, fd->version); + if (c->stats[DS_HC]->nvals && !h->codecs[DS_HC]) goto_err; //fprintf(stderr, "=== SC ===\n"); if (1) { @@ -1614,6 +1736,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { E_BYTE_ARRAY, (void *)&e, fd->version); } + if (!h->codecs[DS_SC]) goto_err; } //fprintf(stderr, "=== IN ===\n"); @@ -1622,16 +1745,19 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, fd->version); + if (!h->codecs[DS_IN]) goto_err; } h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE, (void *)DS_QS, fd->version); + if (!h->codecs[DS_QS]) goto_err; { int i2[2] = {0, DS_RN}; h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, fd->version); + if (!h->codecs[DS_RN]) goto_err; } @@ -1649,11 +1775,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->ref_seq_start = c->ref_seq_start; h->ref_seq_span = c->ref_seq_span; h->num_records = c->num_records; - - h->mapped_qs_included = 0; // fixme - h->unmapped_qs_included = 0; // fixme h->AP_delta = c->pos_sorted; - // h->... fixme memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20); if (!(c_hdr = cram_encode_compression_header(fd, c, h))) @@ -1733,6 +1855,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } return 0; + + err: + return -1; } @@ -1756,12 +1881,16 @@ static int cram_add_feature(cram_container *c, cram_slice *s, if (!r->nfeature++) { r->feature = s->nfeatures; - cram_stats_add(c->stats[DS_FP], f->X.pos); + if (cram_stats_add(c->stats[DS_FP], f->X.pos) < 0) + return -1; } else { - cram_stats_add(c->stats[DS_FP], - f->X.pos - s->features[r->feature + r->nfeature-2].X.pos); + if (cram_stats_add(c->stats[DS_FP], + f->X.pos - s->features[r->feature + r->nfeature-2].X.pos) < 0) + return -1; + } - cram_stats_add(c->stats[DS_FC], f->X.code); + if (cram_stats_add(c->stats[DS_FC], f->X.code) < 0) + return -1; s->features[s->nfeatures++] = *f; @@ -1778,17 +1907,21 @@ static int cram_add_substitution(cram_fd *fd, cram_container *c, f.X.pos = pos+1; f.X.code = 'X'; f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f]; - cram_stats_add(c->stats[DS_BS], f.X.base); + if (cram_stats_add(c->stats[DS_BS], f.X.base) < 0) + return -1; } else { f.B.pos = pos+1; f.B.code = 'B'; f.B.base = base; f.B.qual = qual; - cram_stats_add(c->stats[DS_BA], f.B.base); - cram_stats_add(c->stats[DS_QS], f.B.qual); + if (cram_stats_add(c->stats[DS_BA], f.B.base) < 0) return -1; + if (cram_stats_add(c->stats[DS_QS], f.B.qual) < 0) return -1; BLOCK_APPEND_CHAR(s->qual_blk, qual); } return cram_add_feature(c, s, r, &f); + + block_err: + return -1; } static int cram_add_bases(cram_fd *fd, cram_container *c, @@ -1812,10 +1945,13 @@ static int cram_add_base(cram_fd *fd, cram_container *c, f.B.code = 'B'; f.B.base = base; f.B.qual = qual; - cram_stats_add(c->stats[DS_BA], base); - cram_stats_add(c->stats[DS_QS], qual); + if (cram_stats_add(c->stats[DS_BA], base) < 0) return -1; + if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1; BLOCK_APPEND_CHAR(s->qual_blk, qual); return cram_add_feature(c, s, r, &f); + + block_err: + return -1; } static int cram_add_quality(cram_fd *fd, cram_container *c, @@ -1825,9 +1961,12 @@ static int cram_add_quality(cram_fd *fd, cram_container *c, f.Q.pos = pos+1; f.Q.code = 'Q'; f.Q.qual = qual; - cram_stats_add(c->stats[DS_QS], qual); + if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1; BLOCK_APPEND_CHAR(s->qual_blk, qual); return cram_add_feature(c, s, r, &f); + + block_err: + return -1; } static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r, @@ -1836,7 +1975,7 @@ static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r, f.D.pos = pos+1; f.D.code = 'D'; f.D.len = len; - cram_stats_add(c->stats[DS_DL], len); + if (cram_stats_add(c->stats[DS_DL], len) < 0) return -1; return cram_add_feature(c, s, r, &f); } @@ -1871,6 +2010,9 @@ static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r, // f.S.seq_idx = BLOCK_SIZE(s->soft_blk); } return cram_add_feature(c, s, r, &f); + + block_err: + return -1; } static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r, @@ -1879,7 +2021,7 @@ static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r, f.S.pos = pos+1; f.S.code = 'H'; f.S.len = len; - cram_stats_add(c->stats[DS_HC], len); + if (cram_stats_add(c->stats[DS_HC], len) < 0) return -1; return cram_add_feature(c, s, r, &f); } @@ -1889,7 +2031,7 @@ static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r, f.S.pos = pos+1; f.S.code = 'N'; f.S.len = len; - cram_stats_add(c->stats[DS_RS], len); + if (cram_stats_add(c->stats[DS_RS], len) < 0) return -1; return cram_add_feature(c, s, r, &f); } @@ -1899,7 +2041,7 @@ static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r, f.S.pos = pos+1; f.S.code = 'P'; f.S.len = len; - cram_stats_add(c->stats[DS_PD], len); + if (cram_stats_add(c->stats[DS_PD], len) < 0) return -1; return cram_add_feature(c, s, r, &f); } @@ -1911,7 +2053,7 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, char b = base ? *base : 'N'; f.i.code = 'i'; f.i.base = b; - cram_stats_add(c->stats[DS_BA], b); + if (cram_stats_add(c->stats[DS_BA], b) < 0) return -1; } else { f.I.code = 'I'; f.I.len = len; @@ -1926,6 +2068,9 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, BLOCK_APPEND_CHAR(s->base_blk, '\0'); } return cram_add_feature(c, s, r, &f); + + block_err: + return -1; } /* @@ -1983,7 +2128,8 @@ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, return NULL; } s->TN[s->nTN++] = i32; - cram_stats_add(c->stats[DS_TN], i32); + if (cram_stats_add(c->stats[DS_TN], i32) < 0) + goto block_err; switch(aux[2]) { case 'A': case 'C': case 'c': @@ -2054,7 +2200,8 @@ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, return NULL; } } - cram_stats_add(c->stats[DS_TC], cr->ntags); + if (cram_stats_add(c->stats[DS_TC], cr->ntags) < 0) + goto block_err; cr->aux = BLOCK_SIZE(s->aux_blk); cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); @@ -2062,6 +2209,9 @@ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, assert(s->aux_blk->byte <= s->aux_blk->alloc); return rg; + + block_err: + return NULL; } /* @@ -2069,11 +2219,13 @@ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, * keep it simple and avoid a myriad of version ifs. * * Returns the read-group parsed out of the BAM aux fields on success - * NULL on failure or no rg present (FIXME) + * NULL on failure or no rg present, also sets "*err" to non-zero */ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, cram_slice *s, cram_record *cr, - int verbatim_NM, int verbatim_MD) { + int verbatim_NM, int verbatim_MD, + int NM, kstring_t *MD, + int *err) { char *aux, *orig, *rg = NULL; int aux_size = bam_get_l_aux(b); cram_block *td_b = c->comp_hdr->TD_blk; @@ -2081,6 +2233,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, char *key; khint_t k; + if (err) *err = 1; + orig = aux = (char *)bam_aux(b); // Copy aux keys to td_b and aux values to slice aux blocks @@ -2097,23 +2251,28 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, // MD:Z if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) { - while (*aux++); - continue; + if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) { + while (*aux++); + continue; + } } } // NM:i if (aux[0] == 'N' && aux[1] == 'M') { if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) { - switch(aux[2]) { - case 'A': case 'C': case 'c': aux+=4; break; - case 'S': case 's': aux+=5; break; - case 'I': case 'i': case 'f': aux+=7; break; - default: - hts_log_error("Unhandled type code for NM tag"); - return NULL; + int NM_ = bam_aux2i((uint8_t *)aux+2); + if (NM_ == NM) { + switch(aux[2]) { + case 'A': case 'C': case 'c': aux+=4; break; + case 'S': case 's': aux+=5; break; + case 'I': case 'i': case 'f': aux+=7; break; + default: + hts_log_error("Unhandled type code for NM tag"); + return NULL; + } + continue; } - continue; } } @@ -2125,6 +2284,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, k = kh_put(m_tagmap, c->tags_used, key, &r); if (-1 == r) return NULL; + else if (r != 0) + kh_val(c->tags_used, k) = NULL; if (r == 1) { khint_t k_global; @@ -2132,16 +2293,23 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, // Global tags_used for cram_metrics support pthread_mutex_lock(&fd->metrics_lock); k_global = kh_put(m_metrics, fd->tags_used, key, &r); - if (-1 == r) + if (-1 == r) { + pthread_mutex_unlock(&fd->metrics_lock); return NULL; - if (r == 1) + } + if (r == 1) { kh_val(fd->tags_used, k_global) = cram_new_metrics(); + if (!kh_val(fd->tags_used, k_global)) + goto err; + } pthread_mutex_unlock(&fd->metrics_lock); int i2[2] = {'\t',key}; size_t sk = key; cram_tag_map *m = calloc(1, sizeof(*m)); + if (!m) + goto_err; kh_val(c->tags_used, k) = m; cram_codec *c; @@ -2164,7 +2332,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, e.len_encoding = E_HUFFMAN; e.len_dat = NULL; memset(&st, 0, sizeof(st)); - cram_stats_add(&st, 1); + if (cram_stats_add(&st, 1) < 0) goto block_err; cram_stats_encoding(fd, &st); e.val_encoding = E_EXTERNAL; @@ -2184,7 +2352,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, e.len_encoding = E_HUFFMAN; e.len_dat = NULL; memset(&st, 0, sizeof(st)); - cram_stats_add(&st, 2); + if (cram_stats_add(&st, 2) < 0) goto block_err; cram_stats_encoding(fd, &st); e.val_encoding = E_EXTERNAL; @@ -2203,7 +2371,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, e.len_encoding = E_HUFFMAN; e.len_dat = NULL; memset(&st, 0, sizeof(st)); - cram_stats_add(&st, 4); + if (cram_stats_add(&st, 4) < 0) goto block_err; cram_stats_encoding(fd, &st); e.val_encoding = E_EXTERNAL; @@ -2240,6 +2408,9 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, c = NULL; } + if (!c) + goto_err; + m->codec = c; // Link to fd-global tag metrics @@ -2249,14 +2420,16 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, } cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k); + if (!tm) goto_err; cram_codec *codec = tm->codec; + if (!tm->codec) goto_err; switch(aux[2]) { case 'A': case 'C': case 'c': if (!tm->blk) { if (!(tm->blk = cram_new_block(EXTERNAL, key))) return NULL; - codec->e_byte_array_len.val_codec->out = tm->blk; + codec->u.e_byte_array_len.val_codec->out = tm->blk; } aux+=3; @@ -2270,7 +2443,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (!tm->blk) { if (!(tm->blk = cram_new_block(EXTERNAL, key))) return NULL; - codec->e_byte_array_len.val_codec->out = tm->blk; + codec->u.e_byte_array_len.val_codec->out = tm->blk; } aux+=3; @@ -2283,7 +2456,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (!tm->blk) { if (!(tm->blk = cram_new_block(EXTERNAL, key))) return NULL; - codec->e_byte_array_len.val_codec->out = tm->blk; + codec->u.e_byte_array_len.val_codec->out = tm->blk; } aux+=3; @@ -2296,7 +2469,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (!tm->blk) { if (!(tm->blk = cram_new_block(EXTERNAL, key))) return NULL; - codec->e_byte_array_len.val_codec->out = tm->blk; + codec->u.e_byte_array_len.val_codec->out = tm->blk; } aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; @@ -2317,7 +2490,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, aux += 3; aux_s = aux; while (*aux++); - codec->encode(s, codec, aux_s, aux - aux_s); + if (codec->encode(s, codec, aux_s, aux - aux_s) < 0) + return NULL; } break; @@ -2330,8 +2504,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (!tm->blk) { if (!(tm->blk = cram_new_block(EXTERNAL, key))) return NULL; - codec->e_byte_array_len.len_codec->out = tm->blk; - codec->e_byte_array_len.val_codec->out = tm->blk; + codec->u.e_byte_array_len.len_codec->out = tm->blk; + codec->u.e_byte_array_len.val_codec->out = tm->blk; } // skip TN field @@ -2355,7 +2529,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, blen += 5; // sub-type & length - codec->encode(s, codec, aux, blen); + if (codec->encode(s, codec, aux, blen) < 0) + return NULL; aux += blen; break; } @@ -2386,9 +2561,15 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, } cr->TL = kh_val(c->comp_hdr->TD_hash, k); - cram_stats_add(c->stats[DS_TL], cr->TL); + if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0) + goto block_err; + if (err) *err = 0; return rg; + + err: + block_err: + return NULL; } /* @@ -2444,7 +2625,7 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { if (c->curr_slice == c->max_slice || (bam_ref(b) != c->curr_ref && !c->multi_seq)) { c->ref_seq_span = fd->last_base - c->ref_seq_start + 1; - hts_log_info("Flush container %d/%d..%d", + hts_log_info("Flush container %d/%"PRId64"..%"PRId64, c->ref_seq_id, c->ref_seq_start, c->ref_seq_start + c->ref_seq_span -1); @@ -2503,6 +2684,44 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { return c; } +/* + * Convert a nibble encoded BAM sequence to a string of bases. + * + * We do this 2 bp at a time for speed. Equiv to: + * + * for (i = 0; i < len; i++) + * seq[i] = seq_nt16_str[bam_seqi(nib, i)]; + */ +static void nibble2base(uint8_t *nib, char *seq, int len) { + static const char code2base[512] = + "===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N" + "A=AAACAMAGARASAVATAWAYAHAKADABAN" + "C=CACCCMCGCRCSCVCTCWCYCHCKCDCBCN" + "M=MAMCMMMGMRMSMVMTMWMYMHMKMDMBMN" + "G=GAGCGMGGGRGSGVGTGWGYGHGKGDGBGN" + "R=RARCRMRGRRRSRVRTRWRYRHRKRDRBRN" + "S=SASCSMSGSRSSSVSTSWSYSHSKSDSBSN" + "V=VAVCVMVGVRVSVVVTVWVYVHVKVDVBVN" + "T=TATCTMTGTRTSTVTTTWTYTHTKTDTBTN" + "W=WAWCWMWGWRWSWVWTWWWYWHWKWDWBWN" + "Y=YAYCYMYGYRYSYVYTYWYYYHYKYDYBYN" + "H=HAHCHMHGHRHSHVHTHWHYHHHKHDHBHN" + "K=KAKCKMKGKRKSKVKTKWKYKHKKKDKBKN" + "D=DADCDMDGDRDSDVDTDWDYDHDKDDDBDN" + "B=BABCBMBGBRBSBVBTBWBYBHBKBDBBBN" + "N=NANCNMNGNRNSNVNTNWNYNHNKNDNBNN"; + + int i, len2 = len/2; + seq[0] = 0; + + for (i = 0; i < len2; i++) + // Note size_t cast helps gcc optimiser. + memcpy(&seq[i*2], &code2base[(size_t)nib[i]*2], 2); + + if ((i *= 2) < len) + seq[i] = seq_nt16_str[bam_seqi(nib, i)]; +} + /* * Converts a single bam record into a cram record. * Possibly used within a thread. @@ -2512,8 +2731,8 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { */ static int process_one_read(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *cr, - bam_seq_t *b, int rnum) { - int i, fake_qual = -1; + bam_seq_t *b, int rnum, kstring_t *MD) { + int i, fake_qual = -1, NM = 0; char *cp, *rg; char *ref, *seq, *qual; @@ -2530,11 +2749,18 @@ static int process_one_read(cram_fd *fd, cram_container *c, ref = c->ref; cr->flags = bam_flag(b); cr->len = bam_seq_len(b); + if (!bam_aux_get(b, "MD")) + MD = NULL; + else + MD->l = 0; //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg); - cr->ref_id = bam_ref(b); cram_stats_add(c->stats[DS_RI], cr->ref_id); - cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]); + cr->ref_id = bam_ref(b); + if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0) + goto block_err; + if (cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]) < 0) + goto block_err; // Non reference based encoding means storing the bases verbatim as features, which in // turn means every base also has a quality already stored. @@ -2551,7 +2777,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (cr->apos < s->last_apos) { c->pos_sorted = 0; } else { - cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos); + if (cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos) < 0) + goto block_err; s->last_apos = cr->apos; } } else { @@ -2568,60 +2795,11 @@ static int process_one_read(cram_fd *fd, cram_container *c, cr->qual = BLOCK_SIZE(s->qual_blk); BLOCK_GROW(s->seqs_blk, cr->len+1); BLOCK_GROW(s->qual_blk, cr->len); - seq = cp = (char *)BLOCK_END(s->seqs_blk); + // Convert BAM nibble encoded sequence to string of base pairs + seq = cp = (char *)BLOCK_END(s->seqs_blk); *seq = 0; -#if HTS_ALLOW_UNALIGNED != 0 - { - // Convert seq 2 bases at a time for speed. - static const uint16_t code2base[256] = { - 15677, 16701, 17213, 19773, 18237, 21053, 21309, 22077, - 21565, 22333, 22845, 18493, 19261, 17469, 16957, 20029, - 15681, 16705, 17217, 19777, 18241, 21057, 21313, 22081, - 21569, 22337, 22849, 18497, 19265, 17473, 16961, 20033, - 15683, 16707, 17219, 19779, 18243, 21059, 21315, 22083, - 21571, 22339, 22851, 18499, 19267, 17475, 16963, 20035, - 15693, 16717, 17229, 19789, 18253, 21069, 21325, 22093, - 21581, 22349, 22861, 18509, 19277, 17485, 16973, 20045, - 15687, 16711, 17223, 19783, 18247, 21063, 21319, 22087, - 21575, 22343, 22855, 18503, 19271, 17479, 16967, 20039, - 15698, 16722, 17234, 19794, 18258, 21074, 21330, 22098, - 21586, 22354, 22866, 18514, 19282, 17490, 16978, 20050, - 15699, 16723, 17235, 19795, 18259, 21075, 21331, 22099, - 21587, 22355, 22867, 18515, 19283, 17491, 16979, 20051, - 15702, 16726, 17238, 19798, 18262, 21078, 21334, 22102, - 21590, 22358, 22870, 18518, 19286, 17494, 16982, 20054, - 15700, 16724, 17236, 19796, 18260, 21076, 21332, 22100, - 21588, 22356, 22868, 18516, 19284, 17492, 16980, 20052, - 15703, 16727, 17239, 19799, 18263, 21079, 21335, 22103, - 21591, 22359, 22871, 18519, 19287, 17495, 16983, 20055, - 15705, 16729, 17241, 19801, 18265, 21081, 21337, 22105, - 21593, 22361, 22873, 18521, 19289, 17497, 16985, 20057, - 15688, 16712, 17224, 19784, 18248, 21064, 21320, 22088, - 21576, 22344, 22856, 18504, 19272, 17480, 16968, 20040, - 15691, 16715, 17227, 19787, 18251, 21067, 21323, 22091, - 21579, 22347, 22859, 18507, 19275, 17483, 16971, 20043, - 15684, 16708, 17220, 19780, 18244, 21060, 21316, 22084, - 21572, 22340, 22852, 18500, 19268, 17476, 16964, 20036, - 15682, 16706, 17218, 19778, 18242, 21058, 21314, 22082, - 21570, 22338, 22850, 18498, 19266, 17474, 16962, 20034, - 15694, 16718, 17230, 19790, 18254, 21070, 21326, 22094, - 21582, 22350, 22862, 18510, 19278, 17486, 16974, 20046 - }; - - int l2 = cr->len / 2; - unsigned char *from = (unsigned char *)bam_seq(b); - uint16_t *cpi = (uint16_t *)cp; - cp[0] = 0; - for (i = 0; i < l2; i++) - cpi[i] = le_int2(code2base[from[i]]); - if ((i *= 2) < cr->len) - cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)]; - } -#else - for (i = 0; i < cr->len; i++) - cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)]; -#endif + nibble2base(bam_seq(b), cp, cr->len); BLOCK_SIZE(s->seqs_blk) += cr->len; qual = cp = (char *)bam_qual(b); @@ -2629,7 +2807,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, /* Copy and parse */ if (!(cr->flags & BAM_FUNMAP)) { uint32_t *cig_to, *cig_from; - int apos = cr->apos-1, spos = 0; + int64_t apos = cr->apos-1, spos = 0; + int64_t MD_last = apos; // last position of edit in MD tag cr->cigar = s->ncigar; cr->ncigar = bam_cigar_len(b); @@ -2673,9 +2852,20 @@ static int process_one_read(cram_fd *fd, cram_container *c, return -1; } for (l = 0; l < end; l++) { + // This case is just too disputed and different tools + // interpret these in different ways. We give up and + // store verbatim. if (rp[l] == 'N' && sp[l] == 'N') verbatim_NM = verbatim_MD = 1; if (rp[l] != sp[l]) { + // Build our own MD tag if one is on the sequence, so + // we can ensure it matches and thus can be discarded. + if (MD && ref) { + if (kputuw(apos+l - MD_last, MD) < 0) goto err; + if (kputc(rp[l], MD) < 0) goto err; + MD_last = apos+l+1; + } + NM++; if (!sp[l]) break; if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) { @@ -2740,15 +2930,27 @@ static int process_one_read(cram_fd *fd, cram_container *c, break; case BAM_CDEL: + if (MD && ref) { + if (kputuw(apos - MD_last, MD) < 0) goto err; + if (apos < c->ref_end) { + if (kputc_('^', MD) < 0) goto err; + if (kputsn(&ref[apos], MIN(c->ref_end - apos, cig_len), MD) < 0) + goto err; + } + } + NM += cig_len; + if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos])) return -1; apos += cig_len; + MD_last = apos; break; case BAM_CREF_SKIP: if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos])) return -1; apos += cig_len; + MD_last += cig_len; break; case BAM_CINS: @@ -2762,6 +2964,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, } else { spos += cig_len; } + NM += cig_len; break; case BAM_CSOFT_CLIP: @@ -2807,7 +3010,11 @@ static int process_one_read(cram_fd *fd, cram_container *c, } fake_qual = spos; cr->aend = fd->no_ref ? apos : MIN(apos, c->ref_end); - cram_stats_add(c->stats[DS_FN], cr->nfeature); + if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0) + goto block_err; + + if (MD && ref) + if (kputuw(apos - MD_last, MD) < 0) goto err; } else { // Unmapped cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES; @@ -2816,27 +3023,32 @@ static int process_one_read(cram_fd *fd, cram_container *c, cr->nfeature = 0; cr->aend = cr->apos; for (i = 0; i < cr->len; i++) - cram_stats_add(c->stats[DS_BA], seq[i]); + if (cram_stats_add(c->stats[DS_BA], seq[i]) < 0) + goto block_err; fake_qual = 0; } cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags); + int err = 0; if (CRAM_MAJOR_VERS(fd->version) == 1) rg = cram_encode_aux_1_0(fd, b, c, s, cr); else - rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD); + rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, &err); + if (err) + goto block_err; /* Read group, identified earlier */ if (rg) { - SAM_RG *brg = sam_hdr_find_rg(fd->header, rg); + sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, rg); cr->rg = brg ? brg->id : -1; } else if (CRAM_MAJOR_VERS(fd->version) == 1) { - SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN"); + sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN"); assert(brg); } else { cr->rg = -1; } - cram_stats_add(c->stats[DS_RG], cr->rg); + if (cram_stats_add(c->stats[DS_RG], cr->rg) < 0) + goto block_err; /* * Append to the qual block now. We do this here as @@ -2864,7 +3076,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1; } - cram_stats_add(c->stats[DS_RL], cr->len); + if (cram_stats_add(c->stats[DS_RL], cr->len) < 0) + goto block_err; /* Now we know apos and aend both, update mate-pair information */ { @@ -2891,7 +3104,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (new == 0) { cram_record *p = &s->crecs[kh_val(s->pair[sec], k)]; - int aleft, aright, sign; + int64_t aleft, aright; + int sign; aleft = MIN(cr->apos, p->apos); aright = MAX(cr->aend, p->aend); @@ -2908,7 +3122,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/- // a small amount, if appropriate options set. if ((bam_ins_size(b) && - abs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) || + llabs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) || (!bam_ins_size(b) && !fd->tlen_zero)) goto detached; @@ -2930,7 +3144,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, !(fd->tlen_zero && p->ref_id == -1)) goto detached; - if ((p->tlen && abs(p->tlen - -sign*(aright-aleft+1)) > fd->tlen_approx) || + if ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1)) > fd->tlen_approx) || (!p->tlen && !fd->tlen_zero)) goto detached; @@ -2994,7 +3208,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, // Clear detached from cr flags cr->cram_flags &= ~CRAM_FLAG_DETACHED; - cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK); + if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0) + goto block_err; // Clear detached from p flags and set downstream if (p->cram_flags & CRAM_FLAG_STATS_ADDED) { @@ -3004,10 +3219,12 @@ static int process_one_read(cram_fd *fd, cram_container *c, p->cram_flags &= ~CRAM_FLAG_DETACHED; p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM; - cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK); + if (cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK) < 0) + goto block_err; p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1); - cram_stats_add(c->stats[DS_NF], p->mate_line); + if (cram_stats_add(c->stats[DS_NF], p->mate_line) < 0) + goto block_err; kh_val(s->pair[sec], k) = rnum; } else { @@ -3021,24 +3238,30 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (bam_flag(b) & BAM_FMREVERSE) cr->mate_flags |= CRAM_M_REVERSE; - cram_stats_add(c->stats[DS_MF], cr->mate_flags); + if (cram_stats_add(c->stats[DS_MF], cr->mate_flags) < 0) + goto block_err; cr->mate_pos = MAX(bam_mate_pos(b)+1, 0); - cram_stats_add(c->stats[DS_NP], cr->mate_pos); + if (cram_stats_add(c->stats[DS_NP], cr->mate_pos) < 0) + goto block_err; cr->tlen = bam_ins_size(b); - cram_stats_add(c->stats[DS_TS], cr->tlen); + if (cram_stats_add(c->stats[DS_TS], cr->tlen) < 0) + goto block_err; cr->cram_flags |= CRAM_FLAG_DETACHED; - cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK); - cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)); + if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0) + goto block_err; + if (cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)) < 0) + goto block_err; cr->cram_flags |= CRAM_FLAG_STATS_ADDED; } } cr->mqual = bam_map_qual(b); - cram_stats_add(c->stats[DS_MQ], cr->mqual); + if (cram_stats_add(c->stats[DS_MQ], cr->mqual) < 0) + goto block_err; cr->mate_ref_id = bam_mate_ref(b); @@ -3051,6 +3274,10 @@ static int process_one_read(cram_fd *fd, cram_container *c, } return 0; + + block_err: + err: + return -1; } /* @@ -3083,13 +3310,23 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { * * This option isn't available if we choose to embed references * since we can only have one per slice. + * + * The multi_seq var here refers to our intention for the next slice. + * This slice has already been encoded so we output as-is. */ if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 && fd->last_slice && fd->last_slice < c->max_rec/4+10 && !fd->embed_ref) { if (!c->multi_seq) - hts_log_info("Multi-ref enabled for this container"); + hts_log_info("Multi-ref enabled for next container"); multi_seq = 1; + } else if (fd->multi_seq == 1) { + pthread_mutex_lock(&fd->metrics_lock); + if (fd->last_RI_count <= c->max_slice && fd->multi_seq_user != 1) { + multi_seq = 0; + hts_log_info("Multi-ref disabled for next container"); + } + pthread_mutex_unlock(&fd->metrics_lock); } slice_rec = c->slice_rec; @@ -3114,7 +3351,12 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { * multiple sequences per container we emit the small partial * container as-is and then start a fresh one in a different mode. */ - if (multi_seq) { + if (multi_seq == 0 && fd->multi_seq == 1 && fd->multi_seq_user == -1) { + // User selected auto-mode, we're currently using multi-seq, but + // have detected we don't need to. Switch back to auto. + fd->multi_seq = -1; + } else if (multi_seq) { + // We detected we need multi-seq fd->multi_seq = 1; c->multi_seq = 1; c->pos_sorted = 0; // required atm for multi_seq slices @@ -3163,18 +3405,23 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { free(spare); } else { c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *)); - if (!c->bams) + if (!c->bams) { + pthread_mutex_unlock(&fd->bam_list_lock); return -1; + } } pthread_mutex_unlock(&fd->bam_list_lock); } /* Copy or alloc+copy the bam record, for later encoding */ - if (c->bams[c->curr_c_rec]) - bam_copy1(c->bams[c->curr_c_rec], b); - else - c->bams[c->curr_c_rec] = bam_dup(b); - + if (c->bams[c->curr_c_rec]) { + if (bam_copy1(c->bams[c->curr_c_rec], b) == NULL) + return -1; + } else { + c->bams[c->curr_c_rec] = bam_dup1(b); + if (c->bams[c->curr_c_rec] == NULL) + return -1; + } c->curr_rec++; c->curr_c_rec++; c->s_num_bases += bam_seq_len(b); diff --git a/cram/cram_encode.h b/cram/cram_encode.h index 1b1b0c385..c779b46a7 100644 --- a/cram/cram_encode.h +++ b/cram/cram_encode.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2013 Genome Research Ltd. +Copyright (c) 2012-2013, 2018 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -38,8 +38,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * cram_codecs.[ch] for the actual encoding functions themselves. */ -#ifndef _CRAM_WRITE_H_ -#define _CRAM_WRITE_H_ +#ifndef CRAM_ENCODE_H +#define CRAM_ENCODE_H #ifdef __cplusplus extern "C" { diff --git a/cram/cram_external.c b/cram/cram_external.c index b55c83dda..664dfa69b 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2015 Genome Research Ltd. +Copyright (c) 2015, 2018-2019 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * containers and blocks in a robust manner. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include "htslib/hfile.h" @@ -47,8 +48,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *----------------------------------------------------------------------------- * cram_fd */ -SAM_hdr *cram_fd_get_header(cram_fd *fd) { return fd->header; } -void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr) { fd->header = hdr; } +sam_hdr_t *cram_fd_get_header(cram_fd *fd) { return fd->header; } +void cram_fd_set_header(cram_fd *fd, sam_hdr_t *hdr) { fd->header = hdr; } int cram_fd_get_version(cram_fd *fd) { return fd->version; } void cram_fd_set_version(cram_fd *fd, int vers) { fd->version = vers; } @@ -130,15 +131,15 @@ static int cram_block_compression_hdr_set_DS(cram_block_compression_hdr *ch, switch (ch->codecs[ds]->codec) { case E_HUFFMAN: - if (ch->codecs[ds]->huffman.ncodes != 1) + if (ch->codecs[ds]->u.huffman.ncodes != 1) return -1; - ch->codecs[ds]->huffman.codes[0].symbol = new_rg; + ch->codecs[ds]->u.huffman.codes[0].symbol = new_rg; return 0; case E_BETA: - if (ch->codecs[ds]->beta.nbits != 0) + if (ch->codecs[ds]->u.beta.nbits != 0) return -1; - ch->codecs[ds]->beta.offset = -new_rg; + ch->codecs[ds]->u.beta.offset = -new_rg; return 0; default: @@ -209,9 +210,12 @@ void cram_block_set_crc32(cram_block *b, int32_t crc) { b->crc32 = crc; } void cram_block_set_data(cram_block *b, void *data) { BLOCK_DATA(b) = data; } void cram_block_set_size(cram_block *b, int32_t size) { BLOCK_SIZE(b) = size; } -int cram_block_append(cram_block *b, void *data, int size) { +int cram_block_append(cram_block *b, const void *data, int size) { BLOCK_APPEND(b, data, size); - return BLOCK_DATA(b) ? 0 : -1; // It'll do for now... + return 0; + + block_err: + return -1; } void cram_block_update_size(cram_block *b) { BLOCK_UPLEN(b); } diff --git a/cram/cram_index.c b/cram/cram_index.c index 1a52b5701..4aae52742 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2013-2014 Genome Research Ltd. +Copyright (c) 2013-2019 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -48,11 +48,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * binary search to find the first range which overlaps any given coordinate. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include #include #include +#include #include #include #include @@ -98,8 +100,10 @@ static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) { if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9')) return -1; - while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') - val = val*10 + k->s[p++]-'0'; + while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') { + int digit = k->s[p++]-'0'; + val = val*10 + digit; + } *pos = p; *val_p = sign*val; @@ -121,8 +125,10 @@ static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) { if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9')) return -1; - while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') - val = val*10 + k->s[p++]-'0'; + while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') { + int digit = k->s[p++]-'0'; + val = val*10 + digit; + } *pos = p; *val_p = sign*val; @@ -137,7 +143,8 @@ static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) { * -1 for failure */ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { - char *fn2 = NULL; + + char *tfn_idx = NULL; char buf[65536]; ssize_t len; kstring_t kstr = {0}; @@ -167,15 +174,18 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { idx_stack[idx_stack_ptr] = idx; if (!fn_idx) { - fn2 = hts_idx_getfn(fn, ".crai"); - if (!fn2) - goto fail; + if (hts_idx_check_local(fn, HTS_FMT_CRAI, &tfn_idx) == 0 && hisremote(fn)) + tfn_idx = hts_idx_getfn(fn, ".crai"); - fn_idx = fn2; + if (!tfn_idx) { + hts_log_error("Could not retrieve index file for '%s'", fn); + goto fail; + } + fn_idx = tfn_idx; } if (!(fp = hopen(fn_idx, "r"))) { - perror(fn_idx); + hts_log_error("Could not open index file '%s'", fn_idx); goto fail; } @@ -229,7 +239,7 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { goto fail; e.end += e.start-1; - //printf("%d/%d..%d\n", e.refid, e.start, e.end); + //printf("%d/%d..%d-offset=%" PRIu64 ",len=%d,slice=%d\n", e.refid, e.start, e.end, e.offset, e.len, e.slice); if (e.refid < -1) { hts_log_error("Malformed index file, refid %d", e.refid); @@ -296,7 +306,7 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { free(idx_stack); free(kstr.s); - free(fn2); + free(tfn_idx); // dump_index(fd); @@ -305,7 +315,7 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { fail: free(kstr.s); free(idx_stack); - free(fn2); + free(tfn_idx); cram_index_free(fd); // Also sets fd->index = NULL return -1; } @@ -364,6 +374,7 @@ cram_index *cram_index_query(cram_fd *fd, int refid, int pos, case HTS_IDX_NOCOOR: refid = -1; + pos = 0; break; case HTS_IDX_START: { @@ -491,8 +502,12 @@ int cram_seek_to_refpos(cram_fd *fd, cram_range *r) { pthread_mutex_lock(&fd->range_lock); fd->range = *r; - if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) + if (r->refid == HTS_IDX_NOCOOR) { + fd->range.refid = -1; + fd->range.start = 0; + } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) { fd->range.refid = -2; // special case in cram_next_slice + } pthread_mutex_unlock(&fd->range_lock); if (fd->ctr) { @@ -524,6 +539,7 @@ int cram_seek_to_refpos(cram_fd *fd, cram_range *r) { * * Returns 0 on success * -1 on read failure + * -2 on wrong sort order * -4 on write failure */ static int cram_index_build_multiref(cram_fd *fd, @@ -533,14 +549,27 @@ static int cram_index_build_multiref(cram_fd *fd, off_t cpos, int32_t landmark, int sz) { - int i, ref = -2, ref_start = 0, ref_end; + int i, ref = -2; + int64_t ref_start = 0, ref_end; char buf[1024]; - if (0 != cram_decode_slice(fd, c, s, fd->header)) - return -1; + if (fd->mode != 'w') { + if (0 != cram_decode_slice(fd, c, s, fd->header)) + return -1; + } ref_end = INT_MIN; + + int32_t last_ref = -9; + int32_t last_pos = -9; for (i = 0; i < s->hdr->num_records; i++) { + if (s->crecs[i].ref_id == last_ref && s->crecs[i].apos < last_pos) { + hts_log_error("CRAM file is not sorted by chromosome / position"); + return -2; + } + last_ref = s->crecs[i].ref_id; + last_pos = s->crecs[i].apos; + if (s->crecs[i].ref_id == ref) { if (ref_end < s->crecs[i].aend) ref_end = s->crecs[i].aend; @@ -548,7 +577,7 @@ static int cram_index_build_multiref(cram_fd *fd, } if (ref != -2) { - sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", + sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", ref, ref_start, ref_end - ref_start + 1, (int64_t)cpos, landmark, sz); if (bgzf_write(fp, buf, strlen(buf)) < 0) @@ -561,7 +590,7 @@ static int cram_index_build_multiref(cram_fd *fd, } if (ref != -2) { - sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", + sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", ref, ref_start, ref_end - ref_start + 1, (int64_t)cpos, landmark, sz); if (bgzf_write(fp, buf, strlen(buf)) < 0) @@ -571,6 +600,80 @@ static int cram_index_build_multiref(cram_fd *fd, return 0; } +/* + * Adds a single slice to the index. + */ +int cram_index_slice(cram_fd *fd, + cram_container *c, + cram_slice *s, + BGZF *fp, + off_t cpos, + off_t spos, // relative to cpos + off_t sz) { + int ret; + char buf[1024]; + + if (sz > INT_MAX) { + hts_log_error("CRAM slice is too big (%"PRId64" bytes)", + (int64_t) sz); + return -1; + } + + if (s->hdr->ref_seq_id == -2) { + ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz); + } else { + sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", + s->hdr->ref_seq_id, s->hdr->ref_seq_start, + s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz); + ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4; + } + + return ret; +} + +/* + * Adds a single container to the index. + */ +static +int cram_index_container(cram_fd *fd, + cram_container *c, + BGZF *fp, + off_t cpos) { + int j; + off_t spos; + + // 2.0 format + for (j = 0; j < c->num_landmarks; j++) { + cram_slice *s; + off_t sz; + int ret; + + spos = htell(fd->fp); + if (spos - cpos - c->offset != c->landmark[j]) { + hts_log_error("CRAM slice offset %"PRId64" does not match" + " landmark %d in container header (%d)", + spos - cpos - c->offset, j, c->landmark[j]); + return -1; + } + + if (!(s = cram_read_slice(fd))) { + return -1; + } + + sz = htell(fd->fp) - spos; + ret = cram_index_slice(fd, c, s, fp, cpos, c->landmark[j], sz); + + cram_free_slice(s); + + if (ret < 0) { + return ret; + } + } + + return 0; +} + + /* * Builds an index file. * @@ -584,9 +687,10 @@ static int cram_index_build_multiref(cram_fd *fd, */ int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) { cram_container *c; - off_t cpos, spos, hpos; + off_t cpos, hpos; BGZF *fp; kstring_t fn_idx_str = {0}; + int64_t last_ref = -9, last_start = -9; // Useful for cram_index_build_multiref cram_set_option(fd, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_POS | SAM_CIGAR); @@ -607,8 +711,6 @@ int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) { cpos = htell(fd->fp); while ((c = cram_read_container(fd))) { - int j; - if (fd->err) { perror("Cram container read"); return -1; @@ -624,39 +726,16 @@ int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) { if (!c->comp_hdr) return -1; - // 2.0 format - for (j = 0; j < c->num_landmarks; j++) { - char buf[1024]; - cram_slice *s; - int sz, ret; - - spos = htell(fd->fp); - assert(spos - cpos - c->offset == c->landmark[j]); - - if (!(s = cram_read_slice(fd))) { - bgzf_close(fp); - return -1; - } - - sz = (int)(htell(fd->fp) - spos); - - if (s->hdr->ref_seq_id == -2) { - ret = cram_index_build_multiref(fd, c, s, fp, - cpos, c->landmark[j], sz); - } else { - sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", - s->hdr->ref_seq_id, s->hdr->ref_seq_start, - s->hdr->ref_seq_span, (int64_t)cpos, - c->landmark[j], sz); - ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4; - } - - cram_free_slice(s); + if (c->ref_seq_id == last_ref && c->ref_seq_start < last_start) { + hts_log_error("CRAM file is not sorted by chromosome / position"); + return -2; + } + last_ref = c->ref_seq_id; + last_start = c->ref_seq_start; - if (ret < 0) { - bgzf_close(fp); - return ret; - } + if (cram_index_container(fd, c, fp, cpos) < 0) { + bgzf_close(fp); + return -1; } cpos = htell(fd->fp); diff --git a/cram/cram_index.h b/cram/cram_index.h index f5c0f7dfc..a3a8050cc 100644 --- a/cram/cram_index.h +++ b/cram/cram_index.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2013 Genome Research Ltd. +Copyright (c) 2013, 2018 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -28,8 +28,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _CRAM_INDEX_H_ -#define _CRAM_INDEX_H_ +#ifndef CRAM_INDEX_H +#define CRAM_INDEX_H #ifdef __cplusplus extern "C" { @@ -93,6 +93,20 @@ int cram_seek_to_refpos(cram_fd *fd, cram_range *r); */ int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx); +/* + * Adds a single slice to the index. + * + * Returns 0 on success, + * -1 on failure + */ +int cram_index_slice(cram_fd *fd, + cram_container *c, + cram_slice *s, + BGZF *fp, + off_t cpos, + off_t spos, // relative to cpos + off_t sz); + #ifdef __cplusplus } #endif diff --git a/cram/cram_io.c b/cram/cram_io.c index 6bbffc9e9..261342c41 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2016 Genome Research Ltd. +Copyright (c) 2012-2019 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -39,11 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - Reference sequence handling */ -/* - * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need - * a way to return errors for when malloc fails. - */ - +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -204,6 +200,11 @@ int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) { int i = nbytes[val>>4]; val &= nbits[val>>4]; + if (i > 0) { + if (hread(fd->fp, &c[1], i) < i) + return -1; + } + switch(i) { case 0: *val_p = val; @@ -211,22 +212,22 @@ int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) { return 1; case 1: - val = (val<<8) | (c[1]=hgetc(fd->fp)); + val = (val<<8) | c[1]; *val_p = val; *crc = crc32(*crc, c, 2); return 2; case 2: - val = (val<<8) | (c[1]=hgetc(fd->fp)); - val = (val<<8) | (c[2]=hgetc(fd->fp)); + val = (val<<8) | c[1]; + val = (val<<8) | c[2]; *val_p = val; *crc = crc32(*crc, c, 3); return 3; case 3: - val = (val<<8) | (c[1]=hgetc(fd->fp)); - val = (val<<8) | (c[2]=hgetc(fd->fp)); - val = (val<<8) | (c[3]=hgetc(fd->fp)); + val = (val<<8) | c[1]; + val = (val<<8) | c[2]; + val = (val<<8) | c[3]; *val_p = val; *crc = crc32(*crc, c, 4); return 4; @@ -234,11 +235,12 @@ int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) { case 4: // really 3.5 more, why make it different? { uint32_t uv = val; - uv = (uv<<8) | (c[1]=hgetc(fd->fp)); - uv = (uv<<8) | (c[2]=hgetc(fd->fp)); - uv = (uv<<8) | (c[3]=hgetc(fd->fp)); - uv = (uv<<4) | (((c[4]=hgetc(fd->fp))) & 0x0f); - *val_p = uv < 0x80000000UL ? uv : -((int32_t) (0xffffffffUL - uv)) - 1; + uv = (uv<<8) | c[1]; + uv = (uv<<8) | c[2]; + uv = (uv<<8) | c[3]; + uv = (uv<<4) | (c[4] & 0x0f); + // Avoid implementation-defined behaviour on negative values + *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1; *crc = crc32(*crc, c, 5); } } @@ -372,8 +374,8 @@ int ltf8_decode(cram_fd *fd, int64_t *val_p) { int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) { unsigned char c[9]; - int64_t val = (unsigned char)hgetc(fd->fp); - if (val == -1) + int64_t val = hgetc(fd->fp); + if (val < 0) return -1; c[0] = val; @@ -384,79 +386,99 @@ int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) { return 1; } else if (val < 0xc0) { - val = (val<<8) | (c[1]=hgetc(fd->fp));; + int v = hgetc(fd->fp); + if (v < 0) + return -1; + val = (val<<8) | (c[1]=v); *val_p = val & (((1LL<<(6+8)))-1); *crc = crc32(*crc, c, 2); return 2; } else if (val < 0xe0) { - val = (val<<8) | (c[1]=hgetc(fd->fp));; - val = (val<<8) | (c[2]=hgetc(fd->fp));; + if (hread(fd->fp, &c[1], 2) < 2) + return -1; + val = (val<<8) | c[1]; + val = (val<<8) | c[2]; *val_p = val & ((1LL<<(5+2*8))-1); *crc = crc32(*crc, c, 3); return 3; } else if (val < 0xf0) { - val = (val<<8) | (c[1]=hgetc(fd->fp));; - val = (val<<8) | (c[2]=hgetc(fd->fp));; - val = (val<<8) | (c[3]=hgetc(fd->fp));; + if (hread(fd->fp, &c[1], 3) < 3) + return -1; + val = (val<<8) | c[1]; + val = (val<<8) | c[2]; + val = (val<<8) | c[3]; *val_p = val & ((1LL<<(4+3*8))-1); *crc = crc32(*crc, c, 4); return 4; } else if (val < 0xf8) { - val = (val<<8) | (c[1]=hgetc(fd->fp));; - val = (val<<8) | (c[2]=hgetc(fd->fp));; - val = (val<<8) | (c[3]=hgetc(fd->fp));; - val = (val<<8) | (c[4]=hgetc(fd->fp));; + if (hread(fd->fp, &c[1], 4) < 4) + return -1; + val = (val<<8) | c[1]; + val = (val<<8) | c[2]; + val = (val<<8) | c[3]; + val = (val<<8) | c[4]; *val_p = val & ((1LL<<(3+4*8))-1); *crc = crc32(*crc, c, 5); return 5; } else if (val < 0xfc) { - val = (val<<8) | (c[1]=hgetc(fd->fp));; - val = (val<<8) | (c[2]=hgetc(fd->fp));; - val = (val<<8) | (c[3]=hgetc(fd->fp));; - val = (val<<8) | (c[4]=hgetc(fd->fp));; - val = (val<<8) | (c[5]=hgetc(fd->fp));; + if (hread(fd->fp, &c[1], 5) < 5) + return -1; + val = (val<<8) | c[1]; + val = (val<<8) | c[2]; + val = (val<<8) | c[3]; + val = (val<<8) | c[4]; + val = (val<<8) | c[5]; *val_p = val & ((1LL<<(2+5*8))-1); *crc = crc32(*crc, c, 6); return 6; } else if (val < 0xfe) { - val = (val<<8) | (c[1]=hgetc(fd->fp));; - val = (val<<8) | (c[2]=hgetc(fd->fp));; - val = (val<<8) | (c[3]=hgetc(fd->fp));; - val = (val<<8) | (c[4]=hgetc(fd->fp));; - val = (val<<8) | (c[5]=hgetc(fd->fp));; - val = (val<<8) | (c[6]=hgetc(fd->fp));; + if (hread(fd->fp, &c[1], 6) < 6) + return -1; + val = (val<<8) | c[1]; + val = (val<<8) | c[2]; + val = (val<<8) | c[3]; + val = (val<<8) | c[4]; + val = (val<<8) | c[5]; + val = (val<<8) | c[6]; *val_p = val & ((1LL<<(1+6*8))-1); *crc = crc32(*crc, c, 7); return 7; } else if (val < 0xff) { - val = (val<<8) | (c[1]=hgetc(fd->fp));; - val = (val<<8) | (c[2]=hgetc(fd->fp));; - val = (val<<8) | (c[3]=hgetc(fd->fp));; - val = (val<<8) | (c[4]=hgetc(fd->fp));; - val = (val<<8) | (c[5]=hgetc(fd->fp));; - val = (val<<8) | (c[6]=hgetc(fd->fp));; - val = (val<<8) | (c[7]=hgetc(fd->fp));; - *val_p = val & ((1LL<<(7*8))-1); + uint64_t uval = val; + if (hread(fd->fp, &c[1], 7) < 7) + return -1; + uval = (uval<<8) | c[1]; + uval = (uval<<8) | c[2]; + uval = (uval<<8) | c[3]; + uval = (uval<<8) | c[4]; + uval = (uval<<8) | c[5]; + uval = (uval<<8) | c[6]; + uval = (uval<<8) | c[7]; + *val_p = uval & ((1ULL<<(7*8))-1); *crc = crc32(*crc, c, 8); return 8; } else { - val = (val<<8) | (c[1]=hgetc(fd->fp));; - val = (val<<8) | (c[2]=hgetc(fd->fp));; - val = (val<<8) | (c[3]=hgetc(fd->fp));; - val = (val<<8) | (c[4]=hgetc(fd->fp));; - val = (val<<8) | (c[5]=hgetc(fd->fp));; - val = (val<<8) | (c[6]=hgetc(fd->fp));; - val = (val<<8) | (c[7]=hgetc(fd->fp));; - val = (val<<8) | (c[8]=hgetc(fd->fp));; + uint64_t uval; + if (hread(fd->fp, &c[1], 8) < 8) + return -1; + uval = c[1]; + uval = (uval<<8) | c[2]; + uval = (uval<<8) | c[3]; + uval = (uval<<8) | c[4]; + uval = (uval<<8) | c[5]; + uval = (uval<<8) | c[6]; + uval = (uval<<8) | c[7]; + uval = (uval<<8) | c[8]; *crc = crc32(*crc, c, 9); - *val_p = val; + // Avoid implementation-defined behaviour on negative values + *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1; } return 9; @@ -469,13 +491,28 @@ int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) { * * Returns the number of bytes written */ -int itf8_put_blk(cram_block *blk, int val) { +int itf8_put_blk(cram_block *blk, int32_t val) { char buf[5]; int sz; sz = itf8_put(buf, val); BLOCK_APPEND(blk, buf, sz); return sz; + + block_err: + return -1; +} + +int ltf8_put_blk(cram_block *blk, int64_t val) { + char buf[9]; + int sz; + + sz = ltf8_put(buf, val); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; } /* @@ -484,7 +521,7 @@ int itf8_put_blk(cram_block *blk, int val) { * Returns the number of bytes read on success * -1 on failure */ -int int32_decode(cram_fd *fd, int32_t *val) { +static int int32_decode(cram_fd *fd, int32_t *val) { int32_t i; if (4 != hread(fd->fp, &i, 4)) return -1; @@ -499,7 +536,7 @@ int int32_decode(cram_fd *fd, int32_t *val) { * Returns the number of bytes written on success * -1 on failure */ -int int32_encode(cram_fd *fd, int32_t val) { +static int int32_encode(cram_fd *fd, int32_t val) { val = le_int4(val); if (4 != hwrite(fd->fp, &val, 4)) return -1; @@ -512,11 +549,13 @@ int int32_get_blk(cram_block *b, int32_t *val) { if (b->uncomp_size - BLOCK_SIZE(b) < 4) return -1; - *val = - b->data[b->byte ] | - (b->data[b->byte+1] << 8) | - (b->data[b->byte+2] << 16) | - (b->data[b->byte+3] << 24); + uint32_t v = + ((uint32_t) b->data[b->byte ]) | + (((uint32_t) b->data[b->byte+1]) << 8) | + (((uint32_t) b->data[b->byte+2]) << 16) | + (((uint32_t) b->data[b->byte+3]) << 24); + // Avoid implementation-defined behaviour on negative values + *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1; BLOCK_SIZE(b) += 4; return 4; } @@ -524,13 +563,17 @@ int int32_get_blk(cram_block *b, int32_t *val) { /* As int32_decoded/encode, but from/to blocks instead of cram_fd */ int int32_put_blk(cram_block *b, int32_t val) { unsigned char cp[4]; - cp[0] = ( val & 0xff); - cp[1] = ((val>>8) & 0xff); - cp[2] = ((val>>16) & 0xff); - cp[3] = ((val>>24) & 0xff); + uint32_t v = val; + cp[0] = ( v & 0xff); + cp[1] = ((v>>8) & 0xff); + cp[2] = ((v>>16) & 0xff); + cp[3] = ((v>>24) & 0xff); BLOCK_APPEND(b, cp, 4); - return b->data ? 0 : -1; + return 0; + + block_err: + return -1; } /* ---------------------------------------------------------------------- @@ -580,8 +623,8 @@ char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { if (err != Z_OK) { hts_log_error("Call to zlib inflate failed: %s", s.msg); - if (data) - free(data); + free(data); + inflateEnd(&s); return NULL; } @@ -590,6 +633,7 @@ char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { data = realloc((data_tmp = data), data_alloc += alloc_inc); if (!data) { free(data_tmp); + inflateEnd(&s); return NULL; } s.avail_out += alloc_inc; @@ -693,7 +737,7 @@ static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size, static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) { lzma_stream strm = LZMA_STREAM_INIT; size_t out_size = 0, out_pos = 0; - char *out = NULL; + char *out = NULL, *new_out; int r; /* Initiate the decoder */ @@ -707,7 +751,10 @@ static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) { for (;strm.avail_in;) { if (strm.avail_in > out_size - out_pos) { out_size += strm.avail_in * 4 + 32768; - out = realloc(out, out_size); + new_out = realloc(out, out_size); + if (!new_out) + goto fail; + out = new_out; } strm.avail_out = out_size - out_pos; strm.next_out = (uint8_t *)&out[out_pos]; @@ -715,7 +762,7 @@ static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) { r = lzma_code(&strm, LZMA_RUN); if (LZMA_OK != r && LZMA_STREAM_END != r) { hts_log_error("LZMA decode failure (error %d)", r); - return NULL; + goto fail; } out_pos = strm.total_out; @@ -731,12 +778,19 @@ static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) { return NULL; } - out = realloc(out, strm.total_out); + new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1); + if (new_out) + out = new_out; *size = strm.total_out; lzma_end(&strm); return out; + + fail: + lzma_end(&strm); + free(out); + return NULL; } #endif @@ -811,7 +865,10 @@ cram_block *cram_read_block(cram_fd *fd) { return NULL; } } else { - if (b->comp_size < 0) { free(b); return NULL; } + if (b->comp_size < 0 || b->uncomp_size < 0) { + free(b); + return NULL; + } b->alloc = b->comp_size; if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; } if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) { @@ -823,6 +880,7 @@ cram_block *cram_read_block(cram_fd *fd) { if (CRAM_MAJOR_VERS(fd->version) >= 3) { if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) { + free(b->data); free(b); return NULL; } @@ -939,6 +997,7 @@ int cram_uncompress_block(cram_block *b) { b->method = RAW; return 0; } + assert(b->uncomp_size >= 0); // cram_read_block should ensure this switch (b->method) { case RAW: @@ -948,7 +1007,7 @@ int cram_uncompress_block(cram_block *b) { uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size); if (!uncomp) return -1; - if ((int)uncomp_size != b->uncomp_size) { + if (uncomp_size != b->uncomp_size) { free(uncomp); return -1; } @@ -987,8 +1046,10 @@ int cram_uncompress_block(cram_block *b) { uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size); if (!uncomp) return -1; - if ((int)uncomp_size != b->uncomp_size) + if (uncomp_size != b->uncomp_size) { + free(uncomp); return -1; + } free(b->data); b->data = (unsigned char *)uncomp; b->alloc = uncomp_size; @@ -1004,8 +1065,12 @@ int cram_uncompress_block(cram_block *b) { case RANS: { unsigned int usize = b->uncomp_size, usize2; uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2); - if (!uncomp || usize != usize2) + if (!uncomp) return -1; + if (usize != usize2) { + free(uncomp); + return -1; + } free(b->data); b->data = (unsigned char *)uncomp; b->alloc = usize2; @@ -1620,7 +1685,7 @@ static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) { * Returns the refs_t struct on success (maybe newly allocated); * NULL on failure */ -static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) { +static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) { struct stat sb; FILE *fp = NULL; char fai_fn[PATH_MAX]; @@ -1765,14 +1830,14 @@ static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) { static void sanitise_SQ_lines(cram_fd *fd) { int i; - if (!fd->header) + if (!fd->header || !fd->header->hrecs) return; if (!fd->refs || !fd->refs->h_meta) return; - for (i = 0; i < fd->header->nref; i++) { - char *name = fd->header->ref[i].name; + for (i = 0; i < fd->header->hrecs->nref; i++) { + const char *name = fd->header->hrecs->ref[i].name; khint_t k = kh_get(refs, fd->refs->h_meta, name); ref_entry *r; @@ -1784,17 +1849,17 @@ static void sanitise_SQ_lines(cram_fd *fd) { if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k))) continue; - if (r->length && r->length != fd->header->ref[i].len) { - assert(strcmp(r->name, fd->header->ref[i].name) == 0); + if (r->length && r->length != fd->header->hrecs->ref[i].len) { + assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0); // Should we also check MD5sums here to ensure the correct // reference was given? - hts_log_warning("Header @SQ length mismatch for ref %s, %d vs %d", - r->name, fd->header->ref[i].len, (int)r->length); + hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d", + r->name, fd->header->hrecs->ref[i].len, (int)r->length); // Fixing the parsed @SQ header will make MD:Z: strings work // and also stop it producing N for the sequence. - fd->header->ref[i].len = r->length; + fd->header->hrecs->ref[i].len = r->length; } } } @@ -1806,8 +1871,9 @@ static void sanitise_SQ_lines(cram_fd *fd) { * Returns 0 on success * -1 on failure */ -int refs2id(refs_t *r, SAM_hdr *h) { +int refs2id(refs_t *r, sam_hdr_t *hdr) { int i; + sam_hrecs_t *h = hdr->hrecs; if (r->ref_id) free(r->ref_id); @@ -1836,29 +1902,43 @@ int refs2id(refs_t *r, SAM_hdr *h) { * Returns 0 on success * -1 on failure */ -static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) { - int i, j; +static int refs_from_header(cram_fd *fd) { + if (!fd) + return -1; + refs_t *r = fd->refs; if (!r) return -1; - if (!h || h->nref == 0) + sam_hdr_t *h = fd->header; + if (!h) + return 0; + + if (!h->hrecs) { + if (-1 == sam_hdr_fill_hrecs(h)) + return -1; + } + + if (h->hrecs->nref == 0) return 0; //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode); /* Existing refs are fine, as long as they're compatible with the hdr. */ - if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id)))) + ref_entry **new_ref_id = realloc(r->ref_id, (r->nref + h->hrecs->nref) * sizeof(*r->ref_id)); + if (!new_ref_id) return -1; + r->ref_id = new_ref_id; + int i, j; /* Copy info from h->ref[i] over to r */ - for (i = 0, j = r->nref; i < h->nref; i++) { - SAM_hdr_type *ty; - SAM_hdr_tag *tag; + for (i = 0, j = r->nref; i < h->hrecs->nref; i++) { + sam_hrec_type_t *ty; + sam_hrec_tag_t *tag; khint_t k; int n; - k = kh_get(refs, r->h_meta, h->ref[i].name); + k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name); if (k != kh_end(r->h_meta)) // Ref already known about continue; @@ -1866,15 +1946,16 @@ static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) { if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry)))) return -1; - if (!h->ref[i].name) + if (!h->hrecs->ref[i].name) return -1; - r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name); + r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name); + if (!r->ref_id[j]->name) return -1; r->ref_id[j]->length = 0; // marker for not yet loaded /* Initialise likely filename if known */ - if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) { - if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) { + if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) { + if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) { r->ref_id[j]->fn = string_dup(r->pool, tag->str+3); //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn); } @@ -1896,14 +1977,34 @@ static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) { * Attaches a header to a cram_fd. * * This should be used when creating a new cram_fd for writing where - * we have an SAM_hdr already constructed (eg from a file we've read + * we have a header already constructed (eg from a file we've read * in). */ -int cram_set_header(cram_fd *fd, SAM_hdr *hdr) { - if (fd->header) - sam_hdr_free(fd->header); - fd->header = hdr; - return refs_from_header(fd->refs, fd, hdr); +int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) { + if (!fd || !hdr ) + return -1; + + if (fd->header != hdr) { + if (fd->header) + sam_hdr_destroy(fd->header); + fd->header = sam_hdr_dup(hdr); + if (!fd->header) + return -1; + } + return refs_from_header(fd); +} + +int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) { + return cram_set_header2(fd, hdr); +} + +/* + * Returns whether the path refers to a directory. + */ +static int is_directory(char *fn) { + struct stat buf; + if ( stat(fn,&buf) ) return 0; + return S_ISDIR(buf.st_mode); } /* @@ -1911,7 +2012,7 @@ int cram_set_header(cram_fd *fd, SAM_hdr *hdr) { * in directory with the filename and %[0-9]+s with portions of the filename * Any remaining parts of filename are added to the end with /%s. */ -int expand_cache_path(char *path, char *dir, char *fn) { +static int expand_cache_path(char *path, char *dir, const char *fn) { char *cp, *start = path; size_t len; size_t sz = PATH_MAX; @@ -1974,7 +2075,7 @@ int expand_cache_path(char *path, char *dir, char *fn) { /* * Make the directory containing path and any prefix directories. */ -void mkdir_prefix(char *path, int mode) { +static void mkdir_prefix(char *path, int mode) { char *cp = strrchr(path, '/'); if (!cp) return; @@ -2043,8 +2144,8 @@ static unsigned get_int_threadid() { */ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { char *ref_path = getenv("REF_PATH"); - SAM_hdr_type *ty; - SAM_hdr_tag *tag; + sam_hrec_type_t *ty; + sam_hrec_tag_t *tag; char path[PATH_MAX], path_tmp[PATH_MAX + 64]; char cache[PATH_MAX], cache_root[PATH_MAX]; char *local_cache = getenv("REF_CACHE"); @@ -2074,10 +2175,10 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { if (!r->name) return -1; - if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name))) + if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name))) return -1; - if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL))) + if (!(tag = sam_hrecs_find_key(ty, "M5", NULL))) goto no_M5; hts_log_info("Querying ref %s", tag->str+3); @@ -2092,9 +2193,9 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { char *path2; /* Search local files in REF_PATH; we can open them and return as above */ if (!local_path && (path2 = find_path(tag->str+3, ref_path))) { - strncpy(path, path2, PATH_MAX); + int len = snprintf(path, PATH_MAX, "%s", path2); free(path2); - if (is_file(path)) // incase it's too long + if (len > 0 && len < PATH_MAX) // incase it's too long local_path = 1; } #endif @@ -2104,7 +2205,9 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { struct stat sb; BGZF *fp; - if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) { + if (0 == stat(path, &sb) + && S_ISREG(sb.st_mode) + && (fp = bgzf_open(path, "r"))) { r->length = sb.st_size; r->offset = r->line_length = r->bases_per_line = 0; @@ -2139,11 +2242,11 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { r->is_md5 = 1; } else { refs_t *refs; - char *fn; + const char *fn; no_M5: /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */ - if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL))) + if (!(tag = sam_hrecs_find_key(ty, "UR", NULL))) return -1; fn = (strncmp(tag->str+3, "file:", 5) == 0) @@ -2403,6 +2506,9 @@ ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) { } } + if (!r->fn) + return NULL; + /* Open file if it's not already the current open reference */ if (strcmp(r->fn, e->fn) || r->fp == NULL) { if (r->fp) @@ -2604,7 +2710,7 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { */ /* Unmapped ref ID */ - if (id < 0) { + if (id < 0 || !fd->refs->fn) { if (fd->ref_free) { free(fd->ref_free); fd->ref_free = NULL; @@ -2647,7 +2753,7 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { pthread_mutex_unlock(&fd->refs->lock); pthread_mutex_unlock(&fd->ref_lock); - return seq + ostart - start; + return seq ? seq + ostart - start : NULL; } /* @@ -2673,7 +2779,7 @@ int cram_load_reference(cram_fd *fd, char *fn) { refs_free(fd->refs); if (!(fd->refs = refs_create())) return -1; - if (-1 == refs_from_header(fd->refs, fd, fd->header)) + if (-1 == refs_from_header(fd)) return -1; } @@ -2798,10 +2904,12 @@ void cram_free_container(cram_container *c) { continue; cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k); - cram_codec *c = tm->codec; + if (tm) { + cram_codec *c = tm->codec; - if (c) c->free(c); - free(tm); + if (c) c->free(c); + free(tm); + } } kh_destroy(m_tagmap, c->tags_used); @@ -2849,8 +2957,26 @@ cram_container *cram_read_container(cram_fd *fd) { crc = crc32(0L, (unsigned char *)&len, 4); } if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; - if ((s = itf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s; - if ((s = itf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s; +/* + * LARGE_POS used in this code is purely a debugging mechanism for testing + * whether the htslib API can cope with 64-bit quantities. These are + * possible in SAM, but not *yet* in BAM or CRAM. + * + * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. + * + * At some point it is expected these ifdefs will become a version check + * instead. + */ +#ifdef LARGE_POS + if ((s = ltf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s; + if ((s = ltf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s; +#else + int32_t i32; + if ((s = itf8_decode_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s; + c2.ref_seq_start = i32; + if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s; + c2.ref_seq_span = i32; +#endif if ((s = itf8_decode_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s; if (CRAM_MAJOR_VERS(fd->version) == 1) { @@ -2887,8 +3013,7 @@ cram_container *cram_read_container(cram_fd *fd) { *c = c2; - if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) && - c->num_landmarks) { + if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) { fd->err = errno; cram_free_container(c); return NULL; @@ -2903,10 +3028,12 @@ cram_container *cram_read_container(cram_fd *fd) { } if (CRAM_MAJOR_VERS(fd->version) >= 3) { - if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) + if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) { + cram_free_container(c); return NULL; - else + } else { rd+=4; + } if (crc != c->crc32) { hts_log_error("Container header CRC32 failure"); @@ -2974,8 +3101,13 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) cp += itf8_put((char*)cp, 0); } else { cp += itf8_put((char*)cp, c->ref_seq_id); +#ifdef LARGE_POS + cp += ltf8_put((char*)cp, c->ref_seq_start); + cp += ltf8_put((char*)cp, c->ref_seq_span); +#else cp += itf8_put((char*)cp, c->ref_seq_start); cp += itf8_put((char*)cp, c->ref_seq_span); +#endif } cp += itf8_put((char*)cp, c->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) { @@ -3033,8 +3165,13 @@ int cram_write_container(cram_fd *fd, cram_container *c) { cp += itf8_put((char*)cp, 0); } else { cp += itf8_put((char*)cp, c->ref_seq_id); +#ifdef LARGE_POS + cp += ltf8_put((char*)cp, c->ref_seq_start); + cp += ltf8_put((char*)cp, c->ref_seq_span); +#else cp += itf8_put((char*)cp, c->ref_seq_start); cp += itf8_put((char*)cp, c->ref_seq_span); +#endif } cp += itf8_put((char*)cp, c->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) { @@ -3080,17 +3217,23 @@ static int cram_flush_container2(cram_fd *fd, cram_container *c) { //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum); + off_t c_offset = htell(fd->fp); // File offset of container + /* Write the container struct itself */ if (0 != cram_write_container(fd, c)) return -1; + off_t hdr_size = htell(fd->fp) - c_offset; + /* And the compression header */ if (0 != cram_write_block(fd, c->comp_hdr_block)) return -1; /* Followed by the slice blocks */ + off_t file_offset = htell(fd->fp); for (i = 0; i < c->curr_slice; i++) { cram_slice *s = c->slices[i]; + off_t spos = file_offset - c_offset - hdr_size; if (0 != cram_write_block(fd, s->hdr_block)) return -1; @@ -3099,9 +3242,17 @@ static int cram_flush_container2(cram_fd *fd, cram_container *c) { if (0 != cram_write_block(fd, s->block[j])) return -1; } + + file_offset = htell(fd->fp); + off_t sz = file_offset - c_offset - hdr_size - spos; + + if (fd->idxfp) { + if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0) + return -1; + } } - return hflush(fd->fp) == 0 ? 0 : -1; + return 0; } /* @@ -3192,9 +3343,6 @@ static int cram_flush_result(cram_fd *fd) { lc = c; } - if (fd->mode == 'w') - ret |= hflush(fd->fp) == 0 ? 0 : -1; - hts_tpool_delete_result(r, 1); } if (lc) { @@ -3348,6 +3496,8 @@ void cram_free_slice(cram_slice *s) { if (s->hdr) { for (i = 0; i < s->hdr->num_blocks; i++) { + if (i > 0 && s->block[i] == s->block[0]) + continue; cram_free_block(s->block[i]); } } @@ -3590,6 +3740,7 @@ cram_file_def *cram_read_file_def(cram_fd *fd) { } fd->first_container += 26; + fd->curr_position = fd->first_container; fd->last_slice = 0; return def; @@ -3621,10 +3772,10 @@ void cram_free_file_def(cram_file_def *def) { * Returns SAM hdr ptr on success * NULL on failure */ -SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) { +sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) { int32_t header_len; char *header; - SAM_hdr *hdr; + sam_hdr_t *hdr; /* 1.1 onwards stores the header in the first block of a container */ if (CRAM_MAJOR_VERS(fd->version) == 1) { @@ -3636,8 +3787,10 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) { if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1))) return NULL; - if (header_len != hread(fd->fp, header, header_len)) + if (header_len != hread(fd->fp, header, header_len)) { + free(header); return NULL; + } header[header_len] = '\0'; fd->first_container += 4 + header_len; @@ -3651,6 +3804,7 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) { return NULL; fd->first_container += c->length + c->offset; + fd->curr_position = fd->first_container; if (c->num_blocks < 1) { cram_free_container(c); @@ -3693,6 +3847,7 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) { for (i = 1; i < c->num_blocks; i++) { if (!(b = cram_read_block(fd))) { cram_free_container(c); + free(header); return NULL; } len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + @@ -3707,11 +3862,14 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) { char *pads = malloc(c->length - len); if (!pads) { cram_free_container(c); + free(header); return NULL; } if (c->length - len != hread(fd->fp, pads, c->length - len)) { cram_free_container(c); + free(header); + free(pads); return NULL; } free(pads); @@ -3721,10 +3879,23 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) { } /* Parse */ - hdr = sam_hdr_parse_(header, header_len); - free(header); + hdr = sam_hdr_init(); + if (!hdr) { + free(header); + return NULL; + } + + if (-1 == sam_hdr_add_lines(hdr, header, header_len)) { + free(header); + sam_hdr_destroy(hdr); + return NULL; + } + + hdr->l_text = header_len; + hdr->text = header; return hdr; + } /* @@ -3737,7 +3908,7 @@ static void full_path(char *out, char *in) { // Windows paths (in_l > 3 && toupper_c(*in) >= 'A' && toupper_c(*in) <= 'Z' && in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) { - strncpy(out, in, PATH_MAX); + strncpy(out, in, PATH_MAX-1); out[PATH_MAX-1] = 0; } else { int len; @@ -3745,12 +3916,12 @@ static void full_path(char *out, char *in) { // unable to get dir or out+in is too long if (!getcwd(out, PATH_MAX) || (len = strlen(out))+1+strlen(in) >= PATH_MAX) { - strncpy(out, in, PATH_MAX); + strncpy(out, in, PATH_MAX-1); out[PATH_MAX-1] = 0; return; } - sprintf(out+len, "/%.*s", PATH_MAX - len, in); + sprintf(out+len, "/%.*s", PATH_MAX - 2 - len, in); // FIXME: cope with `pwd`/../../../foo.fa ? } @@ -3761,8 +3932,8 @@ static void full_path(char *out, char *in) { * Returns 0 on success * -1 on failure */ -int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { - int header_len; +int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { + size_t header_len; int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3); /* Write CRAM MAGIC if not yet written. */ @@ -3775,8 +3946,8 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { /* 1.0 requires an UNKNOWN read-group */ if (CRAM_MAJOR_VERS(fd->version) == 1) { - if (!sam_hdr_find_rg(hdr, "UNKNOWN")) - if (sam_hdr_add(hdr, "RG", + if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN")) + if (sam_hdr_add_line(hdr, "RG", "ID", "UNKNOWN", "SM", "UNKNOWN", NULL)) return -1; } @@ -3784,14 +3955,14 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { /* Fix M5 strings */ if (fd->refs && !fd->no_ref) { int i; - for (i = 0; i < hdr->nref; i++) { - SAM_hdr_type *ty; + for (i = 0; i < hdr->hrecs->nref; i++) { + sam_hrec_type_t *ty; char *ref; - if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name))) + if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name))) return -1; - if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) { + if (!sam_hrecs_find_key(ty, "M5", NULL)) { char unsigned buf[16]; char buf2[33]; int rlen; @@ -3814,24 +3985,25 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { cram_ref_decr(fd->refs, i); hts_md5_hex(buf2, buf); - if (sam_hdr_update(hdr, ty, "M5", buf2, NULL)) + if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL)) return -1; } if (fd->ref_fn) { char ref_fn[PATH_MAX]; full_path(ref_fn, fd->ref_fn); - if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL)) + if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL)) return -1; } } } - if (sam_hdr_rebuild(hdr)) - return -1; - /* Length */ header_len = sam_hdr_length(hdr); + if (header_len > INT32_MAX) { + hts_log_error("Header is too long for CRAM format"); + return -1; + } if (CRAM_MAJOR_VERS(fd->version) == 1) { if (-1 == int32_encode(fd, header_len)) return -1; @@ -3853,14 +4025,16 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { return -1; } - int32_put_blk(b, header_len); + if (int32_put_blk(b, header_len) < 0) + return -1; if (header_len) BLOCK_APPEND(b, sam_hdr_str(hdr), header_len); BLOCK_UPLEN(b); // Compress header block if V3.0 and above if (CRAM_MAJOR_VERS(fd->version) >= 3) - cram_compress_block(fd, b, NULL, -1, -1); + if (cram_compress_block(fd, b, NULL, -1, -1) < 0) + return -1; if (blank_block) { c->length = b->comp_size + 2 + 4*is_cram_3 + @@ -3938,7 +4112,7 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { cram_free_container(c); } - if (-1 == refs_from_header(fd->refs, fd, fd->header)) + if (-1 == refs_from_header(fd)) return -1; if (-1 == refs2id(fd->refs, fd->header)) return -1; @@ -3949,6 +4123,9 @@ int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { RP("=== Finishing saving header ===\n"); return 0; + + block_err: + return -1; } /* ---------------------------------------------------------------------- @@ -4093,6 +4270,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->fp = fp; fd->mode = *mode; fd->first_container = 0; + fd->curr_position = 0; if (fd->mode == 'r') { /* Reader */ @@ -4158,10 +4336,12 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3); fd->use_lzma = 0; fd->multi_seq = -1; + fd->multi_seq_user = -1; fd->unsorted = 0; fd->shared_ref = 0; fd->store_md = 0; fd->store_nm = 0; + fd->last_RI_count = 0; fd->index = NULL; fd->own_pool = 0; @@ -4171,8 +4351,11 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->ooc = 0; fd->required_fields = INT_MAX; - for (i = 0; i < DS_END; i++) + for (i = 0; i < DS_END; i++) { fd->m[i] = cram_new_metrics(); + if (!fd->m[i]) + goto err; + } if (!(fd->tags_used = kh_init(m_metrics))) goto err; @@ -4184,7 +4367,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->bl = NULL; /* Initialise dummy refs from the @SQ headers */ - if (-1 == refs_from_header(fd->refs, fd, fd->header)) + if (-1 == refs_from_header(fd)) goto err; return fd; @@ -4332,7 +4515,7 @@ int cram_close(cram_fd *fd) { cram_free_file_def(fd->file_def); if (fd->header) - sam_hdr_free(fd->header); + sam_hdr_destroy(fd->header); free(fd->prefix); @@ -4368,6 +4551,10 @@ int cram_close(cram_fd *fd) { if (fd->own_pool && fd->pool) hts_tpool_destroy(fd->pool); + if (fd->idxfp) + if (bgzf_close(fd->idxfp) < 0) + return -1; + free(fd); return 0; } @@ -4519,7 +4706,7 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { } case CRAM_OPT_MULTI_SEQ_PER_SLICE: - fd->multi_seq = va_arg(args, int); + fd->multi_seq_user = fd->multi_seq = va_arg(args, int); break; case CRAM_OPT_NTHREADS: { diff --git a/cram/cram_io.h b/cram/cram_io.h index 428116f16..b1eccd63f 100644 --- a/cram/cram_io.h +++ b/cram/cram_io.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2014 Genome Research Ltd. +Copyright (c) 2012-2019 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -39,8 +39,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * maps, bitwise I/O, etc. */ -#ifndef _CRAM_IO_H_ -#define _CRAM_IO_H_ +#ifndef CRAM_IO_H +#define CRAM_IO_H #include #include @@ -56,7 +56,7 @@ extern "C" { */ /*! INTERNAL: Converts two characters into an integer for use in switch{} */ -#define CRAM_KEY(a,b) (((a)<<8)|((b))) +#define CRAM_KEY(a,b) ((((unsigned char) a)<<8)|(((unsigned char) b))) /*! Reads an integer in ITF-8 encoding from 'fd' and stores it in * *val. @@ -377,7 +377,8 @@ static inline int safe_ltf8_get(const char *cp, const char *endp, * @return * Returns the number of bytes written */ -int itf8_put_blk(cram_block *blk, int val); +int itf8_put_blk(cram_block *blk, int32_t val); +int ltf8_put_blk(cram_block *blk, int64_t val); /*! Pulls a literal 32-bit value from a block. * @@ -505,48 +506,72 @@ static inline cram_block *cram_get_block_by_id(cram_slice *slice, int id) { /* Returns the address one past the end of the block */ #define BLOCK_END(b) (&(b)->data[(b)->byte]) +/* Make block exactly 'l' bytes long */ +static inline int block_resize_exact(cram_block *b, size_t len) { + unsigned char *tmp = realloc(b->data, len); + if (!tmp) + return -1; + b->alloc = len; + b->data = tmp; + return 0; +} + /* Request block to be at least 'l' bytes long */ -#define BLOCK_RESIZE(b,l) \ - do { \ - while((b)->alloc <= (l)) { \ - (b)->alloc = (b)->alloc ? (b)->alloc*1.5 : 1024; \ - (b)->data = realloc((b)->data, (b)->alloc); \ - } \ - } while(0) +static inline int block_resize(cram_block *b, size_t len) { + if (b->alloc > len) + return 0; + + size_t alloc = b->alloc; + while (alloc <= len) + alloc = alloc ? alloc*1.5 : 1024; + + return block_resize_exact(b, alloc); +} -/* Make block exactly 'l' bytes long */ -#define BLOCK_RESIZE_EXACT(b,l) \ - do { \ - (b)->alloc = (l); \ - (b)->data = realloc((b)->data, (b)->alloc); \ - } while(0) /* Ensure the block can hold at least another 'l' bytes */ -#define BLOCK_GROW(b,l) BLOCK_RESIZE((b), BLOCK_SIZE((b)) + (l)) +static inline int block_grow(cram_block *b, size_t len) { + return block_resize(b, BLOCK_SIZE(b) + len); +} -/* Append string 's' of length 'l' */ -#define BLOCK_APPEND(b,s,l) \ - do { \ - BLOCK_GROW((b),(l)); \ - memcpy(BLOCK_END((b)), (s), (l)); \ - BLOCK_SIZE((b)) += (l); \ - } while (0) +/* Append string 's' of length 'l'. */ +static inline int block_append(cram_block *b, const void *s, size_t len) { + if (block_grow(b, len) < 0) + return -1; + + memcpy(BLOCK_END(b), s, len); + BLOCK_SIZE(b) += len; + + return 0; +} /* Append as single character 'c' */ -#define BLOCK_APPEND_CHAR(b,c) \ - do { \ - BLOCK_GROW((b),1); \ - (b)->data[(b)->byte++] = (c); \ - } while (0) +static inline int block_append_char(cram_block *b, char c) { + if (block_grow(b, 1) < 0) + return -1; + + b->data[b->byte++] = c; + return 0; +} /* Append a single unsigned integer */ -#define BLOCK_APPEND_UINT(b,i) \ - do { \ - unsigned char *cp; \ - BLOCK_GROW((b),11); \ - cp = &(b)->data[(b)->byte]; \ - (b)->byte += append_uint32(cp, (i)) - cp; \ - } while (0) +static inline unsigned char *append_uint32(unsigned char *cp, uint32_t i); +static inline int block_append_uint(cram_block *b, unsigned int i) { + if (block_grow(b, 11) < 0) + return -1; + + unsigned char *cp = &b->data[b->byte]; + b->byte += append_uint32(cp, i) - cp; + return 0; +} + +// Versions of above with built in goto block_err calls. +#define BLOCK_RESIZE_EXACT(b,l) if (block_resize_exact((b),(l))<0) goto block_err +#define BLOCK_RESIZE(b,l) if (block_resize((b),(l)) <0) goto block_err +#define BLOCK_GROW(b,l) if (block_grow((b),(l)) <0) goto block_err +#define BLOCK_APPEND(b,s,l) if (block_append((b),(s),(l)) <0) goto block_err +#define BLOCK_APPEND_CHAR(b,c) if (block_append_char((b),(c)) <0) goto block_err +#define BLOCK_APPEND_UINT(b,i) if (block_append_uint((b),(i)) <0) goto block_err static inline unsigned char *append_uint32(unsigned char *cp, uint32_t i) { uint32_t j; @@ -634,7 +659,7 @@ static inline unsigned char *append_uint64(unsigned char *cp, uint64_t i) { */ int cram_load_reference(cram_fd *fd, char *fn); -/*! Generates a lookup table in refs based on the SQ headers in SAM_hdr. +/*! Generates a lookup table in refs based on the SQ headers in sam_hdr_t. * * Indexes references by the order they appear in a BAM file. This may not * necessarily be the same order they appear in the fasta reference file. @@ -643,7 +668,7 @@ int cram_load_reference(cram_fd *fd, char *fn); * Returns 0 on success; * -1 on failure */ -int refs2id(refs_t *r, SAM_hdr *bfd); +int refs2id(refs_t *r, sam_hdr_t *hdr); void refs_free(refs_t *r); @@ -800,7 +825,7 @@ void cram_free_file_def(cram_file_def *def); * Returns SAM hdr ptr on success; * NULL on failure */ -SAM_hdr *cram_read_SAM_hdr(cram_fd *fd); +sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd); /*! Writes a CRAM SAM header. * @@ -808,7 +833,7 @@ SAM_hdr *cram_read_SAM_hdr(cram_fd *fd); * Returns 0 on success; * -1 on failure */ -int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr); +int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr); /**@}*/ @@ -850,8 +875,6 @@ int cram_close(cram_fd *fd); */ int cram_seek(cram_fd *fd, off_t offset, int whence); -int64_t cram_tell(cram_fd *fd); - /* * Flushes a CRAM file. * Useful for when writing to stdout without wishing to close the stream. @@ -896,14 +919,14 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args); * Attaches a header to a cram_fd. * * This should be used when creating a new cram_fd for writing where - * we have an SAM_hdr already constructed (eg from a file we've read + * we have an sam_hdr_t already constructed (eg from a file we've read * in). * * @return * Returns 0 on success; * -1 on failure */ -int cram_set_header(cram_fd *fd, SAM_hdr *hdr); +int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr); /*! * Returns the hFILE connected to a cram_fd. @@ -916,4 +939,4 @@ static inline struct hFILE *cram_hfile(cram_fd *fd) { } #endif -#endif /* _CRAM_IO_H_ */ +#endif /* CRAM_IO_H */ diff --git a/cram/cram_samtools.c b/cram/cram_samtools.c index c8f23e108..5bddbd559 100644 --- a/cram/cram_samtools.c +++ b/cram/cram_samtools.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2010-2013 Genome Research Ltd. +Copyright (c) 2010-2013, 2017-2019 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -28,6 +28,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -36,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cram/cram.h" #include "htslib/sam.h" +#include "../sam_internal.h" /*--------------------------------------------------------------------------- * Samtools compatibility portion @@ -44,13 +46,13 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len, const char *qname, size_t qname_len, int flag, int rname, // Ref ID - int pos, - int end, // aligned start/end coords + int64_t pos, + int64_t end, // aligned start/end coords int mapq, uint32_t ncigar, const uint32_t *cigar, int mrnm, // Mate Ref ID - int mpos, - int isize, + int64_t mpos, + int64_t isize, int len, const char *seq, const char *qual) { @@ -79,16 +81,9 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len, //b->l_aux = extra_len; // we fill this out later qname_nuls = 4 - qname_len%4; - if (qname_len + qname_nuls > 255) // Check for core.l_qname overflow - return -1; bam_len = qname_len + qname_nuls + ncigar*4 + (len+1)/2 + len + extra_len; - if (b->m_data < bam_len) { - b->m_data = bam_len; - kroundup32(b->m_data); - b->data = (uint8_t*)realloc(b->data, b->m_data); - if (!b->data) - return -1; - } + if (realloc_bam_data(b, bam_len) < 0) + return -1; b->l_data = bam_len; b->core.tid = rname; @@ -126,29 +121,3 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len, return bam_len; } - -bam_hdr_t *cram_header_to_bam(SAM_hdr *h) { - int i; - bam_hdr_t *header = bam_hdr_init(); - - header->l_text = ks_len(&h->text); - header->text = malloc(header->l_text+1); - memcpy(header->text, ks_str(&h->text), header->l_text); - header->text[header->l_text] = 0; - - header->n_targets = h->nref; - header->target_name = (char **)calloc(header->n_targets, - sizeof(char *)); - header->target_len = (uint32_t *)calloc(header->n_targets, 4); - - for (i = 0; i < h->nref; i++) { - header->target_name[i] = strdup(h->ref[i].name); - header->target_len[i] = h->ref[i].len; - } - - return header; -} - -SAM_hdr *bam_header_to_cram(bam_hdr_t *h) { - return sam_hdr_parse_(h->text, h->l_text); -} diff --git a/cram/cram_samtools.h b/cram/cram_samtools.h index feefb4a8e..49c50b5f3 100644 --- a/cram/cram_samtools.h +++ b/cram/cram_samtools.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2010-2013 Genome Research Ltd. +Copyright (c) 2010-2013, 2018 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -28,8 +28,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _CRAM_SAMTOOLS_H_ -#define _CRAM_SAMTOOLS_H_ +#ifndef CRAM_SAMTOOLS_H +#define CRAM_SAMTOOLS_H /* Samtools compatible API */ #define bam_blk_size(b) ((b)->l_data) @@ -52,8 +52,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define bam_cigar(b) bam_get_cigar((b)) #define bam_aux(b) bam_get_aux((b)) -#define bam_dup(b) bam_copy1(bam_init1(), (b)) - #define bam_free(b) bam_destroy1((b)) #define bam_reg2bin(beg,end) hts_reg2bin((beg),(end),14,5) @@ -74,26 +72,21 @@ enum cigar_op { typedef bam1_t bam_seq_t; -#include "cram/sam_header.h" - #ifdef __cplusplus extern "C" { #endif -bam_hdr_t *cram_header_to_bam(SAM_hdr *h); -SAM_hdr *bam_header_to_cram(bam_hdr_t *h); - int bam_construct_seq(bam_seq_t **bp, size_t extra_len, const char *qname, size_t qname_len, int flag, int rname, // Ref ID - int pos, - int end, // aligned start/end coords + int64_t pos, + int64_t end, // aligned start/end coords int mapq, uint32_t ncigar, const uint32_t *cigar, int mrnm, // Mate Ref ID - int mpos, - int isize, + int64_t mpos, + int64_t isize, int len, const char *seq, const char *qual); @@ -102,4 +95,4 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len, } #endif -#endif /* _CRAM_SAMTOOLS_H_ */ +#endif /* CRAM_SAMTOOLS_H */ diff --git a/cram/cram_stats.c b/cram/cram_stats.c index cad14bfc1..1f26bcd25 100644 --- a/cram/cram_stats.c +++ b/cram/cram_stats.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2013 Genome Research Ltd. +Copyright (c) 2012-2014, 2016, 2018 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -28,6 +28,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -39,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "cram/cram.h" #include "cram/os.h" @@ -47,7 +49,7 @@ cram_stats *cram_stats_create(void) { return calloc(1, sizeof(cram_stats)); } -void cram_stats_add(cram_stats *st, int32_t val) { +int cram_stats_add(cram_stats *st, int64_t val) { st->nsamp++; //assert(val >= 0); @@ -60,6 +62,8 @@ void cram_stats_add(cram_stats *st, int32_t val) { if (!st->h) { st->h = kh_init(m_i2i); + if (!st->h) + return -1; } k = kh_put(m_i2i, st->h, val, &r); @@ -68,11 +72,12 @@ void cram_stats_add(cram_stats *st, int32_t val) { else if (r != -1) kh_val(st->h, k) = 1; else - ; // FIXME: handle error + return -1; } + return 0; } -void cram_stats_del(cram_stats *st, int32_t val) { +void cram_stats_del(cram_stats *st, int64_t val) { st->nsamp--; //assert(val >= 0); @@ -87,11 +92,11 @@ void cram_stats_del(cram_stats *st, int32_t val) { if (--kh_val(st->h, k) == 0) kh_del(m_i2i, st->h, k); } else { - hts_log_warning("Failed to remove val %d from cram_stats", val); + hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val); st->nsamp++; } } else { - hts_log_warning("Failed to remove val %d from cram_stats", val); + hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val); st->nsamp++; } } diff --git a/cram/cram_stats.h b/cram/cram_stats.h index 4fa0cc141..5f8cfec7b 100644 --- a/cram/cram_stats.h +++ b/cram/cram_stats.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2013 Genome Research Ltd. +Copyright (c) 2012-2013, 2018 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -28,16 +28,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _CRAM_STATS_H_ -#define _CRAM_STATS_H_ +#ifndef CRAM_STATS_H +#define CRAM_STATS_H #ifdef __cplusplus extern "C" { #endif cram_stats *cram_stats_create(void); -void cram_stats_add(cram_stats *st, int32_t val); -void cram_stats_del(cram_stats *st, int32_t val); +int cram_stats_add(cram_stats *st, int64_t val); +void cram_stats_del(cram_stats *st, int64_t val); void cram_stats_dump(cram_stats *st); void cram_stats_free(cram_stats *st); diff --git a/cram/cram_structs.h b/cram/cram_structs.h index ec60ebb88..c58b64db2 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2013 Genome Research Ltd. +Copyright (c) 2012-2016, 2018-2019 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -28,8 +28,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _CRAM_STRUCTS_H_ -#define _CRAM_STRUCTS_H_ +#ifndef HTSLIB_CRAM_STRUCTS_H +#define HTSLIB_CRAM_STRUCTS_H /* * Defines in-memory structs for the basic file-format objects in the @@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "htslib/thread_pool.h" +#include "htslib/cram.h" #include "cram/string_alloc.h" #include "cram/mFILE.h" #include "htslib/khash.h" @@ -60,7 +61,7 @@ extern "C" { #endif // Generic hash-map integer -> integer -KHASH_MAP_INIT_INT(m_i2i, int) +KHASH_MAP_INIT_INT64(m_i2i, int) // Generic hash-set integer -> (existance) KHASH_SET_INIT_INT(s_i2i) @@ -179,18 +180,19 @@ enum cram_DS_ID { }; /* "File Definition Structure" */ -typedef struct cram_file_def { +struct cram_file_def { char magic[4]; uint8_t major_version; uint8_t minor_version; char file_id[20]; // Filename or SHA1 checksum -} cram_file_def; +}; #define CRAM_MAJOR_VERS(v) ((v) >> 8) #define CRAM_MINOR_VERS(v) ((v) & 0xff) struct cram_slice; +/* Now in htslib/cram.h enum cram_block_method { BM_ERROR = -1, RAW = 0, @@ -202,7 +204,9 @@ enum cram_block_method { RANS1 = 10, // Not externalised; stored as RANS (generic) GZIP_RLE = 11, // NB: not externalised in CRAM }; +*/ +/* Now in htslib/cram.h enum cram_content_type { CT_ERROR = -1, FILE_HEADER = 0, @@ -212,9 +216,10 @@ enum cram_content_type { EXTERNAL = 4, CORE = 5, }; +*/ /* Compression metrics */ -typedef struct { +struct cram_metrics { // number of trials and time to next trial int trial; int next_trial; @@ -246,14 +251,14 @@ typedef struct { double rans1_extra; double bzip2_extra; double lzma_extra; -} cram_metrics; +}; // Hash aux key (XX:i) to cram_metrics KHASH_MAP_INIT_INT(m_metrics, cram_metrics*) /* Block */ -typedef struct cram_block { +struct cram_block { enum cram_block_method method, orig_method; enum cram_content_type content_type; int32_t content_id; @@ -270,7 +275,7 @@ typedef struct cram_block { // To aid compression cram_metrics *m; // used to track aux block compression only -} cram_block; +}; struct cram_codec; /* defined in cram_codecs.h */ struct cram_map; @@ -279,19 +284,15 @@ struct cram_map; #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1)) /* Compression header block */ -typedef struct cram_block_compression_hdr { +struct cram_block_compression_hdr { int32_t ref_seq_id; - int32_t ref_seq_start; - int32_t ref_seq_span; + int64_t ref_seq_start; + int64_t ref_seq_span; int32_t num_records; int32_t num_landmarks; int32_t *landmark; /* Flags from preservation map */ - int mapped_qs_included; - int unmapped_qs_included; - int unmapped_placed; - int qs_included; int read_names_included; int AP_delta; // indexed by ref-base and subst. code @@ -313,7 +314,7 @@ typedef struct cram_block_compression_hdr { char *uncomp; // A single block of uncompressed data size_t uncomp_size, uncomp_alloc; -} cram_block_compression_hdr; +}; typedef struct cram_map { int key; /* 0xe0 + 3 bytes */ @@ -334,11 +335,11 @@ typedef struct cram_tag_map { KHASH_MAP_INIT_INT(m_tagmap, cram_tag_map*) /* Mapped or unmapped slice header block */ -typedef struct cram_block_slice_hdr { +struct cram_block_slice_hdr { enum cram_content_type content_type; int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */ - int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */ - int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */ + int64_t ref_seq_start; /* if content_type == MAPPED_SLICE */ + int64_t ref_seq_span; /* if content_type == MAPPED_SLICE */ int32_t num_records; int64_t record_counter; int32_t num_blocks; @@ -346,7 +347,7 @@ typedef struct cram_block_slice_hdr { int32_t *block_content_ids; int32_t ref_base_id; /* if content_type == MAPPED_SLICE */ unsigned char md5[16]; -} cram_block_slice_hdr; +}; struct ref_entry; @@ -359,11 +360,11 @@ struct ref_entry; * * OR... are landmarks the start/end points of slices? */ -typedef struct cram_container { +struct cram_container { int32_t length; int32_t ref_seq_id; - int32_t ref_seq_start; - int32_t ref_seq_span; + int64_t ref_seq_start; + int64_t ref_seq_span; int64_t record_counter; int64_t num_bases; int32_t num_records; @@ -385,10 +386,10 @@ typedef struct cram_container { int max_c_rec, curr_c_rec; // current and max recs per container int slice_rec; // rec no. for start of this slice int curr_ref; // current ref ID. -2 for no previous - int last_pos; // last record position + int64_t last_pos; // last record position struct cram_slice **slices, *slice; int pos_sorted; // boolean, 1=>position sorted data - int max_apos; // maximum position, used if pos_sorted==0 + int64_t max_apos; // maximum position, used if pos_sorted==0 int last_slice; // number of reads in last slice (0 for 1st) int multi_seq; // true if packing multi seqs per cont/slice int unsorted; // true is AP_delta is 0. @@ -410,7 +411,7 @@ typedef struct cram_container { uint32_t crc32; // CRC32 uint64_t s_num_bases; // number of bases in this slice -} cram_container; +}; /* * A single cram record @@ -422,14 +423,14 @@ typedef struct cram_record { int32_t flags; // BF int32_t cram_flags; // CF int32_t len; // RL - int32_t apos; // AP + int64_t apos; // AP int32_t rg; // RG int32_t name; // RN; idx to s->names_blk int32_t name_len; int32_t mate_line; // index to another cram_record int32_t mate_ref_id; - int32_t mate_pos; // NP - int32_t tlen; // TS + int64_t mate_pos; // NP + int64_t tlen; // TS // Auxiliary data int32_t ntags; // TC @@ -446,7 +447,7 @@ typedef struct cram_record { int32_t qual; // idx to s->qual_blk int32_t cigar; // idx to s->cigar int32_t ncigar; - int32_t aend; // alignment end + int64_t aend; // alignment end int32_t mqual; // MQ int32_t feature; // idx to s->feature @@ -536,14 +537,14 @@ typedef union cram_feature { * is the logical unit for decoding a number of * sequences. */ -typedef struct cram_slice { +struct cram_slice { cram_block_slice_hdr *hdr; cram_block *hdr_block; cram_block **block; cram_block **block_by_id; /* State used during encoding/decoding */ - int last_apos, max_apos; + int64_t last_apos, max_apos; /* Array of decoded cram records */ cram_record *crecs; @@ -593,7 +594,7 @@ typedef struct cram_slice { int max_rec, curr_rec; // current and max recs per slice int slice_num; // To be copied into c->curr_slice in decode -} cram_slice; +}; /*----------------------------------------------------------------------------- * Consider moving reference handling to cram_refs.[ch] @@ -615,7 +616,7 @@ typedef struct ref_entry { KHASH_MAP_INIT_STR(refs, ref_entry*) // References structure. -typedef struct { +struct refs_t { string_alloc_t *pool; // String pool for holding filenames and SN vals khash_t(refs) *h_meta; // ref_entry*, index by name @@ -630,7 +631,7 @@ typedef struct { pthread_mutex_t lock; // Mutex for multi-threaded updating ref_entry *last; // Last queried sequence int last_id; // Used in cram_ref_decr_locked to delay free -} refs_t; +}; /*----------------------------------------------------------------------------- * CRAM index @@ -661,8 +662,8 @@ typedef struct cram_index { typedef struct { int refid; - int start; - int end; + int64_t start; + int64_t end; } cram_range; /*----------------------------------------------------------------------------- @@ -674,12 +675,12 @@ typedef struct spare_bams { struct spare_bams *next; } spare_bams; -typedef struct cram_fd { +struct cram_fd { struct hFILE *fp; int mode; // 'r' or 'w' int version; cram_file_def *file_def; - SAM_hdr *header; + sam_hdr_t *header; char *prefix; int64_t record_counter; @@ -738,9 +739,12 @@ typedef struct cram_fd { int index_sz; cram_index *index; // array, sizeof index_sz off_t first_container; + off_t curr_position; int eof; int last_slice; // number of recs encoded in last slice - int multi_seq; + int last_RI_count; // number of references encoded in last container + int multi_seq; // -1 is auto, 0 is one ref per container, 1 is multi... + int multi_seq_user; // Original user setting (CRAM_OPT_MULTI_SEQ_PER_SLICE) int unsorted; int empty_container; // Marker for EOF block @@ -759,7 +763,9 @@ typedef struct cram_fd { int lossy_read_names; // boolean int tlen_approx; // max TLEN calculation offset. int tlen_zero; // If true, permit tlen 0 (=> tlen calculated) -} cram_fd; + + BGZF *idxfp; // File pointer for on-the-fly index creation +}; // Translation of required fields to cram data series enum cram_fields { @@ -850,4 +856,4 @@ enum cram_fields { } #endif -#endif /* _CRAM_STRUCTS_H_ */ +#endif /* HTSLIB_CRAM_STRUCTS_H */ diff --git a/cram/files.c b/cram/files.c deleted file mode 100644 index f5d489522..000000000 --- a/cram/files.c +++ /dev/null @@ -1,74 +0,0 @@ -/* -Copyright (c) 1994, 1996-1997, 2000, 2003 MEDICAL RESEARCH COUNCIL -All rights reserved - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1 Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - - 2 Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - - 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF -MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or -promote products derived from this software without specific prior written -permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include - -#include "cram/misc.h" - -#include -#include -/* Alliant's Concentrix is hugely deficient */ -/* Define things we require in this program */ -/* Methinks S_IFMT and S_IFDIR aren't defined in POSIX */ -#ifndef S_ISDIR -#define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR) -#endif /*!S_ISDIR*/ -#ifndef S_ISREG -#define S_ISREG(m) (((m)&S_IFMT) == S_IFREG) -#endif /*!S_ISREG*/ - -int is_directory(char * fn) -{ - struct stat buf; - if ( stat(fn,&buf) ) return 0; - return S_ISDIR(buf.st_mode); -} - -int is_file(char * fn) -{ - struct stat buf; - if ( stat(fn,&buf) ) return 0; - return S_ISREG(buf.st_mode); -} - -int file_exists(char * fn) -{ - struct stat buf; - return ( stat(fn,&buf) == 0); -} - -int file_size(char * fn) -{ - struct stat buf; - if ( stat(fn,&buf) != 0) return 0; - return buf.st_size; -} - diff --git a/cram/mFILE.c b/cram/mFILE.c index d3e9a1fc7..461d89826 100644 --- a/cram/mFILE.c +++ b/cram/mFILE.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2005-2006, 2008-2009, 2013 Genome Research Ltd. +Copyright (c) 2005-2006, 2008-2009, 2013, 2015, 2017-2019 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -28,6 +28,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -86,6 +87,8 @@ static char *mfload(FILE *fp, const char *fn, size_t *size, int binary) { if (fn && -1 != stat(fn, &sb)) { data = malloc(allocated = sb.st_size); + if (!data) + return NULL; bufsize = sb.st_size; } else { fn = NULL; @@ -95,7 +98,13 @@ static char *mfload(FILE *fp, const char *fn, size_t *size, int binary) { size_t len; if (used + bufsize > allocated) { allocated += bufsize; - data = realloc(data, allocated); + char *datan = realloc(data, allocated); + if (datan) { + data = datan; + } else { + free(data); + return NULL; + } } len = fread(data + used, 1, allocated - used, fp); if (len > 0) @@ -125,7 +134,7 @@ int mfmmap(mFILE *mf, FILE *fp, const char *fn) { mf->data = mmap(NULL, mf->size, PROT_READ, MAP_SHARED, fileno(fp), 0); - if (!mf->data) + if (!mf->data || mf->data == (void *)-1) return -1; mf->alloced = 0; @@ -298,6 +307,10 @@ mFILE *mfreopen(const char *path, const char *mode_str, FILE *fp) { #endif if (!mf->data) { mf->data = mfload(fp, path, &mf->size, b); + if (!mf->data) { + free(mf); + return NULL; + } mf->alloced = mf->size; if (!a) fseek(fp, 0, SEEK_SET); diff --git a/cram/mFILE.h b/cram/mFILE.h index b0463c765..ca7062c15 100644 --- a/cram/mFILE.h +++ b/cram/mFILE.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2005-2006, 2008-2009 Genome Research Ltd. +Copyright (c) 2005-2006, 2008-2009, 2013, 2018 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -28,8 +28,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _MFILE_H_ -#define _MFILE_H_ +#ifndef CRAM_MFILE_H +#define CRAM_MFILE_H #include @@ -90,4 +90,4 @@ void mfascii(mFILE *mf); } #endif -#endif /* _MFILE_H_ */ +#endif /* CRAM_MFILE_H */ diff --git a/cram/misc.h b/cram/misc.h index 8428608e4..312dc7d22 100644 --- a/cram/misc.h +++ b/cram/misc.h @@ -30,7 +30,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* -Copyright (c) 2003-2013 Genome Research Ltd. +Copyright (c) 2003-2013, 2018-2019 Genome Research Ltd. Author: James Bonfield @@ -60,46 +60,13 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _misc_h -#define _misc_h - -#include "cram/os.h" - -#include -#include /* varargs needed for v*printf() prototypes */ -#include +#ifndef CRAM_MISC_H +#define CRAM_MISC_H #ifdef __cplusplus extern "C" { #endif -/* - * This informs gcc that crash() doesn't return, so it doesn't need to - * concern itself that code paths going via crash could mean some variables - * being undefined and then issuing uninitialised variable warnings. - * This particularly affected convert. - */ -#ifdef __GNUC__ -# define __NORETURN__ __attribute__ ((__noreturn__)) -#else -# define __NORETURN__ -#endif - -/* - * Used for printf style argument checking. We can request a function such - * as vTcl_SetResult does argument checking, avoiding bugs with using - * %d and passing in a 64-bit record. - */ -#ifdef __GNUC__ -# define __PRINTF_FORMAT__(a,b) __attribute__ ((format (printf, a, b))) -#else -# define __PRINTF_FORMAT__(a,b) -#endif - -extern int is_directory(char * fn); -extern int is_file(char * fn); -extern int file_size(char * fn); - #define MIN(A,B) ( ( (A) < (B) ) ? (A) : (B) ) #define MAX(A,B) ( ( (A) > (B) ) ? (A) : (B) ) @@ -107,4 +74,4 @@ extern int file_size(char * fn); } #endif -#endif /*_misc_h*/ +#endif /* CRAM_MISC_H */ diff --git a/cram/open_trace_file.c b/cram/open_trace_file.c index 87566d7eb..71af42972 100644 --- a/cram/open_trace_file.c +++ b/cram/open_trace_file.c @@ -32,7 +32,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* -Copyright (c) 2008, 2009, 2013, 2014 Genome Research Ltd. +Copyright (c) 2008, 2009, 2013, 2014-2015, 2018-2019 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -61,6 +61,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -80,6 +81,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cram/misc.h" #include "htslib/hfile.h" #include "htslib/hts_log.h" +#include "htslib/hts.h" + +/* + * Returns whether the path refers to a regular file. + */ +static int is_file(char *fn) { + struct stat buf; + if ( stat(fn,&buf) ) return 0; + return S_ISREG(buf.st_mode); +} /* * Tokenises the search path splitting on colons (unix) or semicolons @@ -93,15 +104,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * The returned data has been malloced. It is up to the caller to free this * memory. */ -char *tokenise_search_path(char *searchpath) { +char *tokenise_search_path(const char *searchpath) { char *newsearch; unsigned int i, j; size_t len; -#ifdef _WIN32 - char path_sep = ';'; -#else - char path_sep = ':'; -#endif + char path_sep = HTS_PATH_SEPARATOR_CHAR; if (!searchpath) searchpath=""; @@ -168,45 +175,48 @@ char *tokenise_search_path(char *searchpath) { return newsearch; } -mFILE *find_file_url(char *file, char *url) { - char buf[8192], *cp; +static char *expand_path(const char *file, char *dirname, int max_s_digits); + +mFILE *find_file_url(const char *file, char *url) { + char *path = NULL, buf[8192]; mFILE *mf = NULL; - int maxlen = 8190 - strlen(file), len; - hFILE *hf; - - /* Expand %s for the trace name */ - for (cp = buf; *url && cp - buf < maxlen; url++) { - if (*url == '%' && *(url+1) == 's') { - url++; - cp += strlen(strcpy(cp, file)); - } else { - *cp++ = *url; - } - } - *cp++ = 0; + ssize_t len; + hFILE *hf = NULL; - if (!(hf = hopen(buf, "r"))) { - if (errno != ENOENT) - hts_log_warning("Failed to open reference \"%s\": %s", buf, strerror(errno)); + /* Expand %s for the trace name. Only one digit is allowed between + The % and s to avoid ambiguity with percent-encoded URLs */ + + path = expand_path(file, url, 1); + if (!path) return NULL; + + if (!(hf = hopen(path, "r"))) { + if (errno != ENOENT) + hts_log_warning("Failed to open reference \"%s\": %s", path, strerror(errno)); + goto fail; } if (NULL == (mf = mfcreate(NULL, 0))) - return NULL; - while ((len = hread(hf, buf, 8192)) > 0) { + goto fail; + while ((len = hread(hf, buf, sizeof(buf))) > 0) { if (mfwrite(buf, len, 1, mf) <= 0) { hclose_abruptly(hf); - mfdestroy(mf); - return NULL; + goto fail; } } if (hclose(hf) < 0 || len < 0) { - mfdestroy(mf); - return NULL; + hts_log_warning("Failed to read reference \"%s\": %s", path, strerror(errno)); + goto fail; } + free(path); mrewind(mf); return mf; + + fail: + mfdestroy(mf); + free(path); + return NULL; } /* @@ -215,14 +225,16 @@ mFILE *find_file_url(char *file, char *url) { * * Returns expanded pathname or NULL for malloc failure. */ -static char *expand_path(char *file, char *dirname) { +static char *expand_path(const char *file, char *dirname, int max_s_digits) { size_t len = strlen(dirname); size_t lenf = strlen(file); char *cp, *path; path = malloc(len+lenf+2); // worst expansion DIR/FILE - if (!path) + if (!path) { + hts_log_error("Out of memory"); return NULL; + } if (dirname[len-1] == '/') len--; @@ -237,7 +249,7 @@ static char *expand_path(char *file, char *dirname) { while ((cp = strchr(dirname, '%'))) { char *endp; long l = strtol(cp+1, &endp, 10); - if (*endp != 's') { + if (*endp != 's' || endp - cp - 1 > max_s_digits) { strncpy(path_end, dirname, (endp+1)-dirname); path_end += (endp+1)-dirname; dirname = endp+1; @@ -279,11 +291,13 @@ static char *expand_path(char *file, char *dirname) { * Returns mFILE pointer if found * NULL if not */ -static mFILE *find_file_dir(char *file, char *dirname) { +static mFILE *find_file_dir(const char *file, char *dirname) { char *path; mFILE *mf = NULL; - path = expand_path(file, dirname); + path = expand_path(file, dirname, INT_MAX); + if (!path) + return NULL; if (is_file(path)) mf = mfopen(path, "rbm"); @@ -312,7 +326,7 @@ static mFILE *find_file_dir(char *file, char *dirname) { * Returns a mFILE pointer when found. * NULL otherwise. */ -mFILE *open_path_mfile(char *file, char *path, char *relative_to) { +mFILE *open_path_mfile(const char *file, char *path, char *relative_to) { char *newsearch; char *ele; mFILE *fp; @@ -387,7 +401,7 @@ mFILE *open_path_mfile(char *file, char *path, char *relative_to) { * Returns the expanded pathname if found. * NULL if not */ -char *find_path(char *file, char *path) { +char *find_path(const char *file, const char *path) { char *newsearch; char *ele; char *outpath = NULL; @@ -407,7 +421,7 @@ char *find_path(char *file, char *path) { !strncmp(ele2, "ftp:", 4)) { continue; } else { - outpath = expand_path(file, ele2); + outpath = expand_path(file, ele2, INT_MAX); if (is_file(outpath)) { free(newsearch); return outpath; diff --git a/cram/open_trace_file.h b/cram/open_trace_file.h index 7e1f29fd9..98b37d505 100644 --- a/cram/open_trace_file.h +++ b/cram/open_trace_file.h @@ -32,7 +32,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* -Copyright (c) 2008, 2009, 2013 Genome Research Ltd. +Copyright (c) 2008, 2009, 2013, 2018 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -61,8 +61,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _OPEN_TRACE_FILE_H_ -#define _OPEN_TRACE_FILE_H_ +#ifndef OPEN_TRACE_FILE_H +#define OPEN_TRACE_FILE_H #include "cram/mFILE.h" @@ -82,7 +82,7 @@ extern "C" { * The returned data has been malloced. It is up to the caller to free this * memory. */ -char *tokenise_search_path(char *searchpath); +char *tokenise_search_path(const char *searchpath); /* * Opens a trace file named 'file'. This is initially looked for as a @@ -99,13 +99,13 @@ char *tokenise_search_path(char *searchpath); * Returns a mFILE pointer when found. * NULL otherwise. */ -mFILE *open_path_mfile(char *file, char *path, char *relative_to); +mFILE *open_path_mfile(const char *file, char *path, char *relative_to); /* * Returns a mFILE containing the entire contents of the url; * NULL on failure. */ -mFILE *find_file_url(char *file, char *url); +mFILE *find_file_url(const char *file, char *url); /* @@ -116,10 +116,10 @@ mFILE *find_file_url(char *file, char *url); * Returns the expanded pathname if found. * NULL if not */ -char *find_path(char *file, char *path); +char *find_path(const char *file, const char *path); #ifdef __cplusplus } #endif -#endif /* _OPEN_TRACE_FILE_H_ */ +#endif /* OPEN_TRACE_FILE_H */ diff --git a/cram/os.h b/cram/os.h index e62f6ff6e..91d9617f3 100644 --- a/cram/os.h +++ b/cram/os.h @@ -30,7 +30,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* -Copyright (c) 2004, 2006, 2009-2011, 2013, 2017 Genome Research Ltd. +Copyright (c) 2004, 2006, 2009-2011, 2013, 2017-2018 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -72,11 +72,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ -#ifndef _OS_H_ -#define _OS_H_ +#ifndef CRAM_OS_H +#define CRAM_OS_H #include -#include +#include #include "htslib/hts_endian.h" #ifdef __cplusplus @@ -159,22 +159,6 @@ static inline uint16_t le_int2(uint16_t x) { } #endif -/*----------------------------------------------------------------------------- - * definitions, incase they're not present - */ - -#ifndef PRId64 -#define __PRI64__ "l" -#define PRId64 __PRI64__ "d" -#define PRId32 "d" -#define PRId16 "d" -#define PRId8 "d" -#define PRIu64 __PRI64__ "u" -#define PRIu32 "u" -#define PRIu16 "u" -#define PRIu8 "u" -#endif - /*----------------------------------------------------------------------------- * Operating system specifics. * These ought to be done by autoconf, but are legacy code. @@ -217,4 +201,4 @@ static inline uint16_t le_int2(uint16_t x) { } #endif -#endif /*_OS_H_*/ +#endif /* CRAM_OS_H */ diff --git a/cram/pooled_alloc.c b/cram/pooled_alloc.c index c792c3f42..cf2ccb40c 100644 --- a/cram/pooled_alloc.c +++ b/cram/pooled_alloc.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2009 Genome Research Ltd. +Copyright (c) 2009, 2013, 2015, 2018-2019 Genome Research Ltd. Author: Rob Davies Redistribution and use in source and binary forms, with or without @@ -28,6 +28,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -37,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cram/pooled_alloc.h" #include "cram/misc.h" +//#define DISABLE_POOLED_ALLOC //#define TEST_MAIN #define PSIZE 1024*1024 @@ -79,6 +81,18 @@ pool_alloc_t *pool_create(size_t dsize) { return p; } +void pool_destroy(pool_alloc_t *p) { + size_t i; + + for (i = 0; i < p->npools; i++) { + free(p->pools[i].pool); + } + free(p->pools); + free(p); +} + +#ifndef DISABLE_POOLED_ALLOC + static pool_t *new_pool(pool_alloc_t *p) { size_t n = p->psize / p->dsize; pool_t *pool; @@ -98,16 +112,6 @@ static pool_t *new_pool(pool_alloc_t *p) { return pool; } -void pool_destroy(pool_alloc_t *p) { - size_t i; - - for (i = 0; i < p->npools; i++) { - free(p->pools[i].pool); - } - free(p->pools); - free(p); -} - void *pool_alloc(pool_alloc_t *p) { pool_t *pool; void *ret; @@ -142,6 +146,18 @@ void pool_free(pool_alloc_t *p, void *ptr) { p->free = ptr; } +#else + +void *pool_alloc(pool_alloc_t *p) { + return malloc(p->dsize); +} + +void pool_free(pool_alloc_t *p, void *ptr) { + free(ptr); +} + +#endif + #ifdef TEST_MAIN typedef struct { int x, y, z; diff --git a/cram/pooled_alloc.h b/cram/pooled_alloc.h index a21e690d9..bb49d11dd 100644 --- a/cram/pooled_alloc.h +++ b/cram/pooled_alloc.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2009 Genome Research Ltd. +Copyright (c) 2009, 2013, 2018 Genome Research Ltd. Author: Rob Davies Redistribution and use in source and binary forms, with or without @@ -28,8 +28,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _POOLED_ALLOC_H_ -#define _POOLED_ALLOC_H_ +#ifndef POOLED_ALLOC_H +#define POOLED_ALLOC_H #include @@ -63,4 +63,4 @@ void pool_free(pool_alloc_t *p, void *ptr); } #endif -#endif /*_POOLED_ALLOC_H_*/ +#endif /* POOLED_ALLOC_H */ diff --git a/cram/rANS_byte.h b/cram/rANS_byte.h index b47bfa4ad..f8bcae248 100644 --- a/cram/rANS_byte.h +++ b/cram/rANS_byte.h @@ -120,10 +120,10 @@ static inline void RansDecInit(RansState* r, uint8_t** pptr) uint32_t x; uint8_t* ptr = *pptr; - x = ptr[0] << 0; - x |= ptr[1] << 8; - x |= ptr[2] << 16; - x |= ptr[3] << 24; + x = ((uint32_t) ptr[0]) << 0; + x |= ((uint32_t) ptr[1]) << 8; + x |= ((uint32_t) ptr[2]) << 16; + x |= ((uint32_t) ptr[3]) << 24; ptr += 4; *pptr = ptr; diff --git a/cram/rANS_static.c b/cram/rANS_static.c index 091d99fe0..f0d0cb25b 100644 --- a/cram/rANS_static.c +++ b/cram/rANS_static.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Genome Research Ltd. + * Copyright (c) 2014-2019 Genome Research Ltd. * Author(s): James Bonfield * * Redistribution and use in source and binary forms, with or without @@ -35,6 +35,7 @@ * Author: James Bonfield, Wellcome Trust Sanger Institute. 2014 */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -44,6 +45,7 @@ #include #include #include +#include #include "cram/rANS_static.h" #include "cram/rANS_byte.h" @@ -220,11 +222,16 @@ unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size, if (*in++ != 0) // Order-0 check return NULL; - in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | ((in[3])<<24); - out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | ((in[7])<<24); + in_sz = ((((uint32_t) in[0])<<0) | (((uint32_t) in[1])<<8) | + (((uint32_t) in[2])<<16) | (((uint32_t) in[3])<<24)); + out_sz = ((((uint32_t) in[4])<<0) | (((uint32_t) in[5])<<8) | + (((uint32_t) in[6])<<16) | (((uint32_t) in[7])<<24)); if (in_sz != in_size-9) return NULL; + if (out_sz >= INT_MAX) + return NULL; // protect against some overflow cases + // Precompute reverse lookup of frequency. rle = x = 0; j = *cp++; @@ -582,11 +589,16 @@ unsigned char *rans_uncompress_O1(unsigned char *in, unsigned int in_size, if (*in++ != 1) // Order-1 check return NULL; - in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | ((in[3])<<24); - out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | ((in[7])<<24); + in_sz = ((((uint32_t) in[0])<<0) | (((uint32_t) in[1])<<8) | + (((uint32_t) in[2])<<16) | (((uint32_t) in[3])<<24)); + out_sz = ((((uint32_t) in[4])<<0) | (((uint32_t) in[5])<<8) | + (((uint32_t) in[6])<<16) | (((uint32_t) in[7])<<24)); if (in_sz != in_size-9) return NULL; + if (out_sz >= INT_MAX) + return NULL; // protect against some overflow cases + // calloc may add 2% overhead to CRAM decode, but on linux with glibc it's // often the same thing due to using mmap. D = calloc(256, sizeof(*D)); diff --git a/cram/sam_header.c b/cram/sam_header.c deleted file mode 100644 index c4a40cb33..000000000 --- a/cram/sam_header.c +++ /dev/null @@ -1,1269 +0,0 @@ -/* -Copyright (c) 2013 Genome Research Ltd. -Author: James Bonfield - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - - 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger -Institute nor the names of its contributors may be used to endorse or promote -products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#include - -#include -#include - -#include "htslib/hts_log.h" -#include "cram/sam_header.h" -#include "cram/string_alloc.h" - -static void sam_hdr_error(char *msg, char *line, int len, int lno) { - int j; - - for (j = 0; j < len && line[j] != '\n'; j++) - ; - hts_log_error("%s at line %d: \"%.*s\"", msg, lno, j, line); -} - -void sam_hdr_dump(SAM_hdr *hdr) { - khint_t k; - int i; - - printf("===DUMP===\n"); - for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) { - SAM_hdr_type *t1, *t2; - char c[2]; - - if (!kh_exist(hdr->h, k)) - continue; - - t1 = t2 = kh_val(hdr->h, k); - c[0] = kh_key(hdr->h, k)>>8; - c[1] = kh_key(hdr->h, k)&0xff; - printf("Type %.2s, count %d\n", c, t1->prev->order+1); - - do { - SAM_hdr_tag *tag; - printf(">>>%d ", t1->order); - for (tag = t1->tag; tag; tag=tag->next) { - printf("\"%.2s\":\"%.*s\"\t", - tag->str, tag->len-3, tag->str+3); - } - putchar('\n'); - t1 = t1->next; - } while (t1 != t2); - } - - /* Dump out PG chains */ - printf("\n@PG chains:\n"); - for (i = 0; i < hdr->npg_end; i++) { - int j; - printf(" %d:", i); - for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) { - printf("%s%d(%.*s)", - j == hdr->pg_end[i] ? " " : "->", - j, hdr->pg[j].name_len, hdr->pg[j].name); - } - printf("\n"); - } - - puts("===END DUMP==="); -} - -/* Updates the hash tables in the SAM_hdr structure. - * - * Returns 0 on success; - * -1 on failure - */ -static int sam_hdr_update_hashes(SAM_hdr *sh, - int type, - SAM_hdr_type *h_type) { - /* Add to reference hash? */ - if ((type>>8) == 'S' && (type&0xff) == 'Q') { - SAM_hdr_tag *tag; - SAM_SQ *new_ref; - int nref = sh->nref; - - new_ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref)); - if (!new_ref) - return -1; - sh->ref = new_ref; - - tag = h_type->tag; - sh->ref[nref].name = NULL; - sh->ref[nref].len = 0; - sh->ref[nref].ty = h_type; - sh->ref[nref].tag = tag; - - while (tag) { - if (tag->str[0] == 'S' && tag->str[1] == 'N') { - if (!(sh->ref[nref].name = malloc(tag->len))) - return -1; - strncpy(sh->ref[nref].name, tag->str+3, tag->len-3); - sh->ref[nref].name[tag->len-3] = 0; - } else if (tag->str[0] == 'L' && tag->str[1] == 'N') { - sh->ref[nref].len = atoi(tag->str+3); - } - tag = tag->next; - } - - if (sh->ref[nref].name) { - khint_t k; - int r; - k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r); - if (-1 == r) return -1; - kh_val(sh->ref_hash, k) = nref; - } else { - return -1; // SN should be present, according to spec. - } - - sh->nref++; - } - - /* Add to read-group hash? */ - if ((type>>8) == 'R' && (type&0xff) == 'G') { - SAM_hdr_tag *tag; - SAM_RG *new_rg; - int nrg = sh->nrg; - - new_rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg)); - if (!new_rg) - return -1; - sh->rg = new_rg; - - tag = h_type->tag; - sh->rg[nrg].name = NULL; - sh->rg[nrg].name_len = 0; - sh->rg[nrg].ty = h_type; - sh->rg[nrg].tag = tag; - sh->rg[nrg].id = nrg; - - while (tag) { - if (tag->str[0] == 'I' && tag->str[1] == 'D') { - if (!(sh->rg[nrg].name = malloc(tag->len))) - return -1; - strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3); - sh->rg[nrg].name[tag->len-3] = 0; - sh->rg[nrg].name_len = strlen(sh->rg[nrg].name); - } - tag = tag->next; - } - - if (sh->rg[nrg].name) { - khint_t k; - int r; - k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r); - if (-1 == r) return -1; - kh_val(sh->rg_hash, k) = nrg; - } else { - return -1; // ID should be present, according to spec. - } - - sh->nrg++; - } - - /* Add to program hash? */ - if ((type>>8) == 'P' && (type&0xff) == 'G') { - SAM_hdr_tag *tag; - SAM_PG *new_pg; - int npg = sh->npg; - - new_pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg)); - if (!new_pg) - return -1; - sh->pg = new_pg; - - tag = h_type->tag; - sh->pg[npg].name = NULL; - sh->pg[npg].name_len = 0; - sh->pg[npg].ty = h_type; - sh->pg[npg].tag = tag; - sh->pg[npg].id = npg; - sh->pg[npg].prev_id = -1; - - while (tag) { - if (tag->str[0] == 'I' && tag->str[1] == 'D') { - if (!(sh->pg[npg].name = malloc(tag->len))) - return -1; - strncpy(sh->pg[npg].name, tag->str+3, tag->len-3); - sh->pg[npg].name[tag->len-3] = 0; - sh->pg[npg].name_len = strlen(sh->pg[npg].name); - } else if (tag->str[0] == 'P' && tag->str[1] == 'P') { - // Resolve later if needed - khint_t k; - char tmp = tag->str[tag->len]; tag->str[tag->len] = 0; - k = kh_get(m_s2i, sh->pg_hash, tag->str+3); - tag->str[tag->len] = tmp; - - if (k != kh_end(sh->pg_hash)) { - int p_id = kh_val(sh->pg_hash, k); - sh->pg[npg].prev_id = sh->pg[p_id].id; - - /* Unmark previous entry as a PG termination */ - if (sh->npg_end > 0 && - sh->pg_end[sh->npg_end-1] == p_id) { - sh->npg_end--; - } else { - int i; - for (i = 0; i < sh->npg_end; i++) { - if (sh->pg_end[i] == p_id) { - memmove(&sh->pg_end[i], &sh->pg_end[i+1], - (sh->npg_end-i-1)*sizeof(*sh->pg_end)); - sh->npg_end--; - } - } - } - } else { - sh->pg[npg].prev_id = -1; - } - } - tag = tag->next; - } - - if (sh->pg[npg].name) { - khint_t k; - int r; - k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r); - if (-1 == r) return -1; - kh_val(sh->pg_hash, k) = npg; - } else { - return -1; // ID should be present, according to spec. - } - - /* Add to npg_end[] array. Remove later if we find a PP line */ - if (sh->npg_end >= sh->npg_end_alloc) { - int *new_pg_end; - int new_alloc = sh->npg_end_alloc ? sh->npg_end_alloc*2 : 4; - - new_pg_end = realloc(sh->pg_end, new_alloc * sizeof(int)); - if (!new_pg_end) - return -1; - sh->npg_end_alloc = new_alloc; - sh->pg_end = new_pg_end; - } - sh->pg_end[sh->npg_end++] = npg; - - sh->npg++; - } - - return 0; -} - -/* - * Appends a formatted line to an existing SAM header. - * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with - * optional new-line. If it contains more than 1 line then multiple lines - * will be added in order. - * - * Input text is of maximum length len or as terminated earlier by a NUL. - * Len may be 0 if unknown, in which case lines must be NUL-terminated. - * - * Returns 0 on success - * -1 on failure - */ -int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) { - int i, lno, text_offset; - char *hdr; - - if (!len) - len = strlen(lines); - - text_offset = ks_len(&sh->text); - if (EOF == kputsn(lines, len, &sh->text)) - return -1; - hdr = ks_str(&sh->text) + text_offset; - - for (i = 0, lno = 1; i < len && hdr[i] != '\0'; i++, lno++) { - khint32_t type; - khint_t k; - - int l_start = i, new; - SAM_hdr_type *h_type; - SAM_hdr_tag *h_tag, *last; - - if (hdr[i] != '@') { - int j; - for (j = i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++) - ; - sam_hdr_error("Header line does not start with '@'", - &hdr[l_start], len - l_start, lno); - return -1; - } - - type = (hdr[i+1]<<8) | hdr[i+2]; - if (hdr[i+1] < 'A' || hdr[i+1] > 'z' || - hdr[i+2] < 'A' || hdr[i+2] > 'z') { - sam_hdr_error("Header line does not have a two character key", - &hdr[l_start], len - l_start, lno); - return -1; - } - - i += 3; - if (hdr[i] == '\n') - continue; - - // Add the header line type - if (!(h_type = pool_alloc(sh->type_pool))) - return -1; - if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new))) - return -1; - - // Form the ring, either with self or other lines of this type - if (!new) { - SAM_hdr_type *t = kh_val(sh->h, k), *p; - p = t->prev; - - assert(p->next == t); - p->next = h_type; - h_type->prev = p; - - t->prev = h_type; - h_type->next = t; - h_type->order = p->order+1; - } else { - kh_val(sh->h, k) = h_type; - h_type->prev = h_type->next = h_type; - h_type->order = 0; - } - - // Parse the tags on this line - last = NULL; - if ((type>>8) == 'C' && (type&0xff) == 'O') { - int j; - if (hdr[i] != '\t') { - sam_hdr_error("Missing tab", - &hdr[l_start], len - l_start, lno); - return -1; - } - - for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++) - ; - - if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool))) - return -1; - h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i); - h_tag->len = j-i; - h_tag->next = NULL; - if (!h_tag->str) - return -1; - - i = j; - - } else { - do { - int j; - if (hdr[i] != '\t') { - sam_hdr_error("Missing tab", - &hdr[l_start], len - l_start, lno); - return -1; - } - - for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++) - ; - - if (!(h_tag = pool_alloc(sh->tag_pool))) - return -1; - h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i); - h_tag->len = j-i; - h_tag->next = NULL; - if (!h_tag->str) - return -1; - - if (h_tag->len < 3 || h_tag->str[2] != ':') { - sam_hdr_error("Malformed key:value pair", - &hdr[l_start], len - l_start, lno); - return -1; - } - - if (last) - last->next = h_tag; - else - h_type->tag = h_tag; - - last = h_tag; - i = j; - } while (i < len && hdr[i] != '\0' && hdr[i] != '\n'); - } - - /* Update RG/SQ hashes */ - if (-1 == sam_hdr_update_hashes(sh, type, h_type)) - return -1; - } - - return 0; -} - -/* - * Adds a single line to a SAM header. - * Specify type and one or more key,value pairs, ending with the NULL key. - * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL). - * - * Returns index for specific entry on success (eg 2nd SQ, 4th RG) - * -1 on failure - */ -int sam_hdr_add(SAM_hdr *sh, const char *type, ...) { - va_list args; - va_start(args, type); - return sam_hdr_vadd(sh, type, args, NULL); -} - -/* - * sam_hdr_add with a va_list interface. - * - * Note: this function invokes va_arg at least once, making the value - * of ap indeterminate after the return. The caller should call - * va_start/va_end before/after calling this function or use va_copy. - */ -int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) { - va_list args; - SAM_hdr_type *h_type; - SAM_hdr_tag *h_tag, *last; - int new; - khint32_t type_i = (type[0]<<8) | type[1], k; - - if (EOF == kputc_('@', &sh->text)) - return -1; - if (EOF == kputsn(type, 2, &sh->text)) - return -1; - - if (!(h_type = pool_alloc(sh->type_pool))) - return -1; - if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new))) - return -1; - - // Form the ring, either with self or other lines of this type - if (!new) { - SAM_hdr_type *t = kh_val(sh->h, k), *p; - p = t->prev; - - assert(p->next == t); - p->next = h_type; - h_type->prev = p; - - t->prev = h_type; - h_type->next = t; - h_type->order = p->order + 1; - } else { - kh_val(sh->h, k) = h_type; - h_type->prev = h_type->next = h_type; - h_type->order = 0; - } - - last = NULL; - - // Any ... varargs - va_start(args, ap); - for (;;) { - char *k, *v; - int idx; - - if (!(k = (char *)va_arg(args, char *))) - break; - v = va_arg(args, char *); - - if (EOF == kputc_('\t', &sh->text)) - return -1; - - if (!(h_tag = pool_alloc(sh->tag_pool))) - return -1; - idx = ks_len(&sh->text); - - if (EOF == kputs(k, &sh->text)) - return -1; - if (EOF == kputc_(':', &sh->text)) - return -1; - if (EOF == kputs(v, &sh->text)) - return -1; - - h_tag->len = ks_len(&sh->text) - idx; - h_tag->str = string_ndup(sh->str_pool, - ks_str(&sh->text) + idx, - h_tag->len); - h_tag->next = NULL; - if (!h_tag->str) - return -1; - - if (last) - last->next = h_tag; - else - h_type->tag = h_tag; - - last = h_tag; - } - va_end(args); - - // Plus the specified va_list params - for (;;) { - char *k, *v; - int idx; - - if (!(k = (char *)va_arg(ap, char *))) - break; - v = va_arg(ap, char *); - - if (EOF == kputc_('\t', &sh->text)) - return -1; - - if (!(h_tag = pool_alloc(sh->tag_pool))) - return -1; - idx = ks_len(&sh->text); - - if (EOF == kputs(k, &sh->text)) - return -1; - if (EOF == kputc_(':', &sh->text)) - return -1; - if (EOF == kputs(v, &sh->text)) - return -1; - - h_tag->len = ks_len(&sh->text) - idx; - h_tag->str = string_ndup(sh->str_pool, - ks_str(&sh->text) + idx, - h_tag->len); - h_tag->next = NULL; - if (!h_tag->str) - return -1; - - if (last) - last->next = h_tag; - else - h_type->tag = h_tag; - - last = h_tag; - } - va_end(ap); - - if (EOF == kputc('\n', &sh->text)) - return -1; - - int itype = (type[0]<<8) | type[1]; - if (-1 == sam_hdr_update_hashes(sh, itype, h_type)) - return -1; - - return h_type->order; -} - -/* - * Returns the first header item matching 'type'. If ID is non-NULL it checks - * for the tag ID: and compares against the specified ID. - * - * Returns NULL if no type/ID is found - */ -SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type, - char *ID_key, char *ID_value) { - SAM_hdr_type *t1, *t2; - int itype = (type[0]<<8)|(type[1]); - khint_t k; - - /* Special case for types we have prebuilt hashes on */ - if (ID_key) { - if (type[0] == 'S' && type[1] == 'Q' && - ID_key[0] == 'S' && ID_key[1] == 'N') { - k = kh_get(m_s2i, hdr->ref_hash, ID_value); - return k != kh_end(hdr->ref_hash) - ? hdr->ref[kh_val(hdr->ref_hash, k)].ty - : NULL; - } - - if (type[0] == 'R' && type[1] == 'G' && - ID_key[0] == 'I' && ID_key[1] == 'D') { - k = kh_get(m_s2i, hdr->rg_hash, ID_value); - return k != kh_end(hdr->rg_hash) - ? hdr->rg[kh_val(hdr->rg_hash, k)].ty - : NULL; - } - - if (type[0] == 'P' && type[1] == 'G' && - ID_key[0] == 'I' && ID_key[1] == 'D') { - k = kh_get(m_s2i, hdr->pg_hash, ID_value); - return k != kh_end(hdr->pg_hash) - ? hdr->pg[kh_val(hdr->pg_hash, k)].ty - : NULL; - } - } - - k = kh_get(sam_hdr, hdr->h, itype); - if (k == kh_end(hdr->h)) - return NULL; - - if (!ID_key) - return kh_val(hdr->h, k); - - t1 = t2 = kh_val(hdr->h, k); - do { - SAM_hdr_tag *tag; - for (tag = t1->tag; tag; tag = tag->next) { - if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) { - char *cp1 = tag->str+3; - char *cp2 = ID_value; - while (*cp1 && *cp1 == *cp2) - cp1++, cp2++; - if (*cp2 || *cp1) - continue; - return t1; - } - } - t1 = t1->next; - } while (t1 != t2); - - return NULL; -} - -/* - * As per SAM_hdr_type, but returns a complete line of formatted text - * for a specific head type/ID combination. If ID is NULL then it returns - * the first line of the specified type. - * - * The returned string is malloced and should be freed by the calling - * function with free(). - * - * Returns NULL if no type/ID is found. - */ -char *sam_hdr_find_line(SAM_hdr *hdr, char *type, - char *ID_key, char *ID_value) { - SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value); - kstring_t ks = KS_INITIALIZER; - SAM_hdr_tag *tag; - int r = 0; - - if (!ty) - return NULL; - - // Paste together the line from the hashed copy - r |= (kputc_('@', &ks) == EOF); - r |= (kputs(type, &ks) == EOF); - for (tag = ty->tag; tag; tag = tag->next) { - r |= (kputc_('\t', &ks) == EOF); - r |= (kputsn(tag->str, tag->len, &ks) == EOF); - } - - if (r) { - KS_FREE(&ks); - return NULL; - } - - return ks_str(&ks); -} - - -/* - * Looks for a specific key in a single sam header line. - * If prev is non-NULL it also fills this out with the previous tag, to - * permit use in key removal. *prev is set to NULL when the tag is the first - * key in the list. When a tag isn't found, prev (if non NULL) will be the last - * tag in the existing list. - * - * Returns the tag pointer on success - * NULL on failure - */ -SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh, - SAM_hdr_type *type, - char *key, - SAM_hdr_tag **prev) { - SAM_hdr_tag *tag, *p = NULL; - - for (tag = type->tag; tag; p = tag, tag = tag->next) { - if (tag->str[0] == key[0] && tag->str[1] == key[1]) { - if (prev) - *prev = p; - return tag; - } - } - - if (prev) - *prev = p; - - return NULL; -} - - -/* - * Adds or updates tag key,value pairs in a header line. - * Eg for adding M5 tags to @SQ lines or updating sort order for the - * @HD line (although use the sam_hdr_sort_order() function for - * HD manipulation, which is a wrapper around this funuction). - * - * Specify multiple key,value pairs ending in NULL. - * - * Returns 0 on success - * -1 on failure - */ -int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) { - va_list ap; - - va_start(ap, type); - - for (;;) { - char *k, *v; - int idx; - SAM_hdr_tag *tag, *prev; - - if (!(k = (char *)va_arg(ap, char *))) - break; - v = va_arg(ap, char *); - - tag = sam_hdr_find_key(hdr, type, k, &prev); - if (!tag) { - if (!(tag = pool_alloc(hdr->tag_pool))) - return -1; - if (prev) - prev->next = tag; - else - type->tag = tag; - - tag->next = NULL; - } - - idx = ks_len(&hdr->text); - if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0) - return -1; - tag->len = ks_len(&hdr->text) - idx; - tag->str = string_ndup(hdr->str_pool, - ks_str(&hdr->text) + idx, - tag->len); - if (!tag->str) - return -1; - } - - va_end(ap); - - return 0; -} - -#define K(a) (((a)[0]<<8)|((a)[1])) - -/* - * Returns the sort order: - */ -enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr) { - return hdr->sort_order; -} - -static enum sam_sort_order sam_hdr_parse_sort_order(SAM_hdr *hdr) { - khint_t k; - enum sam_sort_order so; - - so = ORDER_UNKNOWN; - k = kh_get(sam_hdr, hdr->h, K("HD")); - if (k != kh_end(hdr->h)) { - SAM_hdr_type *ty = kh_val(hdr->h, k); - SAM_hdr_tag *tag; - for (tag = ty->tag; tag; tag = tag->next) { - if (tag->str[0] == 'S' && tag->str[1] == 'O') { - if (strcmp(tag->str+3, "unsorted") == 0) - so = ORDER_UNSORTED; - else if (strcmp(tag->str+3, "queryname") == 0) - so = ORDER_NAME; - else if (strcmp(tag->str+3, "coordinate") == 0) - so = ORDER_COORD; - else if (strcmp(tag->str+3, "unknown") != 0) - hts_log_error("Unknown sort order field: %s", tag->str+3); - } - } - } - - return so; -} - - -/* - * Reconstructs the kstring from the header hash table. - * Returns 0 on success - * -1 on failure - */ -int sam_hdr_rebuild(SAM_hdr *hdr) { - /* Order: HD then others */ - kstring_t ks = KS_INITIALIZER; - khint_t k; - - - k = kh_get(sam_hdr, hdr->h, K("HD")); - if (k != kh_end(hdr->h)) { - SAM_hdr_type *ty = kh_val(hdr->h, k); - SAM_hdr_tag *tag; - if (EOF == kputs("@HD", &ks)) - return -1; - for (tag = ty->tag; tag; tag = tag->next) { - if (EOF == kputc_('\t', &ks)) - return -1; - if (EOF == kputsn_(tag->str, tag->len, &ks)) - return -1; - } - if (EOF == kputc('\n', &ks)) - return -1; - } - - for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) { - SAM_hdr_type *t1, *t2; - - if (!kh_exist(hdr->h, k)) - continue; - - if (kh_key(hdr->h, k) == K("HD")) - continue; - - t1 = t2 = kh_val(hdr->h, k); - do { - SAM_hdr_tag *tag; - char c[2]; - - if (EOF == kputc_('@', &ks)) - return -1; - c[0] = kh_key(hdr->h, k)>>8; - c[1] = kh_key(hdr->h, k)&0xff; - if (EOF == kputsn_(c, 2, &ks)) - return -1; - for (tag = t1->tag; tag; tag=tag->next) { - if (EOF == kputc_('\t', &ks)) - return -1; - if (EOF == kputsn_(tag->str, tag->len, &ks)) - return -1; - } - if (EOF == kputc('\n', &ks)) - return -1; - t1 = t1->next; - } while (t1 != t2); - } - - if (ks_str(&hdr->text)) - KS_FREE(&hdr->text); - - hdr->text = ks; - - return 0; -} - - -/* - * Creates an empty SAM header, ready to be populated. - * - * Returns a SAM_hdr struct on success (free with sam_hdr_free()) - * NULL on failure - */ -SAM_hdr *sam_hdr_new() { - SAM_hdr *sh = calloc(1, sizeof(*sh)); - - if (!sh) - return NULL; - - sh->h = kh_init(sam_hdr); - if (!sh->h) - goto err; - - sh->ID_cnt = 1; - sh->ref_count = 1; - - sh->nref = 0; - sh->ref = NULL; - if (!(sh->ref_hash = kh_init(m_s2i))) - goto err; - - sh->nrg = 0; - sh->rg = NULL; - if (!(sh->rg_hash = kh_init(m_s2i))) - goto err; - - sh->npg = 0; - sh->pg = NULL; - sh->npg_end = sh->npg_end_alloc = 0; - sh->pg_end = NULL; - if (!(sh->pg_hash = kh_init(m_s2i))) - goto err; - - KS_INIT(&sh->text); - - if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag)))) - goto err; - - if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type)))) - goto err; - - if (!(sh->str_pool = string_pool_create(8192))) - goto err; - - return sh; - - err: - if (sh->h) - kh_destroy(sam_hdr, sh->h); - - if (sh->tag_pool) - pool_destroy(sh->tag_pool); - - if (sh->type_pool) - pool_destroy(sh->type_pool); - - if (sh->str_pool) - string_pool_destroy(sh->str_pool); - - free(sh); - - return NULL; -} - - -/* - * Tokenises a SAM header into a hash table. - * Also extracts a few bits on specific data types, such as @RG lines. - * - * Returns a SAM_hdr struct on success (free with sam_hdr_free()) - * NULL on failure - */ -SAM_hdr *sam_hdr_parse_(const char *hdr, int len) { - /* Make an empty SAM_hdr */ - SAM_hdr *sh; - - sh = sam_hdr_new(); - if (NULL == sh) return NULL; - - if (NULL == hdr) return sh; // empty header is permitted - - /* Parse the header, line by line */ - if (-1 == sam_hdr_add_lines(sh, hdr, len)) { - sam_hdr_free(sh); - return NULL; - } - - /* Obtain sort order */ - sh->sort_order = sam_hdr_parse_sort_order(sh); - - //sam_hdr_dump(sh); - //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL); - //sam_hdr_rebuild(sh); - //printf(">>%s<<", ks_str(sh->text)); - - //parse_references(sh); - //parse_read_groups(sh); - - sam_hdr_link_pg(sh); - //sam_hdr_dump(sh); - - return sh; -} - -/* - * Produces a duplicate copy of hdr and returns it. - * Returns NULL on failure - */ -SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) { - if (-1 == sam_hdr_rebuild(hdr)) - return NULL; - - return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr)); -} - -/*! Increments a reference count on hdr. - * - * This permits multiple files to share the same header, all calling - * sam_hdr_free when done, without causing errors for other open files. - */ -void sam_hdr_incr_ref(SAM_hdr *hdr) { - hdr->ref_count++; -} - -/*! Increments a reference count on hdr. - * - * This permits multiple files to share the same header, all calling - * sam_hdr_free when done, without causing errors for other open files. - * - * If the reference count hits zero then the header is automatically - * freed. This makes it a synonym for sam_hdr_free(). - */ -void sam_hdr_decr_ref(SAM_hdr *hdr) { - sam_hdr_free(hdr); -} - -/*! Deallocates all storage used by a SAM_hdr struct. - * - * This also decrements the header reference count. If after decrementing - * it is still non-zero then the header is assumed to be in use by another - * caller and the free is not done. - * - * This is a synonym for sam_hdr_dec_ref(). - */ -void sam_hdr_free(SAM_hdr *hdr) { - if (!hdr) - return; - - if (--hdr->ref_count > 0) - return; - - if (ks_str(&hdr->text)) - KS_FREE(&hdr->text); - - if (hdr->h) - kh_destroy(sam_hdr, hdr->h); - - if (hdr->ref_hash) - kh_destroy(m_s2i, hdr->ref_hash); - - if (hdr->ref) { - int i; - for (i = 0; i < hdr->nref; i++) - if (hdr->ref[i].name) - free(hdr->ref[i].name); - free(hdr->ref); - } - - if (hdr->rg_hash) - kh_destroy(m_s2i, hdr->rg_hash); - - if (hdr->rg) { - int i; - for (i = 0; i < hdr->nrg; i++) - if (hdr->rg[i].name) - free(hdr->rg[i].name); - free(hdr->rg); - } - - if (hdr->pg_hash) - kh_destroy(m_s2i, hdr->pg_hash); - - if (hdr->pg) { - int i; - for (i = 0; i < hdr->npg; i++) - if (hdr->pg[i].name) - free(hdr->pg[i].name); - free(hdr->pg); - } - - if (hdr->pg_end) - free(hdr->pg_end); - - if (hdr->type_pool) - pool_destroy(hdr->type_pool); - - if (hdr->tag_pool) - pool_destroy(hdr->tag_pool); - - if (hdr->str_pool) - string_pool_destroy(hdr->str_pool); - - free(hdr); -} - -int sam_hdr_length(SAM_hdr *hdr) { - return ks_len(&hdr->text); -} - -char *sam_hdr_str(SAM_hdr *hdr) { - return ks_str(&hdr->text); -} - -/* - * Looks up a reference sequence by name and returns the numerical ID. - * Returns -1 if unknown reference. - */ -int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) { - khint_t k = kh_get(m_s2i, hdr->ref_hash, ref); - return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k); -} - -/* - * Looks up a read-group by name and returns a pointer to the start of the - * associated tag list. - * - * Returns NULL on failure - */ -SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) { - khint_t k = kh_get(m_s2i, hdr->rg_hash, rg); - return k == kh_end(hdr->rg_hash) - ? NULL - : &hdr->rg[kh_val(hdr->rg_hash, k)]; -} - - -/* - * Fixes any PP links in @PG headers. - * If the entries are in order then this doesn't need doing, but incase - * our header is out of order this goes through the sh->pg[] array - * setting the prev_id field. - * - * Note we can have multiple complete chains. This code should identify the - * tails of these chains as these are the entries we have to link to in - * subsequent PP records. - * - * Returns 0 on sucess - * -1 on failure (indicating broken PG/PP records) - */ -int sam_hdr_link_pg(SAM_hdr *hdr) { - int i, j, ret = 0; - - hdr->npg_end_alloc = hdr->npg; - hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end)); - if (!hdr->pg_end) - return -1; - - for (i = 0; i < hdr->npg; i++) - hdr->pg_end[i] = i; - - for (i = 0; i < hdr->npg; i++) { - khint_t k; - SAM_hdr_tag *tag; - char tmp; - - for (tag = hdr->pg[i].tag; tag; tag = tag->next) { - if (tag->str[0] == 'P' && tag->str[1] == 'P') - break; - } - if (!tag) { - /* Chain start points */ - continue; - } - - tmp = tag->str[tag->len]; tag->str[tag->len] = 0; - k = kh_get(m_s2i, hdr->pg_hash, tag->str+3); - tag->str[tag->len] = tmp; - - if (k == kh_end(hdr->pg_hash)) { - ret = -1; - continue; - } - - hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id; - hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1; - } - - for (i = j = 0; i < hdr->npg; i++) { - if (hdr->pg_end[i] != -1) - hdr->pg_end[j++] = hdr->pg_end[i]; - } - hdr->npg_end = j; - - return ret; -} - -/* - * Returns a unique ID from a base name. - * - * The value returned is valid until the next call to - * this function. - */ -const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) { - khint_t k = kh_get(m_s2i, sh->pg_hash, name); - if (k == kh_end(sh->pg_hash)) - return name; - - do { - sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++); - k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf); - } while (k != kh_end(sh->pg_hash)); - - return sh->ID_buf; -} - -/* - * Add an @PG line. - * - * If we wish complete control over this use sam_hdr_add() directly. This - * function uses that, but attempts to do a lot of tedious house work for - * you too. - * - * - It will generate a suitable ID if the supplied one clashes. - * - It will generate multiple @PG records if we have multiple PG chains. - * - * Call it as per sam_hdr_add() with a series of key,value pairs ending - * in NULL. - * - * Returns 0 on success - * -1 on failure - */ -int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) { - va_list args; - - if (sh->npg_end) { - /* Copy ends array to avoid us looping while modifying it */ - int *end = malloc(sh->npg_end * sizeof(int)); - int i, nends = sh->npg_end; - - if (!end) - return -1; - - memcpy(end, sh->pg_end, nends * sizeof(*end)); - - for (i = 0; i < nends; i++) { - va_start(args, name); - if (-1 == sam_hdr_vadd(sh, "PG", args, - "ID", sam_hdr_PG_ID(sh, name), - "PN", name, - "PP", sh->pg[end[i]].name, - NULL)) { - free(end); - return -1; - } - va_end(args); - } - - free(end); - } else { - va_start(args, name); - if (-1 == sam_hdr_vadd(sh, "PG", args, - "ID", sam_hdr_PG_ID(sh, name), - "PN", name, - NULL)) - return -1; - va_end(args); - } - - //sam_hdr_dump(sh); - - return 0; -} - -/* - * A function to help with construction of CL tags in @PG records. - * Takes an argc, argv pair and returns a single space-separated string. - * This string should be deallocated by the calling function. - * - * Returns malloced char * on success - * NULL on failure - */ -char *stringify_argv(int argc, char *argv[]) { - char *str, *cp; - size_t nbytes = 1; - int i, j; - - /* Allocate */ - for (i = 0; i < argc; i++) { - if (i > 0) nbytes += 1; - nbytes += strlen(argv[i]); - } - if (!(str = malloc(nbytes))) - return NULL; - - /* Copy */ - cp = str; - for (i = 0; i < argc; i++) { - if (i > 0) *cp++ = ' '; - j = 0; - while (argv[i][j]) { - if (argv[i][j] == '\t') - *cp++ = ' '; - else - *cp++ = argv[i][j]; - j++; - } - } - *cp++ = 0; - - return str; -} diff --git a/cram/sam_header.h b/cram/sam_header.h deleted file mode 100644 index d6c0e30f9..000000000 --- a/cram/sam_header.h +++ /dev/null @@ -1,459 +0,0 @@ -/* -Copyright (c) 2013-2014 Genome Research Ltd. -Author: James Bonfield - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - - 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger -Institute nor the names of its contributors may be used to endorse or promote -products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -/*! \file - * SAM header parsing. - * - * These functions can be shared between SAM, BAM and CRAM file - * formats as all three internally use the same string encoding for - * header fields. - */ - -/* - * TODO. - * - * - Sort order (parse to struct, enum type, updating funcs) - * - Removal of lines. - * - Updating of lines - */ - -#ifndef _SAM_HDR_H_ -#define _SAM_HDR_H_ - -#include - -#include "cram/string_alloc.h" -#include "cram/pooled_alloc.h" - -#include "htslib/khash.h" -#include "htslib/kstring.h" - -#ifdef __cplusplus -extern "C" { -#endif - -// For structure assignment. Eg kstring_t s = KS_INITIALIZER; -#define KS_INITIALIZER {0,0,0} - -// For initialisation elsewhere. Eg KS_INIT(x->str); -#define KS_INIT(ks) ((ks)->l = 0, (ks)->m = 0, (ks)->s = NULL) - -// Frees the string subfield only. Assumes 's' itself is static. -#define KS_FREE(ks) do { if ((ks)->s) free((ks)->s); } while(0) - -/* - * Proposed new SAM header parsing - -1 @SQ ID:foo LN:100 -2 @SQ ID:bar LN:200 -3 @SQ ID:ram LN:300 UR:xyz -4 @RG ID:r ... -5 @RG ID:s ... - -Hash table for 2-char @keys without dup entries. -If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}. - -HASH("SQ")--\ - | - (3) <-> 1 <-> 2 <-> 3 <-> (1) - -HASH("RG")--\ - | - (5) <-> 4 <-> 5 <-> (4) - -Items stored in the hash values also form their own linked lists: -Ie SQ->ID(foo)->LN(100) - SQ->ID(bar)->LN(200) - SQ->ID(ram)->LN(300)->UR(xyz) - RG->ID(r) - */ - -/*! A single key:value pair on a header line - * - * These form a linked list and hold strings. The strings are - * allocated from a string_alloc_t pool referenced in the master - * SAM_hdr structure. Do not attempt to free, malloc or manipulate - * these strings directly. - */ -typedef struct SAM_hdr_tag_s { - struct SAM_hdr_tag_s *next; - char *str; - int len; -} SAM_hdr_tag; - -/*! The parsed version of the SAM header string. - * - * Each header type (SQ, RG, HD, etc) points to its own SAM_hdr_type - * struct via the main hash table h in the SAM_hdr struct. - * - * These in turn consist of circular bi-directional linked lists (ie - * rings) to hold the multiple instances of the same header type - * code. For example if we have 5 \@SQ lines the primary hash table - * will key on \@SQ pointing to the first SAM_hdr_type and that in turn - * will be part of a ring of 5 elements. - * - * For each SAM_hdr_type structure we also point to a SAM_hdr_tag - * structure which holds the tokenised attributes; the tab separated - * key:value pairs per line. - */ -typedef struct SAM_hdr_item_s { - struct SAM_hdr_item_s *next; // cirular - struct SAM_hdr_item_s *prev; - SAM_hdr_tag *tag; // first tag - int order; // 0 upwards -} SAM_hdr_type; - -/*! Parsed \@SQ lines */ -typedef struct { - char *name; - uint32_t len; - SAM_hdr_type *ty; - SAM_hdr_tag *tag; -} SAM_SQ; - -/*! Parsed \@RG lines */ -typedef struct { - char *name; - SAM_hdr_type *ty; - SAM_hdr_tag *tag; - int name_len; - int id; // numerical ID -} SAM_RG; - -/*! Parsed \@PG lines */ -typedef struct { - char *name; - SAM_hdr_type *ty; - SAM_hdr_tag *tag; - int name_len; - int id; // numerical ID - int prev_id; // -1 if none -} SAM_PG; - -/*! Sort order parsed from @HD line */ -enum sam_sort_order { - ORDER_UNKNOWN =-1, - ORDER_UNSORTED = 0, - ORDER_NAME = 1, - ORDER_COORD = 2, - //ORDER_COLLATE = 3 // maybe one day! -}; - -KHASH_MAP_INIT_INT(sam_hdr, SAM_hdr_type*) -KHASH_MAP_INIT_STR(m_s2i, int) - -/*! Primary structure for header manipulation - * - * The initial header text is held in the text kstring_t, but is also - * parsed out into SQ, RG and PG arrays. These have a hash table - * associated with each to allow lookup by ID or SN fields instead of - * their numeric array indices. Additionally PG has an array to hold - * the linked list start points (the last in a PP chain). - * - * Use the appropriate sam_hdr_* functions to edit the header, and - * call sam_hdr_rebuild() any time the textual form needs to be - * updated again. - */ -typedef struct { - kstring_t text; //!< concatenated text, indexed by SAM_hdr_tag - khash_t(sam_hdr) *h; - string_alloc_t *str_pool; //!< Pool of SAM_hdr_tag->str strings - pool_alloc_t *type_pool;//!< Pool of SAM_hdr_type structs - pool_alloc_t *tag_pool; //!< Pool of SAM_hdr_tag structs - - // @SQ lines / references - int nref; //!< Number of \@SQ lines - SAM_SQ *ref; //!< Array of parsed \@SQ lines - khash_t(m_s2i) *ref_hash; //!< Maps SQ SN field to sq[] index - - // @RG lines / read-groups - int nrg; //!< Number of \@RG lines - SAM_RG *rg; //!< Array of parsed \@RG lines - khash_t(m_s2i) *rg_hash; //!< Maps RG ID field to rg[] index - - // @PG lines / programs - int npg; //!< Number of \@PG lines - int npg_end; //!< Number of terminating \@PG lines - int npg_end_alloc; //!< Size of pg_end field - SAM_PG *pg; //!< Array of parsed \@PG lines - khash_t(m_s2i) *pg_hash; //!< Maps PG ID field to pg[] index - int *pg_end; //!< \@PG chain termination IDs - - // @HD data - enum sam_sort_order sort_order; //!< @HD SO: field - - // @cond internal - char ID_buf[1024]; // temporary buffer - int ID_cnt; - int ref_count; // number of uses of this SAM_hdr - // @endcond -} SAM_hdr; - -/*! Creates an empty SAM header, ready to be populated. - * - * @return - * Returns a SAM_hdr struct on success (free with sam_hdr_free()) - * NULL on failure - */ -SAM_hdr *sam_hdr_new(void); - -/*! Tokenises a SAM header into a hash table. - * - * Also extracts a few bits on specific data types, such as @RG lines. - * - * @return - * Returns a SAM_hdr struct on success (free with sam_hdr_free()); - * NULL on failure - */ -SAM_hdr *sam_hdr_parse_(const char *hdr, int len); - - -/*! Produces a duplicate copy of hdr and returns it. - * @return - * Returns NULL on failure - */ -SAM_hdr *sam_hdr_dup(SAM_hdr *hdr); - - -/*! Increments a reference count on hdr. - * - * This permits multiple files to share the same header, all calling - * sam_hdr_free when done, without causing errors for other open files. - */ -void sam_hdr_incr_ref(SAM_hdr *hdr); - - -/*! Increments a reference count on hdr. - * - * This permits multiple files to share the same header, all calling - * sam_hdr_free when done, without causing errors for other open files. - * - * If the reference count hits zero then the header is automatically - * freed. This makes it a synonym for sam_hdr_free(). - */ -void sam_hdr_decr_ref(SAM_hdr *hdr); - - -/*! Deallocates all storage used by a SAM_hdr struct. - * - * This also decrements the header reference count. If after decrementing - * it is still non-zero then the header is assumed to be in use by another - * caller and the free is not done. - * - * This is a synonym for sam_hdr_dec_ref(). - */ -void sam_hdr_free(SAM_hdr *hdr); - -/*! Returns the current length of the SAM_hdr in text form. - * - * Call sam_hdr_rebuild() first if editing has taken place. - */ -int sam_hdr_length(SAM_hdr *hdr); - -/*! Returns the string form of the SAM_hdr. - * - * Call sam_hdr_rebuild() first if editing has taken place. - */ -char *sam_hdr_str(SAM_hdr *hdr); - -/*! Appends a formatted line to an existing SAM header. - * - * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with - * optional new-line. If it contains more than 1 line then multiple lines - * will be added in order. - * - * Input text is of maximum length len or as terminated earlier by a NUL. - * Len may be 0 if unknown, in which case lines must be NUL-terminated. - * - * @return - * Returns 0 on success; - * -1 on failure - */ -int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len); - -/*! Adds a single line to a SAM header. - * - * Specify type and one or more key,value pairs, ending with the NULL key. - * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL). - * - * @return - * Returns 0 on success; - * -1 on failure - */ -int sam_hdr_add(SAM_hdr *sh, const char *type, ...); - -/*! Adds a single line to a SAM header. - * - * This is much like sam_hdr_add() but with the additional va_list - * argument. This is followed by specifying type and one or more - * key,value pairs, ending with the NULL key. - * - * Eg. sam_hdr_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL). - * - * The purpose of the additional va_list parameter is to permit other - * varargs functions to call this while including their own additional - * parameters; an example is in sam_hdr_add_PG(). - * - * Note: this function invokes va_arg at least once, making the value - * of ap indeterminate after the return. The caller should call - * va_start/va_end before/after calling this function or use va_copy. - * - * @return - * Returns 0 on success; - * -1 on failure - */ -int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...); - -/*! - * @return - * Returns the first header item matching 'type'. If ID is non-NULL it checks - * for the tag ID: and compares against the specified ID. - * - * Returns NULL if no type/ID is found - */ -SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type, - char *ID_key, char *ID_value); - -/*! - * - * As per SAM_hdr_type, but returns a complete line of formatted text - * for a specific head type/ID combination. If ID is NULL then it returns - * the first line of the specified type. - * - * The returned string is malloced and should be freed by the calling - * function with free(). - * - * @return - * Returns NULL if no type/ID is found. - */ -char *sam_hdr_find_line(SAM_hdr *hdr, char *type, - char *ID_key, char *ID_value); - -/*! Looks for a specific key in a single sam header line. - * - * If prev is non-NULL it also fills this out with the previous tag, to - * permit use in key removal. *prev is set to NULL when the tag is the first - * key in the list. When a tag isn't found, prev (if non NULL) will be the last - * tag in the existing list. - * - * @return - * Returns the tag pointer on success; - * NULL on failure - */ -SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh, - SAM_hdr_type *type, - char *key, - SAM_hdr_tag **prev); - -/*! Adds or updates tag key,value pairs in a header line. - * - * Eg for adding M5 tags to @SQ lines or updating sort order for the - * @HD line (although use the sam_hdr_sort_order() function for - * HD manipulation, which is a wrapper around this funuction). - * - * Specify multiple key,value pairs ending in NULL. - * - * @return - * Returns 0 on success; - * -1 on failure - */ -int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...); - -/*! Returns the sort order from the @HD SO: field */ -enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr); - -/*! Reconstructs the kstring from the header hash table. - * @return - * Returns 0 on success; - * -1 on failure - */ -int sam_hdr_rebuild(SAM_hdr *hdr); - -/*! Looks up a reference sequence by name and returns the numerical ID. - * @return - * Returns -1 if unknown reference. - */ -int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref); - -/*! Looks up a read-group by name and returns a pointer to the start of the - * associated tag list. - * - * @return - * Returns NULL on failure - */ -SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg); - -/*! Fixes any PP links in @PG headers. - * - * If the entries are in order then this doesn't need doing, but incase - * our header is out of order this goes through the sh->pg[] array - * setting the prev_id field. - * - * @return - * Returns 0 on sucess; - * -1 on failure (indicating broken PG/PP records) - */ -int sam_hdr_link_pg(SAM_hdr *hdr); - - -/*! Add an @PG line. - * - * If we wish complete control over this use sam_hdr_add() directly. This - * function uses that, but attempts to do a lot of tedious house work for - * you too. - * - * - It will generate a suitable ID if the supplied one clashes. - * - It will generate multiple @PG records if we have multiple PG chains. - * - * Call it as per sam_hdr_add() with a series of key,value pairs ending - * in NULL. - * - * @return - * Returns 0 on success; - * -1 on failure - */ -int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...); - -/*! - * A function to help with construction of CL tags in @PG records. - * Takes an argc, argv pair and returns a single space-separated string. - * This string should be deallocated by the calling function. - * - * @return - * Returns malloced char * on success; - * NULL on failure - */ -char *stringify_argv(int argc, char *argv[]); - -#ifdef __cplusplus -} -#endif - -#endif /* _SAM_HDR_H_ */ diff --git a/cram/string_alloc.c b/cram/string_alloc.c index d64c26190..8de4d0ae3 100644 --- a/cram/string_alloc.c +++ b/cram/string_alloc.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2010 Genome Research Ltd. +Copyright (c) 2010, 2013, 2018-2019 Genome Research Ltd. Author: Andrew Whitwham Redistribution and use in source and binary forms, with or without @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Andrew Whitwham, September 2010. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -61,6 +62,7 @@ string_alloc_t *string_pool_create(size_t max_length) { if (max_length < MIN_STR_SIZE) max_length = MIN_STR_SIZE; a_str->nstrings = 0; + a_str->max_strings = 0; a_str->max_length = max_length; a_str->strings = NULL; @@ -73,14 +75,19 @@ string_alloc_t *string_pool_create(size_t max_length) { static string_t *new_string_pool(string_alloc_t *a_str) { string_t *str; - str = realloc(a_str->strings, (a_str->nstrings + 1) * sizeof(*a_str->strings)); + if (a_str->nstrings == a_str->max_strings) { + size_t new_max = (a_str->max_strings | (a_str->max_strings >> 2)) + 1; + str = realloc(a_str->strings, new_max * sizeof(*a_str->strings)); - if (NULL == str) return NULL; + if (NULL == str) return NULL; + + a_str->strings = str; + a_str->max_strings = new_max; + } - a_str->strings = str; str = &a_str->strings[a_str->nstrings]; - str->str = malloc(a_str->max_length);; + str->str = malloc(a_str->max_length); if (NULL == str->str) return NULL; @@ -139,16 +146,16 @@ char *string_alloc(string_alloc_t *a_str, size_t length) { /* equivalent to strdup */ -char *string_dup(string_alloc_t *a_str, char *instr) { +char *string_dup(string_alloc_t *a_str, const char *instr) { return string_ndup(a_str, instr, strlen(instr)); } -char *string_ndup(string_alloc_t *a_str, char *instr, size_t len) { +char *string_ndup(string_alloc_t *a_str, const char *instr, size_t len) { char *str = string_alloc(a_str, len + 1); if (NULL == str) return NULL; - strncpy(str, instr, len); + memcpy(str, instr, len); str[len] = 0; return str; diff --git a/cram/string_alloc.h b/cram/string_alloc.h index 65c01b4a5..42ebb0a58 100644 --- a/cram/string_alloc.h +++ b/cram/string_alloc.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2010 Genome Research Ltd. +Copyright (c) 2010, 2013, 2018 Genome Research Ltd. Author: Andrew Whitwham Redistribution and use in source and binary forms, with or without @@ -28,8 +28,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _STRING_ALLOC_H_ -#define _STRING_ALLOC_H_ +#ifndef STRING_ALLOC_H +#define STRING_ALLOC_H #include @@ -52,14 +52,15 @@ typedef struct { typedef struct { size_t max_length; size_t nstrings; + size_t max_strings; string_t *strings; } string_alloc_t; string_alloc_t *string_pool_create(size_t max_length); void string_pool_destroy(string_alloc_t *a_str); char *string_alloc(string_alloc_t *a_str, size_t length); -char *string_dup(string_alloc_t *a_str, char *instr); -char *string_ndup(string_alloc_t *a_str, char *instr, size_t len); +char *string_dup(string_alloc_t *a_str, const char *instr); +char *string_ndup(string_alloc_t *a_str, const char *instr, size_t len); #ifdef __cplusplus } diff --git a/errmod.c b/errmod.c index acc89227f..df708e1fd 100644 --- a/errmod.c +++ b/errmod.c @@ -1,7 +1,7 @@ /* errmod.c -- revised MAQ error model. Copyright (C) 2010 Broad Institute. - Copyright (C) 2012, 2013, 2016 Genome Research Ltd. + Copyright (C) 2012, 2013, 2016-2017, 2019 Genome Research Ltd. Author: Heng Li @@ -23,6 +23,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -30,7 +31,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/ksort.h" #include "htslib/hts_os.h" // for drand48 -KSORT_INIT_GENERIC(uint16_t) +KSORT_INIT_STATIC_GENERIC(uint16_t) struct errmod_t { double depcorr; @@ -53,6 +54,7 @@ static double* logbinomial_table( const int n_size ) /* this calcs p(k) = {log(n!) - log(k!) - log((n-k)!) */ int k, n; double *logbinom = (double*)calloc(n_size * n_size, sizeof(double)); + if (!logbinom) return NULL; for (n = 1; n < n_size; ++n) { double lfn = lfact(n); for (k = 1; k <= n; ++k) @@ -61,7 +63,7 @@ static double* logbinomial_table( const int n_size ) return logbinom; } -static void cal_coef(errmod_t *em, double depcorr, double eta) +static int cal_coef(errmod_t *em, double depcorr, double eta) { int k, n, q; double sum, sum1; @@ -69,14 +71,17 @@ static void cal_coef(errmod_t *em, double depcorr, double eta) // initialize ->fk em->fk = (double*)calloc(256, sizeof(double)); + if (!em->fk) return -1; em->fk[0] = 1.0; for (n = 1; n < 256; ++n) em->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta; // initialize ->beta em->beta = (double*)calloc(256 * 256 * 64, sizeof(double)); + if (!em->beta) return -1; lC = logbinomial_table( 256 ); + if (!lC) return -1; for (q = 1; q < 64; ++q) { double e = pow(10.0, -q/10.0); @@ -95,10 +100,15 @@ static void cal_coef(errmod_t *em, double depcorr, double eta) // initialize ->lhet em->lhet = (double*)calloc(256 * 256, sizeof(double)); + if (!em->lhet) { + free(lC); + return -1; + } for (n = 0; n < 256; ++n) for (k = 0; k < 256; ++k) em->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n; free(lC); + return 0; } /** @@ -108,6 +118,7 @@ errmod_t *errmod_init(double depcorr) { errmod_t *em; em = (errmod_t*)calloc(1, sizeof(errmod_t)); + if (!em) return NULL; em->depcorr = depcorr; cal_coef(em, depcorr, 0.03); return em; diff --git a/faidx.c b/faidx.c index f5b58acce..83b4fc0d1 100644 --- a/faidx.c +++ b/faidx.c @@ -1,6 +1,6 @@ /* faidx.c -- FASTA and FASTQ random access. - Copyright (C) 2008, 2009, 2013-2018 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2019 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Author: Heng Li @@ -23,6 +23,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -43,6 +44,7 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_internal.h" typedef struct { + int id; // faidx_t->name[id] is for this struct. uint32_t line_len, line_blen; uint64_t len; uint64_t seq_offset; @@ -62,6 +64,13 @@ struct __faidx_t { #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif +static int fai_name2id(void *v, const char *ref) +{ + faidx_t *fai = (faidx_t *)v; + khint_t k = kh_get(s, fai->hash, ref); + return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id; +} + static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint32_t line_len, uint32_t line_blen, uint64_t seq_offset, uint64_t qual_offset) { if (!name) { @@ -75,7 +84,7 @@ static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, faidx1_t *v = &kh_value(idx->hash, k); if (! absent) { - hts_log_warning("Ignoring duplicate sequence \"%s\" at byte offset %"PRIu64"", name, seq_offset); + hts_log_warning("Ignoring duplicate sequence \"%s\" at byte offset %" PRIu64, name, seq_offset); free(name_key); return 0; } @@ -89,6 +98,7 @@ static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, } idx->name = tmp; } + v->id = idx->n; idx->name[idx->n++] = name_key; v->len = len; v->line_len = line_len; @@ -159,7 +169,6 @@ static faidx_t *fai_build_core(BGZF *bgzf) { char s[4] = { '"', c, '"', '\0' }; hts_log_error("Format error, unexpected %s at line %d", isprint(c) ? s : "character", line_num); goto fail; - break; } } break; @@ -252,10 +261,9 @@ static faidx_t *fai_build_core(BGZF *bgzf) { case SEQ_END: if (c == '+') { state = IN_QUAL; - if (c != '\n') while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n'); + while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n'); qual_offset = bgzf_utell(bgzf); line_num++; - continue; } else { hts_log_error("Format error, expecting '+', got '%c' at line %d", c, line_num); goto fail; @@ -377,14 +385,14 @@ static faidx_t *fai_read(hFILE *fp, const char *fname, int format) } if (format == FAI_FASTA) { - n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32"", &len, &seq_offset, &line_blen, &line_len); + n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32, &len, &seq_offset, &line_blen, &line_len); if (n != 4) { hts_log_error("Could not understand FASTA index %s line %zd", fname, lnum); goto fail; } } else { - n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32"%"SCNu64"", &len, &seq_offset, &line_blen, &line_len, &qual_offset); + n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32"%"SCNu64, &len, &seq_offset, &line_blen, &line_len, &qual_offset); if (n != 5) { if (n == 4) { @@ -686,14 +694,22 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) { static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, - uint64_t offset, long beg, long end, int *len) { + uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) { char *s; size_t l; int c = 0; - int ret = bgzf_useek(fai->bgzf, - offset - + beg / val->line_blen * val->line_len - + beg % val->line_blen, SEEK_SET); + int ret; + + if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) { + hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end); + *len = -1; + return NULL; + } + + ret = bgzf_useek(fai->bgzf, + offset + + beg / val->line_blen * val->line_len + + beg % val->line_blen, SEEK_SET); if (ret < 0) { *len = -1; @@ -723,85 +739,30 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, return s; } - -static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t *val, long *fbeg, long *fend) { - char *s, *ep; - size_t i, l, k, name_end; +static int fai_get_val(const faidx_t *fai, const char *str, + hts_pos_t *len, faidx1_t *val, hts_pos_t *fbeg, hts_pos_t *fend) { khiter_t iter; khash_t(s) *h; - long beg, end; + int id; + hts_pos_t beg, end; - beg = end = -1; - h = fai->hash; - name_end = l = strlen(str); - s = (char*)malloc(l+1); - if (!s) { - *len = -1; + if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) { + hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str); + *len = -2; return 1; } - // remove space - for (i = k = 0; i < l; ++i) - if (!isspace_c(str[i])) s[k++] = str[i]; - s[k] = 0; - name_end = l = k; - // determine the sequence name - for (i = l; i > 0; --i) if (s[i - 1] == ':') break; // look for colon from the end - if (i > 0) name_end = i - 1; - if (name_end < l) { // check if this is really the end - int n_hyphen = 0; - for (i = name_end + 1; i < l; ++i) { - if (s[i] == '-') ++n_hyphen; - else if (!isdigit_c(s[i]) && s[i] != ',') break; - } - if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name - s[name_end] = 0; - iter = kh_get(s, h, s); - if (iter == kh_end(h)) { // cannot find the sequence name - iter = kh_get(s, h, str); // try str as the name - if (iter != kh_end(h)) { - s[name_end] = ':'; - name_end = l; - } - } - } else iter = kh_get(s, h, str); - if(iter == kh_end(h)) { - hts_log_warning("Reference %s not found in file, returning empty sequence", str); - free(s); - *len = -2; - return 1; + h = fai->hash; + iter = kh_get(s, h, faidx_iseq(fai, id)); + if (iter >= kh_end(h)) { + // should have already been caught above + abort(); } *val = kh_value(h, iter); - // parse the interval - if (name_end < l) { - int save_errno = errno; - errno = 0; - for (i = k = name_end + 1; i < l; ++i) - if (s[i] != ',') s[k++] = s[i]; - s[k] = 0; - if (s[name_end + 1] == '-') { - beg = 0; - i = name_end + 2; - } else { - beg = strtol(s + name_end + 1, &ep, 10); - for (i = ep - s; i < k;) if (s[i++] == '-') break; - } - end = i < k? strtol(s + i, &ep, 10) : val->len; - if (beg > 0) --beg; - // Check for out of range numbers. Only going to be a problem on - // 32-bit platforms with >2Gb sequence length. - if (errno == ERANGE && (uint64_t) val->len > LONG_MAX) { - hts_log_error("Positions in range %s are too large for this platform", s); - free(s); - *len = -3; - return 1; - } - errno = save_errno; - } else beg = 0, end = val->len; + if (beg >= val->len) beg = val->len; if (end >= val->len) end = val->len; if (beg > end) beg = end; - free(s); *fbeg = beg; *fend = end; @@ -810,10 +771,10 @@ static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t * } -char *fai_fetch(const faidx_t *fai, const char *str, int *len) +char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len) { faidx1_t val; - long beg, end; + int64_t beg, end; if (fai_get_val(fai, str, len, &val, &beg, &end)) { return NULL; @@ -823,10 +784,17 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len) return fai_retrieve(fai, &val, val.seq_offset, beg, end, len); } +char *fai_fetch(const faidx_t *fai, const char *str, int *len) +{ + hts_pos_t len64; + char *ret = fai_fetch64(fai, str, &len64); + *len = len64; // trunc + return ret; +} -char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { +char *fai_fetchqual64(const faidx_t *fai, const char *str, hts_pos_t *len) { faidx1_t val; - long beg, end; + int64_t beg, end; if (fai_get_val(fai, str, len, &val, &beg, &end)) { return NULL; @@ -836,6 +804,12 @@ char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { return fai_retrieve(fai, &val, val.qual_offset, beg, end, len); } +char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { + hts_pos_t len64; + char *ret = fai_fetchqual64(fai, str, &len64); + *len = len64; // trunc + return ret; +} int faidx_fetch_nseq(const faidx_t *fai) { @@ -859,8 +833,7 @@ int faidx_seq_len(const faidx_t *fai, const char *seq) return kh_val(fai->hash, k).len; } - -static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char *c_name, int *p_beg_i, int *p_end_i, int *len) { +static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char *c_name, hts_pos_t *p_beg_i, hts_pos_t *p_end_i, hts_pos_t *len) { khiter_t iter; // Adjust position @@ -868,7 +841,7 @@ static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char * if (iter == kh_end(fai->hash)) { *len = -2; - hts_log_error("The sequence \"%s\" not found", c_name); + hts_log_error("The sequence \"%s\" was not found", c_name); return 1; } @@ -890,8 +863,7 @@ static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char * return 0; } - -char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) +char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) { faidx1_t val; @@ -901,11 +873,18 @@ char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p } // Now retrieve the sequence - return fai_retrieve(fai, &val, val.seq_offset, p_beg_i, (long) p_end_i + 1, len); + return fai_retrieve(fai, &val, val.seq_offset, p_beg_i, p_end_i + 1, len); } +char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) +{ + hts_pos_t len64; + char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64); + *len = len64; // trunc + return ret; +} -char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) +char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) { faidx1_t val; @@ -915,9 +894,16 @@ char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int } // Now retrieve the sequence - return fai_retrieve(fai, &val, val.qual_offset, p_beg_i, (long) p_end_i + 1, len); + return fai_retrieve(fai, &val, val.qual_offset, p_beg_i, p_end_i + 1, len); } +char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) +{ + hts_pos_t len64; + char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64); + *len = len64; // trunc + return ret; +} int faidx_has_seq(const faidx_t *fai, const char *seq) { @@ -926,3 +912,13 @@ int faidx_has_seq(const faidx_t *fai, const char *seq) return 1; } +const char *fai_parse_region(const faidx_t *fai, const char *s, + int *tid, hts_pos_t *beg, hts_pos_t *end, + int flags) +{ + return hts_parse_region(s, tid, beg, end, (hts_name2id_f)fai_name2id, (void *)fai, flags); +} + +void fai_set_cache_size(faidx_t *fai, int cache_size) { + bgzf_set_cache_size(fai->bgzf, cache_size); +} diff --git a/header.c b/header.c new file mode 100644 index 000000000..c7fe84b30 --- /dev/null +++ b/header.c @@ -0,0 +1,2711 @@ +/* +Copyright (c) 2018-2019 Genome Research Ltd. +Authors: James Bonfield , Valeriu Ohan + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h +#include + +#include +#include +#include +#include "textutils_internal.h" +#include "header.h" + +// Hash table for removing multiple lines from the header +KHASH_SET_INIT_STR(rm) +// Used for long refs in SAM files +KHASH_DECLARE(s2i, kh_cstr_t, int64_t) + +typedef khash_t(rm) rmhash_t; + +static int sam_hdr_link_pg(sam_hdr_t *bh); + +static int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap); +static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...); + + +#define MAX_ERROR_QUOTE 320 // Prevent over-long error messages +static void sam_hrecs_error(const char *msg, const char *line, size_t len, size_t lno) { + int j; + + if (len > MAX_ERROR_QUOTE) + len = MAX_ERROR_QUOTE; + for (j = 0; j < len && line[j] != '\n'; j++) + ; + hts_log_error("%s at line %zd: \"%.*s\"", msg, lno, j, line); +} + +/* ==== Static methods ==== */ + +static int sam_hrecs_init_type_order(sam_hrecs_t *hrecs, char *type_list) { + if (!hrecs) + return -1; + + if (!type_list) { + hrecs->type_count = 5; + hrecs->type_order = calloc(hrecs->type_count, 3); + if (!hrecs->type_order) + return -1; + memcpy(hrecs->type_order[0], "HD", 2); + memcpy(hrecs->type_order[1], "SQ", 2); + memcpy(hrecs->type_order[2], "RG", 2); + memcpy(hrecs->type_order[3], "PG", 2); + memcpy(hrecs->type_order[4], "CO", 2); + } + + return 0; +} + +static int sam_hrecs_add_ref_altnames(sam_hrecs_t *hrecs, int nref, const char *list) { + const char *token; + ks_tokaux_t aux; + + if (!list) + return 0; + + for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) { + if (aux.p == token) + continue; + + char *name = string_ndup(hrecs->str_pool, token, aux.p - token); + if (!name) + return -1; + int r; + khint_t k = kh_put(m_s2i, hrecs->ref_hash, name, &r); + if (r < 0) return -1; + + if (r > 0) + kh_val(hrecs->ref_hash, k) = nref; + else if (kh_val(hrecs->ref_hash, k) != nref) + hts_log_warning("Duplicate entry AN:\"%s\" in sam header", name); + } + + return 0; +} + +static void sam_hrecs_remove_ref_altnames(sam_hrecs_t *hrecs, int expected, const char *list) { + const char *token, *sn; + ks_tokaux_t aux; + kstring_t str = KS_INITIALIZE; + + if (expected < 0 || expected >= hrecs->nref) + return; + sn = hrecs->ref[expected].name; + + for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) { + kputsn(token, aux.p - token, ks_clear(&str)); + khint_t k = kh_get(m_s2i, hrecs->ref_hash, str.s); + if (k != kh_end(hrecs->ref_hash) + && kh_val(hrecs->ref_hash, k) == expected + && strcmp(sn, str.s) != 0) + kh_del(m_s2i, hrecs->ref_hash, k); + } + + free(str.s); +} + +/* Updates the hash tables in the sam_hrecs_t structure. + * + * Returns 0 on success; + * -1 on failure + */ +static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, + int type, + sam_hrec_type_t *h_type) { + /* Add to reference hash? */ + if ((type>>8) == 'S' && (type&0xff) == 'Q') { + sam_hrec_tag_t *tag = h_type->tag; + int nref = hrecs->nref; + const char *name = NULL; + const char *altnames = NULL; + hts_pos_t len = -1; + int r; + khint_t k; + + while (tag) { + if (tag->str[0] == 'S' && tag->str[1] == 'N') { + assert(tag->len >= 3); + name = tag->str+3; + } else if (tag->str[0] == 'L' && tag->str[1] == 'N') { + assert(tag->len >= 3); + len = strtoll(tag->str+3, NULL, 10); + } else if (tag->str[0] == 'A' && tag->str[1] == 'N') { + assert(tag->len >= 3); + altnames = tag->str+3; + } + tag = tag->next; + } + + if (!name) { + hts_log_error("Header includes @SQ line with no SN: tag"); + return -1; // SN should be present, according to spec. + } + + if (len == -1) { + hts_log_error("Header includes @SQ line \"%s\" with no LN: tag", + name); + return -1; // LN should be present, according to spec. + } + + // Seen already? + k = kh_get(m_s2i, hrecs->ref_hash, name); + if (k < kh_end(hrecs->ref_hash)) { + nref = kh_val(hrecs->ref_hash, k); + + // Check for hash entry added by sam_hrecs_refs_from_targets_array() + if (hrecs->ref[nref].ty == NULL) { + // Attach header line to existing stub entry. + hrecs->ref[nref].ty = h_type; + // Check lengths match; correct if not. + if (len != hrecs->ref[nref].len) { + char tmp[32]; + snprintf(tmp, sizeof(tmp), "%" PRIhts_pos, + hrecs->ref[nref].len); + if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0) + return -1; + } + if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0) + return -1; + + if (hrecs->refs_changed < 0 || hrecs->refs_changed > nref) + hrecs->refs_changed = nref; + return 0; + } + + // Check to see if an existing entry is being updated + if (hrecs->ref[nref].ty == h_type) { + hrecs->ref[nref].len = len; + hrecs->ref[nref].name = name; + if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0) + return -1; + + if (hrecs->refs_changed < 0 || hrecs->refs_changed > nref) + hrecs->refs_changed = nref; + return 0; + } + + // If here, the name is a duplicate. + // Check to see if it matches the SN: tag from the earlier record. + if (strcmp(hrecs->ref[nref].name, name) == 0) { + hts_log_error("Duplicate entry \"%s\" in sam header", + name); + return -1; + } + + // Clash with an already-seen altname + // As SN: should be preferred to AN: add this as a new + // record and update the hash entry to point to it. + hts_log_warning("Ref name SN:\"%s\" is a duplicate of an existing AN key", name); + nref = hrecs->nref; + } + + if (nref == hrecs->ref_sz) { + size_t new_sz = hrecs->ref_sz >= 4 ? hrecs->ref_sz + (hrecs->ref_sz / 4) : 32; + sam_hrec_sq_t *new_ref = realloc(hrecs->ref, sizeof(*hrecs->ref) * new_sz); + if (!new_ref) + return -1; + hrecs->ref = new_ref; + hrecs->ref_sz = new_sz; + } + + hrecs->ref[nref].name = name; + hrecs->ref[nref].len = len; + hrecs->ref[nref].ty = h_type; + + k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[nref].name, &r); + if (-1 == r) return -1; + kh_val(hrecs->ref_hash, k) = nref; + + if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0) + return -1; + + if (hrecs->refs_changed < 0 || hrecs->refs_changed > hrecs->nref) + hrecs->refs_changed = hrecs->nref; + hrecs->nref++; + } + + /* Add to read-group hash? */ + if ((type>>8) == 'R' && (type&0xff) == 'G') { + sam_hrec_tag_t *tag = sam_hrecs_find_key(h_type, "ID", NULL); + int nrg = hrecs->nrg, r; + khint_t k; + + if (!tag) { + hts_log_error("Header includes @RG line with no ID: tag"); + return -1; // ID should be present, according to spec. + } + assert(tag->str && tag->len >= 3); + + // Seen already? + k = kh_get(m_s2i, hrecs->rg_hash, tag->str + 3); + if (k < kh_end(hrecs->rg_hash)) { + nrg = kh_val(hrecs->rg_hash, k); + assert(hrecs->rg[nrg].ty != NULL); + if (hrecs->rg[nrg].ty != h_type) { + hts_log_warning("Duplicate entry \"%s\" in sam header", + tag->str + 3); + } else { + hrecs->rg[nrg].name = tag->str + 3; + hrecs->rg[nrg].name_len = tag->len - 3; + } + return 0; + } + + if (nrg == hrecs->rg_sz) { + size_t new_sz = hrecs->rg_sz >= 4 ? hrecs->rg_sz + hrecs->rg_sz / 4 : 4; + sam_hrec_rg_t *new_rg = realloc(hrecs->rg, sizeof(*hrecs->rg) * new_sz); + if (!new_rg) + return -1; + hrecs->rg = new_rg; + hrecs->rg_sz = new_sz; + } + + hrecs->rg[nrg].name = tag->str + 3; + hrecs->rg[nrg].name_len = tag->len - 3; + hrecs->rg[nrg].ty = h_type; + hrecs->rg[nrg].id = nrg; + + k = kh_put(m_s2i, hrecs->rg_hash, hrecs->rg[nrg].name, &r); + if (-1 == r) return -1; + kh_val(hrecs->rg_hash, k) = nrg; + + hrecs->nrg++; + } + + /* Add to program hash? */ + if ((type>>8) == 'P' && (type&0xff) == 'G') { + sam_hrec_tag_t *tag; + sam_hrec_pg_t *new_pg; + int npg = hrecs->npg; + + if (npg == hrecs->pg_sz) { + size_t new_sz = hrecs->pg_sz >= 4 ? hrecs->pg_sz + hrecs->pg_sz / 4 : 4; + new_pg = realloc(hrecs->pg, sizeof(*hrecs->pg) * new_sz); + if (!new_pg) + return -1; + hrecs->pg = new_pg; + hrecs->pg_sz = new_sz; + } + + tag = h_type->tag; + hrecs->pg[npg].name = NULL; + hrecs->pg[npg].name_len = 0; + hrecs->pg[npg].ty = h_type; + hrecs->pg[npg].id = npg; + hrecs->pg[npg].prev_id = -1; + + while (tag) { + if (tag->str[0] == 'I' && tag->str[1] == 'D') { + assert(tag->len >= 3); + hrecs->pg[npg].name = tag->str + 3; + hrecs->pg[npg].name_len = tag->len - 3; + } else if (tag->str[0] == 'P' && tag->str[1] == 'P') { + // Resolve later if needed + khint_t k; + k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3); + + if (k != kh_end(hrecs->pg_hash)) { + int p_id = kh_val(hrecs->pg_hash, k); + hrecs->pg[npg].prev_id = hrecs->pg[p_id].id; + + /* Unmark previous entry as a PG termination */ + if (hrecs->npg_end > 0 && + hrecs->pg_end[hrecs->npg_end-1] == p_id) { + hrecs->npg_end--; + } else { + int i; + for (i = 0; i < hrecs->npg_end; i++) { + if (hrecs->pg_end[i] == p_id) { + memmove(&hrecs->pg_end[i], &hrecs->pg_end[i+1], + (hrecs->npg_end-i-1)*sizeof(*hrecs->pg_end)); + hrecs->npg_end--; + } + } + } + } else { + hrecs->pg[npg].prev_id = -1; + } + } + tag = tag->next; + } + + if (hrecs->pg[npg].name) { + khint_t k; + int r; + k = kh_put(m_s2i, hrecs->pg_hash, hrecs->pg[npg].name, &r); + if (-1 == r) return -1; + kh_val(hrecs->pg_hash, k) = npg; + } else { + return -1; // ID should be present, according to spec. + } + + /* Add to npg_end[] array. Remove later if we find a PP line */ + if (hrecs->npg_end >= hrecs->npg_end_alloc) { + int *new_pg_end; + int new_alloc = hrecs->npg_end_alloc ? hrecs->npg_end_alloc*2 : 4; + + new_pg_end = realloc(hrecs->pg_end, new_alloc * sizeof(int)); + if (!new_pg_end) + return -1; + hrecs->npg_end_alloc = new_alloc; + hrecs->pg_end = new_pg_end; + } + hrecs->pg_end[hrecs->npg_end++] = npg; + + hrecs->npg++; + } + + return 0; +} + +static int sam_hrecs_remove_hash_entry(sam_hrecs_t *hrecs, int type, sam_hrec_type_t *h_type) { + if (!hrecs || !h_type) + return -1; + + sam_hrec_tag_t *tag; + const char *key = NULL; + khint_t k; + + /* Remove name and any alternative names from reference hash */ + if ((type>>8) == 'S' && (type&0xff) == 'Q') { + const char *altnames = NULL; + + tag = h_type->tag; + + while (tag) { + if (tag->str[0] == 'S' && tag->str[1] == 'N') { + assert(tag->len >= 3); + key = tag->str + 3; + } else if (tag->str[0] == 'A' && tag->str[1] == 'N') { + assert(tag->len >= 3); + altnames = tag->str + 3; + } + tag = tag->next; + } + + if (key) { + k = kh_get(m_s2i, hrecs->ref_hash, key); + if (k != kh_end(hrecs->ref_hash)) { + int idx = kh_val(hrecs->ref_hash, k); + if (idx + 1 < hrecs->nref) + memmove(&hrecs->ref[idx], &hrecs->ref[idx+1], + sizeof(sam_hrec_sq_t)*(hrecs->nref - idx - 1)); + if (altnames) + sam_hrecs_remove_ref_altnames(hrecs, idx, altnames); + kh_del(m_s2i, hrecs->ref_hash, k); + hrecs->nref--; + if (hrecs->refs_changed < 0 || hrecs->refs_changed > idx) + hrecs->refs_changed = idx; + for (k = 0; k < kh_end(hrecs->ref_hash); k++) { + if (kh_exist(hrecs->ref_hash, k) + && kh_value(hrecs->ref_hash, k) > idx) { + kh_value(hrecs->ref_hash, k)--; + } + } + } + } + } + + /* Remove from read-group hash */ + if ((type>>8) == 'R' && (type&0xff) == 'G') { + tag = h_type->tag; + + while (tag) { + if (tag->str[0] == 'I' && tag->str[1] == 'D') { + assert(tag->len >= 3); + key = tag->str + 3; + k = kh_get(m_s2i, hrecs->rg_hash, key); + if (k != kh_end(hrecs->rg_hash)) { + int idx = kh_val(hrecs->rg_hash, k); + if (idx + 1 < hrecs->nrg) + memmove(&hrecs->rg[idx], &hrecs->rg[idx+1], sizeof(sam_hrec_rg_t)*(hrecs->nrg - idx - 1)); + kh_del(m_s2i, hrecs->rg_hash, k); + hrecs->nrg--; + for (k = 0; k < kh_end(hrecs->rg_hash); k++) { + if (kh_exist(hrecs->rg_hash, k) + && kh_value(hrecs->rg_hash, k) > idx) { + kh_value(hrecs->rg_hash, k)--; + } + } + } + break; + } + tag = tag->next; + } + } + + return 0; +} + +/** Add a header record to the global line ordering + * + * If @p after is not NULL, the new record will be inserted after this one, + * otherwise it will go at the end. + * + * An exception is an HD record, which will always be put first unless + * one is already present. + */ +static void sam_hrecs_global_list_add(sam_hrecs_t *hrecs, + sam_hrec_type_t *h_type, + sam_hrec_type_t *after) { + const khint32_t hd_type = 'H' << 8 | 'D'; + int update_first_line = 0; + + // First line seen + if (!hrecs->first_line) { + hrecs->first_line = h_type->global_next = h_type->global_prev = h_type; + return; + } + + // @HD goes at the top (unless there's one already) + if (h_type->type == hd_type && hrecs->first_line->type != hd_type) { + after = hrecs->first_line->global_prev; + update_first_line = 1; + } + + // If no instructions given, put it at the end + if (!after) + after = hrecs->first_line->global_prev; + + h_type->global_prev = after; + h_type->global_next = after->global_next; + h_type->global_prev->global_next = h_type; + h_type->global_next->global_prev = h_type; + + if (update_first_line) + hrecs->first_line = h_type; +} + +/*! Add header record with a va_list interface. + * + * Adds a single record to a SAM header. + * + * This takes a header record type, a va_list argument and one or more + * key,value pairs, ending with the NULL key. + * + * Eg. sam_hrecs_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL). + * + * The purpose of the additional va_list parameter is to permit other + * varargs functions to call this while including their own additional + * parameters; an example is in sam_hdr_add_pg(). + * + * Note: this function invokes va_arg at least once, making the value + * of ap indeterminate after the return. The caller should call + * va_start/va_end before/after calling this function or use va_copy. + * + * @return + * Returns >= 0 on success; + * -1 on failure + */ +static int sam_hrecs_vadd(sam_hrecs_t *hrecs, const char *type, va_list ap, ...) { + va_list args; + sam_hrec_type_t *h_type; + sam_hrec_tag_t *h_tag, *last=NULL; + int new; + khint32_t type_i = (type[0]<<8) | type[1], k; + + if (!strncmp(type, "HD", 2) && (h_type = sam_hrecs_find_type_id(hrecs, "HD", NULL, NULL))) + return sam_hrecs_vupdate(hrecs, h_type, ap); + + if (!(h_type = pool_alloc(hrecs->type_pool))) + return -1; + k = kh_put(sam_hrecs_t, hrecs->h, type_i, &new); + if (new < 0) + return -1; + + h_type->type = type_i; + + // Form the ring, either with self or other lines of this type + if (!new) { + sam_hrec_type_t *t = kh_val(hrecs->h, k), *p; + p = t->prev; + + assert(p->next == t); + p->next = h_type; + h_type->prev = p; + + t->prev = h_type; + h_type->next = t; + } else { + kh_val(hrecs->h, k) = h_type; + h_type->prev = h_type->next = h_type; + } + h_type->tag = NULL; + + // Add to global line ordering after any existing line of the same type, + // or at the end if no line of this type exists yet. + sam_hrecs_global_list_add(hrecs, h_type, !new ? h_type->prev : NULL); + + // Check linked-list invariants + assert(h_type->prev->next == h_type); + assert(h_type->next->prev == h_type); + assert(h_type->global_prev->global_next == h_type); + assert(h_type->global_next->global_prev == h_type); + + // Any ... varargs + va_start(args, ap); + for (;;) { + char *key, *val = NULL, *str; + + if (!(key = (char *)va_arg(args, char *))) + break; + if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(args, char *))) + break; + if (*val == '\0') + continue; + + if (!(h_tag = pool_alloc(hrecs->tag_pool))) + return -1; + + if (strncmp(type, "CO", 2)) { + h_tag->len = 3 + strlen(val); + str = string_alloc(hrecs->str_pool, h_tag->len+1); + if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0) + return -1; + h_tag->str = str; + } else { + h_tag->len = strlen(key); + h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len); + if (!h_tag->str) + return -1; + } + + h_tag->next = NULL; + if (last) + last->next = h_tag; + else + h_type->tag = h_tag; + + last = h_tag; + } + va_end(args); + + // Plus the specified va_list params + for (;;) { + char *key, *val = NULL, *str; + + if (!(key = (char *)va_arg(ap, char *))) + break; + if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(ap, char *))) + break; + + if (!(h_tag = pool_alloc(hrecs->tag_pool))) + return -1; + + if (strncmp(type, "CO", 2)) { + h_tag->len = 3 + strlen(val); + str = string_alloc(hrecs->str_pool, h_tag->len+1); + if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0) + return -1; + h_tag->str = str; + } else { + h_tag->len = strlen(key); + h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len); + if (!h_tag->str) + return -1; + } + + h_tag->next = NULL; + if (last) + last->next = h_tag; + else + h_type->tag = h_tag; + + last = h_tag; + } + + int itype = (type[0]<<8) | type[1]; + if (-1 == sam_hrecs_update_hashes(hrecs, itype, h_type)) + return -1; + + if (!strncmp(type, "PG", 2)) + hrecs->pgs_changed = 1; + + hrecs->dirty = 1; + + return 0; +} + +// As sam_hrecs_vadd(), but without the extra va_list parameter +static int sam_hrecs_add(sam_hrecs_t *hrecs, const char *type, ...) { + va_list args; + int res; + va_start(args, type); + res = sam_hrecs_vadd(hrecs, type, args, NULL); + va_end(args); + return res; +} + +/* + * Function for deallocating a list of tags + */ + +static void sam_hrecs_free_tags(sam_hrecs_t *hrecs, sam_hrec_tag_t *tag) { + if (!hrecs || !tag) + return; + if (tag->next) + sam_hrecs_free_tags(hrecs, tag->next); + + pool_free(hrecs->tag_pool, tag); +} + +static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found) { + if (!hrecs || !type_name || !type_found) + return -1; + + int itype = (type_name[0]<<8) | type_name[1]; + khint_t k = kh_get(sam_hrecs_t, hrecs->h, itype); + if (k == kh_end(hrecs->h)) + return -1; + + // Remove from global list (remembering it could be the only line) + if (hrecs->first_line == type_found) { + hrecs->first_line = (type_found->global_next != type_found + ? type_found->global_next : NULL); + } + type_found->global_next->global_prev = type_found->global_prev; + type_found->global_prev->global_next = type_found->global_next; + + /* single element in the list */ + if (type_found->prev == type_found || type_found->next == type_found) { + kh_del(sam_hrecs_t, hrecs->h, k); + } else { + type_found->prev->next = type_found->next; + type_found->next->prev = type_found->prev; + if (kh_val(hrecs->h, k) == type_found) { //first element + kh_val(hrecs->h, k) = type_found->next; + } + } + + if (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2)) + sam_hrecs_remove_hash_entry(hrecs, itype, type_found); + + sam_hrecs_free_tags(hrecs, type_found->tag); + pool_free(hrecs->type_pool, type_found); + + hrecs->dirty = 1; + + return 0; +} + +// Paste together a line from the parsed data structures +static int build_header_line(const sam_hrec_type_t *ty, kstring_t *ks) { + sam_hrec_tag_t *tag; + int r = 0; + char c[2]= { ty->type >> 8, ty->type & 0xff }; + + r |= (kputc_('@', ks) == EOF); + r |= (kputsn(c, 2, ks) == EOF); + for (tag = ty->tag; tag; tag = tag->next) { + r |= (kputc_('\t', ks) == EOF); + r |= (kputsn(tag->str, tag->len, ks) == EOF); + } + + return r; +} + +static int sam_hrecs_rebuild_lines(const sam_hrecs_t *hrecs, kstring_t *ks) { + const sam_hrec_type_t *t1, *t2; + + if (!hrecs->first_line) + return kputsn("", 0, ks) >= 0 ? 0 : -1; + + t1 = t2 = hrecs->first_line; + do { + if (build_header_line(t1, ks) != 0) + return -1; + if (kputc('\n', ks) < 0) + return -1; + + t1 = t1->global_next; + } while (t1 != t2); + + return 0; +} + +static int sam_hrecs_parse_lines(sam_hrecs_t *hrecs, const char *hdr, size_t len) { + size_t i, lno; + + if (!hrecs || len > SSIZE_MAX) + return -1; + + if (!len) + len = strlen(hdr); + + if (len < 3) { + if (len == 0 || *hdr == '\0') return 0; + sam_hrecs_error("Header line too short", hdr, len, 1); + return -1; + } + + for (i = 0, lno = 1; i < len - 3 && hdr[i] != '\0'; i++, lno++) { + khint32_t type; + khint_t k; + + int l_start = i, new; + sam_hrec_type_t *h_type; + sam_hrec_tag_t *h_tag, *last; + + if (hdr[i] != '@') { + sam_hrecs_error("Header line does not start with '@'", + &hdr[l_start], len - l_start, lno); + return -1; + } + + type = (((uint8_t) hdr[i+1])<<8) | (uint8_t) hdr[i+2]; + if (!isalpha_c(hdr[i+1]) || !isalpha_c(hdr[i+2])) { + sam_hrecs_error("Header line does not have a two character key", + &hdr[l_start], len - l_start, lno); + return -1; + } + + i += 3; + if (i == len || hdr[i] == '\n') + continue; + + // Add the header line type + if (!(h_type = pool_alloc(hrecs->type_pool))) + return -1; + k = kh_put(sam_hrecs_t, hrecs->h, type, &new); + if (new < 0) + return -1; + + h_type->type = type; + + // Add to end of global list + sam_hrecs_global_list_add(hrecs, h_type, NULL); + + // Form the ring, either with self or other lines of this type + if (!new) { + sam_hrec_type_t *t = kh_val(hrecs->h, k), *p; + p = t->prev; + + assert(p->next == t); + p->next = h_type; + h_type->prev = p; + + t->prev = h_type; + h_type->next = t; + } else { + kh_val(hrecs->h, k) = h_type; + h_type->prev = h_type->next = h_type; + } + + // Parse the tags on this line + last = NULL; + if ((type>>8) == 'C' && (type&0xff) == 'O') { + size_t j; + + if (i == len || hdr[i] != '\t') { + sam_hrecs_error("Missing tab", + &hdr[l_start], len - l_start, lno); + return -1; + } + + for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++) + ; + + if (!(h_type->tag = h_tag = pool_alloc(hrecs->tag_pool))) + return -1; + h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i); + h_tag->len = j-i; + h_tag->next = NULL; + if (!h_tag->str) + return -1; + + i = j; + + } else { + do { + size_t j; + + if (i == len || hdr[i] != '\t') { + sam_hrecs_error("Missing tab", + &hdr[l_start], len - l_start, lno); + return -1; + } + + for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++) + ; + + if (j - i < 3 || hdr[i + 2] != ':') { + sam_hrecs_error("Malformed key:value pair", + &hdr[l_start], len - l_start, lno); + return -1; + } + + if (!(h_tag = pool_alloc(hrecs->tag_pool))) + return -1; + h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i); + h_tag->len = j-i; + h_tag->next = NULL; + if (!h_tag->str) + return -1; + + if (last) + last->next = h_tag; + else + h_type->tag = h_tag; + + last = h_tag; + i = j; + } while (i < len && hdr[i] != '\0' && hdr[i] != '\n'); + } + + /* Update RG/SQ hashes */ + if (-1 == sam_hrecs_update_hashes(hrecs, type, h_type)) + return -1; + } + + return 0; +} + +/*! Update sam_hdr_t target_name and target_len arrays + * + * @return 0 on success; -1 on failure + */ +int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs, + int refs_changed) { + if (!bh || !hrecs) + return -1; + + if (refs_changed < 0) + return 0; + + // Grow arrays if necessary + if (bh->n_targets < hrecs->nref) { + char **new_names = realloc(bh->target_name, + hrecs->nref * sizeof(*new_names)); + if (!new_names) + return -1; + bh->target_name = new_names; + uint32_t *new_lens = realloc(bh->target_len, + hrecs->nref * sizeof(*new_lens)); + if (!new_lens) + return -1; + bh->target_len = new_lens; + } + + // Update names and lengths where changed + // hrecs->refs_changed is the first ref that has been updated, so ones + // before that can be skipped. + int i; + khint_t k; + khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict; + for (i = refs_changed; i < hrecs->nref; i++) { + if (i >= bh->n_targets + || strcmp(bh->target_name[i], hrecs->ref[i].name) != 0) { + if (i < bh->n_targets) + free(bh->target_name[i]); + bh->target_name[i] = strdup(hrecs->ref[i].name); + if (!bh->target_name[i]) + return -1; + } + if (hrecs->ref[i].len < UINT32_MAX) { + bh->target_len[i] = hrecs->ref[i].len; + + if (!long_refs) + continue; + + // Check if we have an old length, if so remove it. + k = kh_get(s2i, long_refs, bh->target_name[i]); + if (k < kh_end(long_refs)) + kh_del(s2i, long_refs, k); + } else { + bh->target_len[i] = UINT32_MAX; + if (bh->hrecs != hrecs) { + // Called from sam_hdr_dup; need to add sdict entries + if (!long_refs) { + if (!(bh->sdict = long_refs = kh_init(s2i))) + return -1; + } + + // Add / update length + int absent; + k = kh_put(s2i, long_refs, bh->target_name[i], &absent); + if (absent < 0) + return -1; + kh_val(long_refs, k) = hrecs->ref[i].len; + } + } + } + + // Free up any names that have been removed + for (; i < bh->n_targets; i++) { + if (long_refs) { + k = kh_get(s2i, long_refs, bh->target_name[i]); + if (k < kh_end(long_refs)) + kh_del(s2i, long_refs, k); + } + free(bh->target_name[i]); + } + + bh->n_targets = hrecs->nref; + return 0; +} + +static int rebuild_target_arrays(sam_hdr_t *bh) { + if (!bh || !bh->hrecs) + return -1; + + sam_hrecs_t *hrecs = bh->hrecs; + if (hrecs->refs_changed < 0) + return 0; + + if (sam_hdr_update_target_arrays(bh, hrecs, hrecs->refs_changed) != 0) + return -1; + + hrecs->refs_changed = -1; + return 0; +} + +/// Populate hrecs refs array from header target_name, target_len arrays +/** + * @return 0 on success; -1 on failure + * + * Pre-fills the refs hash from the target arrays. For BAM files this + * will ensure that they are in the correct order as the target arrays + * are the canonical source for converting target ids to names and lengths. + * + * The added entries do not link to a header line. sam_hrecs_update_hashes() + * will add the links later for lines found in the text header. + * + * This should be called before the text header is parsed. + */ +static int sam_hrecs_refs_from_targets_array(sam_hrecs_t *hrecs, + const sam_hdr_t *bh) { + int32_t tid = 0; + + if (!hrecs || !bh) + return -1; + + // This should always be called before parsing the text header + // so the ref array should start off empty, and we don't have to try + // to reconcile any existing data. + if (hrecs->nref > 0) { + hts_log_error("Called with non-empty ref array"); + return -1; + } + + if (hrecs->ref_sz < bh->n_targets) { + sam_hrec_sq_t *new_ref = realloc(hrecs->ref, + bh->n_targets * sizeof(*new_ref)); + if (!new_ref) + return -1; + + hrecs->ref = new_ref; + hrecs->ref_sz = bh->n_targets; + } + + for (tid = 0; tid < bh->n_targets; tid++) { + khint_t k; + int r; + hrecs->ref[tid].name = string_dup(hrecs->str_pool, bh->target_name[tid]); + if (!hrecs->ref[tid].name) goto fail; + if (bh->target_len[tid] < UINT32_MAX || !bh->sdict) { + hrecs->ref[tid].len = bh->target_len[tid]; + } else { + khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict; + k = kh_get(s2i, long_refs, hrecs->ref[tid].name); + if (k < kh_end(long_refs)) { + hrecs->ref[tid].len = kh_val(long_refs, k); + } else { + hrecs->ref[tid].len = UINT32_MAX; + } + } + hrecs->ref[tid].ty = NULL; + k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name, &r); + if (r < 0) goto fail; + if (r == 0) { + hts_log_error("Duplicate entry \"%s\" in target list", + hrecs->ref[tid].name); + return -1; + } else { + kh_val(hrecs->ref_hash, k) = tid; + } + } + hrecs->nref = bh->n_targets; + return 0; + + fail: { + int32_t i; + hts_log_error("%s", strerror(errno)); + for (i = 0; i < tid; i++) { + khint_t k; + if (!hrecs->ref[i].name) continue; + k = kh_get(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name); + if (k < kh_end(hrecs->ref_hash)) kh_del(m_s2i, hrecs->ref_hash, k); + } + hrecs->nref = 0; + return -1; + } +} + +/* + * Add SQ header records for any references in the hrecs->ref array that + * were added by sam_hrecs_refs_from_targets_array() but have not + * been linked to an @SQ line by sam_hrecs_update_hashes() yet. + * + * This may be needed either because: + * + * - A bam file was read that had entries in its refs list with no + * corresponding @SQ line. + * + * - A program constructed a sam_hdr_t which has target_name and target_len + * array entries with no corresponding @SQ line in text. + */ +static int add_stub_ref_sq_lines(sam_hrecs_t *hrecs) { + int tid; + char len[32]; + + for (tid = 0; tid < hrecs->nref; tid++) { + if (hrecs->ref[tid].ty == NULL) { + snprintf(len, sizeof(len), "%"PRIhts_pos, hrecs->ref[tid].len); + if (sam_hrecs_add(hrecs, "SQ", + "SN", hrecs->ref[tid].name, + "LN", len, NULL) != 0) + return -1; + + // Check that the stub has actually been filled + if(hrecs->ref[tid].ty == NULL) { + hts_log_error("Reference stub with tid=%d, name=\"%s\", len=%"PRIhts_pos" could not be filled", + tid, hrecs->ref[tid].name, hrecs->ref[tid].len); + return -1; + } + } + } + return 0; +} + +int sam_hdr_fill_hrecs(sam_hdr_t *bh) { + sam_hrecs_t *hrecs = sam_hrecs_new(); + + if (!hrecs) + return -1; + + if (bh->target_name && bh->target_len && bh->n_targets > 0) { + if (sam_hrecs_refs_from_targets_array(hrecs, bh) != 0) { + sam_hrecs_free(hrecs); + return -1; + } + } + + // Parse existing header text + if (bh->text && bh->l_text > 0) { + if (sam_hrecs_parse_lines(hrecs, bh->text, bh->l_text) != 0) { + sam_hrecs_free(hrecs); + return -1; + } + } + + if (add_stub_ref_sq_lines(hrecs) < 0) { + sam_hrecs_free(hrecs); + return -1; + } + + bh->hrecs = hrecs; + + if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + return -1; + + return 0; +} + +/** Remove outdated header text + + @param bh BAM header + + This is called when API functions have changed the header so that the + text version is no longer valid. + */ +static void redact_header_text(sam_hdr_t *bh) { + assert(bh->hrecs && bh->hrecs->dirty); + bh->l_text = 0; + free(bh->text); + bh->text = NULL; +} + +/** Find nth header record of a given type + + @param type Header type (SQ, RG etc.) + @param idx 0-based index + + @return sam_hrec_type_t pointer to the record on success + NULL if no record exists with the given type and index + */ + +static sam_hrec_type_t *sam_hrecs_find_type_pos(sam_hrecs_t *hrecs, + const char *type, int idx) { + sam_hrec_type_t *first, *itr; + + if (idx < 0) + return NULL; + + if (type[0] == 'S' && type[1] == 'Q') + return idx < hrecs->nref ? hrecs->ref[idx].ty : NULL; + + if (type[0] == 'R' && type[1] == 'G') + return idx < hrecs->nrg ? hrecs->rg[idx].ty : NULL; + + if (type[0] == 'P' && type[1] == 'G') + return idx < hrecs->npg ? hrecs->pg[idx].ty : NULL; + + first = itr = sam_hrecs_find_type_id(hrecs, type, NULL, NULL); + if (!first) + return NULL; + + while (idx > 0) { + itr = itr->next; + if (itr == first) + break; + --idx; + } + + return idx == 0 ? itr : NULL; +} + +/* ==== Public methods ==== */ + +size_t sam_hdr_length(sam_hdr_t *bh) { + if (!bh || -1 == sam_hdr_rebuild(bh)) + return SIZE_MAX; + + return bh->l_text; +} + +const char *sam_hdr_str(sam_hdr_t *bh) { + if (!bh || -1 == sam_hdr_rebuild(bh)) + return NULL; + + return bh->text; +} + +int sam_hdr_nref(const sam_hdr_t *bh) { + if (!bh) + return -1; + + return bh->hrecs ? bh->hrecs->nref : bh->n_targets; +} + +/* + * Reconstructs the text representation from the header hash table. + * Returns 0 on success + * -1 on failure + */ +int sam_hdr_rebuild(sam_hdr_t *bh) { + sam_hrecs_t *hrecs; + if (!bh) + return -1; + + if (!(hrecs = bh->hrecs)) + return bh->text ? 0 : -1; + + if (hrecs->refs_changed >= 0) { + if (rebuild_target_arrays(bh) < 0) { + hts_log_error("Header target array rebuild has failed"); + return -1; + } + } + + /* If header text wasn't changed or header is empty, don't rebuild it. */ + if (!hrecs->dirty) + return 0; + + if (hrecs->pgs_changed) + sam_hdr_link_pg(bh); + + kstring_t ks = KS_INITIALIZE; + if (sam_hrecs_rebuild_text(hrecs, &ks) != 0) { + ks_free(&ks); + hts_log_error("Header text rebuild has failed"); + return -1; + } + + hrecs->dirty = 0; + + /* Sync */ + free(bh->text); + bh->l_text = ks_len(&ks); + bh->text = ks_release(&ks); + + return 0; +} + +/* + * Appends a formatted line to an existing SAM header. + * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with + * optional new-line. If it contains more than 1 line then multiple lines + * will be added in order. + * + * Input text is of maximum length len or as terminated earlier by a NUL. + * len may be 0 if unknown, in which case lines must be NUL-terminated. + * + * Returns 0 on success + * -1 on failure + */ +int sam_hdr_add_lines(sam_hdr_t *bh, const char *lines, size_t len) { + sam_hrecs_t *hrecs; + + if (!bh || !lines) + return -1; + + if (len == 0 && *lines == '\0') + return 0; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -1; + hrecs = bh->hrecs; + } + + if (sam_hrecs_parse_lines(hrecs, lines, len) != 0) + return -1; + + if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + return -1; + + hrecs->dirty = 1; + redact_header_text(bh); + + return 0; +} + +/* + * Adds a single line to a SAM header. + * Specify type and one or more key,value pairs, ending with the NULL key. + * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL). + * + * Returns 0 on success + * -1 on failure + */ +int sam_hdr_add_line(sam_hdr_t *bh, const char *type, ...) { + va_list args; + sam_hrecs_t *hrecs; + + if (!bh || !type) + return -1; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -1; + hrecs = bh->hrecs; + } + + va_start(args, type); + int ret = sam_hrecs_vadd(hrecs, type, args, NULL); + va_end(args); + + if (ret == 0) { + if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + return -1; + + if (hrecs->dirty) + redact_header_text(bh); + } + + return ret; +} + +/* + * Returns a complete line of formatted text for a specific head type/ID + * combination. If ID_key is NULL then it returns the first line of the specified + * type. + */ +int sam_hdr_find_line_id(sam_hdr_t *bh, const char *type, + const char *ID_key, const char *ID_val, kstring_t *ks) { + sam_hrecs_t *hrecs; + if (!bh || !type) + return -2; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -2; + hrecs = bh->hrecs; + } + + sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_val); + if (!ty) + return -1; + + ks->l = 0; + if (build_header_line(ty, ks) < 0) { + return -2; + } + + return 0; +} + +int sam_hdr_find_line_pos(sam_hdr_t *bh, const char *type, + int pos, kstring_t *ks) { + sam_hrecs_t *hrecs; + if (!bh || !type) + return -2; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -2; + hrecs = bh->hrecs; + } + + sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos); + if (!ty) + return -1; + + ks->l = 0; + if (build_header_line(ty, ks) < 0) { + return -2; + } + + return 0; +} + +/* + * Remove a line from the header by specifying a tag:value that uniquely + * identifies a line, i.e. the @SQ line containing "SN:ref1". + * @SQ line is uniquely identified by SN tag. + * @RG line is uniquely identified by ID tag. + * @PG line is uniquely identified by ID tag. + * + * Returns 0 on success and -1 on error + */ + +int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) { + sam_hrecs_t *hrecs; + if (!bh || !type) + return -1; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -1; + hrecs = bh->hrecs; + } + + if (!strncmp(type, "PG", 2)) { + hts_log_warning("Removing PG lines is not supported!"); + return -1; + } + + sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value); + if (!type_found) + return 0; + + int ret = sam_hrecs_remove_line(hrecs, type, type_found); + if (ret == 0) { + if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + return -1; + + if (hrecs->dirty) + redact_header_text(bh); + } + + return ret; +} + +/* + * Remove a line from the header by specifying the position in the type + * group, i.e. 3rd @SQ line. + * + * Returns 0 on success and -1 on error + */ + +int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) { + sam_hrecs_t *hrecs; + if (!bh || !type || position <= 0) + return -1; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -1; + hrecs = bh->hrecs; + } + + if (!strncmp(type, "PG", 2)) { + hts_log_warning("Removing PG lines is not supported!"); + return -1; + } + + sam_hrec_type_t *type_found = sam_hrecs_find_type_pos(hrecs, type, + position); + if (!type_found) + return -1; + + int ret = sam_hrecs_remove_line(hrecs, type, type_found); + if (ret == 0) { + if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + return -1; + + if (hrecs->dirty) + redact_header_text(bh); + } + + return ret; +} + +/* + * Check if sam_hdr_update_line() is being used to change the name of + * a record, and if the new name is going to clash with an existing one. + * + * If ap includes repeated keys, we go with the last one as sam_hrecs_vupdate() + * will go through them all and leave the final one in place. + * + * Returns 0 if the name does not change + * 1 if the name changes but does not clash + * -1 if the name changes and the new one is already in use + */ +static int check_for_name_update(sam_hrecs_t *hrecs, sam_hrec_type_t *rec, + va_list ap, const char **old_name, + const char **new_name, + char id_tag_out[3], + khash_t(m_s2i) **hash_out) { + char *key, *val; + const char *id_tag; + sam_hrec_tag_t *tag, *prev; + khash_t(m_s2i) *hash; + khint_t k; + int ret = 0; + + if (rec->type == TYPEKEY("SQ")) { + id_tag = "SN"; hash = hrecs->ref_hash; + } else if (rec->type == TYPEKEY("RG")) { + id_tag = "ID"; hash = hrecs->rg_hash; + } else if (rec->type == TYPEKEY("PG")) { + id_tag = "ID"; hash = hrecs->pg_hash; + } else { + return 0; + } + + memcpy(id_tag_out, id_tag, 3); + *hash_out = hash; + + tag = sam_hrecs_find_key(rec, id_tag, &prev); + if (!tag) + return 0; + assert(tag->len >= 3); + *old_name = tag->str + 3; + + while ((key = va_arg(ap, char *)) != NULL) { + val = va_arg(ap, char *); + if (!val) val = ""; + if (strcmp(key, id_tag) != 0) continue; + if (strcmp(val, tag->str + 3) == 0) { ret = 0; continue; } + k = kh_get(m_s2i, hash, val); + ret = k < kh_end(hash) ? -1 : 1; + *new_name = val; + } + return ret; +} + +int sam_hdr_update_line(sam_hdr_t *bh, const char *type, + const char *ID_key, const char *ID_value, ...) { + sam_hrecs_t *hrecs; + if (!bh) + return -1; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -1; + hrecs = bh->hrecs; + } + + int ret, rename; + sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value); + if (!ty) + return -1; + + va_list args; + const char *old_name = "?", *new_name = "?"; + char id_tag[3]; + khash_t(m_s2i) *hash = NULL; + va_start(args, ID_value); + rename = check_for_name_update(hrecs, ty, args, + &old_name, &new_name, id_tag, &hash); + va_end(args); + if (rename < 0) { + hts_log_error("Cannot rename @%s \"%s\" to \"%s\" : already exists", + type, old_name, new_name); + return -1; + } + if (rename > 0 && TYPEKEY(type) == TYPEKEY("PG")) { + // This is just too complicated + hts_log_error("Renaming @PG records is not supported"); + return -1; + } + va_start(args, ID_value); + ret = sam_hrecs_vupdate(hrecs, ty, args); + va_end(args); + + if (ret) + return ret; + + // TODO Account for @SQ-AN altnames + + if (rename) { + // Adjust the hash table to point to the new name + // sam_hrecs_update_hashes() should sort out everything else + khint_t k = kh_get(m_s2i, hash, old_name); + sam_hrec_tag_t *new_tag = sam_hrecs_find_key(ty, id_tag, NULL); + int r, pos; + assert(k < kh_end(hash)); // Or we wouldn't have found it earlier + assert(new_tag && new_tag->str); // id_tag should exist + assert(new_tag->len > 3); + pos = kh_val(hash, k); + kh_del(m_s2i, hash, k); + k = kh_put(m_s2i, hash, new_tag->str + 3, &r); + if (r < 1) { + hts_log_error("Failed to rename item in hash table"); + return -1; + } + kh_val(hash, k) = pos; + } + + ret = sam_hrecs_update_hashes(hrecs, TYPEKEY(type), ty); + + if (!ret && hrecs->refs_changed >= 0) + ret = rebuild_target_arrays(bh); + + if (!ret && hrecs->dirty) + redact_header_text(bh); + + return ret; +} + +int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) { + sam_hrecs_t *hrecs; + if (!bh || !type) + return -1; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -1; + hrecs = bh->hrecs; + } + + sam_hrec_type_t *step; + int ret = 1, remove_all = (ID_key == NULL); + + if (!strncmp(type, "PG", 2) || !strncmp(type, "CO", 2)) { + hts_log_warning("Removing PG or CO lines is not supported!"); + return -1; + } + + sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value); + if (!type_found) { // remove all line of this type + int itype = (type[0]<<8)|(type[1]); + khint_t k = kh_get(sam_hrecs_t, hrecs->h, itype); + if (k == kh_end(hrecs->h)) + return 0; + type_found = kh_val(hrecs->h, k); + if (!type_found) + return 0; + remove_all = 1; + } + + step = type_found->next; + while (step != type_found) { + sam_hrec_type_t *to_remove = step; + step = step->next; + ret &= sam_hrecs_remove_line(hrecs, type, to_remove); + } + + if (remove_all) + ret &= sam_hrecs_remove_line(hrecs, type, type_found); + + if (!ret && hrecs->dirty) + redact_header_text(bh); + + return 0; +} + +int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void *h) { + sam_hrecs_t *hrecs; + rmhash_t *rh = (rmhash_t *)h; + + if (!bh || !type) + return -1; + if (!rh) // remove all lines + return sam_hdr_remove_except(bh, type, NULL, NULL); + if (!id) + return -1; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -1; + hrecs = bh->hrecs; + } + + int itype = (type[0]<<8)|(type[1]); + khint_t k = kh_get(sam_hrecs_t, hrecs->h, itype); + if (k == kh_end(hrecs->h)) // nothing to remove from + return 0; + + sam_hrec_type_t *head = kh_val(hrecs->h, k); + if (!head) { + hts_log_error("Header inconsistency"); + return -1; + } + + int ret = 0; + sam_hrec_type_t *step = head->next; + while (step != head) { + sam_hrec_tag_t *tag = sam_hrecs_find_key(step, id, NULL); + if (tag && tag->str && tag->len >= 3) { + k = kh_get(rm, rh, tag->str+3); + if (k == kh_end(rh)) { // value is not in the hash table, so remove + sam_hrec_type_t *to_remove = step; + step = step->next; + ret |= sam_hrecs_remove_line(hrecs, type, to_remove); + } else { + step = step->next; + } + } else { // tag is not on the line, so skip to next line + step = step->next; + } + } + + // process the first line + sam_hrec_tag_t * tag = sam_hrecs_find_key(head, id, NULL); + if (tag && tag->str && tag->len >= 3) { + k = kh_get(rm, rh, tag->str+3); + if (k == kh_end(rh)) { // value is not in the hash table, so remove + sam_hrec_type_t *to_remove = head; + head = head->next; + ret |= sam_hrecs_remove_line(hrecs, type, to_remove); + } + } + + if (!ret && hrecs->dirty) + redact_header_text(bh); + + return ret; +} + +int sam_hdr_count_lines(sam_hdr_t *bh, const char *type) { + int count; + sam_hrec_type_t *first_ty, *itr_ty; + + if (!bh || !type) + return -1; + + if (!bh->hrecs) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -1; + } + + // Deal with types that have counts + switch (type[0]) { + case 'S': + if (type[1] == 'Q') + return bh->hrecs->nref; + break; + case 'R': + if (type[1] == 'G') + return bh->hrecs->nrg; + break; + case 'P': + if (type[1] == 'G') + return bh->hrecs->npg; + break; + default: + break; + } + + first_ty = sam_hrecs_find_type_id(bh->hrecs, type, NULL, NULL); + if (!first_ty) + return 0; + + count = 1; + for (itr_ty = first_ty->next; + itr_ty && itr_ty != first_ty; itr_ty = itr_ty->next) { + count++; + } + + return count; +} + +int sam_hdr_line_index(sam_hdr_t *bh, + const char *type, + const char *key) { + sam_hrecs_t *hrecs; + if (!bh || !type || !key) + return -2; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -2; + hrecs = bh->hrecs; + } + + khint_t k; + int idx = -1; + switch (type[0]) { + case 'S': + if (type[1] == 'Q') { + k = kh_get(m_s2i, hrecs->ref_hash, key); + if (k != kh_end(hrecs->ref_hash)) + idx = kh_val(hrecs->ref_hash, k); + } else { + hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type); + } + break; + case 'R': + if (type[1] == 'G') { + k = kh_get(m_s2i, hrecs->rg_hash, key); + if (k != kh_end(hrecs->rg_hash)) + idx = kh_val(hrecs->rg_hash, k); + } else { + hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type); + } + break; + case 'P': + if (type[1] == 'G') { + k = kh_get(m_s2i, hrecs->pg_hash, key); + if (k != kh_end(hrecs->pg_hash)) + idx = kh_val(hrecs->pg_hash, k); + } else { + hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type); + } + break; + default: + hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type); + } + + return idx; +} + +const char *sam_hdr_line_name(sam_hdr_t *bh, + const char *type, + int pos) { + sam_hrecs_t *hrecs; + if (!bh || !type || pos < 0) + return NULL; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return NULL; + hrecs = bh->hrecs; + } + + switch (type[0]) { + case 'S': + if (type[1] == 'Q') { + if (pos < hrecs->nref) + return hrecs->ref[pos].name; + } else { + hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type); + } + break; + case 'R': + if (type[1] == 'G') { + if (pos < hrecs->nrg) + return hrecs->rg[pos].name; + } else { + hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type); + } + break; + case 'P': + if (type[1] == 'G') { + if (pos < hrecs->npg) + return hrecs->pg[pos].name; + } else { + hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type); + } + break; + default: + hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type); + } + + return NULL; +} + +/* ==== Key:val level methods ==== */ + +int sam_hdr_find_tag_id(sam_hdr_t *bh, + const char *type, + const char *ID_key, + const char *ID_value, + const char *key, + kstring_t *ks) { + sam_hrecs_t *hrecs; + if (!bh || !type || !key) + return -2; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -2; + hrecs = bh->hrecs; + } + + sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value); + if (!ty) + return -1; + + sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL); + if (!tag || !tag->str || tag->len < 4) + return -1; + + ks->l = 0; + if (kputsn(tag->str+3, tag->len-3, ks) == EOF) { + return -2; + } + + return 0; +} + +int sam_hdr_find_tag_pos(sam_hdr_t *bh, + const char *type, + int pos, + const char *key, + kstring_t *ks) { + sam_hrecs_t *hrecs; + if (!bh || !type || !key) + return -2; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -2; + hrecs = bh->hrecs; + } + + sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos); + if (!ty) + return -1; + + sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL); + if (!tag || !tag->str || tag->len < 4) + return -1; + + ks->l = 0; + if (kputsn(tag->str+3, tag->len-3, ks) == EOF) { + return -2; + } + + return 0; +} + +int sam_hdr_remove_tag_id(sam_hdr_t *bh, + const char *type, + const char *ID_key, + const char *ID_value, + const char *key) { + sam_hrecs_t *hrecs; + if (!bh || !type || !key) + return -1; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -1; + hrecs = bh->hrecs; + } + + sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value); + if (!ty) + return -1; + + int ret = sam_hrecs_remove_key(hrecs, ty, key); + if (!ret && hrecs->dirty) + redact_header_text(bh); + + return ret; +} + +/* + * Reconstructs a kstring from the header hash table. + * Returns 0 on success + * -1 on failure + */ +int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks) { + ks->l = 0; + + if (!hrecs->h || !hrecs->h->size) { + return kputsn("", 0, ks) >= 0 ? 0 : -1; + } + if (sam_hrecs_rebuild_lines(hrecs, ks) != 0) + return -1; + + return 0; +} + +/* + * Looks up a reference sequence by name and returns the numerical ID. + * Returns -1 if unknown reference; -2 if header could not be parsed. + */ +int sam_hdr_name2tid(sam_hdr_t *bh, const char *ref) { + sam_hrecs_t *hrecs; + khint_t k; + + if (!bh) + return -1; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -2; + hrecs = bh->hrecs; + } + + if (!hrecs->ref_hash) + return -1; + + k = kh_get(m_s2i, hrecs->ref_hash, ref); + return k == kh_end(hrecs->ref_hash) ? -1 : kh_val(hrecs->ref_hash, k); +} + +const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid) { + sam_hrecs_t *hrecs; + + if (!h || tid < 0) + return NULL; + + if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) { + return hrecs->ref[tid].name; + } else { + if (tid < h->n_targets) + return h->target_name[tid]; + } + + return NULL; +} + +hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) { + sam_hrecs_t *hrecs; + + if (!h || tid < 0) + return 0; + + if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) { + return hrecs->ref[tid].len; + } else { + if (tid < h->n_targets) { + if (h->target_len[tid] < UINT32_MAX || !h->sdict) { + return h->target_len[tid]; + } else { + khash_t(s2i) *long_refs = (khash_t(s2i) *) h->sdict; + khint_t k = kh_get(s2i, long_refs, h->target_name[tid]); + if (k < kh_end(long_refs)) { + return kh_val(long_refs, k); + } else { + return UINT32_MAX; + } + } + } + } + + return 0; +} + +/* + * Fixes any PP links in @PG headers. + * If the entries are in order then this doesn't need doing, but incase + * our header is out of order this goes through the hrecs->pg[] array + * setting the prev_id field. + * + * Note we can have multiple complete chains. This code should identify the + * tails of these chains as these are the entries we have to link to in + * subsequent PP records. + * + * Returns 0 on success + * -1 on failure (indicating broken PG/PP records) + */ +static int sam_hdr_link_pg(sam_hdr_t *bh) { + sam_hrecs_t *hrecs; + int i, j, ret = 0, *new_pg_end; + + if (!bh) + return -1; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -1; + hrecs = bh->hrecs; + } + + if (!hrecs->pgs_changed) + return 0; + + hrecs->npg_end_alloc = hrecs->npg; + new_pg_end = realloc(hrecs->pg_end, hrecs->npg * sizeof(*new_pg_end)); + if (!new_pg_end) + return -1; + hrecs->pg_end = new_pg_end; + + for (i = 0; i < hrecs->npg; i++) + hrecs->pg_end[i] = i; + + for (i = 0; i < hrecs->npg; i++) { + khint_t k; + sam_hrec_tag_t *tag; + + assert(hrecs->pg[i].ty != NULL); + for (tag = hrecs->pg[i].ty->tag; tag; tag = tag->next) { + if (tag->str[0] == 'P' && tag->str[1] == 'P') + break; + } + if (!tag) { + /* Chain start points */ + continue; + } + + k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3); + + if (k == kh_end(hrecs->pg_hash)) { + ret = -1; + continue; + } + + hrecs->pg[i].prev_id = hrecs->pg[kh_val(hrecs->pg_hash, k)].id; + hrecs->pg_end[kh_val(hrecs->pg_hash, k)] = -1; + } + + for (i = j = 0; i < hrecs->npg; i++) { + if (hrecs->pg_end[i] != -1) + hrecs->pg_end[j++] = hrecs->pg_end[i]; + } + hrecs->npg_end = j; + hrecs->pgs_changed = 0; + + /* mark as dirty or empty for rebuild */ + hrecs->dirty = 1; + redact_header_text(bh); + + return ret; +} + +/* + * Returns a unique ID from a base name. + * + * The value returned is valid until the next call to + * this function. + */ +const char *sam_hdr_pg_id(sam_hdr_t *bh, const char *name) { + sam_hrecs_t *hrecs; + size_t name_len; + const size_t name_extra = 17; + if (!bh || !name) + return NULL; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return NULL; + hrecs = bh->hrecs; + } + + khint_t k = kh_get(m_s2i, hrecs->pg_hash, name); + if (k == kh_end(hrecs->pg_hash)) + return name; + + name_len = strlen(name); + if (name_len > 1000) name_len = 1000; + if (hrecs->ID_buf_sz < name_len + name_extra) { + char *new_ID_buf = realloc(hrecs->ID_buf, name_len + name_extra); + if (new_ID_buf == NULL) + return NULL; + hrecs->ID_buf = new_ID_buf; + hrecs->ID_buf_sz = name_len + name_extra; + } + + do { + snprintf(hrecs->ID_buf, hrecs->ID_buf_sz, "%.1000s.%d", name, hrecs->ID_cnt++); + k = kh_get(m_s2i, hrecs->pg_hash, hrecs->ID_buf); + } while (k != kh_end(hrecs->pg_hash)); + + return hrecs->ID_buf; +} + +/* + * Add an @PG line. + * + * If we wish complete control over this use sam_hdr_add_line() directly. This + * function uses that, but attempts to do a lot of tedious house work for + * you too. + * + * - It will generate a suitable ID if the supplied one clashes. + * - It will generate multiple @PG records if we have multiple PG chains. + * + * Call it as per sam_hdr_add_line() with a series of key,value pairs ending + * in NULL. + * + * Returns 0 on success + * -1 on failure + */ +int sam_hdr_add_pg(sam_hdr_t *bh, const char *name, ...) { + sam_hrecs_t *hrecs; + const char *specified_id = NULL, *specified_pn = NULL, *specified_pp = NULL; + const char *key, *val; + if (!bh) + return -1; + + if (!(hrecs = bh->hrecs)) { + if (sam_hdr_fill_hrecs(bh) != 0) + return -1; + hrecs = bh->hrecs; + } + + va_list args; + // Check for ID / PN / PP tags in varargs list + va_start(args, name); + while ((key = va_arg(args, const char *)) != NULL) { + val = va_arg(args, const char *); + if (!val) break; + if (strcmp(key, "PN") == 0 && *val != '\0') + specified_pn = val; + else if (strcmp(key, "PP") == 0 && *val != '\0') + specified_pp = val; + else if (strcmp(key, "ID") == 0 && *val != '\0') + specified_id = val; + } + va_end(args); + + if (specified_id && hrecs->pg_hash) { + khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_id); + if (k != kh_end(hrecs->pg_hash)) { + hts_log_error("Header @PG ID:%s already present", specified_id); + return -1; + } + } + + if (specified_pp && hrecs->pg_hash) { + khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_pp); + if (k == kh_end(hrecs->pg_hash)) { + hts_log_error("Header @PG ID:%s referred to by PP tag not present", + specified_pp); + return -1; + } + } + + if (!specified_pp && hrecs->npg_end) { + /* Copy ends array to avoid us looping while modifying it */ + int *end = malloc(hrecs->npg_end * sizeof(int)); + int i, nends = hrecs->npg_end; + + if (!end) + return -1; + + memcpy(end, hrecs->pg_end, nends * sizeof(*end)); + + for (i = 0; i < nends; i++) { + const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : ""; + if (!id) { + free(end); + return -1; + } + va_start(args, name); + if (-1 == sam_hrecs_vadd(hrecs, "PG", args, + "ID", id, + "PN", !specified_pn ? name : "", + "PP", hrecs->pg[end[i]].name, + NULL)) { + free(end); + return -1; + } + va_end(args); + } + + free(end); + } else { + const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : ""; + if (!id) + return -1; + va_start(args, name); + if (-1 == sam_hrecs_vadd(hrecs, "PG", args, + "ID", id, + "PN", !specified_pn ? name : "", + NULL)) + return -1; + va_end(args); + } + + hrecs->dirty = 1; + redact_header_text(bh); + + return 0; +} + +/*! Increments a reference count on bh. + * + * This permits multiple files to share the same header, all calling + * sam_hdr_destroy when done, without causing errors for other open files. + */ +void sam_hdr_incr_ref(sam_hdr_t *bh) { + if (!bh) + return; + bh->ref_count++; +} + +/* ==== Internal methods ==== */ + +/* + * Creates an empty SAM header. Allocates space for the SAM header + * structures (hash tables) ready to be populated. + * + * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free()) + * NULL on failure + */ +sam_hrecs_t *sam_hrecs_new() { + sam_hrecs_t *hrecs = calloc(1, sizeof(*hrecs)); + + if (!hrecs) + return NULL; + + hrecs->h = kh_init(sam_hrecs_t); + if (!hrecs->h) + goto err; + + hrecs->ID_cnt = 1; + + hrecs->nref = 0; + hrecs->ref_sz = 0; + hrecs->ref = NULL; + if (!(hrecs->ref_hash = kh_init(m_s2i))) + goto err; + hrecs->refs_changed = -1; + + hrecs->nrg = 0; + hrecs->rg_sz = 0; + hrecs->rg = NULL; + if (!(hrecs->rg_hash = kh_init(m_s2i))) + goto err; + + hrecs->npg = 0; + hrecs->pg_sz = 0; + hrecs->pg = NULL; + hrecs->npg_end = hrecs->npg_end_alloc = 0; + hrecs->pg_end = NULL; + if (!(hrecs->pg_hash = kh_init(m_s2i))) + goto err; + + if (!(hrecs->tag_pool = pool_create(sizeof(sam_hrec_tag_t)))) + goto err; + + if (!(hrecs->type_pool = pool_create(sizeof(sam_hrec_type_t)))) + goto err; + + if (!(hrecs->str_pool = string_pool_create(65536))) + goto err; + + if (sam_hrecs_init_type_order(hrecs, NULL)) + goto err; + + return hrecs; + +err: + if (hrecs->h) + kh_destroy(sam_hrecs_t, hrecs->h); + + if (hrecs->tag_pool) + pool_destroy(hrecs->tag_pool); + + if (hrecs->type_pool) + pool_destroy(hrecs->type_pool); + + if (hrecs->str_pool) + string_pool_destroy(hrecs->str_pool); + + free(hrecs); + + return NULL; +} +#if 0 +/* + * Produces a duplicate copy of source and returns it. + * Returns NULL on failure + */ +sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *source) { + return NULL; +} +#endif +/*! Deallocates all storage used by a sam_hrecs_t struct. + * + * This also decrements the header reference count. If after decrementing + * it is still non-zero then the header is assumed to be in use by another + * caller and the free is not done. + * + */ +void sam_hrecs_free(sam_hrecs_t *hrecs) { + if (!hrecs) + return; + + if (hrecs->h) + kh_destroy(sam_hrecs_t, hrecs->h); + + if (hrecs->ref_hash) + kh_destroy(m_s2i, hrecs->ref_hash); + + if (hrecs->ref) + free(hrecs->ref); + + if (hrecs->rg_hash) + kh_destroy(m_s2i, hrecs->rg_hash); + + if (hrecs->rg) + free(hrecs->rg); + + if (hrecs->pg_hash) + kh_destroy(m_s2i, hrecs->pg_hash); + + if (hrecs->pg) + free(hrecs->pg); + + if (hrecs->pg_end) + free(hrecs->pg_end); + + if (hrecs->type_pool) + pool_destroy(hrecs->type_pool); + + if (hrecs->tag_pool) + pool_destroy(hrecs->tag_pool); + + if (hrecs->str_pool) + string_pool_destroy(hrecs->str_pool); + + if (hrecs->type_order) + free(hrecs->type_order); + + if (hrecs->ID_buf) + free(hrecs->ID_buf); + + free(hrecs); +} + +/* + * Internal method already used by the CRAM code + * Returns the first header item matching 'type'. If ID is non-NULL it checks + * for the tag ID: and compares against the specified ID. + * + * Returns NULL if no type/ID is found + */ +sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type, + const char *ID_key, const char *ID_value) { + if (!hrecs || !type) + return NULL; + sam_hrec_type_t *t1, *t2; + int itype = (type[0]<<8)|(type[1]); + khint_t k; + + /* Special case for types we have prebuilt hashes on */ + if (ID_key) { + if (!ID_value) + return NULL; + + if (type[0] == 'S' && type[1] == 'Q' && + ID_key[0] == 'S' && ID_key[1] == 'N') { + k = kh_get(m_s2i, hrecs->ref_hash, ID_value); + return k != kh_end(hrecs->ref_hash) + ? hrecs->ref[kh_val(hrecs->ref_hash, k)].ty + : NULL; + } + + if (type[0] == 'R' && type[1] == 'G' && + ID_key[0] == 'I' && ID_key[1] == 'D') { + k = kh_get(m_s2i, hrecs->rg_hash, ID_value); + return k != kh_end(hrecs->rg_hash) + ? hrecs->rg[kh_val(hrecs->rg_hash, k)].ty + : NULL; + } + + if (type[0] == 'P' && type[1] == 'G' && + ID_key[0] == 'I' && ID_key[1] == 'D') { + k = kh_get(m_s2i, hrecs->pg_hash, ID_value); + return k != kh_end(hrecs->pg_hash) + ? hrecs->pg[kh_val(hrecs->pg_hash, k)].ty + : NULL; + } + } + + k = kh_get(sam_hrecs_t, hrecs->h, itype); + if (k == kh_end(hrecs->h)) + return NULL; + + if (!ID_key) + return kh_val(hrecs->h, k); + + t1 = t2 = kh_val(hrecs->h, k); + do { + sam_hrec_tag_t *tag; + for (tag = t1->tag; tag; tag = tag->next) { + if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) { + const char *cp1 = tag->str+3; + const char *cp2 = ID_value; + while (*cp1 && *cp1 == *cp2) + cp1++, cp2++; + if (*cp2 || *cp1) + continue; + return t1; + } + } + t1 = t1->next; + } while (t1 != t2); + + return NULL; +} + +/* + * Adds or updates tag key,value pairs in a header line. + * Eg for adding M5 tags to @SQ lines or updating sort order for the + * @HD line. + * + * va_list contains multiple key,value pairs ending in NULL. + * + * Returns 0 on success + * -1 on failure + */ +int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap) { + if (!hrecs) + return -1; + + for (;;) { + char *k, *v, *str; + sam_hrec_tag_t *tag, *prev = NULL; + + if (!(k = (char *)va_arg(ap, char *))) + break; + if (!(v = va_arg(ap, char *))) + v = ""; + + tag = sam_hrecs_find_key(type, k, &prev); + if (!tag) { + if (!(tag = pool_alloc(hrecs->tag_pool))) + return -1; + if (prev) + prev->next = tag; + else + type->tag = tag; + + tag->next = NULL; + } + + tag->len = 3 + strlen(v); + str = string_alloc(hrecs->str_pool, tag->len+1); + if (!str) + return -1; + + if (snprintf(str, tag->len+1, "%2.2s:%s", k, v) < 0) + return -1; + + tag->str = str; + } + + hrecs->dirty = 1; //mark text as dirty and force a rebuild + + return 0; +} + +/* + * Adds or updates tag key,value pairs in a header line. + * Eg for adding M5 tags to @SQ lines or updating sort order for the + * @HD line. + * + * Specify multiple key,value pairs ending in NULL. + * + * Returns 0 on success + * -1 on failure + */ +static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) { + va_list args; + int res; + va_start(args, type); + res = sam_hrecs_vupdate(hrecs, type, args); + va_end(args); + return res; +} + +/* + * Looks for a specific key in a single sam header line identified by *type. + * If prev is non-NULL it also fills this out with the previous tag, to + * permit use in key removal. *prev is set to NULL when the tag is the first + * key in the list. When a tag isn't found, prev (if non NULL) will be the last + * tag in the existing list. + * + * Returns the tag pointer on success + * NULL on failure + */ +sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type, + const char *key, + sam_hrec_tag_t **prev) { + sam_hrec_tag_t *tag, *p = NULL; + if (!type) + return NULL; + + for (tag = type->tag; tag; p = tag, tag = tag->next) { + if (tag->str[0] == key[0] && tag->str[1] == key[1]) { + if (prev) + *prev = p; + return tag; + } + } + + if (prev) + *prev = p; + + return NULL; +} + +int sam_hrecs_remove_key(sam_hrecs_t *hrecs, + sam_hrec_type_t *type, + const char *key) { + sam_hrec_tag_t *tag, *prev; + if (!hrecs) + return -1; + tag = sam_hrecs_find_key(type, key, &prev); + if (!tag) + return 0; // Not there anyway + + if (type->type == TYPEKEY("SQ") && tag->str[0] == 'A' && tag->str[1] == 'N') { + assert(tag->len >= 3); + sam_hrec_tag_t *sn_tag = sam_hrecs_find_key(type, "SN", NULL); + if (sn_tag) { + assert(sn_tag->len >= 3); + khint_t k = kh_get(m_s2i, hrecs->ref_hash, sn_tag->str + 3); + if (k != kh_end(hrecs->ref_hash)) + sam_hrecs_remove_ref_altnames(hrecs, kh_val(hrecs->ref_hash, k), tag->str + 3); + } + } + + if (!prev) { //first tag + type->tag = tag->next; + } else { + prev->next = tag->next; + } + pool_free(hrecs->tag_pool, tag); + hrecs->dirty = 1; //mark text as dirty and force a rebuild + + return 1; +} + +/* + * Looks up a read-group by name and returns a pointer to the start of the + * associated tag list. + * + * Returns NULL on failure + */ +sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg) { + khint_t k = kh_get(m_s2i, hrecs->rg_hash, rg); + return k == kh_end(hrecs->rg_hash) + ? NULL + : &hrecs->rg[kh_val(hrecs->rg_hash, k)]; +} + +#if DEBUG_HEADER +void sam_hrecs_dump(sam_hrecs_t *hrecs) { + khint_t k; + int i; + + printf("===DUMP===\n"); + for (k = kh_begin(hrecs->h); k != kh_end(hrecs->h); k++) { + sam_hrec_type_t *t1, *t2; + char c[2]; + int idx = 0; + + if (!kh_exist(hrecs->h, k)) + continue; + + t1 = t2 = kh_val(hrecs->h, k); + c[0] = kh_key(hrecs->h, k)>>8; + c[1] = kh_key(hrecs->h, k)&0xff; + printf("Type %.2s\n", c); + + do { + sam_hrec_tag_t *tag; + printf(">>>%d ", idx++); + for (tag = t1->tag; tag; tag=tag->next) { + if (strncmp(c, "CO", 2)) + printf("\"%.2s\":\"%.*s\"\t", tag->str, tag->len-3, tag->str+3); + else + printf("%s", tag->str); + } + putchar('\n'); + t1 = t1->next; + } while (t1 != t2); + } + + /* Dump out PG chains */ + printf("\n@PG chains:\n"); + for (i = 0; i < hrecs->npg_end; i++) { + int j; + printf(" %d:", i); + for (j = hrecs->pg_end[i]; j != -1; j = hrecs->pg[j].prev_id) { + printf("%s%d(%.*s)", + j == hrecs->pg_end[i] ? " " : "->", + j, hrecs->pg[j].name_len, hrecs->pg[j].name); + } + printf("\n"); + } + + puts("===END DUMP==="); +} +#endif + +/* + * Returns the sort order: + */ +enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs) { + khint_t k; + enum sam_sort_order so; + + so = ORDER_UNKNOWN; + k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD")); + if (k != kh_end(hrecs->h)) { + sam_hrec_type_t *ty = kh_val(hrecs->h, k); + sam_hrec_tag_t *tag; + for (tag = ty->tag; tag; tag = tag->next) { + if (tag->str[0] == 'S' && tag->str[1] == 'O') { + if (strcmp(tag->str+3, "unsorted") == 0) + so = ORDER_UNSORTED; + else if (strcmp(tag->str+3, "queryname") == 0) + so = ORDER_NAME; + else if (strcmp(tag->str+3, "coordinate") == 0) + so = ORDER_COORD; + else if (strcmp(tag->str+3, "unknown") != 0) + hts_log_error("Unknown sort order field: %s", tag->str+3); + } + } + } + + return so; +} + +enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs) { + khint_t k; + enum sam_group_order go; + + go = ORDER_NONE; + k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD")); + if (k != kh_end(hrecs->h)) { + sam_hrec_type_t *ty = kh_val(hrecs->h, k); + sam_hrec_tag_t *tag; + for (tag = ty->tag; tag; tag = tag->next) { + if (tag->str[0] == 'G' && tag->str[1] == 'O') { + if (strcmp(tag->str+3, "query") == 0) + go = ORDER_QUERY; + else if (strcmp(tag->str+3, "reference") == 0) + go = ORDER_REFERENCE; + } + } + } + + return go; +} diff --git a/header.h b/header.h new file mode 100644 index 000000000..5d787c22e --- /dev/null +++ b/header.h @@ -0,0 +1,314 @@ +/* +Copyright (c) 2013-2019 Genome Research Ltd. +Authors: James Bonfield , Valeriu Ohan + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/*! \file + * SAM header parsing. + * + * These functions can be shared between SAM, BAM and CRAM file + * formats as all three internally use the same string encoding for + * header fields. + */ + + +#ifndef HEADER_H_ +#define HEADER_H_ + +#include + +#include "cram/string_alloc.h" +#include "cram/pooled_alloc.h" + +#include "htslib/khash.h" +#include "htslib/kstring.h" +#include "htslib/sam.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define TYPEKEY(a) (((a)[0]<<8)|((a)[1])) + +/* + * Proposed new SAM header parsing + +1 @SQ ID:foo LN:100 +2 @SQ ID:bar LN:200 +3 @SQ ID:ram LN:300 UR:xyz +4 @RG ID:r ... +5 @RG ID:s ... + +Hash table for 2-char @keys without dup entries. +If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}. + +HASH("SQ")--\ + | + (3) <-> 1 <-> 2 <-> 3 <-> (1) + +HASH("RG")--\ + | + (5) <-> 4 <-> 5 <-> (4) + +Items stored in the hash values also form their own linked lists: +Ie SQ->ID(foo)->LN(100) + SQ->ID(bar)->LN(200) + SQ->ID(ram)->LN(300)->UR(xyz) + RG->ID(r) + */ + +/*! A single key:value pair on a header line + * + * These form a linked list and hold strings. The strings are + * allocated from a string_alloc_t pool referenced in the master + * sam_hrecs_t structure. Do not attempt to free, malloc or manipulate + * these strings directly. + */ +typedef struct sam_hrec_tag_s { + struct sam_hrec_tag_s *next; + const char *str; + int len; +} sam_hrec_tag_t; + +/*! The parsed version of the SAM header string. + * + * Each header type (SQ, RG, HD, etc) points to its own sam_hdr_type + * struct via the main hash table h in the sam_hrecs_t struct. + * + * These in turn consist of circular bi-directional linked lists (ie + * rings) to hold the multiple instances of the same header type + * code. For example if we have 5 \@SQ lines the primary hash table + * will key on \@SQ pointing to the first sam_hdr_type and that in turn + * will be part of a ring of 5 elements. + * + * For each sam_hdr_type structure we also point to a sam_hdr_tag + * structure which holds the tokenised attributes; the tab separated + * key:value pairs per line. + */ +typedef struct sam_hrec_type_s { + struct sam_hrec_type_s *next; // circular list of this type + struct sam_hrec_type_s *prev; // circular list of this type + struct sam_hrec_type_s *global_next; // circular list of all lines + struct sam_hrec_type_s *global_prev; // circular list of all lines + sam_hrec_tag_t *tag; // first tag + khint32_t type; // Two-letter type code as an int +} sam_hrec_type_t; + +/*! Parsed \@SQ lines */ +typedef struct { + const char *name; + hts_pos_t len; + sam_hrec_type_t *ty; +} sam_hrec_sq_t; + +/*! Parsed \@RG lines */ +typedef struct { + const char *name; + sam_hrec_type_t *ty; + int name_len; + int id; // numerical ID +} sam_hrec_rg_t; + +/*! Parsed \@PG lines */ +typedef struct { + const char *name; + sam_hrec_type_t *ty; + int name_len; + int id; // numerical ID + int prev_id; // -1 if none +} sam_hrec_pg_t; + + +/*! Sort order parsed from @HD line */ +enum sam_sort_order { + ORDER_UNKNOWN =-1, + ORDER_UNSORTED = 0, + ORDER_NAME = 1, + ORDER_COORD = 2 + //ORDER_COLLATE = 3 // maybe one day! +}; + +enum sam_group_order { + ORDER_NONE =-1, + ORDER_QUERY = 0, + ORDER_REFERENCE = 1 +}; + +KHASH_MAP_INIT_INT(sam_hrecs_t, sam_hrec_type_t*) +KHASH_MAP_INIT_STR(m_s2i, int) + +/*! Primary structure for header manipulation + * + * The initial header text is held in the text kstring_t, but is also + * parsed out into SQ, RG and PG arrays. These have a hash table + * associated with each to allow lookup by ID or SN fields instead of + * their numeric array indices. Additionally PG has an array to hold + * the linked list start points (the last in a PP chain). + * + * Use the appropriate sam_hdr_* functions to edit the header, and + * call sam_hdr_rebuild() any time the textual form needs to be + * updated again. + */ +struct sam_hrecs_t { + khash_t(sam_hrecs_t) *h; + sam_hrec_type_t *first_line; //!< First line (usually @HD) + string_alloc_t *str_pool; //!< Pool of sam_hdr_tag->str strings + pool_alloc_t *type_pool;//!< Pool of sam_hdr_type structs + pool_alloc_t *tag_pool; //!< Pool of sam_hdr_tag structs + + // @SQ lines / references + int nref; //!< Number of \@SQ lines + int ref_sz; //!< Number of entries available in ref[] + sam_hrec_sq_t *ref; //!< Array of parsed \@SQ lines + khash_t(m_s2i) *ref_hash; //!< Maps SQ SN field to ref[] index + + // @RG lines / read-groups + int nrg; //!< Number of \@RG lines + int rg_sz; //!< number of entries available in rg[] + sam_hrec_rg_t *rg; //!< Array of parsed \@RG lines + khash_t(m_s2i) *rg_hash; //!< Maps RG ID field to rg[] index + + // @PG lines / programs + int npg; //!< Number of \@PG lines + int pg_sz; //!< Number of entries available in pg[] + int npg_end; //!< Number of terminating \@PG lines + int npg_end_alloc; //!< Size of pg_end field + sam_hrec_pg_t *pg; //!< Array of parsed \@PG lines + khash_t(m_s2i) *pg_hash; //!< Maps PG ID field to pg[] index + int *pg_end; //!< \@PG chain termination IDs + + // @cond internal + char *ID_buf; // temporary buffer for sam_hdr_pg_id + uint32_t ID_buf_sz; + int ID_cnt; + // @endcond + + int dirty; // marks the header as modified, so it can be rebuilt + int refs_changed; // Index of first changed ref (-1 if unchanged) + int pgs_changed; // New PG line added + int type_count; + char (*type_order)[3]; +}; + +/*! + * Method for parsing the header text and populating the + * internal hash tables. After calling this method, the + * parsed representation becomes the single source of truth. + * + * @param bh Header structure, previously initialised by a + * sam_hdr_init call + * @return 0 on success, -1 on failure + */ +int sam_hdr_fill_hrecs(sam_hdr_t *bh); + +/*! + * Reconstructs the text representation of the header from + * the hash table data after a change has been performed on + * the header. + * + * @return 0 on success, -1 on failure + */ +int sam_hdr_rebuild(sam_hdr_t *bh); + +/*! Creates an empty SAM header, ready to be populated. + * + * @return + * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free()) + * NULL on failure + */ +sam_hrecs_t *sam_hrecs_new(void); + +/*! Produces a duplicate copy of hrecs and returns it. + * @return + * Returns NULL on failure + */ +sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *hrecs); + +/*! Update sam_hdr_t target_name and target_len arrays + * + * sam_hdr_t and sam_hrecs_t are specified separately so that sam_hdr_dup + * can use it to construct target arrays from the source header. + * + * @return 0 on success; -1 on failure + */ +int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs, + int refs_changed); + +/*! Reconstructs a kstring from the header hash table. + * + * @return + * Returns 0 on success + * -1 on failure + */ +int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks); + +/*! Deallocates all storage used by a sam_hrecs_t struct. + * + * This also decrements the header reference count. If after decrementing + * it is still non-zero then the header is assumed to be in use by another + * caller and the free is not done. + */ +void sam_hrecs_free(sam_hrecs_t *hrecs); + +/*! + * @return + * Returns the first header item matching 'type'. If ID is non-NULL it checks + * for the tag ID: and compares against the specified ID. + * + * Returns NULL if no type/ID is found + */ +sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type, + const char *ID_key, const char *ID_value); + +sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type, + const char *key, + sam_hrec_tag_t **prev); + +int sam_hrecs_remove_key(sam_hrecs_t *hrecs, + sam_hrec_type_t *type, + const char *key); + +/*! Looks up a read-group by name and returns a pointer to the start of the + * associated tag list. + * + * @return + * Returns NULL on failure + */ +sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg); + +/*! Returns the sort order from the @HD SO: field */ +enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs); + +/*! Returns the group order from the @HD SO: field */ +enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs); + +#ifdef __cplusplus +} +#endif + +#endif /* HEADER_H_ */ diff --git a/hfile.c b/hfile.c index 645d7a3b8..992837604 100644 --- a/hfile.c +++ b/hfile.c @@ -1,6 +1,6 @@ /* hfile.c -- buffered low-level input/output streams. - Copyright (C) 2013-2016 Genome Research Ltd. + Copyright (C) 2013-2019 Genome Research Ltd. Author: John Marshall @@ -22,6 +22,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -33,8 +34,16 @@ DEALINGS IN THE SOFTWARE. */ #include +#ifdef ENABLE_PLUGINS +#if defined(_WIN32) || defined(__CYGWIN__) || defined(__MSYS__) +#define USING_WINDOWS_PLUGIN_DLLS +#include +#endif +#endif + #include "htslib/hfile.h" #include "hfile_internal.h" +#include "htslib/kstring.h" #ifndef ENOTSUP #define ENOTSUP EINVAL @@ -92,7 +101,7 @@ for seek(): abcdefghijkLMNOPQRSTUVWXYZ------ ^buffer ^begin ^end ^limit */ - +HTSLIB_EXPORT hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity) { hFILE *fp = (hFILE *) malloc(struct_size); @@ -140,6 +149,7 @@ hFILE *hfile_init_fixed(size_t struct_size, const char *mode, static const struct hFILE_backend mem_backend; +HTSLIB_EXPORT void hfile_destroy(hFILE *fp) { int save = errno; @@ -189,6 +199,7 @@ static ssize_t refill_buffer(hFILE *fp) * Returns 0 on success; * -1 on failure. */ +HTSLIB_EXPORT int hfile_set_blksize(hFILE *fp, size_t bufsiz) { char *buffer; ptrdiff_t curr_used; @@ -211,6 +222,7 @@ int hfile_set_blksize(hFILE *fp, size_t bufsiz) { } /* Called only from hgetc(), when our buffer is empty. */ +HTSLIB_EXPORT int hgetc2(hFILE *fp) { return (refill_buffer(fp) > 0)? (unsigned char) *(fp->begin++) : EOF; @@ -292,6 +304,7 @@ ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) /* Called only from hread(); when called, our buffer is empty and nread bytes have already been placed in the destination buffer. */ +HTSLIB_EXPORT ssize_t hread2(hFILE *fp, void *destv, size_t nbytes, size_t nread) { const size_t capacity = fp->limit - fp->buffer; @@ -361,6 +374,7 @@ int hflush(hFILE *fp) } /* Called only from hputc(), when our buffer is already full. */ +HTSLIB_EXPORT int hputc2(int c, hFILE *fp) { if (flush_buffer(fp) < 0) return EOF; @@ -372,6 +386,7 @@ int hputc2(int c, hFILE *fp) full and ncopied bytes from the source have already been copied to our buffer; or completely empty, ncopied is zero and totalbytes is greater than the buffer size. */ +HTSLIB_EXPORT ssize_t hwrite2(hFILE *fp, const void *srcv, size_t totalbytes, size_t ncopied) { const char *src = (const char *) srcv; @@ -399,6 +414,7 @@ ssize_t hwrite2(hFILE *fp, const void *srcv, size_t totalbytes, size_t ncopied) } /* Called only from hputs(), when our buffer is already full. */ +HTSLIB_EXPORT int hputs2(const char *text, size_t totalbytes, size_t ncopied, hFILE *fp) { return (hwrite2(fp, text, totalbytes, ncopied) >= 0)? 0 : EOF; @@ -686,9 +702,9 @@ static hFILE *hopen_fd_fileuri(const char *url, const char *mode) else if (strncmp(url, "file:///", 8) == 0) url += 7; else { errno = EPROTONOSUPPORT; return NULL; } -#ifdef _WIN32 +#if defined(_WIN32) || defined(__MSYS__) // For cases like C:/foo - if (url[0] == '/' && url[2] == ':' && url[3] == '/') url++; + if (url[0] == '/' && url[1] && url[2] == ':' && url[3] == '/') url++; #endif return hopen_fd(url, mode); @@ -703,6 +719,7 @@ static hFILE *hopen_fd_stdinout(const char *mode) return hdopen(fd, mode); } +HTSLIB_EXPORT int hfile_oflags(const char *mode) { int rdwr = 0, flags = 0; @@ -808,7 +825,7 @@ static hFILE *hopen_mem(const char *url, const char *mode) return hf; } -hFILE *hopenv_mem(const char *filename, const char *mode, va_list args) +static hFILE *hopenv_mem(const char *filename, const char *mode, va_list args) { char* buffer = va_arg(args, char*); size_t sz = va_arg(args, size_t); @@ -896,10 +913,54 @@ static inline int priority(const struct hFILE_scheme_handler *handler) return handler->priority % 1000; } +#ifdef USING_WINDOWS_PLUGIN_DLLS +/* + * Work-around for Windows plug-in dlls where the plug-in could be + * using a different HTSlib library to the executable (for example + * because the latter was build against a static libhts.a). When this + * happens, the plug-in can call the wrong copy of hfile_add_scheme_handler(). + * If this is detected, it calls this function which attempts to fix the + * problem by redirecting to the hfile_add_scheme_handler() in the main + * executable. + */ +static int try_exe_add_scheme_handler(const char *scheme, + const struct hFILE_scheme_handler *handler) +{ + static void (*add_scheme_handler)(const char *scheme, + const struct hFILE_scheme_handler *handler); + if (!add_scheme_handler) { + // dlopen the main executable and resolve hfile_add_scheme_handler + void *exe_handle = dlopen(NULL, RTLD_LAZY); + if (!exe_handle) return -1; + *(void **) (&add_scheme_handler) = dlsym(exe_handle, "hfile_add_scheme_handler"); + dlclose(exe_handle); + } + // Check that the symbol was obtained and isn't the one in this copy + // of the library (to avoid infinite recursion) + if (!add_scheme_handler || add_scheme_handler == hfile_add_scheme_handler) + return -1; + add_scheme_handler(scheme, handler); + return 0; +} +#else +static int try_exe_add_scheme_handler(const char *scheme, + const struct hFILE_scheme_handler *handler) +{ + return -1; +} +#endif + +HTSLIB_EXPORT void hfile_add_scheme_handler(const char *scheme, const struct hFILE_scheme_handler *handler) { int absent; + if (!schemes) { + if (try_exe_add_scheme_handler(scheme, handler) != 0) { + hts_log_warning("Couldn't register scheme handler for %s", scheme); + } + return; + } khint_t k = kh_put(scheme_string, schemes, scheme, &absent); if (absent || priority(handler) > priority(kh_value(schemes, k))) { kh_value(schemes, k) = handler; @@ -971,6 +1032,7 @@ static void load_hfile_plugins() #endif #ifdef ENABLE_S3 init_add_plugin(NULL, hfile_plugin_init_s3, "s3"); + init_add_plugin(NULL, hfile_plugin_init_s3_write, "s3w"); #endif #endif @@ -1041,7 +1103,10 @@ hFILE *hopen(const char *fname, const char *mode, ...) else return hopen_fd(fname, mode); } +HTSLIB_EXPORT int hfile_always_local (const char *fname) { return 0; } + +HTSLIB_EXPORT int hfile_always_remote(const char *fname) { return 1; } int hisremote(const char *fname) @@ -1049,3 +1114,40 @@ int hisremote(const char *fname) const struct hFILE_scheme_handler *handler = find_scheme_handler(fname); return handler? handler->isremote(fname) : 0; } + +// Remove an extension, if any, from the basename part of [start,limit). +// Note: Doesn't notice percent-encoded '.' and '/' characters. Don't do that. +static const char *strip_extension(const char *start, const char *limit) +{ + const char *s = limit; + while (s > start) { + --s; + if (*s == '.') return s; + else if (*s == '/') break; + } + return limit; +} + +char *haddextension(struct kstring_t *buffer, const char *filename, + int replace, const char *new_extension) +{ + const char *trailing, *end; + + if (find_scheme_handler(filename)) { + // URL, so alter extensions before any trailing query or fragment parts + // Allow # symbols in s3 URLs + trailing = filename + ((strncmp(filename, "s3://", 5) && strncmp(filename, "s3+http://", 10) && strncmp(filename, "s3+https://", 11)) ? strcspn(filename, "?#") : strcspn(filename, "?")); + } + else { + // Local path, so alter extensions at the end of the filename + trailing = strchr(filename, '\0'); + } + + end = replace? strip_extension(filename, trailing) : trailing; + + buffer->l = 0; + if (kputsn(filename, end - filename, buffer) >= 0 && + kputs(new_extension, buffer) >= 0 && + kputs(trailing, buffer) >= 0) return buffer->s; + else return NULL; +} diff --git a/hfile_gcs.c b/hfile_gcs.c index cdf2076b9..e6f72ae4c 100644 --- a/hfile_gcs.c +++ b/hfile_gcs.c @@ -22,6 +22,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -124,7 +125,7 @@ int PLUGIN_GLOBAL(hfile_plugin_init,_gcs)(struct hFILE_plugin *self) #ifdef ENABLE_PLUGINS // Embed version string for examination via strings(1) or what(1) - static const char id[] = "@(#)hfile_gcs plugin (htslib)\t" HTS_VERSION; + static const char id[] = "@(#)hfile_gcs plugin (htslib)\t" HTS_VERSION_TEXT; if (hts_verbose >= 9) fprintf(stderr, "[M::hfile_gcs.init] version %s\n", strchr(id, '\t')+1); #endif diff --git a/hfile_internal.h b/hfile_internal.h index c243f8e77..f8be4c910 100644 --- a/hfile_internal.h +++ b/hfile_internal.h @@ -1,6 +1,6 @@ /* hfile_internal.h -- internal parts of low-level input/output streams. - Copyright (C) 2013-2016 Genome Research Ltd. + Copyright (C) 2013-2016, 2019 Genome Research Ltd. Author: John Marshall @@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE. */ #include +#include "htslib/hts_defs.h" #include "htslib/hfile.h" #include "textutils_internal.h" @@ -157,6 +158,7 @@ struct hFILE_plugin { #define PLUGIN_GLOBAL(identifier,suffix) identifier /* Plugins must define an entry point with this signature. */ +HTSLIB_EXPORT extern int hfile_plugin_init(struct hFILE_plugin *self); #else @@ -169,6 +171,7 @@ extern int hfile_plugin_init(struct hFILE_plugin *self); extern int hfile_plugin_init_gcs(struct hFILE_plugin *self); extern int hfile_plugin_init_libcurl(struct hFILE_plugin *self); extern int hfile_plugin_init_s3(struct hFILE_plugin *self); +extern int hfile_plugin_init_s3_write(struct hFILE_plugin *self); #endif /* This one is never built as a separate plugin. */ @@ -179,6 +182,18 @@ extern int hfile_plugin_init_net(struct hFILE_plugin *self); // although we may consider exposing it in the API later. typedef int (* hts_httphdr_callback) (void *cb_data, char ***hdrs); +/** Callback for handling 3xx redirect responses from http connections. + + @param data is passed to the callback + @param response http response code (e.g. 301) + @param headers http response headers + @param new_url the callback should write the url to switch to in here + + Currently used by s3 to handle switching region endpoints. +*/ +typedef int (*redirect_callback) (void *data, long response, + kstring_t *headers, kstring_t *new_url); + #ifdef __cplusplus } #endif diff --git a/hfile_libcurl.c b/hfile_libcurl.c index 9076fe763..235b4c196 100644 --- a/hfile_libcurl.c +++ b/hfile_libcurl.c @@ -1,6 +1,6 @@ /* hfile_libcurl.c -- libcurl backend for low-level file streams. - Copyright (C) 2015-2017 Genome Research Ltd. + Copyright (C) 2015-2017, 2019 Genome Research Ltd. Author: John Marshall @@ -22,11 +22,13 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include #include #include +#include #include #include #ifndef _WIN32 @@ -76,13 +78,18 @@ typedef struct { hdrlist fixed; // List of headers supplied at hopen() hdrlist extra; // List of headers from callback hts_httphdr_callback callback; // Callback to get more headers - void *callback_data; // Data to pass to callback + void *callback_data; // Data to pass to httphdr callback auth_token *auth; // Authentication token int auth_hdr_num; // Location of auth_token in hdrlist extra // If -1, Authorization header is in fixed // -2, it came from the callback // -3, "auth_token_enabled", "false" // passed to hopen() + redirect_callback redirect; // Callback to handle 3xx redirects + void *redirect_data; // Data to pass to redirect_callback + long *http_response_ptr; // Location to store http response code. + int fail_on_error; // Open fails on >400 response code + // (default true) } http_headers; typedef struct { @@ -106,6 +113,7 @@ typedef struct { unsigned tried_seek : 1; // At least one seek has been attempted int nrunning; http_headers headers; + off_t delayed_seek; // Location to seek to before reading off_t last_offset; // Location we're seeking from } hFILE_libcurl; @@ -692,8 +700,19 @@ static int wait_perform(hFILE_libcurl *fp) timeout = 10000; // as recommended by curl_multi_timeout(3) } } - if (maxfd < 0 && timeout > 100) - timeout = 100; // as recommended by curl_multi_fdset(3) + if (maxfd < 0) { + if (timeout > 100) + timeout = 100; // as recommended by curl_multi_fdset(3) +#ifdef _WIN32 + /* Windows ignores the first argument of select, so calling select + * with maxfd=-1 does not give the expected result of sleeping for + * timeout milliseconds in the conditional block below. + * So sleep here and skip the next block. + */ + Sleep(timeout); + timeout = 0; +#endif + } if (timeout > 0) { struct timeval tval; @@ -719,7 +738,10 @@ static size_t recv_callback(char *ptr, size_t size, size_t nmemb, void *fpv) hFILE_libcurl *fp = (hFILE_libcurl *) fpv; size_t n = size * nmemb; - if (n > fp->buffer.len) { fp->paused = 1; return CURL_WRITEFUNC_PAUSE; } + if (n > fp->buffer.len) { + fp->paused = 1; + return CURL_WRITEFUNC_PAUSE; + } else if (n == 0) return 0; memcpy(fp->buffer.ptr.rd, ptr, n); @@ -728,6 +750,21 @@ static size_t recv_callback(char *ptr, size_t size, size_t nmemb, void *fpv) return n; } + +static size_t header_callback(void *contents, size_t size, size_t nmemb, + void *userp) +{ + size_t realsize = size * nmemb; + kstring_t *resp = (kstring_t *)userp; + + if (kputsn((const char *)contents, realsize, resp) == EOF) { + return 0; + } + + return realsize; +} + + static ssize_t libcurl_read(hFILE *fpv, void *bufferv, size_t nbytes) { hFILE_libcurl *fp = (hFILE_libcurl *) fpv; @@ -763,8 +800,9 @@ static ssize_t libcurl_read(hFILE *fpv, void *bufferv, size_t nbytes) err = curl_easy_pause(fp->easy, CURLPAUSE_CONT); if (err != CURLE_OK) { errno = easy_errno(fp->easy, err); return -1; } - while (! fp->paused && ! fp->finished) + while (! fp->paused && ! fp->finished) { if (wait_perform(fp) < 0) return -1; + } got = fp->buffer.ptr.rd - buffer; @@ -1086,6 +1124,8 @@ libcurl_open(const char *url, const char *modes, http_headers *headers) CURLcode err; CURLMcode errm; int save, is_recursive; + kstring_t in_header = {0, 0, NULL}; + long response; is_recursive = strchr(modes, 'R') != NULL; @@ -1104,6 +1144,7 @@ libcurl_open(const char *url, const char *modes, http_headers *headers) fp->headers = *headers; } else { memset(&fp->headers, 0, sizeof(fp->headers)); + fp->headers.fail_on_error = 1; } fp->file_size = -1; @@ -1162,28 +1203,83 @@ libcurl_open(const char *url, const char *modes, http_headers *headers) goto error; if ((list = get_header_list(fp)) != NULL) err |= curl_easy_setopt(fp->easy, CURLOPT_HTTPHEADER, list); - err |= curl_easy_setopt(fp->easy, CURLOPT_FOLLOWLOCATION, 1L); - if (hts_verbose <= 8) + + if (hts_verbose <= 8 && fp->headers.fail_on_error) err |= curl_easy_setopt(fp->easy, CURLOPT_FAILONERROR, 1L); if (hts_verbose >= 8) err |= curl_easy_setopt(fp->easy, CURLOPT_VERBOSE, 1L); + if (fp->headers.redirect) { + err |= curl_easy_setopt(fp->easy, CURLOPT_HEADERFUNCTION, header_callback); + err |= curl_easy_setopt(fp->easy, CURLOPT_HEADERDATA, (void *)&in_header); + } else { + err |= curl_easy_setopt(fp->easy, CURLOPT_FOLLOWLOCATION, 1L); + } + if (err != 0) { errno = ENOSYS; goto error; } errm = curl_multi_add_handle(fp->multi, fp->easy); if (errm != CURLM_OK) { errno = multi_errno(errm); goto error; } fp->nrunning++; - while (! fp->paused && ! fp->finished) + while (! fp->paused && ! fp->finished) { if (wait_perform(fp) < 0) goto error_remove; + } + + curl_easy_getinfo(fp->easy, CURLINFO_RESPONSE_CODE, &response); + if (fp->headers.http_response_ptr) { + *fp->headers.http_response_ptr = response; + } if (fp->finished && fp->final_result != CURLE_OK) { errno = easy_errno(fp->easy, fp->final_result); goto error_remove; } + if (fp->headers.redirect) { + if (response >= 300 && response < 400) { // redirection + kstring_t new_url = {0, 0, NULL}; + + if (fp->headers.redirect(fp->headers.redirect_data, response, + &in_header, &new_url)) { + errno = ENOSYS; + goto error; + } + + err |= curl_easy_setopt(fp->easy, CURLOPT_URL, new_url.s); + err |= curl_easy_setopt(fp->easy, CURLOPT_HEADERFUNCTION, NULL); + err |= curl_easy_setopt(fp->easy, CURLOPT_HEADERDATA, NULL); + free(ks_release(&in_header)); + + if (err != 0) { errno = ENOSYS; goto error; } + free(ks_release(&new_url)); + + if (restart_from_position(fp, 0) < 0) { + goto error_remove; + } + + if (fp->headers.http_response_ptr) { + curl_easy_getinfo(fp->easy, CURLINFO_RESPONSE_CODE, + fp->headers.http_response_ptr); + } + + if (fp->finished && fp->final_result != CURLE_OK) { + errno = easy_errno(fp->easy, fp->final_result); + goto error_remove; + } + } else { + // we no longer need to look at the headers + err |= curl_easy_setopt(fp->easy, CURLOPT_HEADERFUNCTION, NULL); + err |= curl_easy_setopt(fp->easy, CURLOPT_HEADERDATA, NULL); + free(ks_release(&in_header)); + + if (err != 0) { errno = ENOSYS; goto error; } + } + } + if (mode == 'r') { double dval; + if (curl_easy_getinfo(fp->easy, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &dval) == CURLE_OK && dval >= 0.0) fp->file_size = (off_t) (dval + 0.1); @@ -1199,6 +1295,7 @@ libcurl_open(const char *url, const char *modes, http_headers *headers) errno = save; error: + if (fp->headers.redirect) free(in_header.s); save = errno; if (fp->easy) curl_easy_cleanup(fp->easy); if (fp->multi) curl_multi_cleanup(fp->multi); @@ -1265,6 +1362,18 @@ static int parse_va_list(http_headers *headers, va_list args) if (strcmp(flag, "false") == 0) headers->auth_hdr_num = -3; } + else if (strcmp(argtype, "redirect_callback") == 0) { + headers->redirect = va_arg(args, const redirect_callback); + } + else if (strcmp(argtype, "redirect_callback_data") == 0) { + headers->redirect_data = va_arg(args, void *); + } + else if (strcmp(argtype, "http_response_ptr") == 0) { + headers->http_response_ptr = va_arg(args, long *); + } + else if (strcmp(argtype, "fail_on_error") == 0) { + headers->fail_on_error = va_arg(args, int); + } else { errno = EINVAL; return -1; } return 0; @@ -1317,7 +1426,8 @@ static int parse_va_list(http_headers *headers, va_list args) static hFILE *vhopen_libcurl(const char *url, const char *modes, va_list args) { hFILE *fp = NULL; - http_headers headers = { { NULL, 0, 0 }, { NULL, 0, 0 }, NULL, NULL }; + http_headers headers = { .fail_on_error = 1 }; + if (parse_va_list(&headers, args) == 0) { fp = libcurl_open(url, modes, &headers); } @@ -1337,7 +1447,8 @@ int PLUGIN_GLOBAL(hfile_plugin_init,_libcurl)(struct hFILE_plugin *self) #ifdef ENABLE_PLUGINS // Embed version string for examination via strings(1) or what(1) - static const char id[] = "@(#)hfile_libcurl plugin (htslib)\t" HTS_VERSION; + static const char id[] = + "@(#)hfile_libcurl plugin (htslib)\t" HTS_VERSION_TEXT; const char *version = strchr(id, '\t')+1; #else const char *version = hts_version(); diff --git a/hfile_s3.c b/hfile_s3.c index 98afa3deb..3f094d3c0 100644 --- a/hfile_s3.c +++ b/hfile_s3.c @@ -1,6 +1,6 @@ /* hfile_s3.c -- Amazon S3 backend for low-level file streams. - Copyright (C) 2015-2017 Genome Research Ltd. + Copyright (C) 2015-2017, 2019 Genome Research Ltd. Author: John Marshall @@ -22,6 +22,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -30,6 +31,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include + #include "hfile_internal.h" #ifdef ENABLE_PLUGINS #include "version.h" @@ -37,16 +40,24 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts.h" // for hts_version() and hts_verbose #include "htslib/kstring.h" -typedef struct { +typedef struct s3_auth_data { kstring_t id; kstring_t token; kstring_t secret; + kstring_t region; + kstring_t canonical_query_string; + kstring_t user_query_string; + kstring_t host; char *bucket; kstring_t auth_hdr; time_t auth_time; char date[40]; + char date_long[17]; + char date_short[9]; + kstring_t date_html; char mode; - char *headers[3]; + char *headers[4]; + int refcount; } s3_auth_data; #define AUTH_LIFETIME 60 @@ -56,6 +67,8 @@ typedef struct { #include #define DIGEST_BUFSIZ CC_SHA1_DIGEST_LENGTH +#define SHA256_DIGEST_BUFSIZE CC_SHA256_DIGEST_LENGTH +#define HASH_LENGTH_SHA256 (SHA256_DIGEST_BUFSIZE * 2) + 1 static size_t s3_sign(unsigned char *digest, kstring_t *key, kstring_t *message) @@ -64,11 +77,26 @@ s3_sign(unsigned char *digest, kstring_t *key, kstring_t *message) return CC_SHA1_DIGEST_LENGTH; } + +static void s3_sha256(const unsigned char *in, size_t length, unsigned char *out) { + CC_SHA256(in, length, out); +} + + +static void s3_sign_sha256(const void *key, int key_len, const unsigned char *d, int n, unsigned char *md, unsigned int *md_len) { + CCHmac(kCCHmacAlgSHA256, key, key_len, d, n, md); + *md_len = CC_SHA256_DIGEST_LENGTH; +} + + #elif defined HAVE_HMAC #include +#include #define DIGEST_BUFSIZ EVP_MAX_MD_SIZE +#define SHA256_DIGEST_BUFSIZE SHA256_DIGEST_LENGTH +#define HASH_LENGTH_SHA256 (SHA256_DIGEST_BUFSIZE * 2) + 1 static size_t s3_sign(unsigned char *digest, kstring_t *key, kstring_t *message) @@ -79,6 +107,16 @@ s3_sign(unsigned char *digest, kstring_t *key, kstring_t *message) return len; } + +static void s3_sha256(const unsigned char *in, size_t length, unsigned char *out) { + SHA256(in, length, out); +} + + +static void s3_sign_sha256(const void *key, int key_len, const unsigned char *d, int n, unsigned char *md, unsigned int *md_len) { + HMAC(EVP_sha256(), key, key_len, d, n, md, md_len); +} + #else #error No HMAC() routine found by configure #endif @@ -122,7 +160,7 @@ static void base64_kput(const unsigned char *data, size_t len, kstring_t *str) kputsn("==", pad, str); } -static int is_dns_compliant(const char *s0, const char *slim) +static int is_dns_compliant(const char *s0, const char *slim, int is_https) { int has_nondigit = 0, len = 0; const char *s; @@ -137,6 +175,7 @@ static int is_dns_compliant(const char *s0, const char *slim) else if (isdigit_c(*s)) ; else if (*s == '.') { + if (is_https) return 0; if (s == s0 || ! isalnum_c(s[-1])) return 0; if (s+1 == slim || ! isalnum_c(s[1])) return 0; } @@ -244,11 +283,20 @@ static int copy_auth_headers(s3_auth_data *ad, char ***hdrs) { } static void free_auth_data(s3_auth_data *ad) { + if (ad->refcount > 0) { + --ad->refcount; + return; + } free(ad->id.s); free(ad->token.s); free(ad->secret.s); + free(ad->region.s); + free(ad->canonical_query_string.s); + free(ad->user_query_string.s); + free(ad->host.s); free(ad->bucket); free(ad->auth_hdr.s); + free(ad->date_html.s); free(ad); } @@ -283,7 +331,7 @@ static int auth_header_callback(void *ctx, char ***hdrs) { return copy_auth_headers(ad, hdrs); } - if (ksprintf(&message, "%s\n\n\n%s\n%s%s%s/%s", + if (ksprintf(&message, "%s\n\n\n%s\n%s%s%s%s", ad->mode == 'r' ? "GET" : "PUT", ad->date + 6, ad->token.l ? "x-amz-security-token:" : "", ad->token.l ? ad->token.s : "", @@ -307,17 +355,154 @@ static int auth_header_callback(void *ctx, char ***hdrs) { return -1; } -static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp) -{ - const char *bucket, *path; - char *header_list[4], **header = header_list; - kstring_t url = { 0, 0, NULL }; - kstring_t profile = { 0, 0, NULL }; - kstring_t host_base = { 0, 0, NULL }; - kstring_t token_hdr = { 0, 0, NULL }; +/* like a escape path but for query strings '=' and '&' are untouched */ +static char *escape_query(const char *qs) { + size_t i, j = 0, length; + char *escaped; + + length = strlen(qs); + + if ((escaped = malloc(length * 3 + 1)) == NULL) { + return NULL; + } + + for (i = 0; i < length; i++) { + int c = qs[i]; + + if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || + c == '_' || c == '-' || c == '~' || c == '.' || c == '/' || c == '=' || c == '&') { + escaped[j++] = c; + } else { + sprintf(escaped + j, "%%%02X", c); + j += 3; + } + } + + if (i != length) { + // in the case of a '?' copy the rest of the qs across unchanged + strcpy(escaped + j, qs + i); + } else { + escaped[j] = '\0'; + } + + return escaped; +} + + +static char *escape_path(const char *path) { + size_t i, j = 0, length; + char *escaped; + + length = strlen(path); + + if ((escaped = malloc(length * 3 + 1)) == NULL) { + return NULL; + } + + for (i = 0; i < length; i++) { + int c = path[i]; + + if (c == '?') break; // don't escape ? or beyond + + if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || + c == '_' || c == '-' || c == '~' || c == '.' || c == '/') { + escaped[j++] = c; + } else { + sprintf(escaped + j, "%%%02X", c); + j += 3; + } + } + if (i != length) { + // in the case of a '?' copy the rest of the path across unchanged + strcpy(escaped + j, path + i); + } else { + escaped[j] = '\0'; + } + + return escaped; +} + + +static int is_escaped(const char *str) { + const char *c = str; + int escaped = 0; + int needs_escape = 0; + + while (*c != '\0') { + if (*c == '%' && c[1] != '\0' && c[2] != '\0') { + if (isxdigit_c(c[1]) && isxdigit_c(c[2])) { + escaped = 1; + c += 3; + continue; + } else { + // only escaped if all % signs are escaped + escaped = 0; + } + } + if (!((*c >= '0' && *c <= '9') || (*c >= 'A' && *c <= 'Z') + || (*c >= 'a' && *c <= 'z') || + *c == '_' || *c == '-' || *c == '~' || *c == '.' || *c == '/')) { + needs_escape = 1; + } + c++; + } + + return escaped || !needs_escape; +} + +static int redirect_endpoint_callback(void *auth, long response, + kstring_t *header, kstring_t *url) { + s3_auth_data *ad = (s3_auth_data *)auth; + char *new_region; + char *end; + int ret = -1; + + // get the new region from the reply header + if ((new_region = strstr(header->s, "x-amz-bucket-region: "))) { + + new_region += strlen("x-amz-bucket-region: "); + end = new_region; + + while (isalnum_c(*end) || ispunct_c(*end)) end++; + + *end = 0; + + if (strstr(ad->host.s, "amazonaws.com")) { + ad->region.l = 0; + kputs(new_region, &ad->region); + + ad->host.l = 0; + ksprintf(&ad->host, "s3.%s.amazonaws.com", new_region); + + if (ad->region.l && ad->host.l) { + url->l = 0; + kputs(ad->host.s, url); + kputsn(ad->bucket, strlen(ad->bucket), url); + if (ad->user_query_string.l) { + kputc('?', url); + kputsn(ad->user_query_string.s, ad->user_query_string.l, url); + } + ret = 0; + } + } + } + + return ret; +} + +static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, + int sigver, kstring_t *url) +{ s3_auth_data *ad = calloc(1, sizeof(*ad)); + const char *bucket, *path; + char *escaped = NULL; + kstring_t profile = { 0, 0, NULL }; + size_t url_path_pos; + ptrdiff_t bucket_len; + int is_https = 1, dns_compliant; + char *query_start; if (!ad) return NULL; @@ -327,15 +512,21 @@ static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp) if (s3url[2] == '+') { bucket = strchr(s3url, ':') + 1; - kputsn(&s3url[3], bucket - &s3url[3], &url); + if (bucket == NULL) { + free(ad); + return NULL; + } + kputsn(&s3url[3], bucket - &s3url[3], url); + is_https = strncmp(url->s, "https:", 6) == 0; } else { - kputs("https:", &url); + kputs("https:", url); bucket = &s3url[3]; } - while (*bucket == '/') kputc(*bucket++, &url); + while (*bucket == '/') kputc(*bucket++, url); path = bucket + strcspn(bucket, "/?#@"); + if (*path == '@') { const char *colon = strpbrk(bucket, ":@"); if (*colon != ':') { @@ -358,6 +549,8 @@ static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp) if ((v = getenv("AWS_ACCESS_KEY_ID")) != NULL) kputs(v, &ad->id); if ((v = getenv("AWS_SECRET_ACCESS_KEY")) != NULL) kputs(v, &ad->secret); if ((v = getenv("AWS_SESSION_TOKEN")) != NULL) kputs(v, &ad->token); + if ((v = getenv("AWS_DEFAULT_REGION")) != NULL) kputs(v, &ad->region); + if ((v = getenv("HTS_S3_HOST")) != NULL) kputs(v, &ad->host); if ((v = getenv("AWS_DEFAULT_PROFILE")) != NULL) kputs(v, &profile); else if ((v = getenv("AWS_PROFILE")) != NULL) kputs(v, &profile); @@ -369,29 +562,117 @@ static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp) parse_ini(v? v : "~/.aws/credentials", profile.s, "aws_access_key_id", &ad->id, "aws_secret_access_key", &ad->secret, - "aws_session_token", &ad->token, NULL); + "aws_session_token", &ad->token, + "region", &ad->region, NULL); } - if (ad->id.l == 0) - parse_ini("~/.s3cfg", profile.s, "access_key", &ad->id, + + if (ad->id.l == 0) { + const char *v = getenv("HTS_S3_S3CFG"); + parse_ini(v? v : "~/.s3cfg", profile.s, "access_key", &ad->id, "secret_key", &ad->secret, "access_token", &ad->token, - "host_base", &host_base, NULL); + "host_base", &ad->host, + "bucket_location", &ad->region, NULL); + } + if (ad->id.l == 0) parse_simple("~/.awssecret", &ad->id, &ad->secret); - if (host_base.l == 0) - kputs("s3.amazonaws.com", &host_base); + dns_compliant = is_dns_compliant(bucket, path, is_https); + + if (ad->host.l == 0) + kputs("s3.amazonaws.com", &ad->host); + + if (!dns_compliant && ad->region.l > 0 + && strcmp(ad->host.s, "s3.amazonaws.com") == 0) { + // Can avoid a redirection by including the region in the host name + // (assuming the right one has been specified) + ad->host.l = 0; + ksprintf(&ad->host, "s3.%s.amazonaws.com", ad->region.s); + } + + if (ad->region.l == 0) + kputs("us-east-1", &ad->region); + + if (!is_escaped(path)) { + escaped = escape_path(path); + if (escaped == NULL) { + goto error; + } + } + + bucket_len = path - bucket; + // Use virtual hosted-style access if possible, otherwise path-style. - if (is_dns_compliant(bucket, path)) { - kputsn(bucket, path - bucket, &url); - kputc('.', &url); - kputs(host_base.s, &url); + if (dns_compliant) { + size_t url_host_pos = url->l; + // Append "bucket.host" to url + kputsn_(bucket, bucket_len, url); + kputc('.', url); + kputsn(ad->host.s, ad->host.l, url); + url_path_pos = url->l; + + if (sigver == 4) { + // Copy back to ad->host to use when making the signature + ad->host.l = 0; + kputsn(url->s + url_host_pos, url->l - url_host_pos, &ad->host); + } } else { - kputs(host_base.s, &url); - kputc('/', &url); - kputsn(bucket, path - bucket, &url); + // Append "host/bucket" to url + kputsn(ad->host.s, ad->host.l, url); + url_path_pos = url->l; + kputc('/', url); + kputsn(bucket, bucket_len, url); + } + + kputs(escaped == NULL ? path : escaped, url); + + if (sigver == 4 || !dns_compliant) { + ad->bucket = malloc(url->l - url_path_pos + 1); + if (ad->bucket == NULL) { + goto error; + } + memcpy(ad->bucket, url->s + url_path_pos, url->l - url_path_pos + 1); } - kputs(path, &url); + else { + ad->bucket = malloc(url->l - url_path_pos + bucket_len + 2); + if (ad->bucket == NULL) { + goto error; + } + ad->bucket[0] = '/'; + memcpy(ad->bucket + 1, bucket, bucket_len); + memcpy(ad->bucket + bucket_len + 1, + url->s + url_path_pos, url->l - url_path_pos + 1); + } + + // write any query strings to its own place to use later + if ((query_start = strchr(ad->bucket, '?'))) { + kputs(query_start + 1, &ad->user_query_string); + *query_start = 0; + } + + free(profile.s); + free(escaped); + + return ad; + + error: + free(profile.s); + free(escaped); + free_auth_data(ad); + return NULL; +} + +static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp) +{ + char *header_list[4], **header = header_list; + + kstring_t url = { 0, 0, NULL }; + kstring_t token_hdr = { 0, 0, NULL }; + s3_auth_data *ad = setup_auth_data(s3url, mode, 2, &url); + + if (!ad) + return NULL; if (ad->token.l > 0) { kputs("X-Amz-Security-Token: ", &token_hdr); @@ -399,48 +680,558 @@ static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp) *header++ = token_hdr.s; } - ad->bucket = strdup(bucket); - if (!ad->bucket) - goto fail; - *header = NULL; hFILE *fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list, "httphdr_callback", auth_header_callback, - "httphdr_callback_data", ad, NULL); + "httphdr_callback_data", ad, + "redirect_callback", redirect_endpoint_callback, + "redirect_callback_data", ad, + NULL); if (!fp) goto fail; free(url.s); - free(profile.s); - free(host_base.s); free(token_hdr.s); return fp; fail: free(url.s); - free(profile.s); - free(host_base.s); free(token_hdr.s); free_auth_data(ad); return NULL; } +/*************************************************************** + +AWS S3 sig version 4 writing code + +****************************************************************/ + +static void hash_string(char *in, size_t length, char *out) { + unsigned char hashed[SHA256_DIGEST_BUFSIZE]; + int i, j; + + s3_sha256((const unsigned char *)in, length, hashed); + + for (i = 0, j = 0; i < SHA256_DIGEST_BUFSIZE; i++, j+= 2) { + sprintf(out + j, "%02x", hashed[i]); + } +} + +static void ksinit(kstring_t *s) { + s->l = 0; + s->m = 0; + s->s = NULL; +} + + +static void ksfree(kstring_t *s) { + free(s->s); + ksinit(s); +} + + +static int make_signature(s3_auth_data *ad, kstring_t *string_to_sign, char *signature_string) { + unsigned char date_key[SHA256_DIGEST_BUFSIZE]; + unsigned char date_region_key[SHA256_DIGEST_BUFSIZE]; + unsigned char date_region_service_key[SHA256_DIGEST_BUFSIZE]; + unsigned char signing_key[SHA256_DIGEST_BUFSIZE]; + unsigned char signature[SHA256_DIGEST_BUFSIZE]; + + const unsigned char service[] = "s3"; + const unsigned char request[] = "aws4_request"; + + kstring_t secret_access_key = {0, 0, NULL}; + unsigned int len; + unsigned int i, j; + + ksprintf(&secret_access_key, "AWS4%s", ad->secret.s); + + if (secret_access_key.l == 0) { + return -1; + } + + s3_sign_sha256(secret_access_key.s, secret_access_key.l, (const unsigned char *)ad->date_short, strlen(ad->date_short), date_key, &len); + s3_sign_sha256(date_key, len, (const unsigned char *)ad->region.s, ad->region.l, date_region_key, &len); + s3_sign_sha256(date_region_key, len, service, 2, date_region_service_key, &len); + s3_sign_sha256(date_region_service_key, len, request, 12, signing_key, &len); + s3_sign_sha256(signing_key, len, (const unsigned char *)string_to_sign->s, string_to_sign->l, signature, &len); + + for (i = 0, j = 0; i < len; i++, j+= 2) { + sprintf(signature_string + j, "%02x", signature[i]); + } + + ksfree(&secret_access_key); + + return 0; +} + + +static int make_authorisation(s3_auth_data *ad, char *http_request, char *content, kstring_t *auth) { + kstring_t signed_headers = {0, 0, NULL}; + kstring_t canonical_headers = {0, 0, NULL}; + kstring_t canonical_request = {0, 0, NULL}; + kstring_t scope = {0, 0, NULL}; + kstring_t string_to_sign = {0, 0, NULL}; + char cr_hash[HASH_LENGTH_SHA256]; + char signature_string[HASH_LENGTH_SHA256]; + int ret = -1; + + + if (!ad->token.l) { + kputs("host;x-amz-content-sha256;x-amz-date", &signed_headers); + } else { + kputs("host;x-amz-content-sha256;x-amz-date;x-amz-security-token", &signed_headers); + } + + if (signed_headers.l == 0) { + return -1; + } + + + if (!ad->token.l) { + ksprintf(&canonical_headers, "host:%s\nx-amz-content-sha256:%s\nx-amz-date:%s\n", + ad->host.s, content, ad->date_long); + } else { + ksprintf(&canonical_headers, "host:%s\nx-amz-content-sha256:%s\nx-amz-date:%s\nx-amz-security-token:%s\n", + ad->host.s, content, ad->date_long, ad->token.s); + } + + if (canonical_headers.l == 0) { + goto cleanup; + } + + // bucket == canonical_uri + ksprintf(&canonical_request, "%s\n%s\n%s\n%s\n%s\n%s", + http_request, ad->bucket, ad->canonical_query_string.s, + canonical_headers.s, signed_headers.s, content); + + if (canonical_request.l == 0) { + goto cleanup; + } + + hash_string(canonical_request.s, canonical_request.l, cr_hash); + + ksprintf(&scope, "%s/%s/s3/aws4_request", ad->date_short, ad->region.s); + + if (scope.l == 0) { + goto cleanup; + } + + ksprintf(&string_to_sign, "AWS4-HMAC-SHA256\n%s\n%s\n%s", ad->date_long, scope.s, cr_hash); + + if (string_to_sign.l == 0) { + goto cleanup; + } + + if (make_signature(ad, &string_to_sign, signature_string)) { + goto cleanup; + } + + ksprintf(auth, "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request,SignedHeaders=%s,Signature=%s", + ad->id.s, ad->date_short, ad->region.s, signed_headers.s, signature_string); + + if (auth->l == 0) { + goto cleanup; + } + + ret = 0; + + cleanup: + ksfree(&signed_headers); + ksfree(&canonical_headers); + ksfree(&canonical_request); + ksfree(&scope); + ksfree(&string_to_sign); + + return ret; +} + + +static int update_time(s3_auth_data *ad) { + int ret = -1; + time_t now = time(NULL); +#ifdef HAVE_GMTIME_R + struct tm tm_buffer; + struct tm *tm = gmtime_r(&now, &tm_buffer); +#else + struct tm *tm = gmtime(&now); +#endif + + if (now - ad->auth_time > AUTH_LIFETIME) { + // update timestamp + ad->auth_time = now; + + if (strftime(ad->date_long, 17, "%Y%m%dT%H%M%SZ", tm) != 16) { + return -1; + } + + if (strftime(ad->date_short, 9, "%Y%m%d", tm) != 8) { + return -1;; + } + + ad->date_html.l = 0; + ksprintf(&ad->date_html, "x-amz-date: %s", ad->date_long); + } + + if (ad->date_html.l) ret = 0; + + return ret; +} + + +static int query_cmp(const void *p1, const void *p2) { + char **q1 = (char **)p1; + char **q2 = (char **)p2; + + return strcmp(*q1, *q2); +} + + +/* Query strings must be in alphabetical order for authorisation */ + +static int order_query_string(kstring_t *qs) { + int *query_offset; + int num_queries, i; + char **queries; + kstring_t ordered = {0, 0, NULL}; + char *escaped; + + if ((query_offset = ksplit(qs, '&', &num_queries)) == NULL) { + return -1; + } + + if ((queries = malloc(num_queries * sizeof(char*))) == NULL) { + return -1; + } + + for (i = 0; i < num_queries; i++) { + queries[i] = qs->s + query_offset[i]; + } + + qsort(queries, num_queries, sizeof(char *), query_cmp); + + for (i = 0; i < num_queries; i++) { + if (i) { + kputs("&", &ordered); + } + + kputs(queries[i], &ordered); + } + + if ((escaped = escape_query(ordered.s)) == NULL) { + return -1; + } + + qs->l = 0; + kputs(escaped, qs); + + free(ordered.s); + free(queries); + free(query_offset); + free(escaped); + + return 0; +} + + +static int write_authorisation_callback(void *auth, char *request, kstring_t *content, char *cqs, + kstring_t *hash, kstring_t *auth_str, kstring_t *date, + kstring_t *token, int uqs) { + s3_auth_data *ad = (s3_auth_data *)auth; + char content_hash[HASH_LENGTH_SHA256]; + + if (request == NULL) { + // signal to free auth data + free_auth_data(ad); + return 0; + } + + if (update_time(ad)) { + return -1; + } + + if (content) { + hash_string(content->s, content->l, content_hash); + } else { + // empty hash + hash_string("", 0, content_hash); + } + + ad->canonical_query_string.l = 0; + kputs(cqs, &ad->canonical_query_string); + + if (ad->canonical_query_string.l == 0) { + return -1; + } + + /* add a user provided query string, normally only useful on upload initiation */ + if (uqs) { + kputs("&", &ad->canonical_query_string); + kputs(ad->user_query_string.s, &ad->canonical_query_string); + + if (order_query_string(&ad->canonical_query_string)) { + return -1; + } + } + + if (make_authorisation(ad, request, content_hash, auth_str)) { + return -1; + } + + kputs(ad->date_html.s, date); + kputsn(content_hash, HASH_LENGTH_SHA256, hash); + + if (date->l == 0 || hash->l == 0) { + return -1; + } + + if (ad->token.l) { + ksprintf(token, "x-amz-security-token: %s", ad->token.s); + } + + return 0; +} + + +static int v4_auth_header_callback(void *ctx, char ***hdrs) { + s3_auth_data *ad = (s3_auth_data *) ctx; + char content_hash[HASH_LENGTH_SHA256]; + kstring_t content = {0, 0, NULL}; + kstring_t authorisation = {0, 0, NULL}; + char *date_html = NULL; + + if (!hdrs) { // Closing connection + free_auth_data(ad); + return 0; + } + + if (update_time(ad)) { + return -1; + } + + hash_string("", 0, content_hash); // empty hash + + ad->canonical_query_string.l = 0; + + if (ad->user_query_string.l > 0) { + kputs(ad->user_query_string.s, &ad->canonical_query_string); + + if (order_query_string(&ad->canonical_query_string)) { + return -1; + } + } else { + kputs("", &ad->canonical_query_string); + } + + if (make_authorisation(ad, "GET", content_hash, &authorisation)) { + return -1; + } + + ksprintf(&content, "x-amz-content-sha256: %s", content_hash); + date_html = strdup(ad->date_html.s); + + if (content.l == 0 || date_html == NULL) { + ksfree(&authorisation); + ksfree(&content); + free(date_html); + return -1; + } + + *hdrs = &ad->headers[0]; + ad->headers[0] = ks_release(&authorisation); + ad->headers[1] = date_html; + ad->headers[2] = ks_release(&content); + ad->headers[3] = NULL; + + return 0; +} + +static int handle_400_response(hFILE *fp, s3_auth_data *ad) { + // v4 signatures in virtual hosted mode return 400 Bad Request if the + // wrong region is used to make the signature. The response is an xml + // document which includes the name of the correct region. This can + // be extracted and used to generate a corrected signature. + // As the xml is fairly simple, go with something "good enough" instead + // of trying to parse it properly. + + char buffer[1024], *region, *reg_end; + ssize_t bytes; + + bytes = hread(fp, buffer, sizeof(buffer) - 1); + if (bytes < 0) { + return -1; + } + buffer[bytes] = '\0'; + region = strstr(buffer, ""); + if (region == NULL) { + return -1; + } + region += 8; + while (isspace((unsigned char) *region)) ++region; + reg_end = strchr(region, '<'); + if (reg_end == NULL || strncmp(reg_end + 1, "/Region>", 8) != 0) { + return -1; + } + while (reg_end > region && isspace((unsigned char) reg_end[-1])) --reg_end; + ad->region.l = 0; + kputsn(region, reg_end - region, &ad->region); + if (ad->region.l == 0) { + return -1; + } + + return 0; +} + +static int set_region(void *adv, kstring_t *region) { + s3_auth_data *ad = (s3_auth_data *) adv; + + ad->region.l = 0; + return kputsn(region->s, region->l, &ad->region) < 0; +} + +static int http_status_errno(int status) +{ + if (status >= 500) + switch (status) { + case 501: return ENOSYS; + case 503: return EBUSY; + case 504: return ETIMEDOUT; + default: return EIO; + } + else if (status >= 400) + switch (status) { + case 401: return EPERM; + case 403: return EACCES; + case 404: return ENOENT; + case 405: return EROFS; + case 407: return EPERM; + case 408: return ETIMEDOUT; + case 410: return ENOENT; + default: return EINVAL; + } + else return 0; +} + +static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { + kstring_t url = { 0, 0, NULL }; + kstring_t token_hdr = { 0, 0, NULL }; + + char *header_list[4], **header = header_list; + s3_auth_data *ad = setup_auth_data(s3url, mode, 4, &url); + hFILE *fp = NULL; + + if (ad == NULL) { + return NULL; + } + + if (ad->mode == 'r') { + long http_response = 0; + + if (ad->token.l > 0) { + kputs("x-amz-security-token: ", &token_hdr); + kputs(ad->token.s, &token_hdr); + *header++ = token_hdr.s; + } + + *header = NULL; + fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list, + "httphdr_callback", v4_auth_header_callback, + "httphdr_callback_data", ad, + "redirect_callback", redirect_endpoint_callback, + "redirect_callback_data", ad, + "http_response_ptr", &http_response, + "fail_on_error", 0, + NULL); + + if (fp == NULL) goto error; + + if (http_response == 400) { + ad->refcount = 1; + if (handle_400_response(fp, ad) != 0) { + goto error; + } + hclose_abruptly(fp); + fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list, + "httphdr_callback", v4_auth_header_callback, + "httphdr_callback_data", ad, + "redirect_callback", redirect_endpoint_callback, + "redirect_callback_data", ad, + NULL); + } else if (http_response > 400) { + ad->refcount = 1; + errno = http_status_errno(http_response); + goto error; + } + + if (fp == NULL) goto error; + } else { + kstring_t final_url = {0, 0, NULL}; + + // add the scheme marker + ksprintf(&final_url, "s3w+%s", url.s); + + if(final_url.l == 0) goto error; + + fp = hopen(final_url.s, mode, "va_list", argsp, + "s3_auth_callback", write_authorisation_callback, + "s3_auth_callback_data", ad, + "redirect_callback", redirect_endpoint_callback, + "set_region_callback", set_region, + NULL); + free(final_url.s); + + if (fp == NULL) goto error; + } + + free(url.s); + free(token_hdr.s); + + return fp; + + error: + + if (fp) hclose_abruptly(fp); + free(url.s); + free(token_hdr.s); + free_auth_data(ad); + + return NULL; +} + + static hFILE *s3_open(const char *url, const char *mode) { + hFILE *fp; + kstring_t mode_colon = { 0, 0, NULL }; kputs(mode, &mode_colon); kputc(':', &mode_colon); - hFILE *fp = s3_rewrite(url, mode_colon.s, NULL); + + if (getenv("HTS_S3_V2") == NULL) { // Force the v2 signature code + fp = s3_open_v4(url, mode_colon.s, NULL); + } else { + fp = s3_rewrite(url, mode_colon.s, NULL); + } + free(mode_colon.s); + return fp; } static hFILE *s3_vopen(const char *url, const char *mode_colon, va_list args0) { + hFILE *fp; // Need to use va_copy() as we can only take the address of an actual // va_list object, not that of a parameter whose type may have decayed. va_list args; va_copy(args, args0); - hFILE *fp = s3_rewrite(url, mode_colon, &args); + + if (getenv("HTS_S3_V2") == NULL) { // Force the v2 signature code + fp = s3_open_v4(url, mode_colon, &args); + } else { + fp = s3_rewrite(url, mode_colon, &args); + } + va_end(args); return fp; } @@ -453,7 +1244,7 @@ int PLUGIN_GLOBAL(hfile_plugin_init,_s3)(struct hFILE_plugin *self) #ifdef ENABLE_PLUGINS // Embed version string for examination via strings(1) or what(1) - static const char id[] = "@(#)hfile_s3 plugin (htslib)\t" HTS_VERSION; + static const char id[] = "@(#)hfile_s3 plugin (htslib)\t" HTS_VERSION_TEXT; if (hts_verbose >= 9) fprintf(stderr, "[M::hfile_s3.init] version %s\n", strchr(id, '\t')+1); #endif diff --git a/hfile_s3_write.c b/hfile_s3_write.c new file mode 100644 index 000000000..900862225 --- /dev/null +++ b/hfile_s3_write.c @@ -0,0 +1,896 @@ +/* + hfile_s3_write.c - Code to handle mulitpart uploading to S3. + + Copyright (C) 2019 Genome Research Ltd. + + Author: Andrew Whitwham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + + +S3 Multipart Upload +------------------- + +There are several steps in the Mulitipart upload. + + +1) Initiate Upload +------------------ + +Initiate the upload and get an upload ID. This ID is used in all other steps. + + +2) Upload Part +-------------- + +Upload a part of the data. 5Mb minimum part size (except for the last part). +Each part is numbered and a succesful upload returns an Etag header value that +needs to used for the completion step. + +Step repeated till all data is uploaded. + + +3) Completion +------------- + +Complete the upload by sending all the part numbers along with their associated +Etag values. + + +Optional - Abort +---------------- + +If something goes wrong this instructs the server to delete all the partial +uploads and abandon the upload process. + + +Andrew Whitwham, January 2019 +*/ + +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h +#include + +#include +#include +#include +#ifdef __MSYS__ +#include +#endif +#include +#include + +#include "hfile_internal.h" +#ifdef ENABLE_PLUGINS +#include "version.h" +#endif +#include "htslib/hts.h" +#include "htslib/kstring.h" +#include "htslib/khash.h" + +#include + +#define MINIMUM_S3_WRITE_SIZE 5242880 +#define S3_MOVED_PERMANENTLY 301 +#define S3_BAD_REQUEST 400 + +// Lets the part memory size grow to about 1Gb giving a 2.5Tb max file size. +// Max. parts allowed by AWS is 10000, so use ceil(10000.0/9.0) +#define EXPAND_ON 1112 + +static struct { + kstring_t useragent; + CURLSH *share; + pthread_mutex_t share_lock; +} curl = { { 0, 0, NULL }, NULL, PTHREAD_MUTEX_INITIALIZER }; + +static void share_lock(CURL *handle, curl_lock_data data, + curl_lock_access access, void *userptr) { + pthread_mutex_lock(&curl.share_lock); +} + +static void share_unlock(CURL *handle, curl_lock_data data, void *userptr) { + pthread_mutex_unlock(&curl.share_lock); +} + +typedef int (*s3_auth_callback) (void *auth_data, char *, kstring_t*, char*, kstring_t*, kstring_t*, kstring_t*, kstring_t*, int); + +typedef int (*set_region_callback) (void *auth_data, kstring_t *region); + +typedef struct { + s3_auth_callback callback; + redirect_callback redirect_callback; + set_region_callback set_region_callback; + void *callback_data; +} s3_authorisation; + +typedef struct { + hFILE base; + CURL *curl; + CURLcode ret; + s3_authorisation *au; + kstring_t buffer; + kstring_t url; + kstring_t upload_id; + kstring_t completion_message; + int part_no; + int aborted; + size_t index; + long verbose; + int part_size; + int expand; +} hFILE_s3_write; + + +static void ksinit(kstring_t *s) { + s->l = 0; + s->m = 0; + s->s = NULL; +} + + +static void ksfree(kstring_t *s) { + free(s->s); + ksinit(s); +} + + +static size_t response_callback(void *contents, size_t size, size_t nmemb, void *userp) { + size_t realsize = size * nmemb; + kstring_t *resp = (kstring_t *)userp; + + if (kputsn((const char *)contents, realsize, resp) == EOF) { + return 0; + } + + return realsize; +} + + +static int get_entry(char *in, char *start_tag, char *end_tag, kstring_t *out) { + char *start; + char *end; + + if (!in) { + return EOF; + } + + start = strstr(in, start_tag); + if (!start) return EOF; + + start += strlen(start_tag); + end = strstr(start, end_tag); + + if (!end) return EOF; + + return kputsn(start, end - start, out); +} + + +static void cleanup_local(hFILE_s3_write *fp) { + ksfree(&fp->buffer); + ksfree(&fp->url); + ksfree(&fp->upload_id); + ksfree(&fp->completion_message); + curl_easy_cleanup(fp->curl); + free(fp->au); + +} + + +static void cleanup(hFILE_s3_write *fp) { + // free up authorisation data + fp->au->callback(fp->au->callback_data, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0); + cleanup_local(fp); +} + + +static struct curl_slist *set_html_headers(hFILE_s3_write *fp, kstring_t *auth, kstring_t *date, kstring_t *content, kstring_t *token) { + struct curl_slist *headers = NULL; + + headers = curl_slist_append(headers, "Content-Type:"); // get rid of this + headers = curl_slist_append(headers, "Expect:"); // and this + headers = curl_slist_append(headers, auth->s); + headers = curl_slist_append(headers, date->s); + headers = curl_slist_append(headers, content->s); + + if (token->l) { + headers = curl_slist_append(headers, token->s); + } + + curl_easy_setopt(fp->curl, CURLOPT_HTTPHEADER, headers); + + return headers; +} + + +/* + The partially uploaded file will hang around unless the delete command is sent. +*/ +static int abort_upload(hFILE_s3_write *fp) { + kstring_t content_hash = {0, 0, NULL}; + kstring_t authorisation = {0, 0, NULL}; + kstring_t url = {0, 0, NULL}; + kstring_t content = {0, 0, NULL}; + kstring_t canonical_query_string = {0, 0, NULL}; + kstring_t date = {0, 0, NULL}; + kstring_t token = {0, 0, NULL}; + int ret = -1; + struct curl_slist *headers = NULL; + char http_request[] = "DELETE"; + + if (ksprintf(&canonical_query_string, "uploadId=%s", fp->upload_id.s) < 0) { + goto out; + } + + if (fp->au->callback(fp->au->callback_data, http_request, NULL, + canonical_query_string.s, &content_hash, + &authorisation, &date, &token, 0) != 0) { + goto out; + } + + if (ksprintf(&url, "%s?%s", fp->url.s, canonical_query_string.s) < 0) { + goto out; + } + + if (ksprintf(&content, "x-amz-content-sha256: %s", content_hash.s) < 0) { + goto out; + } + + curl_easy_reset(fp->curl); + curl_easy_setopt(fp->curl, CURLOPT_CUSTOMREQUEST, http_request); + curl_easy_setopt(fp->curl, CURLOPT_USERAGENT, curl.useragent.s); + curl_easy_setopt(fp->curl, CURLOPT_URL, url.s); + + curl_easy_setopt(fp->curl, CURLOPT_VERBOSE, fp->verbose); + + headers = set_html_headers(fp, &authorisation, &date, &content, &token); + fp->ret = curl_easy_perform(fp->curl); + + if (fp->ret == CURLE_OK) { + ret = 0; + } + + out: + ksfree(&authorisation); + ksfree(&content); + ksfree(&content_hash); + ksfree(&url); + ksfree(&date); + ksfree(&canonical_query_string); + ksfree(&token); + curl_slist_free_all(headers); + + fp->aborted = 1; + cleanup(fp); + + return ret; +} + + +static int complete_upload(hFILE_s3_write *fp, kstring_t *resp) { + kstring_t content_hash = {0, 0, NULL}; + kstring_t authorisation = {0, 0, NULL}; + kstring_t url = {0, 0, NULL}; + kstring_t content = {0, 0, NULL}; + kstring_t canonical_query_string = {0, 0, NULL}; + kstring_t date = {0, 0, NULL}; + kstring_t token = {0, 0, NULL}; + int ret = -1; + struct curl_slist *headers = NULL; + char http_request[] = "POST"; + + if (ksprintf(&canonical_query_string, "uploadId=%s", fp->upload_id.s) < 0) { + return -1; + } + + // finish off the completion reply + if (kputs("\n", &fp->completion_message) < 0) { + goto out; + } + + if (fp->au->callback(fp->au->callback_data, http_request, + &fp->completion_message, canonical_query_string.s, + &content_hash, &authorisation, &date, &token, 0) != 0) { + goto out; + } + + if (ksprintf(&url, "%s?%s", fp->url.s, canonical_query_string.s) < 0) { + goto out; + } + + if (ksprintf(&content, "x-amz-content-sha256: %s", content_hash.s) < 0) { + goto out; + } + + curl_easy_reset(fp->curl); + curl_easy_setopt(fp->curl, CURLOPT_POST, 1L); + curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDS, fp->completion_message.s); + curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDSIZE, fp->completion_message.l); + curl_easy_setopt(fp->curl, CURLOPT_WRITEFUNCTION, response_callback); + curl_easy_setopt(fp->curl, CURLOPT_WRITEDATA, (void *)resp); + curl_easy_setopt(fp->curl, CURLOPT_URL, url.s); + curl_easy_setopt(fp->curl, CURLOPT_USERAGENT, curl.useragent.s); + + curl_easy_setopt(fp->curl, CURLOPT_VERBOSE, fp->verbose); + + headers = set_html_headers(fp, &authorisation, &date, &content, &token); + fp->ret = curl_easy_perform(fp->curl); + + if (fp->ret == CURLE_OK) { + ret = 0; + } + + out: + ksfree(&authorisation); + ksfree(&content); + ksfree(&content_hash); + ksfree(&url); + ksfree(&date); + ksfree(&token); + ksfree(&canonical_query_string); + curl_slist_free_all(headers); + + return ret; +} + + +static size_t upload_callback(void *ptr, size_t size, size_t nmemb, void *stream) { + size_t realsize = size * nmemb; + hFILE_s3_write *fp = (hFILE_s3_write *)stream; + size_t read_length; + + if (realsize > (fp->buffer.l - fp->index)) { + read_length = fp->buffer.l - fp->index; + } else { + read_length = realsize; + } + + memcpy(ptr, fp->buffer.s + fp->index, read_length); + fp->index += read_length; + + return read_length; +} + + +static int upload_part(hFILE_s3_write *fp, kstring_t *resp) { + kstring_t content_hash = {0, 0, NULL}; + kstring_t authorisation = {0, 0, NULL}; + kstring_t url = {0, 0, NULL}; + kstring_t content = {0, 0, NULL}; + kstring_t canonical_query_string = {0, 0, NULL}; + kstring_t date = {0, 0, NULL}; + kstring_t token = {0, 0, NULL}; + int ret = -1; + struct curl_slist *headers = NULL; + char http_request[] = "PUT"; + + if (ksprintf(&canonical_query_string, "partNumber=%d&uploadId=%s", fp->part_no, fp->upload_id.s) < 0) { + return -1; + } + + if (fp->au->callback(fp->au->callback_data, http_request, &fp->buffer, + canonical_query_string.s, &content_hash, + &authorisation, &date, &token, 0) != 0) { + goto out; + } + + if (ksprintf(&url, "%s?%s", fp->url.s, canonical_query_string.s) < 0) { + goto out; + } + + fp->index = 0; + if (ksprintf(&content, "x-amz-content-sha256: %s", content_hash.s) < 0) { + goto out; + } + + curl_easy_reset(fp->curl); + + curl_easy_setopt(fp->curl, CURLOPT_UPLOAD, 1L); + curl_easy_setopt(fp->curl, CURLOPT_READFUNCTION, upload_callback); + curl_easy_setopt(fp->curl, CURLOPT_READDATA, fp); + curl_easy_setopt(fp->curl, CURLOPT_INFILESIZE_LARGE, (curl_off_t)fp->buffer.l); + curl_easy_setopt(fp->curl, CURLOPT_HEADERFUNCTION, response_callback); + curl_easy_setopt(fp->curl, CURLOPT_HEADERDATA, (void *)resp); + curl_easy_setopt(fp->curl, CURLOPT_URL, url.s); + curl_easy_setopt(fp->curl, CURLOPT_USERAGENT, curl.useragent.s); + + curl_easy_setopt(fp->curl, CURLOPT_VERBOSE, fp->verbose); + + headers = set_html_headers(fp, &authorisation, &date, &content, &token); + fp->ret = curl_easy_perform(fp->curl); + + if (fp->ret == CURLE_OK) { + ret = 0; + } + + out: + ksfree(&authorisation); + ksfree(&content); + ksfree(&content_hash); + ksfree(&url); + ksfree(&date); + ksfree(&token); + ksfree(&canonical_query_string); + curl_slist_free_all(headers); + + return ret; +} + + +static ssize_t s3_write(hFILE *fpv, const void *bufferv, size_t nbytes) { + hFILE_s3_write *fp = (hFILE_s3_write *)fpv; + const char *buffer = (const char *)bufferv; + + if (kputsn(buffer, nbytes, &fp->buffer) == EOF) { + return -1; + } + + if (fp->buffer.l > fp->part_size) { + // time to write out our data + kstring_t response = {0, 0, NULL}; + int ret; + + ret = upload_part(fp, &response); + + if (!ret) { + long response_code; + kstring_t etag = {0, 0, NULL}; + + curl_easy_getinfo(fp->curl, CURLINFO_RESPONSE_CODE, &response_code); + + if (response_code > 200) { + ret = -1; + } else { + if (get_entry(response.s, "ETag: \"", "\"", &etag) == EOF) { + ret = -1; + } else { + ksprintf(&fp->completion_message, "\t\n\t\t%d\n\t\t%s\n\t\n", + fp->part_no, etag.s); + + ksfree(&etag); + } + } + } + + ksfree(&response); + + if (ret) { + abort_upload(fp); + return -1; + } + + fp->part_no++; + fp->buffer.l = 0; + + if (fp->expand && (fp->part_no % EXPAND_ON == 0)) { + fp->part_size *= 2; + } + } + + return nbytes; +} + + +static int s3_close(hFILE *fpv) { + hFILE_s3_write *fp = (hFILE_s3_write *)fpv; + kstring_t response = {0, 0, NULL}; + int ret = 0; + + if (!fp->aborted) { + + if (fp->buffer.l) { + // write the last part + + ret = upload_part(fp, &response); + + if (!ret) { + long response_code; + kstring_t etag = {0, 0, NULL}; + + curl_easy_getinfo(fp->curl, CURLINFO_RESPONSE_CODE, &response_code); + + if (response_code > 200) { + ret = -1; + } else { + if (get_entry(response.s, "ETag: \"", "\"", &etag) == EOF) { + ret = -1; + } else { + ksprintf(&fp->completion_message, "\t\n\t\t%d\n\t\t%s\n\t\n", + fp->part_no, etag.s); + + ksfree(&etag); + } + } + } + + ksfree(&response); + + if (ret) { + abort_upload(fp); + return -1; + } + + fp->part_no++; + } + + if (fp->part_no > 1) { + ret = complete_upload(fp, &response); + + if (!ret) { + if (strstr(response.s, "CompleteMultipartUploadResult") == NULL) { + ret = -1; + } + } + } else { + ret = -1; + } + + if (ret) { + abort_upload(fp); + } else { + cleanup(fp); + } + } + + ksfree(&response); + + return ret; +} + + +static int redirect_endpoint(hFILE_s3_write *fp, kstring_t *head) { + int ret = -1; + + if (fp->au->redirect_callback) { + ret = fp->au->redirect_callback(fp->au->callback_data, 301, head, &fp->url); + } + + return ret; +} + +static int handle_bad_request(hFILE_s3_write *fp, kstring_t *resp) { + kstring_t region = {0, 0, NULL}; + int ret = -1; + + if (fp->au->set_region_callback) { + if (get_entry(resp->s, "", "", ®ion) == EOF) { + return -1; + } + + ret = fp->au->set_region_callback(fp->au->callback_data, ®ion); + + ksfree(®ion); + } + + return ret; +} + +static int initialise_upload(hFILE_s3_write *fp, kstring_t *head, kstring_t *resp, int user_query) { + kstring_t content_hash = {0, 0, NULL}; + kstring_t authorisation = {0, 0, NULL}; + kstring_t url = {0, 0, NULL}; + kstring_t content = {0, 0, NULL}; + kstring_t date = {0, 0, NULL}; + kstring_t token = {0, 0, NULL}; + int ret = -1; + struct curl_slist *headers = NULL; + char http_request[] = "POST"; + char delimeter = '?'; + + if (user_query) { + delimeter = '&'; + } + + if (fp->au->callback(fp->au->callback_data, http_request, NULL, "uploads=", + &content_hash, &authorisation, &date, &token, user_query) != 0) { + goto out; + } + + if (ksprintf(&url, "%s%cuploads", fp->url.s, delimeter) < 0) { + goto out; + } + + if (ksprintf(&content, "x-amz-content-sha256: %s", content_hash.s) < 0) { + goto out; + } + + curl_easy_setopt(fp->curl, CURLOPT_URL, url.s); + curl_easy_setopt(fp->curl, CURLOPT_POST, 1L); + curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDS, ""); // send no data + curl_easy_setopt(fp->curl, CURLOPT_WRITEFUNCTION, response_callback); + curl_easy_setopt(fp->curl, CURLOPT_WRITEDATA, (void *)resp); + curl_easy_setopt(fp->curl, CURLOPT_HEADERFUNCTION, response_callback); + curl_easy_setopt(fp->curl, CURLOPT_HEADERDATA, (void *)head); + curl_easy_setopt(fp->curl, CURLOPT_USERAGENT, curl.useragent.s); + + curl_easy_setopt(fp->curl, CURLOPT_VERBOSE, fp->verbose); + + headers = set_html_headers(fp, &authorisation, &date, &content, &token); + fp->ret = curl_easy_perform(fp->curl); + + if (fp->ret == CURLE_OK) { + ret = 0; + } + + out: + ksfree(&authorisation); + ksfree(&content); + ksfree(&content_hash); + ksfree(&url); + ksfree(&date); + ksfree(&token); + curl_slist_free_all(headers); + + return ret; +} + + +static int get_upload_id(hFILE_s3_write *fp, kstring_t *resp) { + int ret = 0; + + ksinit(&fp->upload_id); + + if (get_entry(resp->s, "", "", &fp->upload_id) == EOF) { + ret = -1; + } + + return ret; +} + + +static const struct hFILE_backend s3_write_backend = { + NULL, s3_write, NULL, NULL, s3_close +}; + + +static hFILE *s3_write_open(const char *url, s3_authorisation *auth) { + hFILE_s3_write *fp; + kstring_t response = {0, 0, NULL}; + kstring_t header = {0, 0, NULL}; + int ret, has_user_query = 0; + char *query_start; + const char *env; + + + if (!auth || !auth->callback || !auth->callback_data) { + return NULL; + } + + fp = (hFILE_s3_write *)hfile_init(sizeof(hFILE_s3_write), "w", 0); + + if (fp == NULL) { + return NULL; + } + + if ((fp->curl = curl_easy_init()) == NULL) { + errno = ENOMEM; + goto error; + } + + if ((fp->au = calloc(1, sizeof(s3_authorisation))) == NULL) { + goto error; + } + + memcpy(fp->au, auth, sizeof(s3_authorisation)); + + ksinit(&fp->buffer); + ksinit(&fp->url); + ksinit(&fp->completion_message); + fp->aborted = 0; + + fp->part_size = MINIMUM_S3_WRITE_SIZE; + fp->expand = 1; + + if ((env = getenv("HTS_S3_PART_SIZE")) != NULL) { + int part_size = atoi(env) * 1024 * 1024; + + if (part_size > fp->part_size) + fp->part_size = part_size; + + fp->expand = 0; + } + + if (hts_verbose >= 8) { + fp->verbose = 1L; + } else { + fp->verbose = 0L; + } + + kputs(url + 4, &fp->url); + + if ((query_start = strchr(fp->url.s, '?'))) { + has_user_query = 1;; + } + + ret = initialise_upload(fp, &header, &response, has_user_query); + + if (ret == 0) { + long response_code; + + curl_easy_getinfo(fp->curl, CURLINFO_RESPONSE_CODE, &response_code); + + if (response_code == S3_MOVED_PERMANENTLY) { + if (redirect_endpoint(fp, &header) == 0) { + ksfree(&response); + ksfree(&header); + + ret = initialise_upload(fp, &header, &response, has_user_query); + } + } else if (response_code == S3_BAD_REQUEST) { + if (handle_bad_request(fp, &response) == 0) { + ksfree(&response); + ksfree(&header); + + ret = initialise_upload(fp, &header, &response, has_user_query); + } + } + + ksfree(&header); // no longer needed + } + + if (ret) goto error; + + if (get_upload_id(fp, &response)) goto error; + + // start the completion message (a formatted list of parts) + ksinit(&fp->completion_message); + + if (kputs("\n", &fp->completion_message) == EOF) { + goto error; + } + + fp->part_no = 1; + + // user query string no longer a useful part of the URL + if (query_start) + *query_start = '\0'; + + fp->base.backend = &s3_write_backend; + ksfree(&response); + + return &fp->base; + +error: + ksfree(&response); + cleanup_local(fp); + hfile_destroy((hFILE *)fp); + return NULL; +} + + +static hFILE *hopen_s3_write(const char *url, const char *mode) { + if (hts_verbose >= 1) { + fprintf(stderr, "[E::%s] s3w:// URLs should not be used directly; use s3:// instead.\n", __func__); + } + return NULL; +} + + +static int parse_va_list(s3_authorisation *auth, va_list args) { + const char *argtype; + + while ((argtype = va_arg(args, const char *)) != NULL) { + if (strcmp(argtype, "s3_auth_callback") == 0) { + auth->callback = va_arg(args, s3_auth_callback); + } else if (strcmp(argtype, "s3_auth_callback_data") == 0) { + auth->callback_data = va_arg(args, void *); + } else if (strcmp(argtype, "redirect_callback") == 0) { + auth->redirect_callback = va_arg(args, redirect_callback); + } else if (strcmp(argtype, "set_region_callback") == 0) { + auth->set_region_callback = va_arg(args, set_region_callback); + } else if (strcmp(argtype, "va_list") == 0) { + va_list *args2 = va_arg(args, va_list *); + + if (args2) { + if (parse_va_list(auth, *args2) < 0) return -1; + } + } else { + errno = EINVAL; + return -1; + } + } + + return 0; +} + + +static hFILE *vhopen_s3_write(const char *url, const char *mode, va_list args) { + hFILE *fp = NULL; + s3_authorisation auth = {NULL, NULL, NULL}; + + if (parse_va_list(&auth, args) == 0) { + fp = s3_write_open(url, &auth); + } + + return fp; +} + + +static void s3_write_exit() { + if (curl_share_cleanup(curl.share) == CURLSHE_OK) + curl.share = NULL; + + free(curl.useragent.s); + curl.useragent.l = curl.useragent.m = 0; curl.useragent.s = NULL; + curl_global_cleanup(); +} + + +int PLUGIN_GLOBAL(hfile_plugin_init,_s3_write)(struct hFILE_plugin *self) { + + static const struct hFILE_scheme_handler handler = + { hopen_s3_write, hfile_always_remote, "S3 Multipart Upload", + 2000 + 50, vhopen_s3_write + }; + +#ifdef ENABLE_PLUGINS + // Embed version string for examination via strings(1) or what(1) + static const char id[] = + "@(#)hfile_s3_write plugin (htslib)\t" HTS_VERSION_TEXT; + const char *version = strchr(id, '\t') + 1; + + if (hts_verbose >= 9) + fprintf(stderr, "[M::hfile_s3_write.init] version %s\n", + version); +#else + const char *version = hts_version(); +#endif + + const curl_version_info_data *info; + CURLcode err; + CURLSHcode errsh; + + err = curl_global_init(CURL_GLOBAL_ALL); + + if (err != CURLE_OK) { + // look at putting in an errno here + return -1; + } + + curl.share = curl_share_init(); + + if (curl.share == NULL) { + curl_global_cleanup(); + errno = EIO; + return -1; + } + + errsh = curl_share_setopt(curl.share, CURLSHOPT_LOCKFUNC, share_lock); + errsh |= curl_share_setopt(curl.share, CURLSHOPT_UNLOCKFUNC, share_unlock); + errsh |= curl_share_setopt(curl.share, CURLSHOPT_SHARE, CURL_LOCK_DATA_DNS); + + if (errsh != 0) { + curl_share_cleanup(curl.share); + curl_global_cleanup(); + errno = EIO; + return -1; + } + + info = curl_version_info(CURLVERSION_NOW); + ksprintf(&curl.useragent, "htslib/%s libcurl/%s", version, info->version); + + self->name = "S3 Multipart Upload"; + self->destroy = s3_write_exit; + + hfile_add_scheme_handler("s3w", &handler); + hfile_add_scheme_handler("s3w+http", &handler); + hfile_add_scheme_handler("s3w+https", &handler); + + return 0; +} diff --git a/hts.c b/hts.c index 1a36f08e9..cb64f5bf7 100644 --- a/hts.c +++ b/hts.c @@ -1,6 +1,6 @@ /* hts.c -- format-neutral I/O, indexing, and iterator API functions. - Copyright (C) 2008, 2009, 2012-2017 Genome Research Ltd. + Copyright (C) 2008, 2009, 2012-2019 Genome Research Ltd. Copyright (C) 2012, 2013 Broad Institute. Author: Heng Li @@ -23,6 +23,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -30,7 +31,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include +#include #include #include #include @@ -44,21 +47,29 @@ DEALINGS IN THE SOFTWARE. */ #include "version.h" #include "hts_internal.h" #include "hfile_internal.h" +#include "sam_internal.h" #include "htslib/hts_os.h" // drand48 #include "htslib/khash.h" #include "htslib/kseq.h" #include "htslib/ksort.h" +#include "htslib/tbx.h" + +#ifndef EFTYPE +#define EFTYPE ENOEXEC +#endif KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal) +HTSLIB_EXPORT int hts_verbose = HTS_LOG_WARNING; const char *hts_version() { - return HTS_VERSION; + return HTS_VERSION_TEXT; } +HTSLIB_EXPORT const unsigned char seq_nt16_table[256] = { 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, @@ -79,8 +90,10 @@ const unsigned char seq_nt16_table[256] = { 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 }; +HTSLIB_EXPORT const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN"; +HTSLIB_EXPORT const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; /********************** @@ -93,6 +106,7 @@ static enum htsFormatCategory format_category(enum htsExactFormat fmt) case bam: case sam: case cram: + case fastq_format: return sequence_data; case vcf: @@ -102,6 +116,8 @@ static enum htsFormatCategory format_category(enum htsExactFormat fmt) case bai: case crai: case csi: + case fai_format: + case fqi_format: case gzi: case tbi: return index_file; @@ -109,12 +125,14 @@ static enum htsFormatCategory format_category(enum htsExactFormat fmt) case bed: return region_list; + case fasta_format: case htsget: return unknown_category; case unknown_format: case binary_format: case text_format: + case empty_format: case format_maximum: break; } @@ -122,14 +140,11 @@ static enum htsFormatCategory format_category(enum htsExactFormat fmt) return unknown_category; } -// Decompress up to ten or so bytes by peeking at the file, which must be +// Decompress several hundred bytes by peeking at the file, which must be // positioned at the start of a GZIP block. static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize) { - // Typically at most a couple of hundred bytes of input are required - // to get a few bytes of output from inflate(), so hopefully this buffer - // size suffices in general. - unsigned char buffer[512]; + unsigned char buffer[2048]; z_stream zs; ssize_t npeek = hpeek(fp, buffer, sizeof buffer); @@ -195,29 +210,151 @@ cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim) return 0; } +static int is_text_only(const unsigned char *u, const unsigned char *ulim) +{ + for (; u < ulim; u++) + if (! (*u >= ' ' || *u == '\t' || *u == '\r' || *u == '\n')) + return 0; + + return 1; +} + +static int +secondline_is_bases(const unsigned char *u, const unsigned char *ulim) +{ + // Skip to second line, returning false if there isn't one + u = memchr(u, '\n', ulim - u); + if (u == NULL || ++u == ulim) return 0; + + // Scan over all base-encoding letters (including 'N' but not SEQ's '=') + while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) { + if (*u == '=') return 0; + u++; + } + + return (u == ulim || *u == '\r' || *u == '\n')? 1 : 0; +} + +// Parse tab-delimited text, filling in a string of column types and returning +// the number of columns spotted (within [u,ulim), and up to column_len) or -1 +// if non-printable characters were seen. Column types: +// i: integer, s: strand sign, C: CIGAR, O: SAM optional field, Z: anything +static int +parse_tabbed_text(char *columns, int column_len, + const unsigned char *u, const unsigned char *ulim, + int *complete) +{ + const char *str = (const char *) u; + const char *slim = (const char *) ulim; + const char *s; + int ncolumns = 0; + + enum { digit = 1, leading_sign = 2, cigar_operator = 4, other = 8 }; + unsigned seen = 0; + *complete = 0; + + for (s = str; s < slim; s++) + if (*s >= ' ') { + if (isdigit_c(*s)) + seen |= digit; + else if ((*s == '+' || *s == '-') && s == str) + seen |= leading_sign; + else if (strchr(BAM_CIGAR_STR, *s) && s > str && isdigit_c(s[-1])) + seen |= cigar_operator; + else + seen |= other; + } + else if (*s == '\t' || *s == '\r' || *s == '\n') { + size_t len = s - str; + char type; + + if (seen == digit || seen == (leading_sign|digit)) type = 'i'; + else if (seen == (digit|cigar_operator)) type = 'C'; + else if (len == 1) + switch (str[0]) { + case '*': type = 'C'; break; + case '+': case '-': case '.': type = 's'; break; + default: type = 'Z'; break; + } + else if (len >= 5 && str[2] == ':' && str[4] == ':') type = 'O'; + else type = 'Z'; + + columns[ncolumns++] = type; + if (*s != '\t' || ncolumns >= column_len - 1) { + *complete = 1; // finished the line or more columns than needed + break; + } + + str = s + 1; + seen = 0; + } + else return -1; + + columns[ncolumns] = '\0'; + return ncolumns; +} + +// Match COLUMNS as a prefix against PATTERN (so COLUMNS may run out first). +// Returns len(COLUMNS) (modulo '+'), or 0 if there is a mismatched entry. +static int colmatch(const char *columns, const char *pattern) +{ + int i; + for (i = 0; columns[i] != '\0'; i++) { + if (pattern[i] == '+') return i; + if (! (columns[i] == pattern[i] || pattern[i] == 'Z')) return 0; + } + + return i; +} + int hts_detect_format(hFILE *hfile, htsFormat *fmt) { - unsigned char s[32]; + char columns[24]; + unsigned char s[1024]; + int complete = 0; ssize_t len = hpeek(hfile, s, 18); if (len < 0) return -1; + fmt->category = unknown_category; + fmt->format = unknown_format; + fmt->version.major = fmt->version.minor = -1; + fmt->compression = no_compression; + fmt->compression_level = -1; + fmt->specific = NULL; + if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) { // The stream is either gzip-compressed or BGZF-compressed. - // Determine which, and decompress the first few bytes. + // Determine which, and decompress the first few records or lines. fmt->compression = (len >= 18 && (s[3] & 4) && memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip; + if (len >= 9 && s[2] == 8) + fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1; + len = decompress_peek(hfile, s, sizeof s); } + else if (len >= 10 && memcmp(s, "BZh", 3) == 0 && + (memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 || + memcmp(&s[4], "\x17\x72\x45\x38\x50\x90", 6) == 0)) { + fmt->compression = bzip2_compression; + fmt->compression_level = s[3] - '0'; + // Decompressing via libbz2 produces no output until it has a whole + // block (of size 100Kb x level), which is too large for peeking. + // So unfortunately we can recognise bzip2 but not the contents, + // except that \x1772... magic indicates the stream is empty. + if (s[4] == '\x31') return 0; + else len = 0; + } else { - fmt->compression = no_compression; len = hpeek(hfile, s, sizeof s); } if (len < 0) return -1; - fmt->compression_level = -1; - fmt->specific = NULL; + if (len == 0) { + fmt->format = empty_format; + return 0; + } - if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=3 && s[5]<=1) { + if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) { fmt->category = sequence_data; fmt->format = cram; fmt->version.major = s[4], fmt->version.minor = s[5]; @@ -260,7 +397,6 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) else if (memcmp(s, "TBI\1", 4) == 0) { fmt->category = index_file; fmt->format = tbi; - fmt->version.major = -1, fmt->version.minor = -1; return 0; } } @@ -269,13 +405,12 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) fmt->format = vcf; if (len >= 21 && s[16] == 'v') parse_version(fmt, &s[17], &s[len]); - else - fmt->version.major = fmt->version.minor = -1; return 0; } else if (len >= 4 && s[0] == '@' && (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 || - memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0)) { + memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0 || + memcmp(s, "@CO\t", 4) == 0)) { fmt->category = sequence_data; fmt->format = sam; // @HD-VN is not guaranteed to be the first tag, but then @HD is @@ -289,26 +424,55 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) { fmt->category = unknown_category; fmt->format = htsget; - fmt->version.major = fmt->version.minor = -1; return 0; } - else { - // Various possibilities for tab-delimited text: - // .crai (gzipped tab-delimited six columns: seqid 5*number) - // .bed ([3..12] tab-delimited columns) - // .bedpe (>= 10 tab-delimited columns) - // .sam (tab-delimited >= 11 columns: seqid number seqid...) - // FIXME For now, assume it's SAM + else if (len >= 1 && s[0] == '>' && secondline_is_bases(s, &s[len])) { + fmt->format = fasta_format; + return 0; + } + else if (len >= 1 && s[0] == '@' && secondline_is_bases(s, &s[len])) { fmt->category = sequence_data; - fmt->format = sam; - fmt->version.major = 1, fmt->version.minor = -1; + fmt->format = fastq_format; return 0; } + else if (parse_tabbed_text(columns, sizeof columns, s, + &s[len], &complete) > 0) { + // A complete SAM line is at least 11 columns. On unmapped long reads may + // be missing two. (On mapped long reads we must have an @ header so long + // CIGAR is irrelevant.) + if (colmatch(columns, "ZiZiiCZiiZZOOOOOOOOOOOOOOOOOOOO+") + >= 9 + 2*complete) { + fmt->category = sequence_data; + fmt->format = sam; + fmt->version.major = 1, fmt->version.minor = -1; + return 0; + } + else if (fmt->compression == gzip && colmatch(columns, "iiiiii") == 6) { + fmt->category = index_file; + fmt->format = crai; + return 0; + } + else if (colmatch(columns, "Ziiiii") == 6) { + fmt->category = index_file; + fmt->format = fqi_format; + return 0; + } + else if (colmatch(columns, "Ziiii") == 5) { + fmt->category = index_file; + fmt->format = fai_format; + return 0; + } + else if (colmatch(columns, "Zii+") >= 3) { + fmt->category = region_list; + fmt->format = bed; + return 0; + } + } - fmt->category = unknown_category; - fmt->format = unknown_format; - fmt->version.major = fmt->version.minor = -1; - fmt->compression = no_compression; + // Arbitrary text files can be read using hts_getline(). + if (is_text_only(s, &s[len])) fmt->format = text_format; + + // Nothing recognised: leave unset fmt-> fields as unknown. return 0; } @@ -320,6 +484,8 @@ char *hts_format_description(const htsFormat *format) case sam: kputs("SAM", &str); break; case bam: kputs("BAM", &str); break; case cram: kputs("CRAM", &str); break; + case fasta_format: kputs("FASTA", &str); break; + case fastq_format: kputs("FASTQ", &str); break; case vcf: kputs("VCF", &str); break; case bcf: if (format->version.major == 1) kputs("Legacy BCF", &str); @@ -328,8 +494,13 @@ char *hts_format_description(const htsFormat *format) case bai: kputs("BAI", &str); break; case crai: kputs("CRAI", &str); break; case csi: kputs("CSI", &str); break; + case fai_format: kputs("FASTA-IDX", &str); break; + case fqi_format: kputs("FASTQ-IDX", &str); break; + case gzi: kputs("GZI", &str); break; case tbi: kputs("Tabix", &str); break; + case bed: kputs("BED", &str); break; case htsget: kputs("htsget", &str); break; + case empty_format: kputs("empty", &str); break; default: kputs("unknown", &str); break; } @@ -343,6 +514,7 @@ char *hts_format_description(const htsFormat *format) } switch (format->compression) { + case bzip2_compression: kputs(" bzip2-compressed", &str); break; case custom: kputs(" compressed", &str); break; case gzip: kputs(" gzip-compressed", &str); break; case bgzf: @@ -372,14 +544,22 @@ char *hts_format_description(const htsFormat *format) if (format->compression == no_compression) switch (format->format) { + case text_format: case sam: case crai: case vcf: case bed: + case fai_format: + case fqi_format: + case fasta_format: + case fastq_format: case htsget: kputs(" text", &str); break; + case empty_format: + break; + default: kputs(" data", &str); break; @@ -392,13 +572,14 @@ char *hts_format_description(const htsFormat *format) htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) { - char smode[102], *cp, *cp2, *mode_c; + char smode[101], *cp, *cp2, *mode_c; htsFile *fp = NULL; - hFILE *hfile; + hFILE *hfile = NULL; char fmt_code = '\0'; + const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0"; - strncpy(smode, mode, 100); - smode[100]=0; + strncpy(smode, mode, 99); + smode[99]=0; if ((cp = strchr(smode, ','))) *cp = '\0'; @@ -414,11 +595,27 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) mode_c = cp2; *cp2++ = fmt_code; *cp2++ = 0; - *cp2++ = 0; // Set or reset the format code if opts->format is used - if (fmt && fmt->format != unknown_format) - *mode_c = "\0g\0\0b\0c\0\0b\0g\0\0"[fmt->format]; + if (fmt && fmt->format > unknown_format + && fmt->format < sizeof(format_to_mode)) { + *mode_c = format_to_mode[fmt->format]; + } + + // If we really asked for a compressed text format then mode_c above will + // point to nul. We set to 'z' to enable bgzf. + if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) { + if (fmt->format == sam || fmt->format == vcf || fmt->format == text_format) + *mode_c = 'z'; + } + + char *rmme = NULL, *fnidx = strstr(fn, HTS_IDX_DELIM); + if ( fnidx ) { + rmme = strdup(fn); + if ( !rmme ) goto error; + rmme[fnidx-fn] = 0; + fn = rmme; + } hfile = hopen(fn, smode); if (hfile == NULL) goto error; @@ -426,14 +623,27 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) fp = hts_hopen(hfile, fn, smode); if (fp == NULL) goto error; + // Compensate for the loss of exactness in htsExactFormat. + // hts_hopen returns generics such as binary or text, but we + // have been given something explicit here so use that instead. + if (fp->is_write && fmt && + (fmt->format == bam || fmt->format == sam || + fmt->format == vcf || fmt->format == bcf || + fmt->format == bed || fmt->format == fasta_format || + fmt->format == fastq_format)) + fp->format.format = fmt->format; + if (fmt && fmt->specific) if (hts_opt_apply(fp, fmt->specific) != 0) goto error; + if ( rmme ) free(rmme); return fp; error: - hts_log_error("Failed to open file %s", fn); + hts_log_error("Failed to open file \"%s\"%s%s", fn, + errno ? " : " : "", errno ? strerror(errno) : ""); + if ( rmme ) free(rmme); if (hfile) hclose_abruptly(hfile); @@ -639,6 +849,9 @@ int hts_opt_apply(htsFile *fp, hts_opt *opts) { for (; opts; opts = (last=opts)->next) { switch (opts->opt) { case CRAM_OPT_REFERENCE: + if (!(fp->fn_aux = strdup(opts->val.s))) + return -1; + // fall through case CRAM_OPT_VERSION: case CRAM_OPT_PREFIX: if (hts_set_opt(fp, opts->opt, opts->val.s) != 0) @@ -727,6 +940,11 @@ int hts_parse_format(htsFormat *format, const char *str) { format->format = sam; format->compression = no_compression;; format->compression_level = 0; + } else if (strcmp(fmt, "sam.gz") == 0) { + format->category = sequence_data; + format->format = sam; + format->compression = bgzf; + format->compression_level = -1; } else if (strcmp(fmt, "bam") == 0) { format->category = sequence_data; format->format = bam; @@ -865,7 +1083,11 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) fp->is_cram = 1; break; + case empty_format: case text_format: + case bed: + case fasta_format: + case fastq_format: case sam: case vcf: if (fp->format.compression != no_compression) { @@ -878,7 +1100,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) break; default: - errno = ENOEXEC; + errno = EFTYPE; goto error; } @@ -930,13 +1152,19 @@ int hts_close(htsFile *fp) ret = cram_close(fp->fp.cram); break; + case empty_format: case text_format: + case bed: + case fasta_format: + case fastq_format: case sam: case vcf: + ret = sam_state_destroy(fp); + if (fp->format.compression != no_compression) - ret = bgzf_close(fp->fp.bgzf); + ret |= bgzf_close(fp->fp.bgzf); else - ret = hclose(fp->fp.hfile); + ret |= hclose(fp->fp.hfile); break; default: @@ -945,6 +1173,8 @@ int hts_close(htsFile *fp) } save = errno; + sam_hdr_destroy(fp->bam_header); + hts_idx_destroy(fp->idx); free(fp->fn); free(fp->fn_aux); free(fp->line.s); @@ -971,20 +1201,28 @@ const char *hts_format_file_extension(const htsFormat *format) { case vcf: return "vcf"; case bcf: return "bcf"; case csi: return "csi"; + case fai_format: return "fai"; + case fqi_format: return "fqi"; case gzi: return "gzi"; case tbi: return "tbi"; case bed: return "bed"; + case fasta_format: return "fa"; + case fastq_format: return "fq"; default: return "?"; } } static hFILE *hts_hfile(htsFile *fp) { switch (fp->format.format) { - case binary_format: // fall through; still valid if bcf? + case binary_format:// fall through + case bcf: // fall through case bam: return bgzf_hfile(fp->fp.bgzf); case cram: return cram_hfile(fp->fp.cram); case text_format: return fp->fp.hfile; - case sam: return fp->fp.hfile; + case vcf: // fall through + case sam: return fp->format.compression != no_compression + ? bgzf_hfile(fp->fp.bgzf) + : fp->fp.hfile; default: return NULL; } } @@ -1059,7 +1297,9 @@ BGZF *hts_get_bgzfp(htsFile *fp); int hts_set_threads(htsFile *fp, int n) { - if (fp->format.compression == bgzf) { + if (fp->format.format == sam) { + return sam_set_threads(fp, n); + } else if (fp->format.compression == bgzf) { return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/); } else if (fp->format.format == cram) { return hts_set_opt(fp, CRAM_OPT_NTHREADS, n); @@ -1068,7 +1308,9 @@ int hts_set_threads(htsFile *fp, int n) } int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) { - if (fp->format.compression == bgzf) { + if (fp->format.format == sam || fp->format.format == text_format) { + return sam_set_thread_pool(fp, p); + } else if (fp->format.compression == bgzf) { return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize); } else if (fp->format.format == cram) { return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p); @@ -1107,14 +1349,14 @@ BGZF *hts_get_bgzfp(htsFile *fp) else return NULL; } -int hts_useek(htsFile *fp, long uoffset, int where) +int hts_useek(htsFile *fp, off_t uoffset, int where) { if (fp->is_bgzf) return bgzf_useek(fp->fp.bgzf, uoffset, where); else return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1; } -long hts_utell(htsFile *fp) +off_t hts_utell(htsFile *fp) { if (fp->is_bgzf) return bgzf_utell(fp->fp.bgzf); @@ -1133,7 +1375,7 @@ int hts_getline(htsFile *fp, int delimiter, kstring_t *str) switch (fp->format.compression) { case no_compression: str->l = 0; - ret = kgetline(str, (kgets_func *) hgets, fp->fp.hfile); + ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile); if (ret >= 0) ret = str->l; else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile); else ret = -1; @@ -1279,8 +1521,8 @@ int hts_check_EOF(htsFile *fp) #define pair64_lt(a,b) ((a).u < (b).u) -KSORT_INIT(_off, hts_pair64_t, pair64_lt) -KSORT_INIT(_off_max, hts_pair64_max_t, pair64_lt) +KSORT_INIT_STATIC(_off, hts_pair64_t, pair64_lt) +KSORT_INIT_STATIC(_off_max, hts_pair64_max_t, pair64_lt) typedef struct { int32_t m, n; @@ -1292,7 +1534,7 @@ KHASH_MAP_INIT_INT(bin, bins_t) typedef khash_t(bin) bidx_t; typedef struct { - int32_t n, m; + hts_pos_t n, m; uint64_t *offset; } lidx_t; @@ -1304,9 +1546,11 @@ struct __hts_idx_t { bidx_t **bidx; lidx_t *lidx; uint8_t *meta; // MUST have a terminating NUL on the end + int tbi_n, last_tbi_tid; struct { uint32_t last_bin, save_bin; - int last_coor, last_tid, save_tid, finished; + hts_pos_t last_coor; + int last_tid, save_tid, finished; uint64_t last_off, save_off; uint64_t off_beg, off_end; uint64_t n_mapped, n_unmapped; @@ -1352,7 +1596,8 @@ static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end) static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift) { - int i, beg, end; + int i; + hts_pos_t beg, end; beg = _beg >> min_shift; end = (_end - 1) >> min_shift; if (l->m < end + 1) { @@ -1393,6 +1638,8 @@ hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_l idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t)); if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; } } + idx->tbi_n = -1; + idx->last_tbi_tid = -1; return idx; } @@ -1433,12 +1680,12 @@ static void update_loff(hts_idx_t *idx, int i, int free_lidx) } } -static void compress_binning(hts_idx_t *idx, int i) +static int compress_binning(hts_idx_t *idx, int i) { bidx_t *bidx = idx->bidx[i]; khint_t k; int l, m; - if (bidx == 0) return; + if (bidx == 0) return 0; // merge a bin to its parent if the bin is too small for (l = idx->n_lvls; l > 0; --l) { unsigned start = hts_bin_first(l); @@ -1453,9 +1700,14 @@ static void compress_binning(hts_idx_t *idx, int i) if (kp == kh_end(bidx)) continue; q = &kh_val(bidx, kp); if (q->n + p->n > q->m) { - q->m = q->n + p->n; - kroundup32(q->m); - q->list = (hts_pair64_t*)realloc(q->list, q->m * sizeof(hts_pair64_t)); + uint32_t new_m = q->n + p->n; + hts_pair64_t *new_list; + kroundup32(new_m); + if (new_m > INT32_MAX) return -1; // Limited by index format + new_list = realloc(q->list, new_m * sizeof(*new_list)); + if (!new_list) return -1; + q->m = new_m; + q->list = new_list; } memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t)); q->n += p->n; @@ -1478,32 +1730,63 @@ static void compress_binning(hts_idx_t *idx, int i) } p->n = m + 1; } + return 0; } -void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) +int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) { - int i; - if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times + int i, ret = 0; + if (idx == NULL || idx->z.finished) return 0; // do not run this function on an empty index or multiple times if (idx->z.save_tid >= 0) { - insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset); - insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset); - insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped); + ret |= insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset); + ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset); + ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped); } for (i = 0; i < idx->n; ++i) { update_loff(idx, i, (idx->fmt == HTS_FMT_CSI)); - compress_binning(idx, i); + ret |= compress_binning(idx, i); } idx->z.finished = 1; + + return ret; } -int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) +int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) { - int bin; int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3); - if (tid<0) beg = -1, end = 0; - if (tid >= 0 && (beg > maxpos || end > maxpos)) { - goto pos_too_big; + if (tid < 0 || (beg <= maxpos && end <= maxpos)) + return 0; + int64_t max = end > beg ? end : beg, s = 1 << 14; + int n_lvls = 0; + while (max > s) { + n_lvls++; + s <<= 3; + } + + if (idx->fmt == HTS_FMT_CSI) { + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" cannot be stored in a csi index " + "with min_shift = %d, n_lvls = %d. Try using " + "min_shift = 14, n_lvls >= %d", + beg, end, + idx->min_shift, idx->n_lvls, + n_lvls); + } else { + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" cannot be stored in a %s index. " + "Try using a csi index with min_shift = 14, " + "n_lvls >= %d", + beg, end, idx_format_name(idx->fmt), + n_lvls); } + errno = ERANGE; + return -1; +} + +int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped) +{ + int bin; + if (tid<0) beg = -1, end = 0; + if (hts_idx_check_range(idx, tid, beg, end) < 0) + return -1; if (tid >= idx->m) { // enlarge the index uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1; bidx_t **new_bidx; @@ -1537,7 +1820,12 @@ int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int idx->z.last_tid = tid; idx->z.last_bin = 0xffffffffu; } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order - hts_log_error("Unsorted positions on sequence #%d: %d followed by %d", tid+1, idx->z.last_coor+1, beg+1); + hts_log_error("Unsorted positions on sequence #%d: %"PRIhts_pos" followed by %"PRIhts_pos, tid+1, idx->z.last_coor+1, beg+1); + return -1; + } + else if (end < beg) { + // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.) + hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1); return -1; } if ( tid>=0 ) @@ -1577,32 +1865,51 @@ int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int idx->z.last_off = offset; idx->z.last_coor = beg; return 0; +} - pos_too_big: { - int64_t max = end > beg ? end : beg, s = 1 << 14; - int n_lvls = 0; - while (max > s) { - n_lvls++; - s <<= 3; - } - - if (idx->fmt == HTS_FMT_CSI) { - hts_log_error("Region %d..%d cannot be stored in a csi index " - "with min_shift = %d, n_lvls = %d. Try using " - "min_shift = 14, n_lvls >= %d", - beg, end, - idx->min_shift, idx->n_lvls, - n_lvls); - } else { - hts_log_error("Region %d..%d cannot be stored in a %s index. " - "Try using a csi index with min_shift = 14, " - "n_lvls >= %d", - beg, end, idx_format_name(idx->fmt), - n_lvls); - } - errno = ERANGE; +// Needed for TBI only. Ensure 'tid' with 'name' is in the index meta data. +// idx->meta needs to have been initialsed first with an appropriate Tabix +// configuration via hts_idx_set_meta. +// +// NB number of references (first 4 bytes of tabix header) aren't in +// idx->meta, but held in idx->n instead. +int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name) { + // Horrid - we have to map incoming tid to a tbi alternative tid. + // This is because TBI counts tids by "covered" refs while everything + // else counts by Nth SQ/contig record in header. + if (tid == idx->last_tbi_tid || tid < 0 || !name) + return idx->tbi_n; + + uint32_t len = strlen(name)+1; + uint8_t *tmp = (uint8_t *)realloc(idx->meta, idx->l_meta + len); + if (!tmp) return -1; - } + + // Append name + idx->meta = tmp; + strcpy((char *)idx->meta + idx->l_meta, name); + idx->l_meta += len; + + // Update seq length + u32_to_le(le_to_u32(idx->meta+24)+len, idx->meta+24); + + idx->last_tbi_tid = tid; + return ++idx->tbi_n; +} + +// When doing samtools index we have a read_bam / hts_idx_push(bgzf_tell()) +// loop. idx->z.last_off is the previous bzgf_tell location, so we know +// the location the current bam record started at as well as where it ends. +// +// When building an index on the fly via a write_bam / hts_idx_push loop, +// this isn't quite identical as we may amend the virtual coord returned +// by bgzf_tell to the start of a new block if the next bam struct doesn't +// fit. It's essentially the same thing, but for bit-identical indices +// we need to amend the idx->z.last_off when we know we're starting a new +// block. +void hts_idx_amend_last(hts_idx_t *idx, uint64_t offset) +{ + idx->z.last_off = offset; } void hts_idx_destroy(hts_idx_t *idx) @@ -1632,6 +1939,10 @@ void hts_idx_destroy(hts_idx_t *idx) free(idx); } +int hts_idx_fmt(hts_idx_t *idx) { + return idx->fmt; +} + // The optimizer eliminates these ed_is_big() calls; still it would be good to // TODO Determine endianness at configure- or compile-time @@ -1668,7 +1979,18 @@ static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt) #define check(ret) if ((ret) < 0) return -1 - check(idx_write_int32(fp, idx->n)); + // VCF TBI/CSI only writes IDs for non-empty bins (ie covered references) + // + // NOTE: CSI meta is undefined in spec, so this code has an assumption + // that we're only using it for Tabix data. + int nids = idx->n; + if (idx->meta && idx->l_meta >= 4 && le_to_u32(idx->meta) == TBX_VCF) { + for (i = nids = 0; i < idx->n; ++i) { + if (idx->bidx[i]) + nids++; + } + } + check(idx_write_int32(fp, nids)); if (fmt == HTS_FMT_TBI && idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta)); @@ -1676,8 +1998,10 @@ static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt) khint_t k; bidx_t *bidx = idx->bidx[i]; lidx_t *lidx = &idx->lidx[i]; + // write binning index - check(idx_write_int32(fp, bidx? kh_size(bidx) : 0)); + if (nids == idx->n || bidx) + check(idx_write_int32(fp, bidx? kh_size(bidx) : 0)); if (bidx) for (k = kh_begin(bidx); k != kh_end(bidx); ++k) if (kh_exist(bidx, k)) { @@ -1687,6 +2011,7 @@ static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt) //int j;for(j=0;jn;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v); check(idx_write_int32(fp, p->n)); for (j = 0; j < p->n; ++j) { + //fprintf(stderr, "\t%ld\t%ld\n", p->list[j].u, p->list[j].v); check(idx_write_uint64(fp, p->list[j].u)); check(idx_write_uint64(fp, p->list[j].v)); } @@ -1759,7 +2084,7 @@ int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int return -1; } -static int hts_idx_load_core(hts_idx_t *idx, BGZF *fp, int fmt) +static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt) { int32_t i, n, is_be; is_be = ed_is_big(); @@ -1773,12 +2098,14 @@ static int hts_idx_load_core(hts_idx_t *idx, BGZF *fp, int fmt) h = idx->bidx[i] = kh_init(bin); if (bgzf_read(fp, &n, 4) != 4) return -1; if (is_be) ed_swap_4p(&n); + if (n < 0) return -3; for (j = 0; j < n; ++j) { khint_t k; if (bgzf_read(fp, &key, 4) != 4) return -1; if (is_be) ed_swap_4p(&key); k = kh_put(bin, h, key, &absent); - if (absent <= 0) return -3; // Duplicate bin number + if (absent < 0) return -2; // No memory + if (absent == 0) return -3; // Duplicate bin number p = &kh_val(h, k); if (fmt == HTS_FMT_CSI) { if (bgzf_read(fp, &p->loff, 8) != 8) return -1; @@ -1786,16 +2113,20 @@ static int hts_idx_load_core(hts_idx_t *idx, BGZF *fp, int fmt) } else p->loff = 0; if (bgzf_read(fp, &p->n, 4) != 4) return -1; if (is_be) ed_swap_4p(&p->n); + if (p->n < 0) return -3; + if ((size_t) p->n > SIZE_MAX / sizeof(hts_pair64_t)) return -2; p->m = p->n; p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t)); if (p->list == NULL) return -2; - if (bgzf_read(fp, p->list, p->n<<4) != p->n<<4) return -1; + if (bgzf_read(fp, p->list, ((size_t) p->n)<<4) != ((size_t) p->n)<<4) return -1; if (is_be) swap_bins(p); } if (fmt != HTS_FMT_CSI) { // load linear index int j; if (bgzf_read(fp, &l->n, 4) != 4) return -1; if (is_be) ed_swap_4p(&l->n); + if (l->n < 0) return -3; + if ((size_t) l->n > SIZE_MAX / sizeof(uint64_t)) return -2; l->m = l->n; l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t)); if (l->offset == NULL) return -2; @@ -1811,7 +2142,7 @@ static int hts_idx_load_core(hts_idx_t *idx, BGZF *fp, int fmt) return 0; } -static hts_idx_t *hts_idx_load_local(const char *fn) +static hts_idx_t *idx_read(const char *fn) { uint8_t magic[4]; int i, is_be; @@ -1835,11 +2166,12 @@ static hts_idx_t *hts_idx_load_local(const char *fn) } if (bgzf_read(fp, &n, 4) != 4) goto fail; if (is_be) ed_swap_4p(&n); + if (n > INT32_MAX) goto fail; if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail; idx->l_meta = x[2]; idx->meta = meta; meta = NULL; - if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto fail; + if (idx_read_core(idx, fp, HTS_FMT_CSI) < 0) goto fail; } else if (memcmp(magic, "TBI\1", 4) == 0) { uint8_t x[8 * 4]; @@ -1847,6 +2179,7 @@ static hts_idx_t *hts_idx_load_local(const char *fn) // Read file header if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail; n = le_to_u32(&x[0]); // location of n_ref + if (n > INT32_MAX) goto fail; if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail; n = le_to_u32(&x[7*4]); // location of l_nm if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow @@ -1859,14 +2192,15 @@ static hts_idx_t *hts_idx_load_local(const char *fn) if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail; // Prevent possible strlen past the end in tbx_index_load2 idx->meta[idx->l_meta] = '\0'; - if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto fail; + if (idx_read_core(idx, fp, HTS_FMT_TBI) < 0) goto fail; } else if (memcmp(magic, "BAI\1", 4) == 0) { uint32_t n; if (bgzf_read(fp, &n, 4) != 4) goto fail; if (is_be) ed_swap_4p(&n); - idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5); - if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto fail; + if (n > INT32_MAX) goto fail; + if ((idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5)) == NULL) goto fail; + if (idx_read_core(idx, fp, HTS_FMT_BAI) < 0) goto fail; } else { errno = EINVAL; goto fail; } @@ -1894,7 +2228,7 @@ int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta, if (!new_meta) return -1; memcpy(new_meta, meta, l); // Prevent possible strlen past the end in tbx_index_load2 - meta[l + 1] = '\0'; + new_meta[l] = '\0'; } if (idx->meta) free(idx->meta); idx->l_meta = l_meta; @@ -1956,13 +2290,15 @@ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) *** Iterator *** ****************/ +// Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<= end) return 0; if (end >= 1LL<>s); e = t + (end>>s); n = e - b + 1; if (itr->bins.n + n > itr->bins.m) { itr->bins.m = itr->bins.n + n; @@ -1974,10 +2310,11 @@ static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shi return itr->bins.n; } -static inline int reg2intervals(hts_itr_multi_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls) +static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls) { int l, t, s; - int b, e, i, j; + int i, j; + hts_pos_t b, e; hts_pair64_max_t *off; bidx_t *bidx; khint_t k; @@ -2083,14 +2420,23 @@ uint64_t hts_itr_off(const hts_idx_t* idx, int tid) { return off0; } -hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) +hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) { int i, n_off, l, bin; - hts_pair64_t *off; + hts_pair64_max_t *off; khint_t k; bidx_t *bidx; uint64_t min_off, max_off; - hts_itr_t *iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); + hts_itr_t *iter; + + // It's possible to call this function with NULL idx iff + // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE + if (!idx && !(tid == HTS_IDX_REST || tid == HTS_IDX_NONE)) { + errno = EINVAL; + return NULL; + } + + iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); if (iter) { if (tid < 0) { uint64_t off = hts_itr_off(idx, tid); @@ -2106,8 +2452,14 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_re } } else { if (beg < 0) beg = 0; - if (end < beg) return 0; - if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0; + if (end < beg) { + free(iter); + return NULL; + } + if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) { + free(iter); + return NULL; + } iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; iter->readrec = readrec; @@ -2152,14 +2504,17 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_re iter->finished = 1; return iter; } - off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t)); + off = (hts_pair64_max_t*)calloc(n_off, sizeof(hts_pair64_max_t)); for (i = n_off = 0; i < iter->bins.n; ++i) { if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) { int j; bins_t *p = &kh_value(bidx, k); for (j = 0; j < p->n; ++j) - if (p->list[j].v > min_off && p->list[j].u < max_off) - off[n_off++] = p->list[j]; + if (p->list[j].v > min_off && p->list[j].u < max_off) { + off[n_off].u = p->list[j].u; + off[n_off].v = p->list[j].v; + n_off++; + } } } @@ -2168,7 +2523,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_re iter->finished = 1; return iter; } - ks_introsort(_off, n_off, off); + ks_introsort(_off_max, n_off, off); // resolve completely contained adjacent blocks for (i = 1, l = 0; i < n_off; ++i) if (off[l].v < off[i].v) off[++l] = off[i]; @@ -2185,141 +2540,146 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_re iter->n_off = n_off; iter->off = off; } } + return iter; } -hts_itr_multi_t *hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_multi_t *iter) +int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) { int i, j, l, n_off = 0, bin; hts_pair64_max_t *off = NULL; khint_t k; bidx_t *bidx; uint64_t min_off, max_off, t_off = (uint64_t)-1; - int tid, beg, end; + int tid; + hts_pos_t beg, end; hts_reglist_t *curr_reg; - if (iter) { - iter->i = -1; - for (i=0; in_reg; i++) { - - curr_reg = &iter->reg_list[i]; - tid = curr_reg->tid; - - if (tid < 0) { - t_off = hts_itr_off(idx, tid); - if (t_off != (uint64_t)-1) { - switch (tid) { - case HTS_IDX_NONE: - iter->finished = 1; - case HTS_IDX_START: - case HTS_IDX_REST: - iter->curr_off = t_off; - iter->n_reg = 0; - iter->reg_list = NULL; - iter->read_rest = 1; - return iter; - case HTS_IDX_NOCOOR: - iter->nocoor = 1; - iter->nocoor_off = t_off; - } + if (!idx || !iter || !iter->multi) + return -1; + + iter->i = -1; + for (i=0; in_reg; i++) { + + curr_reg = &iter->reg_list[i]; + tid = curr_reg->tid; + + if (tid < 0) { + t_off = hts_itr_off(idx, tid); + if (t_off != (uint64_t)-1) { + switch (tid) { + case HTS_IDX_NONE: + iter->finished = 1; + case HTS_IDX_START: + case HTS_IDX_REST: + iter->curr_off = t_off; + iter->n_reg = 0; + iter->reg_list = NULL; + iter->read_rest = 1; + return 0; + case HTS_IDX_NOCOOR: + iter->nocoor = 1; + iter->nocoor_off = t_off; } - } else { - if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx)) - continue; + } + } else { + if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx)) + continue; - for(j=0; jcount; j++) { - hts_pair32_t *curr_intv = &curr_reg->intervals[j]; - if (curr_intv->end < curr_intv->beg) - continue; + for(j=0; jcount; j++) { + hts_pair32_t *curr_intv = &curr_reg->intervals[j]; + if (curr_intv->end < curr_intv->beg) + continue; - beg = curr_intv->beg; - end = curr_intv->end; + beg = curr_intv->beg; + end = curr_intv->end; - /* Compute 'min_off' by searching the lowest level bin containing 'beg'. + /* Compute 'min_off' by searching the lowest level bin containing 'beg'. If the computed bin is not in the index, try the next bin to the left, belonging to the same parent. If it is the first sibling bin, try the parent bin. */ - bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); - do { - int first; - k = kh_get(bin, bidx, bin); - if (k != kh_end(bidx)) break; - first = (hts_bin_parent(bin)<<3) + 1; - if (bin > first) --bin; - else bin = hts_bin_parent(bin); - } while (bin); - if (bin == 0) - k = kh_get(bin, bidx, bin); - min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; - - // compute max_off: a virtual offset from a bin to the right of end - bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; - if (bin >= idx->n_bins) bin = 0; - while (1) { + bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); + do { + int first; + k = kh_get(bin, bidx, bin); + if (k != kh_end(bidx)) break; + first = (hts_bin_parent(bin)<<3) + 1; + if (bin > first) --bin; + else bin = hts_bin_parent(bin); + } while (bin); + if (bin == 0) + k = kh_get(bin, bidx, bin); + min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; + + // compute max_off: a virtual offset from a bin to the right of end + bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; + if (bin >= idx->n_bins) bin = 0; + while (1) { // search for an extant bin by moving right, but moving up to the // parent whenever we get to a first child (which also covers falling // off the RHS, which wraps around and immediately goes up to bin 0) - while (bin % 8 == 1) bin = hts_bin_parent(bin); - if (bin == 0) { max_off = (uint64_t)-1; break; } - k = kh_get(bin, bidx, bin); - if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { - max_off = kh_val(bidx, k).list[0].u; - break; - } - bin++; + while (bin % 8 == 1) bin = hts_bin_parent(bin); + if (bin == 0) { max_off = (uint64_t)-1; break; } + k = kh_get(bin, bidx, bin); + if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { + max_off = kh_val(bidx, k).list[0].u; + break; } - - //convert coordinates to file offsets - reg2intervals(iter, idx, tid, beg, end, min_off, max_off, idx->min_shift, idx->n_lvls); + bin++; } + + //convert coordinates to file offsets + reg2intervals(iter, idx, tid, beg, end, min_off, max_off, idx->min_shift, idx->n_lvls); } } + } - off = iter->off; - n_off = iter->n_off; + off = iter->off; + n_off = iter->n_off; - if (n_off) { - ks_introsort(_off_max, n_off, off); - // resolve completely contained adjacent blocks - for (i = 1, l = 0; i < n_off; ++i) { - if (off[l].v < off[i].v) { - off[++l] = off[i]; - } else { - off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max); - } - } - n_off = l + 1; - // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing - for (i = 1; i < n_off; ++i) - if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; - // merge adjacent blocks - for (i = 1, l = 0; i < n_off; ++i) { - if (off[l].v>>16 == off[i].u>>16) { - off[l].v = off[i].v; - off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max); - } else off[++l] = off[i]; + if (n_off) { + ks_introsort(_off_max, n_off, off); + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) { + if (off[l].v < off[i].v) { + off[++l] = off[i]; + } else { + off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max); } - n_off = l + 1; - iter->n_off = n_off; iter->off = off; } - - if(!n_off && !iter->nocoor) - iter->finished = 1; + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + for (i = 1; i < n_off; ++i) + if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; + // merge adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) { + if (off[l].v>>16 == off[i].u>>16) { + off[l].v = off[i].v; + off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max); + } else off[++l] = off[i]; + } + n_off = l + 1; + iter->n_off = n_off; iter->off = off; } - return iter; + + if(!n_off && !iter->nocoor) + iter->finished = 1; + + return 0; } -hts_itr_multi_t *hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_multi_t *iter) +int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; - int tid, beg, end, i, j, l, n_off = 0; + int tid, i, j, l, n_off = 0; + hts_pos_t beg, end; hts_reglist_t *curr_reg; hts_pair32_t *curr_intv; hts_pair64_max_t *off = NULL; cram_index *e = NULL; - if (!cidx || !iter) - return NULL; + if (!cidx || !iter || !iter->multi) + return -1; iter->is_cram = 1; iter->read_rest = 0; @@ -2336,7 +2696,7 @@ hts_itr_multi_t *hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_multi_t *iter) if (tid >= 0) { off = (hts_pair64_max_t*)realloc(off, (n_off + curr_reg->count) * sizeof(hts_pair64_max_t)); if (!off) - return NULL; + return -1; for (j=0; j < curr_reg->count; j++) { curr_intv = &curr_reg->intervals[j]; @@ -2347,7 +2707,7 @@ hts_itr_multi_t *hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_multi_t *iter) end = curr_intv->end; /* First, fetch the container overlapping 'beg' and assign its file offset to u, then - * find the container overlapping 'end' and assing the relative end of the slice to v. + * find the container overlapping 'end' and assign the relative end of the slice to v. * The cram_ptell function will adjust with the container offset, which is not stored * in the index. */ @@ -2366,10 +2726,10 @@ hts_itr_multi_t *hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_multi_t *iter) off[n_off].max = (uint64_t)tid<<32 | end; n_off++; } else { - hts_log_warning("Could not set offset end for region %d(%s):%d-%d. Skipping", tid, curr_reg->reg, beg, end); + hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end); } } else { - hts_log_warning("No index entry for region %d:%d-%d", tid, beg, end); + hts_log_warning("No index entry for region %d:%"PRIhts_pos"-%"PRIhts_pos"", tid, beg, end); } } } else { @@ -2435,33 +2795,18 @@ hts_itr_multi_t *hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_multi_t *iter) if(!n_off && !iter->nocoor) iter->finished = 1; - return iter; + return 0; } void hts_itr_destroy(hts_itr_t *iter) { - if (iter) { free(iter->off); free(iter->bins.a); free(iter); } -} - -void hts_reglist_free(hts_reglist_t *reglist, int count) { - - int i; - if(reglist) { - for (i=0;ireg_list && iter->n_reg) + if (iter->multi) hts_reglist_free(iter->reg_list, iter->n_reg); + else + free(iter->bins.a); - if (iter->off && iter->n_off) + if (iter->off) free(iter->off); free(iter); } @@ -2502,6 +2847,12 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) if (esign == '-') e = -e; } + switch (*s) { + case 'k': case 'K': e += 3; s++; break; + case 'm': case 'M': e += 6; s++; break; + case 'g': case 'G': e += 9; s++; break; + } + e -= decimals; while (e > 0) n *= 10, e--; while (e < 0) lost += n % 10, n /= 10, e++; @@ -2513,25 +2864,239 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) if (strend) { *strend = (char *)s; } else if (*s) { - hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s); + if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ',')) + hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s); } return (sign == '+')? n : -n; } -const char *hts_parse_reg(const char *s, int *beg, int *end) +static void *hts_memrchr(const void *s, int c, size_t n) { + size_t i; + unsigned char *u = (unsigned char *)s; + for (i = n; i > 0; i--) { + if (u[i-1] == c) + return u+i-1; + } + + return NULL; +} + +/* + * A variant of hts_parse_reg which is reference-id aware. It uses + * the iterator name2id callbacks to validate the region tokenisation works. + * + * This is necessary due to GRCh38 HLA additions which have reference names + * like "HLA-DRB1*12:17". + * + * All parameters are mandatory. + * + * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" + * are reference names, we may quote using curly braces. + * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. + * + * Flags are used to control how parsing works, and can be one of the below. + * + * HTS_PARSE_LIST: + * If present, the region is assmed to be a comma separated list and + * position parsing will not contain commas (this implicitly + * clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal). + * On success the return pointer will be the start of the next region, ie + * the character after the comma. (If *ret != '\0' then the caller can + * assume another region is present in the list.) + * + * If not set then positions may contain commas. In this case the return + * value should point to the end of the string, or NULL on failure. + * + * HTS_PARSE_ONE_COORD: + * If present, X:100 is treated as the single base pair region X:100-100. + * In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-. + * (This is the standard bcftools region convention.) + * + * When not set X:100 is considered to be X:100- where is + * the end of chromosome X (set to INT_MAX here). X:100- and X:-100 are + * invalid. + * (This is the standard samtools region convention.) + * + * Note the supplied string expects 1 based inclusive coordinates, but the + * returned coordinates start from 0 and are half open, so pos0 is valid + * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}" + * + * On success a pointer to the byte after the end of the entire region + * specifier is returned (plus any trailing comma), and tid, + * beg & end will be set. + * On failure NULL is returned. + */ +const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg, + hts_pos_t *end, hts_name2id_f getid, void *hdr, + int flags) +{ + if (!s || !tid || !beg || !end || !getid) + return NULL; + + size_t s_len = strlen(s); + kstring_t ks = { 0, 0, NULL }; + + const char *colon = NULL, *comma = NULL; + int quoted = 0; + + if (flags & HTS_PARSE_LIST) + flags &= ~HTS_PARSE_THOUSANDS_SEP; + else + flags |= HTS_PARSE_THOUSANDS_SEP; + + const char *s_end = s + s_len; + + // Braced quoting of references is permitted to resolve ambiguities. + if (*s == '{') { + const char *close = memchr(s, '}', s_len); + if (!close) { + hts_log_error("Mismatching braces in \"%s\"", s); + *tid = -1; + return NULL; + } + s++; + s_len--; + if (close[1] == ':') + colon = close+1; + quoted = 1; // number of trailing characters to trim + + // Truncate to this item only, if appropriate. + if (flags & HTS_PARSE_LIST) { + comma = strchr(close, ','); + if (comma) { + s_len = comma-s; + s_end = comma+1; + } + } + } else { + // Truncate to this item only, if appropriate. + if (flags & HTS_PARSE_LIST) { + comma = strchr(s, ','); + if (comma) { + s_len = comma-s; + s_end = comma+1; + } + } + + colon = hts_memrchr(s, ':', s_len); + } + + // No colon is simplest case; just check and return. + if (colon == NULL) { + *beg = 0; *end = HTS_POS_MAX; + kputsn(s, s_len-quoted, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -2; + return NULL; + } + + *tid = getid(hdr, ks.s); + free(ks.s); + + return *tid >= 0 ? s_end : NULL; + } + + // Has a colon, but check whole name first. + if (!quoted) { + *beg = 0; *end = HTS_POS_MAX; + kputsn(s, s_len, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -2; + return NULL; + } + if ((*tid = getid(hdr, ks.s)) >= 0) { + // Entire name matches, but also check this isn't + // ambiguous. eg we have ref chr1 and ref chr1:100-200 + // both present. + ks.l = 0; + kputsn(s, colon-s, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -2; + return NULL; + } + if (getid(hdr, ks.s) >= 0) { + free(ks.s); + *tid = -1; + hts_log_error("Range is ambiguous. " + "Use {%s} or {%.*s}%s instead", + s, (int)(colon-s), s, colon); + return NULL; + } + free(ks.s); + + return s_end; + } + if (*tid < -1) // Failed to parse header + return NULL; + } + + // Quoted, or unquoted and whole string isn't a name. + // Check the pre-colon part is valid. + ks.l = 0; + kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -2; + return NULL; + } + *tid = getid(hdr, ks.s); + free(ks.s); + if (*tid < 0) + return NULL; + + // Finally parse the post-colon coordinates + char *hyphen; + *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1; + if (*beg < 0) { + if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') { + // interpret chr:-100 as chr:1-100 + *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1); + *beg = 0; + return s_end; + } else if (*hyphen == '-') { + *beg = 0; + } else { + hts_log_error("Unexpected string \"%s\" after region", hyphen); + return NULL; + } + } + + if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) { + *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX; + } else if (*hyphen == '-') { + *end = hts_parse_decimal(hyphen+1, &hyphen, flags); + if (*hyphen != '\0' && *hyphen != ',') { + hts_log_error("Unexpected string \"%s\" after region", hyphen); + return NULL; + } + } else { + hts_log_error("Unexpected string \"%s\" after region", hyphen); + return NULL; + } + + if (*end == 0) + *end = HTS_POS_MAX; // interpret chr:100- as chr:100- + + if (*beg >= *end) return NULL; + + return s_end; +} + +// Next release we should mark this as deprecated? +// Use hts_parse_region above instead. +const char *hts_parse_reg64(const char *s, hts_pos_t *beg, hts_pos_t *end) { char *hyphen; const char *colon = strrchr(s, ':'); if (colon == NULL) { - *beg = 0; *end = INT_MAX; + *beg = 0; *end = HTS_POS_MAX; return s + strlen(s); } *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1; if (*beg < 0) *beg = 0; - if (*hyphen == '\0') *end = INT_MAX; + if (*hyphen == '\0') *end = HTS_POS_MAX; else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP); else return NULL; @@ -2539,46 +3104,51 @@ const char *hts_parse_reg(const char *s, int *beg, int *end) return colon; } +const char *hts_parse_reg(const char *s, int *beg, int *end) +{ + hts_pos_t beg64 = 0, end64 = 0; + const char *colon = hts_parse_reg64(s, &beg64, &end64); + if (beg64 > INT_MAX) { + hts_log_error("Position %"PRId64" too large", beg64); + return NULL; + } + if (end64 > INT_MAX) { + if (end64 == HTS_POS_MAX) { + end64 = INT_MAX; + } else { + hts_log_error("Position %"PRId64" too large", end64); + return NULL; + } + } + *beg = beg64; + *end = end64; + return colon; +} + hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) { - int tid, beg, end; - const char *q; + int tid; + hts_pos_t beg, end; if (strcmp(reg, ".") == 0) return itr_query(idx, HTS_IDX_START, 0, 0, readrec); else if (strcmp(reg, "*") == 0) return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec); - q = hts_parse_reg(reg, &beg, &end); - if (q) { - char tmp_a[1024], *tmp = tmp_a; - if (q - reg + 1 > 1024) - if (!(tmp = malloc(q - reg + 1))) - return NULL; - strncpy(tmp, reg, q - reg); - tmp[q - reg] = 0; - tid = getid(hdr, tmp); - if (tmp != tmp_a) - free(tmp); - } - else { - // not parsable as a region, but possibly a sequence named "foo:a" - tid = getid(hdr, reg); - beg = 0; end = INT_MAX; - } + if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP)) + return NULL; - if (tid < 0) return NULL; return itr_query(idx, tid, beg, end, readrec); } -hts_itr_multi_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) { +hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) { int i; if (!reglist) return NULL; - hts_itr_multi_t *itr = (hts_itr_multi_t*)calloc(1, sizeof(hts_itr_multi_t)); + hts_itr_t *itr = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); if (itr) { itr->n_reg = count; itr->readrec = readrec; @@ -2587,33 +3157,49 @@ hts_itr_multi_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, i itr->reg_list = reglist; itr->finished = 0; itr->nocoor = 0; + itr->multi = 1; for (i = 0; i < itr->n_reg; i++) { - if (!strcmp(itr->reg_list[i].reg, ".")) { - itr->reg_list[i].tid = HTS_IDX_START; - continue; - } + if (itr->reg_list[i].reg) { + if (!strcmp(itr->reg_list[i].reg, ".")) { + itr->reg_list[i].tid = HTS_IDX_START; + continue; + } - if (!strcmp(itr->reg_list[i].reg, "*")) { - itr->reg_list[i].tid = HTS_IDX_NOCOOR; - continue; - } + if (!strcmp(itr->reg_list[i].reg, "*")) { + itr->reg_list[i].tid = HTS_IDX_NOCOOR; + continue; + } - itr->reg_list[i].tid = getid(hdr, reglist[i].reg); - if (itr->reg_list[i].tid < 0) - hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg); + itr->reg_list[i].tid = getid(hdr, reglist[i].reg); + if (itr->reg_list[i].tid < 0) { + if (itr->reg_list[i].tid < -1) { + hts_log_error("Failed to parse header"); + hts_itr_destroy(itr); + return NULL; + } else { + hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg); + } + } + } } qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions); - itr_specific(idx, itr); + if (itr_specific(idx, itr) != 0) { + hts_log_error("Failed to create the multi-region iterator!"); + hts_itr_destroy(itr); + itr = NULL; + } } + return itr; } int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) { - int ret, tid, beg, end; + int ret, tid; + hts_pos_t beg, end; if (iter == NULL || iter->finished) return -1; if (iter->read_rest) { if (iter->curr_off) { // seek to the start @@ -2654,10 +3240,11 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) return ret; } -int hts_itr_multi_next(htsFile *fd, hts_itr_multi_t *iter, void *r) +int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r) { void *fp; - int ret, tid, beg, end, i, cr, ci; + int ret, tid, i, cr, ci; + hts_pos_t beg, end; hts_reglist_t *found_reg; if (iter == NULL || iter->finished) return -1; @@ -2671,15 +3258,15 @@ int hts_itr_multi_next(htsFile *fd, hts_itr_multi_t *iter, void *r) if (iter->read_rest) { if (iter->curr_off) { // seek to the start if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) { + hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off); return -1; } iter->curr_off = 0; // only seek once } ret = iter->readrec(fp, fd, r, &tid, &beg, &end); - if (ret < 0) { + if (ret < 0) iter->finished = 1; - } iter->curr_tid = tid; iter->curr_beg = beg; @@ -2693,24 +3280,39 @@ int hts_itr_multi_next(htsFile *fd, hts_itr_multi_t *iter, void *r) for (;;) { if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk if (iter->i == iter->n_off - 1) { // no more chunks, except NOCOORs - if (iter->nocoor) { - iter->read_rest = 1; - iter->curr_off = iter->nocoor_off; - - return hts_itr_multi_next(fd, iter, r); - } else { - ret = -1; break; - } - } + if (iter->nocoor) { + if (iter->seek(fp, iter->nocoor_off, SEEK_SET) < 0) { + hts_log_error("Seek at offset %" PRIu64 " failed.", iter->nocoor_off); + return -1; + } - if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek - if (iter->seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) { + //The first slice covering the unmapped reads might contain a few mapped reads, so scroll + //forward until finding the first unmapped read. + do { + ret = iter->readrec(fp, fd, r, &tid, &beg, &end); + } while (tid >= 0 && ret >=0); + + if (ret < 0) + iter->finished = 1; + else + iter->read_rest = 1; + + iter->curr_off = 0; // don't seek any more + iter->curr_tid = tid; + iter->curr_beg = beg; + iter->curr_end = end; + + return ret; + } else { + ret = -1; break; + } + } else if (iter->i < iter->n_off - 1) { + iter->curr_off = iter->off[++iter->i].u; + if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) { + hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off); return -1; } - - iter->curr_off = iter->tell(fp); } - ++iter->i; } ret = iter->readrec(fp, fd, r, &tid, &beg, &end); @@ -2718,6 +3320,7 @@ int hts_itr_multi_next(htsFile *fd, hts_itr_multi_t *iter, void *r) break; iter->curr_off = iter->tell(fp); + if (tid != iter->curr_tid) { hts_reglist_t key; key.tid = tid; @@ -2759,64 +3362,107 @@ int hts_itr_multi_next(htsFile *fd, hts_itr_multi_t *iter, void *r) /********************** *** Retrieve index *** **********************/ +// Local_fn and local_len will return a sub-region of 'fn'. +// Eg http://elsewhere/dir/foo.bam.bai?a=b may return +// foo.bam.bai via local_fn and local_len. +// // Returns -1 if index couldn't be opened. // -2 on other errors -static int test_and_fetch(const char *fn, const char **local_fn) +static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_len, int download) { hFILE *remote_hfp; FILE *local_fp = NULL; uint8_t *buf = NULL; int save_errno; + htsFormat fmt; + kstring_t s = KS_INITIALIZE; if (hisremote(fn)) { const int buf_size = 1 * 1024 * 1024; int l; - const char *p; - for (p = fn + strlen(fn) - 1; p >= fn; --p) - if (*p == '/') break; - ++p; // p now points to the local file name + const char *p, *e; + // Ignore ?# params: eg any file.fmt?param=val, except for S3 URLs + e = fn + ((strncmp(fn, "s3://", 5) && strncmp(fn, "s3+http://", 10) && strncmp(fn, "s3+https://", 11)) ? strcspn(fn, "?#") : strcspn(fn, "?")); + // Find the previous slash from there. + p = e; + while (p > fn && *p != '/') p--; + if (*p == '/') p++; + // Attempt to open local file first - if ((local_fp = fopen((char*)p, "rb")) != 0) + kputsn(p, e-p, &s); + if ((local_fp = fopen(s.s, "rb")) != 0) { fclose(local_fp); + free(s.s); *local_fn = p; return 0; } - // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index. - if ((remote_hfp = hopen(fn, "r")) == 0) return -1; - if ((local_fp = fopen(p, "w")) == 0) { - hts_log_error("Failed to create file %s in the working directory", p); + + // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .bai or .tbi index. + if ((remote_hfp = hopen(fn, "r")) == 0) { + hts_log_info("Failed to open index file '%s'", fn); + free(s.s); + return -1; + } + if (hts_detect_format(remote_hfp, &fmt)) { + hts_log_error("Failed to detect format of index file '%s'", fn); goto fail; } - hts_log_info("Downloading file %s to local directory", fn); - buf = (uint8_t*)calloc(buf_size, 1); - if (!buf) { - hts_log_error("%s", strerror(errno)); + if (fmt.category != index_file || (fmt.format != bai && fmt.format != csi && fmt.format != tbi + && fmt.format != crai)) { + hts_log_error("Format of index file '%s' is not supported", fn); goto fail; } - while ((l = hread(remote_hfp, buf, buf_size)) > 0) { - if (fwrite(buf, 1, l, local_fp) != l) { - hts_log_error("Failed to write data to %s : %s", - fn, strerror(errno)); + + if (download) { + if ((local_fp = fopen(s.s, "w")) == 0) { + hts_log_error("Failed to create file %s in the working directory", p); goto fail; } + hts_log_info("Downloading file %s to local directory", fn); + buf = (uint8_t*)calloc(buf_size, 1); + if (!buf) { + hts_log_error("%s", strerror(errno)); + goto fail; + } + while ((l = hread(remote_hfp, buf, buf_size)) > 0) { + if (fwrite(buf, 1, l, local_fp) != l) { + hts_log_error("Failed to write data to %s : %s", + fn, strerror(errno)); + free(buf); + goto fail; + } + } + free(buf); + if (l < 0) { + hts_log_error("Error reading \"%s\"", fn); + goto fail; + } + if (fclose(local_fp) < 0) { + hts_log_error("Error closing %s : %s", fn, strerror(errno)); + local_fp = NULL; + goto fail; + } + + *local_fn = p; + *local_len = e-p; + } else { + *local_fn = fn; + *local_len = e-fn; } - free(buf); - if (fclose(local_fp) < 0) { - hts_log_error("Error closing %s : %s", fn, strerror(errno)); - local_fp = NULL; - goto fail; - } + if (hclose(remote_hfp) != 0) { hts_log_error("Failed to close remote file %s", fn); } - *local_fn = p; + + free(s.s); return 0; } else { hFILE *local_hfp; if ((local_hfp = hopen(fn, "r")) == 0) return -1; hclose_abruptly(local_hfp); *local_fn = fn; + *local_len = strlen(fn); return 0; } @@ -2825,62 +3471,261 @@ static int test_and_fetch(const char *fn, const char **local_fn) hclose_abruptly(remote_hfp); if (local_fp) fclose(local_fp); free(buf); + free(s.s); errno = save_errno; return -2; } -char *hts_idx_getfn(const char *fn, const char *ext) -{ - int i, l_fn, l_ext, ret; +/* + * Check the existence of a local index file using part of the alignment file name. + * The order is alignment.bam.csi, alignment.csi, alignment.bam.bai, alignment.bai + * @param fn - pointer to the file name + * @param fnidx - pointer to the index file name placeholder + * @return 1 for success, 0 for failure + */ +int hts_idx_check_local(const char *fn, int fmt, char **fnidx) { + int i, l_fn, l_ext; + const char *fn_tmp = NULL; + char *fnidx_tmp; + char *csi_ext = ".csi"; + char *bai_ext = ".bai"; + char *tbi_ext = ".tbi"; + char *crai_ext = ".crai"; + + if (!fn) + return 0; + + if (hisremote(fn)) { + for (i = strlen(fn) - 1; i >= 0; --i) + if (fn[i] == '/') { + fn_tmp = (char *)&fn[i+1]; + break; + } + } else { + // Borrowed from hopen_fd_fileuri() + if (strncmp(fn, "file://localhost/", 17) == 0) fn_tmp = fn + 16; + else if (strncmp(fn, "file:///", 8) == 0) fn_tmp = fn + 7; + else fn_tmp = fn; +#if defined(_WIN32) || defined(__MSYS__) + // For cases like C:/foo + if (fn_tmp[0] == '/' && fn_tmp[1] && fn_tmp[2] == ':' && fn_tmp[3] == '/') + fn_tmp++; +#endif + } + + if (!fn_tmp) return 0; + hts_log_info("Using alignment file '%s'", fn_tmp); + l_fn = strlen(fn_tmp); l_ext = 5; + fnidx_tmp = (char*)calloc(l_fn + l_ext + 1, 1); + if (!fnidx_tmp) return 0; + + struct stat sbuf; + + // Try alignment.bam.csi first + strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, csi_ext); + if(stat(fnidx_tmp, &sbuf) == 0) { + *fnidx = fnidx_tmp; + return 1; + } else { // Then try alignment.csi + for (i = l_fn - 1; i > 0; --i) + if (fnidx_tmp[i] == '.') { + strcpy(fnidx_tmp + i, csi_ext); + if(stat(fnidx_tmp, &sbuf) == 0) { + *fnidx = fnidx_tmp; + return 1; + } + break; + } + } + if (fmt == HTS_FMT_BAI) { + // Next, try alignment.bam.bai + strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, bai_ext); + if(stat(fnidx_tmp, &sbuf) == 0) { + *fnidx = fnidx_tmp; + return 1; + } else { // And finally, try alignment.bai + for (i = l_fn - 1; i > 0; --i) + if (fnidx_tmp[i] == '.') { + strcpy(fnidx_tmp + i, bai_ext); + if(stat(fnidx_tmp, &sbuf) == 0) { + *fnidx = fnidx_tmp; + return 1; + } + break; + } + } + } else if (fmt == HTS_FMT_TBI) { // Or .tbi + strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, tbi_ext); + if(stat(fnidx_tmp, &sbuf) == 0) { + *fnidx = fnidx_tmp; + return 1; + } else { + for (i = l_fn - 1; i > 0; --i) + if (fnidx_tmp[i] == '.') { + strcpy(fnidx_tmp + i, tbi_ext); + if(stat(fnidx_tmp, &sbuf) == 0) { + *fnidx = fnidx_tmp; + return 1; + } + break; + } + } + } else if (fmt == HTS_FMT_CRAI) { // Or .crai + strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, crai_ext); + if(stat(fnidx_tmp, &sbuf) == 0) { + *fnidx = fnidx_tmp; + return 1; + } else { + for (i = l_fn - 1; i > 0; --i) + if (fnidx_tmp[i] == '.') { + strcpy(fnidx_tmp + i, crai_ext); + if(stat(fnidx_tmp, &sbuf) == 0) { + *fnidx = fnidx_tmp; + return 1; + } + break; + } + } + } + + free(fnidx_tmp); + return 0; +} + +static char *idx_filename(const char *fn, const char *ext, int download) { + int ret, local_len; char *fnidx; const char *local_fn = NULL; - l_fn = strlen(fn); l_ext = strlen(ext); - fnidx = (char*)calloc(l_fn + l_ext + 1, 1); - if (!fnidx) return NULL; + kstring_t buffer = KS_INITIALIZE; + // First try : append `ext` to `fn` - strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext); - if ((ret = test_and_fetch(fnidx, &local_fn)) == -1) { + if (!(fnidx = haddextension(&buffer, fn, 0, ext))) { + free(buffer.s); + return NULL; + } + if ((ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download)) == -1) { // Second try : replace suffix of `fn` with `ext` - for (i = l_fn - 1; i > 0; --i) - if (fnidx[i] == '.' || fnidx[i] == '/') break; - if (fnidx[i] == '.') { - strcpy(fnidx + i, ext); - ret = test_and_fetch(fnidx, &local_fn); + if (!(fnidx = haddextension(&buffer, fn, 1, ext))) { + free(buffer.s); + return NULL; } + ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download); } + if (ret < 0) { - free(fnidx); + free(buffer.s); return NULL; } - l_fn = strlen(local_fn); - memmove(fnidx, local_fn, l_fn + 1); + + memmove(fnidx, local_fn, local_len); + fnidx[local_len] = 0; return fnidx; } -hts_idx_t *hts_idx_load(const char *fn, int fmt) +char *hts_idx_getfn(const char *fn, const char *ext) { - char *fnidx; + return idx_filename(fn, ext, 1); +} + +static hts_idx_t *idx_find_and_load(const char *fn, int fmt, int flags) +{ + char *fnidx = strstr(fn, HTS_IDX_DELIM); hts_idx_t *idx; - fnidx = hts_idx_getfn(fn, ".csi"); - if (! fnidx) fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi"); - if (fnidx == 0) return 0; - idx = hts_idx_load2(fn, fnidx); + if ( fnidx ) { + char *fn2 = strdup(fn); + if (!fn2) { + hts_log_error("%s", strerror(errno)); + return NULL; + } + fn2[fnidx - fn] = '\0'; + fnidx += strlen(HTS_IDX_DELIM); + idx = hts_idx_load2(fn2, fnidx); + free(fn2); + return idx; + } + + if (hts_idx_check_local(fn, fmt, &fnidx) == 0 && hisremote(fn)) { + if (flags & HTS_IDX_SAVE_REMOTE) { + fnidx = hts_idx_getfn(fn, ".csi"); + if (!fnidx) { + switch (fmt) { + case HTS_FMT_BAI: fnidx = hts_idx_getfn(fn, ".bai"); break; + case HTS_FMT_TBI: fnidx = hts_idx_getfn(fn, ".tbi"); break; + default: break; + } + } + } else { + fnidx = idx_filename(fn, ".csi", 0); + if (!fnidx) { + switch (fmt) { + case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", 0); break; + case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", 0); break; + default: break; + } + } + } + } + if (!fnidx) { + if (!(flags & HTS_IDX_SILENT_FAIL)) + hts_log_error("Could not retrieve index file for '%s'", fn); + return 0; + } + + if (flags & HTS_IDX_SAVE_REMOTE) + idx = hts_idx_load3(fn, fnidx, fmt, flags); + else + idx = idx_read(fnidx); free(fnidx); return idx; } +hts_idx_t *hts_idx_load(const char *fn, int fmt) { + return idx_find_and_load(fn, fmt, 1); +} + hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx) { + return hts_idx_load3(fn, fnidx, 0, 0); +} + +hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags) +{ + const char *local_fn = NULL; + char *local_fnidx = NULL; + int local_len; + if (!fnidx) + return idx_find_and_load(fn, fmt, flags); + // Check that the index file is up to date, the main file might have changed struct stat stat_idx,stat_main; - if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) ) + int remote_fn = hisremote(fn), remote_fnidx = hisremote(fnidx); + if ( !remote_fn && !remote_fnidx + && !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) ) { if ( stat_idx.st_mtime < stat_main.st_mtime ) hts_log_warning("The index file is older than the data file: %s", fnidx); } - return hts_idx_load_local(fnidx); + if (remote_fnidx && (flags & HTS_IDX_SAVE_REMOTE)) + { + int ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, 1); + if (ret == 0) { + local_fnidx = strdup(local_fn); + if (local_fnidx) { + local_fnidx[local_len] = '\0'; + fnidx = local_fnidx; + } + } + } + + hts_idx_t *idx = idx_read(fnidx); + if (!idx && !(flags & HTS_IDX_SILENT_FAIL)) + hts_log_error("Could not load local index file '%s'", fnidx); + + free(local_fnidx); + + return idx; } @@ -2890,6 +3735,7 @@ hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx) **********************/ /* For use with hts_expand macros *only* */ +HTSLIB_EXPORT size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size, int clear, void **ptr, const char *func) { /* If new_m and size are both below this limit, multiplying them @@ -2931,6 +3777,84 @@ size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size, exit(1); } +/* + * Companion to hts_resize() macro that does the actual allocation. + * + * Somewhat complicated as hts_resize() needs to write the new allocated + * size back into *size_in_out, and the value pointed to may either be + * int32_t, uint32_t or size_t depending on which array is being resized. + * This is solved by making `size_in_out` a void pointer, getting the macro + * to pass in the size of the item pointed to (in `size_sz`) and then using + * an appropriate cast (based on the value of size_sz). The function + * ensures that the maximum size will be storable in a signed type of + * the given size so storing to an int32_t should work correctly. + * + * Assumes that sizeof(uint32_t) and sizeof(int32_t) is 4, + * sizeof(uint64_t) and sizeof(int64_t) is 8 and sizeof(size_t) is + * either 4 or 8. It also assumes casting from unsigned to signed will + * work as long as the top bit isn't set. + */ + +int hts_resize_array_(size_t item_size, size_t num, size_t size_sz, + void *size_in_out, void **ptr_in_out, int flags, + const char *func) { + /* If new_size and item_size are both below this limit, multiplying them + together can't overflow */ + const size_t safe = (size_t) 1 << (sizeof(size_t) * 4); + void *new_ptr; + size_t bytes, new_size; + + new_size = num; + kroundup_size_t(new_size); + bytes = item_size * new_size; + + /* Check for overflow. Both ensure that alloc will fit in alloc_in_out (we + make the pessimistic assumption that *alloc_in_out is signed), and that + bytes has not wrapped around. */ + + if ((new_size > (((size_t) 1 << (size_sz * 8 - 1)) - 1)) + || (((item_size > safe) || (new_size > safe)) + && bytes / new_size != item_size)) { + hts_log(HTS_LOG_ERROR, func, "Memory allocation too large"); + errno = ENOMEM; + return -1; + } + + new_ptr = realloc(*ptr_in_out, bytes); + if (new_ptr == NULL) { + int save_errno = errno; + hts_log(HTS_LOG_ERROR, func, "%s", strerror(errno)); + errno = save_errno; + return -1; + } + + if (flags & HTS_RESIZE_CLEAR) { + size_t old_size; + switch (size_sz) { + case 4: old_size = *((uint32_t *) size_in_out); break; + case 8: old_size = *((uint64_t *) size_in_out); break; + default: abort(); + } + if (new_size > old_size) { + memset((char *) new_ptr + old_size * item_size, 0, + (new_size - old_size) * item_size); + } + } + + switch (size_sz) { + case 4: *((uint32_t *) size_in_out) = new_size; break; + case 8: *((uint64_t *) size_in_out) = new_size; break; + default: abort(); + } + + *ptr_in_out = new_ptr; + return 0; +} + +void hts_free(void *ptr) { + free(ptr); +} + void hts_set_log_level(enum htsLogLevel level) { hts_verbose = level; diff --git a/hts_internal.h b/hts_internal.h index dec151288..a0fea744f 100644 --- a/hts_internal.h +++ b/hts_internal.h @@ -1,6 +1,6 @@ /* hts_internal.h -- internal functions; not part of the public API. - Copyright (C) 2015-2016 Genome Research Ltd. + Copyright (C) 2015-2016, 2018-2019 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -46,8 +46,26 @@ struct hts_json_token { struct cram_fd; +/* + * Check the existence of a local index file using part of the alignment file name. + * The order is alignment.bam.csi, alignment.csi, alignment.bam.bai, alignment.bai + * @param fn - pointer to the file name + * @param fnidx - pointer to the index file name placeholder + * @return 1 for success, 0 for failure + */ +int hts_idx_check_local(const char *fn, int fmt, char **fnidx); + +// Retrieve the name of the index file and also download it, if it is remote char *hts_idx_getfn(const char *fn, const char *ext); +// Used for on-the-fly indexing. See the comments in hts.c. +void hts_idx_amend_last(hts_idx_t *idx, uint64_t offset); + +int hts_idx_fmt(hts_idx_t *idx); + +// Check that index is capable of storing items in range beg..end +int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end); + // The CRAM implementation stores the loaded index within the cram_fd rather // than separately as is done elsewhere in htslib. So if p is a pointer to // an hts_idx_t with p->fmt == HTS_FMT_CRAI, then it actually points to an @@ -78,6 +96,15 @@ void *load_plugin(void **pluginp, const char *filename, const char *symbol); void *plugin_sym(void *plugin, const char *name, const char **errmsg); void close_plugin(void *plugin); +/* + * Buffers up arguments to hts_idx_push for later use, once we've written all bar + * this block. This is necessary when multiple blocks are in flight (threading). + * + * Returns 0 on success, + * -1 on failure + */ +int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped); + #ifdef __cplusplus } #endif diff --git a/hts_os.c b/hts_os.c index 0e6dc8eb1..66de1c29e 100644 --- a/hts_os.c +++ b/hts_os.c @@ -23,16 +23,25 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include +#include "htslib/hts_defs.h" // Windows (maybe more) lack a drand48 implementation. #ifndef HAVE_DRAND48 #include "os/rand.c" #else #include +HTSLIB_EXPORT void hts_srand48(long seed) { srand48(seed); } + +HTSLIB_EXPORT double hts_erand48(unsigned short xseed[3]) { return erand48(xseed); } + +HTSLIB_EXPORT double hts_drand48(void) { return drand48(); } + +HTSLIB_EXPORT double hts_lrand48(void) { return lrand48(); } #endif diff --git a/htsfile.1 b/htsfile.1 index e270116db..7a9fdf831 100644 --- a/htsfile.1 +++ b/htsfile.1 @@ -1,8 +1,8 @@ -.TH htsfile 1 "18 July 2018" "htslib-1.9" "Bioinformatics tools" +.TH htsfile 1 "6 December 2019" "htslib-1.10" "Bioinformatics tools" .SH NAME htsfile \- identify high-throughput sequencing data files .\" -.\" Copyright (C) 2015, 2017 Genome Research Ltd. +.\" Copyright (C) 2015, 2017-2018 Genome Research Ltd. .\" .\" Author: John Marshall .\" @@ -28,6 +28,10 @@ htsfile \- identify high-throughput sequencing data files .B htsfile .RB [ -chHv ] .IR FILE ... +.br +.B htsfile --copy +.RB [ -v ] +.I FILE DESTFILE .SH DESCRIPTION The \fBhtsfile\fR utility attempts to identify what kind of high-throughput sequencing data files the specified files are, and provides minimal viewing @@ -52,6 +56,11 @@ only headers or only data records, but has no other filtering capabilities. Use \fBsamtools\fR or \fBbcftools\fR if you need more extensive viewing or filtering capabilities. .P +Alternatively, when \fB--copy\fR is used, \fBhtsfile\fR takes exactly two +arguments and performs a byte-for-byte copy from \fIFILE\fR to \fIDESTFILE\fR. +This is similar to \fBcp\fR(1), but HTSlib's remote file access facilities +are available for both source and destination. +.P The following options are accepted: .TP 4n .BR -c ", " --view @@ -63,6 +72,11 @@ When \fB--verbose\fR is also given, the raw contents of such files are displayed, with non-printable characters shown via C-style "\\x" hexadecimal escape sequences. .TP +.BR -C ", " --copy +Instead of identifying or displaying the specified files, copy the source +\fIFILE\fR to the destination \fIDESTFILE\fR. +Only \fB--verbose\fR may be used in conjunction with \fB--copy\fR. +.TP .BR -h ", " --header-only Display data file headers only. Implies \fB--view\fR. diff --git a/htsfile.c b/htsfile.c index bc54ab6dc..53c5a3b86 100644 --- a/htsfile.c +++ b/htsfile.c @@ -1,6 +1,6 @@ /* htsfile.c -- file identifier and minimal viewer. - Copyright (C) 2014-2018 Genome Research Ltd. + Copyright (C) 2014-2019 Genome Research Ltd. Author: John Marshall @@ -38,7 +38,11 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/sam.h" #include "htslib/vcf.h" -enum { identify, view_headers, view_all } mode = identify; +#ifndef EFTYPE +#define EFTYPE ENOEXEC +#endif + +enum { identify, view_headers, view_all, copy } mode = identify; int show_headers = 1; int verbose = 0; int status = EXIT_SUCCESS; /* Exit status from main */ @@ -68,7 +72,7 @@ static htsFile *dup_stdout(const char *mode) static void view_sam(samFile *in, const char *filename) { bam1_t *b = NULL; - bam_hdr_t *hdr = NULL; + sam_hdr_t *hdr = NULL; samFile *out = NULL; hdr = sam_hdr_read(in); @@ -104,7 +108,7 @@ static void view_sam(samFile *in, const char *filename) } clean: - bam_hdr_destroy(hdr); + sam_hdr_destroy(hdr); bam_destroy1(b); if (out) hts_close(out); } @@ -170,12 +174,58 @@ static void view_raw(hFILE *fp, const char *filename) } } +static void copy_raw(const char *srcfilename, const char *destfilename) +{ + hFILE *src = hopen(srcfilename, "r"); + if (src == NULL) { + error("can't open \"%s\"", srcfilename); + return; + } + + size_t bufsize = 1048576; + char *buffer = malloc(bufsize); + if (buffer == NULL) { + error("can't allocate copy buffer"); + hclose_abruptly(src); + return; + } + + hFILE *dest = hopen(destfilename, "w"); + if (dest == NULL) { + error("can't create \"%s\"", destfilename); + hclose_abruptly(src); + free(buffer); + return; + } + + ssize_t n; + while ((n = hread(src, buffer, bufsize)) > 0) + if (hwrite(dest, buffer, n) != n) { + error("writing to \"%s\" failed", destfilename); + hclose_abruptly(dest); + dest = NULL; + break; + } + + if (n < 0) { + error("reading from \"%s\" failed", srcfilename); + hclose_abruptly(src); + src = NULL; + } + + if (dest && hclose(dest) < 0) error("closing \"%s\" failed", destfilename); + if (src && hclose(src) < 0) error("closing \"%s\" failed", srcfilename); + free(buffer); +} + static void usage(FILE *fp, int status) { fprintf(fp, "Usage: htsfile [-chHv] FILE...\n" +" htsfile --copy [-v] FILE DESTFILE\n" "Options:\n" " -c, --view Write textual form of FILEs to standard output\n" +" -C, --copy Copy the exact contents of FILE to DESTFILE\n" " -h, --header-only Display only headers in view mode, not records\n" " -H, --no-header Suppress header display in view mode\n" " -v, --verbose Increase verbosity of warnings and diagnostics\n"); @@ -185,11 +235,12 @@ static void usage(FILE *fp, int status) int main(int argc, char **argv) { static const struct option options[] = { + { "copy", no_argument, NULL, 'C' }, { "header-only", no_argument, NULL, 'h' }, { "no-header", no_argument, NULL, 'H' }, { "view", no_argument, NULL, 'c' }, { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, '?' }, + { "help", no_argument, NULL, 2 }, { "version", no_argument, NULL, 1 }, { NULL, 0, NULL, 0 } }; @@ -197,25 +248,32 @@ int main(int argc, char **argv) int c, i; status = EXIT_SUCCESS; - while ((c = getopt_long(argc, argv, "chHv?", options, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "cChHv", options, NULL)) >= 0) switch (c) { case 'c': mode = view_all; break; + case 'C': mode = copy; break; case 'h': mode = view_headers; show_headers = 1; break; case 'H': show_headers = 0; break; case 'v': hts_verbose++; verbose++; break; case 1: printf( "htsfile (htslib) %s\n" -"Copyright (C) 2018 Genome Research Ltd.\n", +"Copyright (C) 2019 Genome Research Ltd.\n", hts_version()); exit(EXIT_SUCCESS); break; - case '?': usage(stdout, EXIT_SUCCESS); break; + case 2: usage(stdout, EXIT_SUCCESS); break; default: usage(stderr, EXIT_FAILURE); break; } if (optind == argc) usage(stderr, EXIT_FAILURE); + if (mode == copy) { + if (optind + 2 != argc) usage(stderr, EXIT_FAILURE); + copy_raw(argv[optind], argv[optind + 1]); + return status; + } + for (i = optind; i < argc; i++) { hFILE *fp = hopen(argv[i], "r"); if (fp == NULL) { @@ -258,7 +316,7 @@ int main(int argc, char **argv) if (hts_close(hts) < 0) error("closing \"%s\" failed", argv[i]); fp = NULL; } - else if (errno == ENOEXEC && verbose) + else if ((errno == EFTYPE || errno == ENOEXEC) && verbose) view_raw(fp, argv[i]); else error("can't view \"%s\"", argv[i]); diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7 new file mode 100644 index 000000000..3a1223c22 --- /dev/null +++ b/htslib-s3-plugin.7 @@ -0,0 +1,122 @@ +.TH htslib-s3-plugin 7 "6 December 2019" "htslib-1.10" "Bioinformatics tools" +.SH NAME +s3 plugin \- htslib AWS S3 plugin +.\" +.\" Copyright (C) 2019 Genome Research Ltd. +.\" +.\" Author: Andrew Whitwham +.\" +.\" Permission is hereby granted, free of charge, to any person obtaining a +.\" copy of this software and associated documentation files (the "Software"), +.\" to deal in the Software without restriction, including without limitation +.\" the rights to use, copy, modify, merge, publish, distribute, sublicense, +.\" and/or sell copies of the Software, and to permit persons to whom the +.\" Software is furnished to do so, subject to the following conditions: +.\" +.\" The above copyright notice and this permission notice shall be included in +.\" all copies or substantial portions of the Software. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +.\" IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +.\" THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +.\" LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +.\" FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +.\" DEALINGS IN THE SOFTWARE. +.\" +.SH DESCRIPTION +The S3 plugin allows htslib file functions to communicate with servers that use +the AWS S3 protocol. Files are identified by their bucket and object key in a +URL format e.g. + +.B s3://mybucket/path/to/file + +With \fIpath/to/file\fR being the object key. + +Necessary security information can be provided in as part of the URL, in +environment variables or from configuration files. + +The full URL format is: + +.B s3[+SCHEME]://[ID[:SECRET[:TOKEN]]@]BUCKET/PATH + +The elements are: +.TP +.I SCHEME +The protocol used. Defaults to \fIhttps\fR. +.TP +.I ID +The user AWS access key. +.TP +.I SECRET +The secret key for use with the access key. +.TP +.I TOKEN +Token used for temporary security credentials. +.TP +.I BUCKET +AWS S3 bucket. +.TP +.I PATH +Path to the object under the bucket. +.LP + +The environment variables below will be used if the user ID is not set. +.TP +.B AWS_ACCESS_KEY_ID +The user AWS access key. +.TP +.B AWS_SECRET_ACCESS_KEY +The secret key for use with the access key. +.TP +.B AWS_DEFAULT_REGION +The region to use. Defaults to +.IR us-east-1 . +.TP +.B AWS_SESSION_TOKEN +Token used for temporary security credentials. +.TP +.B AWS_DEFAULT_PROFILE +The profile to use in \fIcredentials\fR, \fIconfig\fR or \fIs3cfg\fR files. +Defaults to +.IR default . +.TP +.B AWS_PROFILE +Same as above. +.TP +.B AWS_SHARED_CREDENTIALS_FILE +Location of the credentials file. Defaults to +.IR ~/.aws/credentials . +.TP +.B HTS_S3_S3CFG +Location of the s3cfg file. Defaults to +.IR ~/.s3cfg . +.TP +.B HTS_S3_HOST +Sets the host. Defaults to +.IR s3.amazonaws.com . +.TP +.B HTS_S3_V2 +If set use signature v2 rather the default v4. This will limit the plugin to +reading only. +.TP +.B HTS_S3_PART_SIZE +Sets the upload part size in Mb, the minimum being 5Mb. +By default the part size starts at 5Mb and expands at regular intervals to +accommodate bigger files (up to 2.5 Tbytes with the current rate). +Using this setting disables the automatic part size expansion. +.LP +In the absence of an ID from the previous two methods the credential/config +files will be used. The default file locations are either +\fI~/.aws/credentials\fR or \fI~/.s3cfg\fR (in that order). +.SH NOTES +In most cases this plugin transforms the given URL into a virtual host-style +format e.g. \fIhttps://bucket.host/path/to/file\fR. A path-style format is used +where the URL is not DNS compliant or the bucket name contains a dot e.g. +\fIhttps://host/bu.cket/path/to/file\fR. + +.SH "SEE ALSO" +.BR htsfile (1) +.BR samtools (1) +.PP +htslib website: diff --git a/htslib.mk b/htslib.mk index eb8f38a77..b750869c8 100644 --- a/htslib.mk +++ b/htslib.mk @@ -1,6 +1,6 @@ # Makefile rules useful for third-party code using htslib's public API. # -# Copyright (C) 2013-2016 Genome Research Ltd. +# Copyright (C) 2013-2017, 2019 Genome Research Ltd. # # Author: John Marshall # @@ -82,12 +82,15 @@ HTSLIB_ALL = \ $(HTSDIR)/config.h \ $(HTSDIR)/errmod.c \ $(HTSDIR)/faidx.c \ + $(HTSDIR)/header.c \ + $(HTSDIR)/header.h \ $(HTSDIR)/hfile_internal.h \ $(HTSDIR)/hfile.c \ $(HTSDIR)/hfile_gcs.c \ $(HTSDIR)/hfile_libcurl.c \ $(HTSDIR)/hfile_net.c \ $(HTSDIR)/hfile_s3.c \ + $(HTSDIR)/hfile_s3_write.c \ $(HTSDIR)/hts.c \ $(HTSDIR)/hts_internal.h \ $(HTSDIR)/hts_os.c \ @@ -100,7 +103,9 @@ HTSLIB_ALL = \ $(HTSDIR)/probaln.c \ $(HTSDIR)/realn.c \ $(HTSDIR)/regidx.c \ + $(HTSDIR)/region.c \ $(HTSDIR)/sam.c \ + $(HTSDIR)/sam_internal.h \ $(HTSDIR)/synced_bcf_reader.c \ $(HTSDIR)/tbx.c \ $(HTSDIR)/textutils.c \ @@ -127,7 +132,6 @@ HTSLIB_ALL = \ $(HTSDIR)/cram/cram_stats.c \ $(HTSDIR)/cram/cram_stats.h \ $(HTSDIR)/cram/cram_structs.h \ - $(HTSDIR)/cram/files.c \ $(HTSDIR)/cram/mFILE.c \ $(HTSDIR)/cram/mFILE.h \ $(HTSDIR)/cram/misc.h \ @@ -139,8 +143,6 @@ HTSLIB_ALL = \ $(HTSDIR)/cram/rANS_byte.h \ $(HTSDIR)/cram/rANS_static.c \ $(HTSDIR)/cram/rANS_static.h \ - $(HTSDIR)/cram/sam_header.c \ - $(HTSDIR)/cram/sam_header.h \ $(HTSDIR)/cram/string_alloc.c \ $(HTSDIR)/cram/string_alloc.h \ $(HTSDIR)/os/lzma_stub.h \ @@ -149,19 +151,27 @@ HTSLIB_ALL = \ $(HTSDIR)/config.h: +cd $(HTSDIR) && $(MAKE) config.h -$(HTSDIR)/libhts.a: $(HTSLIB_ALL) +$(HTSDIR)/hts-object-files : $(HTSLIB_ALL) + +cd $(HTSDIR) && $(MAKE) hts-object-files + +$(HTSDIR)/libhts.a: $(HTSDIR)/hts-object-files +cd $(HTSDIR) && $(MAKE) lib-static -$(HTSDIR)/libhts.so $(HTSDIR)/libhts.dylib: $(HTSLIB_ALL) +$(HTSDIR)/libhts.so: $(HTSLIB_ALL) + +cd $(HTSDIR) && $(MAKE) lib-shared + +$(HTSDIR)/libhts.dylib $(HTSDIR)/libhts.dll.a $(HTSDIR)/hts.dll.a: $(HTSDIR)/hts-object-files +cd $(HTSDIR) && $(MAKE) lib-shared -$(HTSDIR)/bgzip: $(HTSDIR)/bgzip.c $(HTSLIB_PUBLIC_HEADERS) +$(HTSDIR)/bgzip: $(HTSDIR)/bgzip.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a +cd $(HTSDIR) && $(MAKE) bgzip -$(HTSDIR)/htsfile: $(HTSDIR)/htsfile.c $(HTSLIB_PUBLIC_HEADERS) +$(HTSDIR)/htsfile: $(HTSDIR)/htsfile.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a + +cd $(HTSDIR) && $(MAKE) htsfile -$(HTSDIR)/tabix: $(HTSDIR)/tabix.c $(HTSLIB_PUBLIC_HEADERS) +$(HTSDIR)/tabix: $(HTSDIR)/tabix.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a + +cd $(HTSDIR) && $(MAKE) tabix $(HTSDIR)/htslib_static.mk: $(HTSDIR)/htslib.pc.tmp diff --git a/htslib/bgzf.h b/htslib/bgzf.h index 8affb61b5..daa3e2a61 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -3,7 +3,7 @@ /* Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos - Copyright (C) 2009, 2013, 2014,2017 Genome Research Ltd + Copyright (C) 2009, 2013, 2014, 2017, 2018-2019 Genome Research Ltd Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -32,7 +32,6 @@ #include #include -#include #include #include "hts_defs.h" @@ -53,9 +52,11 @@ extern "C" { struct hFILE; struct hts_tpool; +struct kstring_t; struct bgzf_mtaux_t; typedef struct __bgzidx_t bgzidx_t; typedef struct bgzf_cache_t bgzf_cache_t; +struct z_stream_s; struct BGZF { // Reserved bits should be written as 0; read as "don't care" @@ -71,19 +72,12 @@ struct BGZF { struct bgzf_mtaux_t *mt; // only used for multi-threading bgzidx_t *idx; // BGZF index int idx_build_otf; // build index on the fly, set by bgzf_index_build_init() - z_stream *gz_stream;// for gzip-compressed files + struct z_stream_s *gz_stream; // for gzip-compressed files + int64_t seeked; // virtual offset of last seek }; #ifndef HTS_BGZF_TYPEDEF typedef struct BGZF BGZF; #define HTS_BGZF_TYPEDEF -#endif - -#ifndef KSTRING_T -#define KSTRING_T kstring_t -typedef struct __kstring_t { - size_t l, m; - char *s; -} kstring_t; #endif /****************** @@ -106,6 +100,7 @@ typedef struct __kstring_t { * outputs uncompressed data wrapped in the zlib format. * @return BGZF file handler; 0 on error */ + HTSLIB_EXPORT BGZF* bgzf_dopen(int fd, const char *mode); #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility @@ -113,11 +108,13 @@ typedef struct __kstring_t { /** * Open the specified file for reading or writing. */ + HTSLIB_EXPORT BGZF* bgzf_open(const char* path, const char *mode); /** * Open an existing hFILE stream for reading or writing. */ + HTSLIB_EXPORT BGZF* bgzf_hopen(struct hFILE *fp, const char *mode); /** @@ -126,6 +123,7 @@ typedef struct __kstring_t { * @param fp BGZF file handler * @return 0 on success and -1 on error */ + HTSLIB_EXPORT int bgzf_close(BGZF *fp); /** @@ -136,6 +134,7 @@ typedef struct __kstring_t { * @param length size of data to read * @return number of bytes actually read; 0 on end-of-file and -1 on error */ + HTSLIB_EXPORT ssize_t bgzf_read(BGZF *fp, void *data, size_t length) HTS_RESULT_USED; /** @@ -147,6 +146,7 @@ typedef struct __kstring_t { * @param length size of data to write * @return number of bytes written (i.e., _length_); negative on error */ + HTSLIB_EXPORT ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) HTS_RESULT_USED; /** @@ -159,8 +159,19 @@ typedef struct __kstring_t { * @param length size of data to write * @return number of bytes written (i.e., _length_); negative on error */ + HTSLIB_EXPORT ssize_t bgzf_block_write(BGZF *fp, const void *data, size_t length); + /** + * Returns the next byte in the file without consuming it. + * @param fp BGZF file handler + * @return -1 on EOF, + * -2 on error, + * otherwise the unsigned byte value. + */ + HTSLIB_EXPORT + int bgzf_peek(BGZF *fp); + /** * Read up to _length_ bytes directly from the underlying stream without * decompressing. Bypasses BGZF blocking, so must be used with care in @@ -171,6 +182,7 @@ typedef struct __kstring_t { * @param length number of raw bytes to read * @return number of bytes actually read; 0 on end-of-file and -1 on error */ + HTSLIB_EXPORT ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length) HTS_RESULT_USED; /** @@ -183,6 +195,7 @@ typedef struct __kstring_t { * @param length number of raw bytes to write * @return number of bytes actually written; -1 on error */ + HTSLIB_EXPORT ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length) HTS_RESULT_USED; /** @@ -191,6 +204,7 @@ typedef struct __kstring_t { * @param fp BGZF file handle * @return 0 on success and -1 on error */ + HTSLIB_EXPORT int bgzf_flush(BGZF *fp) HTS_RESULT_USED; /** @@ -208,7 +222,11 @@ typedef struct __kstring_t { * @param pos virtual file offset returned by bgzf_tell() * @param whence must be SEEK_SET * @return 0 on success and -1 on error + * + * @note It is not permitted to seek on files open for writing, + * or files compressed with gzip (as opposed to bgzip). */ + HTSLIB_EXPORT int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence) HTS_RESULT_USED; /** @@ -220,6 +238,7 @@ typedef struct __kstring_t { * 0 if the EOF marker is absent; * -1 (with errno set) on error */ + HTSLIB_EXPORT int bgzf_check_EOF(BGZF *fp); /** Return the file's compression format @@ -232,6 +251,7 @@ typedef struct __kstring_t { * - 2 / `bgzf` if the file is BGZF-compressed * @since 1.4 */ + HTSLIB_EXPORT int bgzf_compression(BGZF *fp); /** @@ -240,6 +260,7 @@ typedef struct __kstring_t { * @param fn file name * @return 1 if _fn_ is BGZF; 0 if not or on I/O error */ + HTSLIB_EXPORT int bgzf_is_bgzf(const char *fn) HTS_DEPRECATED("Use bgzf_compression() or hts_detect_format() instead"); /********************* @@ -252,12 +273,14 @@ typedef struct __kstring_t { * @param fp BGZF file handler * @param size size of cache in bytes; 0 to disable caching (default) */ + HTSLIB_EXPORT void bgzf_set_cache_size(BGZF *fp, int size); /** * Flush the file if the remaining buffer size is smaller than _size_ * @return 0 if flushing succeeded or was not needed; negative on error */ + HTSLIB_EXPORT int bgzf_flush_try(BGZF *fp, ssize_t size) HTS_RESULT_USED; /** @@ -265,6 +288,7 @@ typedef struct __kstring_t { * @param fp BGZF file handler * @return byte read; -1 on end-of-file or error */ + HTSLIB_EXPORT int bgzf_getc(BGZF *fp); /** @@ -275,11 +299,13 @@ typedef struct __kstring_t { * @param str string to write to; must be initialized * @return length of the string; -1 on end-of-file; <= -2 on error */ - int bgzf_getline(BGZF *fp, int delim, kstring_t *str); + HTSLIB_EXPORT + int bgzf_getline(BGZF *fp, int delim, struct kstring_t *str); /** * Read the next BGZF block. */ + HTSLIB_EXPORT int bgzf_read_block(BGZF *fp) HTS_RESULT_USED; /** @@ -290,6 +316,7 @@ typedef struct __kstring_t { * @param fp BGZF file handler; must be opened for writing * @param pool The thread pool (see hts_create_threads) */ + HTSLIB_EXPORT int bgzf_thread_pool(BGZF *fp, struct hts_tpool *pool, int qsize); /** @@ -300,6 +327,7 @@ typedef struct __kstring_t { * @param n_threads #threads used for writing * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended */ + HTSLIB_EXPORT int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks); /** @@ -313,6 +341,7 @@ typedef struct __kstring_t { * @param level compression level * @return 0 on success and negative on error */ + HTSLIB_EXPORT int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level); /******************* @@ -324,11 +353,15 @@ typedef struct __kstring_t { * * @param fp BGZF file handler; must be opened for reading * @param uoffset file offset in the uncompressed data - * @param where SEEK_SET supported atm + * @param where must be SEEK_SET * * Returns 0 on success and -1 on error. + * + * @note It is not permitted to seek on files open for writing, + * or files compressed with gzip (as opposed to bgzip). */ - int bgzf_useek(BGZF *fp, long uoffset, int where) HTS_RESULT_USED; + HTSLIB_EXPORT + int bgzf_useek(BGZF *fp, off_t uoffset, int where) HTS_RESULT_USED; /** * Position in uncompressed BGZF @@ -337,7 +370,8 @@ typedef struct __kstring_t { * * Returns the current offset on success and -1 on error. */ - long bgzf_utell(BGZF *fp); + HTSLIB_EXPORT + off_t bgzf_utell(BGZF *fp); /** * Tell BGZF to build index while compressing. @@ -345,7 +379,13 @@ typedef struct __kstring_t { * @param fp BGZF file handler; can be opened for reading or writing. * * Returns 0 on success and -1 on error. + * + * @note This function must be called before any data has been read or + * written, and in particular before calling bgzf_mt() on the same + * file handle (as threads may start reading data before the index + * has been set up). */ + HTSLIB_EXPORT int bgzf_index_build_init(BGZF *fp); /// Load BGZF index @@ -355,6 +395,7 @@ typedef struct __kstring_t { * @param suffix suffix to add to bname (can be NULL) * @return 0 on success and -1 on error. */ + HTSLIB_EXPORT int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix) HTS_RESULT_USED; @@ -373,6 +414,7 @@ typedef struct __kstring_t { * is only used for printing error messages; if NULL the word "index" is * used instead. */ + HTSLIB_EXPORT int bgzf_index_load_hfile(BGZF *fp, struct hFILE *idx, const char *name) HTS_RESULT_USED; @@ -383,6 +425,7 @@ typedef struct __kstring_t { * @param suffix suffix to add to bname (can be NULL) * @return 0 on success and -1 on error. */ + HTSLIB_EXPORT int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix) HTS_RESULT_USED; @@ -400,6 +443,7 @@ typedef struct __kstring_t { * used instead. */ + HTSLIB_EXPORT int bgzf_index_dump_hfile(BGZF *fp, struct hFILE *idx, const char *name) HTS_RESULT_USED; diff --git a/htslib/cram.h b/htslib/cram.h index f00c529b2..952dd22b0 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -1,7 +1,7 @@ /// @file htslib/cram.h /// CRAM format-specific API functions. /* - Copyright (C) 2015, 2016 Genome Research Ltd. + Copyright (C) 2015, 2016, 2018-2019 Genome Research Ltd. Author: James Bonfield @@ -39,15 +39,16 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include "hts_defs.h" #include "hts.h" +#include "sam.h" #ifdef __cplusplus extern "C" { #endif -#ifndef _CRAM_STRUCTS_H_ enum cram_block_method { - ERROR = -1, + BM_ERROR = -1, RAW = 0, GZIP = 1, BZIP2 = 2, @@ -69,7 +70,6 @@ enum cram_content_type { }; // Opaque data types, see cram_structs for the fully fledged versions. -typedef struct SAM_hdr SAM_hdr; typedef struct cram_file_def cram_file_def; typedef struct cram_fd cram_fd; typedef struct cram_container cram_container; @@ -81,7 +81,6 @@ typedef struct cram_block_compression_hdr cram_block_compression_hdr; typedef struct refs_t refs_t; struct hFILE; -#endif // Accessor functions @@ -89,16 +88,26 @@ struct hFILE; *----------------------------------------------------------------------------- * cram_fd */ -SAM_hdr *cram_fd_get_header(cram_fd *fd); -void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr); +HTSLIB_EXPORT +sam_hdr_t *cram_fd_get_header(cram_fd *fd); + +HTSLIB_EXPORT +void cram_fd_set_header(cram_fd *fd, sam_hdr_t *hdr); +HTSLIB_EXPORT int cram_fd_get_version(cram_fd *fd); + +HTSLIB_EXPORT void cram_fd_set_version(cram_fd *fd, int vers); +HTSLIB_EXPORT int cram_major_vers(cram_fd *fd); +HTSLIB_EXPORT int cram_minor_vers(cram_fd *fd); +HTSLIB_EXPORT struct hFILE *cram_fd_get_fp(cram_fd *fd); +HTSLIB_EXPORT void cram_fd_set_fp(cram_fd *fd, struct hFILE *fp); @@ -106,15 +115,22 @@ void cram_fd_set_fp(cram_fd *fd, struct hFILE *fp); *----------------------------------------------------------------------------- * cram_container */ +HTSLIB_EXPORT int32_t cram_container_get_length(cram_container *c); +HTSLIB_EXPORT void cram_container_set_length(cram_container *c, int32_t length); +HTSLIB_EXPORT int32_t cram_container_get_num_blocks(cram_container *c); +HTSLIB_EXPORT void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks); +HTSLIB_EXPORT int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks); +HTSLIB_EXPORT void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks, int32_t *landmarks); /* Returns true if the container is empty (EOF marker) */ +HTSLIB_EXPORT int cram_container_is_empty(cram_fd *fd); @@ -122,31 +138,47 @@ int cram_container_is_empty(cram_fd *fd); *----------------------------------------------------------------------------- * cram_block */ +HTSLIB_EXPORT int32_t cram_block_get_content_id(cram_block *b); +HTSLIB_EXPORT int32_t cram_block_get_comp_size(cram_block *b); +HTSLIB_EXPORT int32_t cram_block_get_uncomp_size(cram_block *b); +HTSLIB_EXPORT int32_t cram_block_get_crc32(cram_block *b); +HTSLIB_EXPORT void * cram_block_get_data(cram_block *b); +HTSLIB_EXPORT enum cram_content_type cram_block_get_content_type(cram_block *b); +HTSLIB_EXPORT void cram_block_set_content_id(cram_block *b, int32_t id); +HTSLIB_EXPORT void cram_block_set_comp_size(cram_block *b, int32_t size); +HTSLIB_EXPORT void cram_block_set_uncomp_size(cram_block *b, int32_t size); +HTSLIB_EXPORT void cram_block_set_crc32(cram_block *b, int32_t crc); +HTSLIB_EXPORT void cram_block_set_data(cram_block *b, void *data); -int cram_block_append(cram_block *b, void *data, int size); +HTSLIB_EXPORT +int cram_block_append(cram_block *b, const void *data, int size); +HTSLIB_EXPORT void cram_block_update_size(cram_block *b); // Offset is known as "size" internally, but it can be confusing. +HTSLIB_EXPORT size_t cram_block_get_offset(cram_block *b); +HTSLIB_EXPORT void cram_block_set_offset(cram_block *b, size_t offset); /* * Computes the size of a cram block, including the block * header itself. */ +HTSLIB_EXPORT uint32_t cram_block_size(cram_block *b); /* @@ -179,6 +211,7 @@ uint32_t cram_block_size(cram_block *b); * -1 if unable to edit; * -2 on other errors (eg I/O). */ +HTSLIB_EXPORT int cram_transcode_rg(cram_fd *in, cram_fd *out, cram_container *c, int nrg, int *in_rg, int *out_rg); @@ -192,24 +225,9 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, * Returns 0 on success * -1 on failure */ +HTSLIB_EXPORT int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice); -/* - *----------------------------------------------------------------------------- - * SAM_hdr - */ - -/*! Tokenises a SAM header into a hash table. - * - * Also extracts a few bits on specific data types, such as @RG lines. - * - * @return - * Returns a SAM_hdr struct on success (free with sam_hdr_free()); - * NULL on failure - */ -SAM_hdr *sam_hdr_parse_(const char *hdr, int len); - - /* *----------------------------------------------------------------------------- * cram_io basics @@ -229,7 +247,11 @@ SAM_hdr *sam_hdr_parse_(const char *hdr, int len); * @return * Returns block pointer on success; * NULL on failure + * + * The cram_block struct returned by a successful call should be freed + * via cram_free_block() when it is no longer needed. */ +HTSLIB_EXPORT cram_block *cram_new_block(enum cram_content_type content_type, int content_id); @@ -238,7 +260,11 @@ cram_block *cram_new_block(enum cram_content_type content_type, * @return * Returns cram_block pointer on success; * NULL on failure + * + * The cram_block struct returned by a successful call should be freed + * via cram_free_block() when it is no longer needed. */ +HTSLIB_EXPORT cram_block *cram_read_block(cram_fd *fd); /*! Writes a CRAM block. @@ -247,10 +273,12 @@ cram_block *cram_read_block(cram_fd *fd); * Returns 0 on success; * -1 on failure */ +HTSLIB_EXPORT int cram_write_block(cram_fd *fd, cram_block *b); /*! Frees a CRAM block, deallocating internal data too. */ +HTSLIB_EXPORT void cram_free_block(cram_block *b); /*! Uncompresses a CRAM block, if compressed. @@ -259,6 +287,7 @@ void cram_free_block(cram_block *b); * Returns 0 on success; * -1 on failure */ +HTSLIB_EXPORT int cram_uncompress_block(cram_block *b); /*! Compresses a block. @@ -274,6 +303,7 @@ int cram_uncompress_block(cram_block *b); * Returns 0 on success; * -1 on failure */ +HTSLIB_EXPORT int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, int method, int level); @@ -288,8 +318,13 @@ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, * @return * Returns cram_container ptr on success; * NULL on failure + * + * The cram_container struct returned by a successful call should be freed + * via cram_free_container() when it is no longer needed. */ +HTSLIB_EXPORT cram_container *cram_new_container(int nrec, int nslice); +HTSLIB_EXPORT void cram_free_container(cram_container *c); /*! Reads a container header. @@ -297,7 +332,11 @@ void cram_free_container(cram_container *c); * @return * Returns cram_container on success; * NULL on failure or no container left (fd->err == 0). + * + * The cram_container struct returned by a successful call should be freed + * via cram_free_container() when it is no longer needed. */ +HTSLIB_EXPORT cram_container *cram_read_container(cram_fd *fd); /*! Writes a container structure. @@ -306,6 +345,7 @@ cram_container *cram_read_container(cram_fd *fd); * Returns 0 on success; * -1 on failure */ +HTSLIB_EXPORT int cram_write_container(cram_fd *fd, cram_container *h); /* @@ -316,8 +356,10 @@ int cram_write_container(cram_fd *fd, cram_container *h); * Returns 0 on success; * -1 on failure */ +HTSLIB_EXPORT int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size); +HTSLIB_EXPORT int cram_container_size(cram_container *c); /**@}*/ @@ -333,6 +375,7 @@ int cram_container_size(cram_container *c); * Returns file handle on success; * NULL on failure. */ +HTSLIB_EXPORT cram_fd *cram_open(const char *filename, const char *mode); /*! Opens an existing stream for reading or writing. @@ -341,6 +384,7 @@ cram_fd *cram_open(const char *filename, const char *mode); * Returns file handle on success; * NULL on failure. */ +HTSLIB_EXPORT cram_fd *cram_dopen(struct hFILE *fp, const char *filename, const char *mode); /*! Closes a CRAM file. @@ -349,6 +393,7 @@ cram_fd *cram_dopen(struct hFILE *fp, const char *filename, const char *mode); * Returns 0 on success; * -1 on failure */ +HTSLIB_EXPORT int cram_close(cram_fd *fd); /* @@ -357,6 +402,7 @@ int cram_close(cram_fd *fd); * Returns 0 on success * -1 on failure */ +HTSLIB_EXPORT int cram_seek(cram_fd *fd, off_t offset, int whence); /* @@ -366,6 +412,7 @@ int cram_seek(cram_fd *fd, off_t offset, int whence); * Returns 0 on success * -1 on failure */ +HTSLIB_EXPORT int cram_flush(cram_fd *fd); /*! Checks for end of file on a cram_fd stream. @@ -375,6 +422,7 @@ int cram_flush(cram_fd *fd); * 1 if we hit an expected EOF (end of range or EOF block) * 2 for other EOF (end of stream without EOF block) */ +HTSLIB_EXPORT int cram_eof(cram_fd *fd); /*! Sets options on the cram_fd. @@ -386,6 +434,7 @@ int cram_eof(cram_fd *fd); * Returns 0 on success; * -1 on failure */ +HTSLIB_EXPORT int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...); /*! Sets options on the cram_fd. @@ -397,6 +446,7 @@ int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...); * Returns 0 on success; * -1 on failure */ +HTSLIB_EXPORT int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args); /*! @@ -410,7 +460,8 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args); * Returns 0 on success; * -1 on failure */ -int cram_set_header(cram_fd *fd, SAM_hdr *hdr); +HTSLIB_EXPORT +int cram_set_header(cram_fd *fd, sam_hdr_t *hdr); /*! Check if this file has a proper EOF block * @@ -422,78 +473,60 @@ int cram_set_header(cram_fd *fd, SAM_hdr *hdr); * -1 if an error occured whilst reading the file or we could not seek back to where we were * */ +HTSLIB_EXPORT int cram_check_EOF(cram_fd *fd); /* As int32_decoded/encode, but from/to blocks instead of cram_fd */ +HTSLIB_EXPORT int int32_put_blk(cram_block *b, int32_t val); /**@}*/ -/**@{ -------------------------------------------------------------------*/ -/*! Deallocates all storage used by a SAM_hdr struct. - * - * This also decrements the header reference count. If after decrementing - * it is still non-zero then the header is assumed to be in use by another - * caller and the free is not done. - * - * This is a synonym for sam_hdr_dec_ref(). +/**@{ ------------------------------------------------------------------- + * Old typedef and function names for compatibility with existing code. + * Header functionality is now provided by sam.h's sam_hdr_t functions. */ -void sam_hdr_free(SAM_hdr *hdr); -/*! Returns the current length of the SAM_hdr in text form. - * - * Call sam_hdr_rebuild() first if editing has taken place. - */ -int sam_hdr_length(SAM_hdr *hdr); +typedef sam_hdr_t SAM_hdr; -/*! Returns the string form of the SAM_hdr. +/*! Tokenises a SAM header into a hash table. * - * Call sam_hdr_rebuild() first if editing has taken place. + * Also extracts a few bits on specific data types, such as @RG lines. + * + * @return + * Returns a SAM_hdr struct on success (free with sam_hdr_free()); + * NULL on failure */ -char *sam_hdr_str(SAM_hdr *hdr); +static inline SAM_hdr *sam_hdr_parse_(const char *hdr, size_t len) { return sam_hdr_parse(len, hdr); } -/*! Appends a formatted line to an existing SAM header. - * - * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with - * optional new-line. If it contains more than 1 line then multiple lines - * will be added in order. - * - * Len is the length of the text data, or 0 if unknown (in which case - * it should be null terminated). +/*! Deallocates all storage used by a SAM_hdr struct. * - * @return - * Returns 0 on success; - * -1 on failure + * This also decrements the header reference count. If after decrementing + * it is still non-zero then the header is assumed to be in use by another + * caller and the free is not done. */ +static inline void sam_hdr_free(SAM_hdr *hdr) { sam_hdr_destroy(hdr); } + +/* sam_hdr_length() and sam_hdr_str() are now provided by sam.h. */ /*! Add an @PG line. * - * If we wish complete control over this use sam_hdr_add() directly. This + * If we wish complete control over this use sam_hdr_add_line() directly. This * function uses that, but attempts to do a lot of tedious house work for * you too. * * - It will generate a suitable ID if the supplied one clashes. * - It will generate multiple @PG records if we have multiple PG chains. * - * Call it as per sam_hdr_add() with a series of key,value pairs ending + * Call it as per sam_hdr_add_line() with a series of key,value pairs ending * in NULL. * * @return * Returns 0 on success; * -1 on failure */ -int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...); - -/*! - * A function to help with construction of CL tags in @PG records. - * Takes an argc, argv pair and returns a single space-separated string. - * This string should be deallocated by the calling function. - * - * @return - * Returns malloced char * on success; - * NULL on failure - */ -char *stringify_argv(int argc, char *argv[]); +#define sam_hdr_add_PG sam_hdr_add_pg +/**@{ -------------------------------------------------------------------*/ /*! * Returns the refs_t structure used by a cram file handle. @@ -504,6 +537,7 @@ char *stringify_argv(int argc, char *argv[]); * @return * Returns NULL if none exists or the file handle is not a CRAM file. */ +HTSLIB_EXPORT refs_t *cram_get_refs(htsFile *fd); /**@}*/ diff --git a/htslib/faidx.h b/htslib/faidx.h index e83143a9c..5afd1b778 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -1,7 +1,7 @@ /// @file htslib/faidx.h /// FASTA random access. /* - Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2018 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2019 Genome Research Ltd. Author: Heng Li @@ -29,7 +29,9 @@ #ifndef HTSLIB_FAIDX_H #define HTSLIB_FAIDX_H +#include #include "hts_defs.h" +#include "hts.h" #ifdef __cplusplus extern "C" { @@ -85,6 +87,7 @@ If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI file will only be built if fn is bgzip-compressed. */ +HTSLIB_EXPORT int fai_build3(const char *fn, const char *fnfai, const char *fngzi) HTS_RESULT_USED; /// Build index for a FASTA or FASTQ or bgzip-compressed FASTA or FASTQ file. @@ -94,9 +97,11 @@ int fai_build3(const char *fn, const char *fnfai, const char *fngzi) HTS_RESULT_ File "fn.fai" will be generated. This function is equivalent to fai_build3(fn, NULL, NULL); */ +HTSLIB_EXPORT int fai_build(const char *fn) HTS_RESULT_USED; /// Destroy a faidx_t struct +HTSLIB_EXPORT void fai_destroy(faidx_t *fai); enum fai_load_options { @@ -116,7 +121,11 @@ The bgzip index is only needed if fn is compressed. If (flags & FAI_CREATE) is true, the index files will be built using fai_build3() if they are not already present. + +The struct returned by a successful call should be freed via fai_destroy() +when it is no longer needed. */ +HTSLIB_EXPORT faidx_t *fai_load3(const char *fn, const char *fnfai, const char *fngzi, int flags); @@ -126,6 +135,7 @@ faidx_t *fai_load3(const char *fn, const char *fnfai, const char *fngzi, This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE); */ +HTSLIB_EXPORT faidx_t *fai_load(const char *fn); /// Load FASTA or FASTQ indexes. @@ -142,7 +152,11 @@ The bgzip index is only needed if fn is compressed. If (flags & FAI_CREATE) is true, the index files will be built using fai_build3() if they are not already present. + +The struct returned by a successful call should be freed via fai_destroy() +when it is no longer needed. */ +HTSLIB_EXPORT faidx_t *fai_load3_format(const char *fn, const char *fnfai, const char *fngzi, int flags, enum fai_format_options format); @@ -153,6 +167,7 @@ faidx_t *fai_load3_format(const char *fn, const char *fnfai, const char *fngzi, This function is equivalent to fai_load3_format(fn, NULL, NULL, FAI_CREATE|FAI_CACHE, format); */ +HTSLIB_EXPORT faidx_t *fai_load_format(const char *fn, enum fai_format_options format); /// Fetch the sequence in a region @@ -163,8 +178,15 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format); The returned sequence is allocated by `malloc()` family and should be destroyed by end users by calling `free()` on it. + +To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" +are reference names, quote using curly braces. +Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. */ +HTSLIB_EXPORT char *fai_fetch(const faidx_t *fai, const char *reg, int *len); +HTSLIB_EXPORT +char *fai_fetch64(const faidx_t *fai, const char *reg, hts_pos_t *len); /// Fetch the quality string for a region for FASTQ files /** @param fai Pointer to the faidx_t struct @@ -172,15 +194,21 @@ char *fai_fetch(const faidx_t *fai, const char *reg, int *len); @param len Length of the region; -2 if seq not present, -1 general error @return Pointer to the quality string; null on failure -The returned quality string is allocated by `malloc()` family and should be destroyed -by end users by calling `free()` on it. +The returned quality string is allocated by `malloc()` family and should be +destroyed by end users by calling `free()` on it. + +Region names can be quoted with curly braces, as for fai_fetch(). */ +HTSLIB_EXPORT char *fai_fetchqual(const faidx_t *fai, const char *reg, int *len); +HTSLIB_EXPORT +char *fai_fetchqual64(const faidx_t *fai, const char *reg, hts_pos_t *len); /// Fetch the number of sequences /** @param fai Pointer to the faidx_t struct @return The number of sequences */ +HTSLIB_EXPORT int faidx_fetch_nseq(const faidx_t *fai) HTS_DEPRECATED("Please use faidx_nseq instead"); /// Fetch the sequence in a region @@ -194,8 +222,23 @@ int faidx_fetch_nseq(const faidx_t *fai) HTS_DEPRECATED("Please use faidx_nseq i The returned sequence is allocated by `malloc()` family and should be destroyed by end users by calling `free()` on it. */ +HTSLIB_EXPORT char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len); +/// Fetch the sequence in a region +/** @param fai Pointer to the faidx_t struct + @param c_name Region name + @param p_beg_i Beginning position number (zero-based) + @param p_end_i End position number (zero-based) + @param len Length of the region; -2 if c_name not present, -1 general error + @return Pointer to the sequence; null on failure + +The returned sequence is allocated by `malloc()` family and should be destroyed +by end users by calling `free()` on it. +*/ +HTSLIB_EXPORT +char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len); + /// Fetch the quality string in a region for FASTQ files /** @param fai Pointer to the faidx_t struct @param c_name Region name @@ -207,24 +250,68 @@ char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p The returned sequence is allocated by `malloc()` family and should be destroyed by end users by calling `free()` on it. */ +HTSLIB_EXPORT char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len); +/// Fetch the quality string in a region for FASTQ files +/** @param fai Pointer to the faidx_t struct + @param c_name Region name + @param p_beg_i Beginning position number (zero-based) + @param p_end_i End position number (zero-based) + @param len Length of the region; -2 if c_name not present, -1 general error + @return Pointer to the sequence; null on failure + +The returned sequence is allocated by `malloc()` family and should be destroyed +by end users by calling `free()` on it. +*/ +HTSLIB_EXPORT +char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len); + /// Query if sequence is present /** @param fai Pointer to the faidx_t struct @param seq Sequence name @return 1 if present or 0 if absent */ +HTSLIB_EXPORT int faidx_has_seq(const faidx_t *fai, const char *seq); /// Return number of sequences in fai index +HTSLIB_EXPORT int faidx_nseq(const faidx_t *fai); /// Return name of i-th sequence +HTSLIB_EXPORT const char *faidx_iseq(const faidx_t *fai, int i); /// Return sequence length, -1 if not present +HTSLIB_EXPORT int faidx_seq_len(const faidx_t *fai, const char *seq); +/// Parses a region string. +/** @param fai Pointer to the faidx_t struct + @param s Region string + @param tid Returns which i-th sequence is described in the region. + @param beg Returns the start of the region (0 based) + @param end Returns the one past last of the region (0 based) + @param flags Parsing method, see HTS_PARSE_* in hts.h. + @return pointer to end of parsed s if successs, NULL if not. + + To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" + are reference names, quote using curly braces. + Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. +*/ +HTSLIB_EXPORT +const char *fai_parse_region(const faidx_t *fai, const char *s, + int *tid, hts_pos_t *beg, hts_pos_t *end, + int flags); + +/// Sets the cache size of the underlying BGZF compressed file +/** @param fai Pointer to the faidx_t struct + * @param cache_size Selected cache size in bytes + */ +HTSLIB_EXPORT +void fai_set_cache_size(faidx_t *fai, int cache_size); + #ifdef __cplusplus } #endif diff --git a/htslib/hfile.h b/htslib/hfile.h index e82f809a0..fb2d9cd77 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -1,7 +1,7 @@ /// @file htslib/hfile.h /// Buffered low-level input/output streams. /* - Copyright (C) 2013-2016 Genome Research Ltd. + Copyright (C) 2013-2019 Genome Research Ltd. Author: John Marshall @@ -37,6 +37,8 @@ extern "C" { #endif struct hFILE_backend; +struct kstring_t; + /// Low-level input/output stream handle /** The fields of this structure are declared here solely for the benefit of the hFILE-related inline functions. They may change in future releases. @@ -61,6 +63,7 @@ The usual `fopen(3)` _mode_ letters are supported: one of `+` (update), `e` (close on `exec(2)`), `x` (create exclusively), `:` (indicates scheme-specific variable arguments follow). */ +HTSLIB_EXPORT hFILE *hopen(const char *filename, const char *mode, ...) HTS_RESULT_USED; /// Associate a stream with an existing open file descriptor @@ -72,6 +75,7 @@ between text and binary mode. For socket descriptors (on Windows), _mode_ should contain `s`. */ +HTSLIB_EXPORT hFILE *hdopen(int fd, const char *mode) HTS_RESULT_USED; /// Report whether the file name or URL denotes remote storage @@ -80,16 +84,34 @@ hFILE *hdopen(int fd, const char *mode) HTS_RESULT_USED; "Remote" means involving e.g. explicit network access, with the implication that callers may wish to cache such files' contents locally. */ +HTSLIB_EXPORT int hisremote(const char *filename) HTS_RESULT_USED; +/// Append an extension or replace an existing extension +/** @param buffer The kstring to be used to store the modified filename + @param filename The filename to be (copied and) adjusted + @param replace If non-zero, one extension (if any) is removed first + @param extension The extension to be added (e.g. ".csi") + @return The modified filename (i.e., `buffer->s`), or NULL on error. + @since 1.10 + +If _filename_ is an URL, alters extensions at the end of the `hier-part`, +leaving any trailing `?query` or `#fragment` unchanged. +*/ +HTSLIB_EXPORT +char *haddextension(struct kstring_t *buffer, const char *filename, + int replace, const char *extension) HTS_RESULT_USED; + /// Flush (for output streams) and close the stream /** @return 0 if successful, or `EOF` (with _errno_ set) if an error occurred. */ +HTSLIB_EXPORT int hclose(hFILE *fp) HTS_RESULT_USED; /// Close the stream, without flushing or propagating errors /** For use while cleaning up after an error only. Preserves _errno_. */ +HTSLIB_EXPORT void hclose_abruptly(hFILE *fp); /// Return the stream's error indicator @@ -113,6 +135,7 @@ static inline void hclearerr(hFILE *fp) /** @return The resulting offset within the stream (as per `lseek(2)`), or negative if an error occurred. */ +HTSLIB_EXPORT off_t hseek(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED; /// Report the current stream offset @@ -144,6 +167,7 @@ Bytes will be read into the buffer up to and including a delimiter, until EOF is reached, or _size-1_ bytes have been written, whichever comes first. The string will then be terminated with a NUL byte (`\0`). */ +HTSLIB_EXPORT ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp) HTS_RESULT_USED; @@ -172,6 +196,7 @@ hgetln(char *buffer, size_t size, hFILE *fp) This function can be used as a replacement for `fgets(3)`, or together with kstring's `kgetline()` to read arbitrarily-long lines into a _kstring_t_. */ +HTSLIB_EXPORT char *hgets(char *buffer, int size, hFILE *fp) HTS_RESULT_USED; /// Peek at characters to be read without removing them from buffers @@ -185,6 +210,7 @@ char *hgets(char *buffer, int size, hFILE *fp) HTS_RESULT_USED; The characters peeked at remain in the stream's internal buffer, and will be returned by later hread() etc calls. */ +HTSLIB_EXPORT ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) HTS_RESULT_USED; /// Read a block of characters from the file @@ -266,6 +292,7 @@ hwrite(hFILE *fp, const void *buffer, size_t nbytes) This includes low-level flushing such as via `fdatasync(2)`. */ +HTSLIB_EXPORT int hflush(hFILE *fp) HTS_RESULT_USED; /// For hfile_mem: get the internal buffer and it's size from a hfile @@ -274,6 +301,7 @@ int hflush(hFILE *fp) HTS_RESULT_USED; The buffer returned should not be freed as this will happen when the hFILE is closed. */ +HTSLIB_EXPORT char *hfile_mem_get_buffer(hFILE *file, size_t *length); /// For hfile_mem: get the internal buffer and it's size from a hfile. @@ -284,6 +312,7 @@ buffer is granted to the caller, who now has responsibility for freeing it. From this point onwards, the hFILE should not be used for any purpose other than closing. */ +HTSLIB_EXPORT char *hfile_mem_steal_buffer(hFILE *file, size_t *length); #ifdef __cplusplus diff --git a/htslib/hts.h b/htslib/hts.h index 5c766f390..13050ad52 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -1,7 +1,7 @@ /// @file htslib/hts.h /// Format-neutral I/O, indexing, and iterator API functions. /* - Copyright (C) 2012-2016 Genome Research Ltd. + Copyright (C) 2012-2019 Genome Research Ltd. Copyright (C) 2010, 2012 Broad Institute. Portions copyright (C) 2003-2006, 2008-2010 by Heng Li @@ -30,6 +30,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include #include "hts_defs.h" #include "hts_log.h" @@ -38,6 +39,15 @@ DEALINGS IN THE SOFTWARE. */ extern "C" { #endif +// Separator used to split HTS_PATH (for plugins); REF_PATH (cram references) +#if defined(_WIN32) || defined(__MSYS__) +#define HTS_PATH_SEPARATOR_CHAR ';' +#define HTS_PATH_SEPARATOR_STR ";" +#else +#define HTS_PATH_SEPARATOR_CHAR ':' +#define HTS_PATH_SEPARATOR_STR ":" +#endif + #ifndef HTS_BGZF_TYPEDEF typedef struct BGZF BGZF; #define HTS_BGZF_TYPEDEF @@ -45,10 +55,11 @@ typedef struct BGZF BGZF; struct cram_fd; struct hFILE; struct hts_tpool; +struct sam_hdr_t; #ifndef KSTRING_T #define KSTRING_T kstring_t -typedef struct __kstring_t { +typedef struct kstring_t { size_t l, m; char *s; } kstring_t; @@ -60,7 +71,7 @@ typedef struct __kstring_t { /** * @hideinitializer - * Macro to expand a dynamic array of a given type + * Deprecated macro to expand a dynamic array of a given type * * @param type_t The type of the array elements * @param[in] n Requested number of elements of type type_t @@ -68,9 +79,12 @@ typedef struct __kstring_t { * @param[in,out] ptr Pointer to the array * * @discussion + * Do not use this macro. Use hts_resize() instead as allows allocation + * failures to be handled more gracefully. + * * The array *ptr will be expanded if necessary so that it can hold @p n * or more elements. If the array is expanded then the new size will be - * written to @p m and the value in @ptr may change. + * written to @p m and the value in @p ptr may change. * * It must be possible to take the address of @p ptr and @p m must be usable * as an lvalue. @@ -99,6 +113,9 @@ typedef struct __kstring_t { * @param[in,out] ptr Pointer to the array * * @discussion + * Do not use this macro. Use hts_resize() instead as allows allocation + * failures to be handled more gracefully. + * * As for hts_expand(), except the bytes that make up the array elements * between the old and new values of @p m are set to zero using memset(). * @@ -118,6 +135,51 @@ typedef struct __kstring_t { } \ } while (0) +// For internal use (by hts_resize()) only +HTSLIB_EXPORT +int hts_resize_array_(size_t, size_t, size_t, void *, void **, int, + const char *); + +#define HTS_RESIZE_CLEAR 1 + +/** + * @hideinitializer + * Macro to expand a dynamic array of a given type + * + * @param type_t The type of the array elements + * @param[in] num Requested number of elements of type type_t + * @param[in,out] size_ptr Pointer to where the size (in elements) of the + array is stored. + * @param[in,out] ptr Location of the pointer to the array + * @param[in] flags Option flags + * + * @return 0 for success, or negative if an error occurred. + * + * @discussion + * The array *ptr will be expanded if necessary so that it can hold @p num + * or more elements. If the array is expanded then the new size will be + * written to @p *size_ptr and the value in @p *ptr may change. + * + * If ( @p flags & HTS_RESIZE_CLEAR ) is set, any newly allocated memory will + * be cleared. + */ + +#define hts_resize(type_t, num, size_ptr, ptr, flags) \ + ((num) > (*(size_ptr)) \ + ? hts_resize_array_(sizeof(type_t), (num), \ + sizeof(*(size_ptr)), (size_ptr), \ + (void **)(ptr), (flags), __func__) \ + : 0) + +/** + * Wrapper function for free(). Enables memory deallocation across DLL + * boundary. Should be used by all applications, which are compiled + * with a different standard library than htslib and call htslib + * methods that return dynamically allocated data. + */ +HTSLIB_EXPORT +void hts_free(void *ptr); + /************ * File I/O * ************/ @@ -140,11 +202,13 @@ enum htsExactFormat { sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed, htsget, json HTS_DEPRECATED_ENUM("Use htsExactFormat 'htsget' instead") = htsget, + empty_format, // File is empty (or empty after decompression) + fasta_format, fastq_format, fai_format, fqi_format, format_maximum = 32767 }; enum htsCompression { - no_compression, gzip, bgzf, custom, + no_compression, gzip, bgzf, custom, bzip2_compression, compression_maximum = 32767 }; @@ -157,6 +221,9 @@ typedef struct htsFormat { void *specific; // format specific options; see struct hts_opt. } htsFormat; +struct __hts_idx_t; +typedef struct __hts_idx_t hts_idx_t; + // Maintainers note htsFile cannot be an opaque structure because some of its // fields are part of libhts.so's ABI (hence these fields must not be moved): // - fp is used in the public sam_itr_next()/etc macros @@ -174,7 +241,11 @@ typedef struct { struct cram_fd *cram; struct hFILE *hfile; } fp; + void *state; // format specific state information htsFormat format; + hts_idx_t *idx; + const char *fnidx; + struct sam_hdr_t *bam_header; } htsFile; // A combined thread pool and queue allocation size. @@ -256,6 +327,12 @@ typedef struct hts_opt { #define HTS_FILE_OPTS_INIT {{0},0} +/* + * Explicit index file name delimiter, see below + */ +#define HTS_IDX_DELIM "##idx##" + + /********************** * Exported functions * **********************/ @@ -266,6 +343,7 @@ typedef struct hts_opt { * Returns 0 on success; * -1 on failure. */ +HTSLIB_EXPORT int hts_opt_add(hts_opt **opts, const char *c_arg); /* @@ -274,11 +352,13 @@ int hts_opt_add(hts_opt **opts, const char *c_arg); * Returns 0 on success * -1 on failure */ +HTSLIB_EXPORT int hts_opt_apply(htsFile *fp, hts_opt *opts); /* * Frees an hts_opt list. */ +HTSLIB_EXPORT void hts_opt_free(hts_opt *opts); /* @@ -289,6 +369,7 @@ void hts_opt_free(hts_opt *opts); * Returns 0 on success * -1 on failure. */ +HTSLIB_EXPORT int hts_parse_format(htsFormat *opt, const char *str); /* @@ -302,6 +383,7 @@ int hts_parse_format(htsFormat *opt, const char *str); * Returns 0 on success * -1 on failure. */ +HTSLIB_EXPORT int hts_parse_opt_list(htsFormat *opt, const char *str); /*! @abstract Table for converting a nucleotide character to 4-bit encoding. @@ -326,14 +408,28 @@ extern const int seq_nt16_int[]; @return For released versions, a string like "N.N[.N]"; or git describe output if using a library built within a Git repository. */ +HTSLIB_EXPORT const char *hts_version(void); +/*! + @abstract Compile-time HTSlib version number, for use in #if checks + @return For released versions X.Y[.Z], an integer of the form XYYYZZ; + useful for preprocessor conditionals such as + #if HTS_VERSION >= 101000 // Check for v1.10 or later +*/ +// Maintainers: Bump this in the final stage of preparing a new release. +// Immediately after release, bump ZZ to 90 to distinguish in-development +// Git repository builds from the release; you may wish to increment this +// further when significant features are merged. +#define HTS_VERSION 101000 + /*! @abstract Determine format by peeking at the start of a file @param fp File opened for reading, positioned at the beginning @param fmt Format structure that will be filled out on return @return 0 for success, or negative if an error occurred. */ +HTSLIB_EXPORT int hts_detect_format(struct hFILE *fp, htsFormat *fmt); /*! @@ -341,11 +437,15 @@ int hts_detect_format(struct hFILE *fp, htsFormat *fmt); @param fmt Format structure holding type, version, compression, etc. @return Description string, to be freed by the caller after use. */ +HTSLIB_EXPORT char *hts_format_description(const htsFormat *format); /*! - @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file - @param fn The file name or "-" for stdin/stdout + @abstract Open a sequence data (SAM/BAM/CRAM) or variant data (VCF/BCF) + or possibly-compressed textual line-orientated file + @param fn The file name or "-" for stdin/stdout. For indexed files + with a non-standard naming, the file name can include the + name of the index file delimited with HTS_IDX_DELIM @param mode Mode matching / [rwa][bceguxz0-9]* / @discussion With 'r' opens for reading; any further format mode letters are ignored @@ -370,6 +470,7 @@ char *hts_format_description(const htsFormat *format); [rw]z .. compressed VCF [rw] .. uncompressed VCF */ +HTSLIB_EXPORT htsFile *hts_open(const char *fn, const char *mode); /*! @@ -386,6 +487,7 @@ htsFile *hts_open(const char *fn, const char *mode); like pointers to the reference or information on compression levels, block sizes, etc. */ +HTSLIB_EXPORT htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt); /*! @@ -393,6 +495,7 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) @param fn The already-open file handle @param mode Open mode, as per hts_open() */ +HTSLIB_EXPORT htsFile *hts_hopen(struct hFILE *fp, const char *fn, const char *mode); /*! @@ -400,6 +503,7 @@ htsFile *hts_hopen(struct hFILE *fp, const char *fn, const char *mode); @param fp The file handle to be closed @return 0 for success, or negative if an error occurred. */ +HTSLIB_EXPORT int hts_close(htsFile *fp); /*! @@ -407,6 +511,7 @@ int hts_close(htsFile *fp); @param fp The file handle @return Read-only pointer to the file's htsFormat. */ +HTSLIB_EXPORT const htsFormat *hts_get_format(htsFile *fp); /*! @@ -414,6 +519,7 @@ const htsFormat *hts_get_format(htsFile *fp); @ param format Format structure containing the file type. @ return A string ("sam", "bam", etc) or "?" for unknown formats. */ +HTSLIB_EXPORT const char *hts_format_file_extension(const htsFormat *format); /*! @@ -423,9 +529,12 @@ const char *hts_format_file_extension(const htsFormat *format); @param ... Optional arguments, dependent on the option used. @return 0 for success, or negative if an error occurred. */ +HTSLIB_EXPORT int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...); +HTSLIB_EXPORT int hts_getline(htsFile *fp, int delimiter, kstring_t *str); +HTSLIB_EXPORT char **hts_readlines(const char *fn, int *_n); /*! @abstract Parse comma-separated list or read list from a file @@ -435,6 +544,7 @@ char **hts_readlines(const char *fn, int *_n); @return NULL on failure or pointer to newly allocated array of strings */ +HTSLIB_EXPORT char **hts_readlist(const char *fn, int is_file, int *_n); /*! @@ -445,6 +555,7 @@ char **hts_readlist(const char *fn, int is_file, int *_n); @notes This function creates non-shared threads for use solely by fp. The hts_set_thread_pool function is the recommended alternative. */ +HTSLIB_EXPORT int hts_set_threads(htsFile *fp, int n); /*! @@ -453,6 +564,7 @@ int hts_set_threads(htsFile *fp, int n); @param p A pool of worker threads, previously allocated by hts_create_threads(). @return 0 for success, or negative if an error occurred. */ +HTSLIB_EXPORT int hts_set_thread_pool(htsFile *fp, htsThreadPool *p); /*! @@ -461,6 +573,7 @@ int hts_set_thread_pool(htsFile *fp, htsThreadPool *p); @param fp The file handle @param n The size of cache, in bytes */ +HTSLIB_EXPORT void hts_set_cache_size(htsFile *fp, int n); /*! @@ -470,6 +583,7 @@ void hts_set_cache_size(htsFile *fp, int n); Called before *_hdr_read(), this provides the name of a .fai file used to provide a reference list if the htsFile contains no @SQ headers. */ +HTSLIB_EXPORT int hts_set_fai_filename(htsFile *fp, const char *fn_aux); @@ -483,6 +597,7 @@ int hts_set_fai_filename(htsFile *fp, const char *fn_aux); @discussion Check if the BGZF end-of-file (EOF) marker is present */ +HTSLIB_EXPORT int hts_check_EOF(htsFile *fp); /************ @@ -509,12 +624,26 @@ When REST or NONE is used, idx is also ignored and may be NULL. #define HTS_FMT_TBI 2 #define HTS_FMT_CRAI 3 -struct __hts_idx_t; -typedef struct __hts_idx_t hts_idx_t; +// Almost INT64_MAX, but when cast into a 32-bit int it's +// also INT_MAX instead of -1. This avoids bugs with old code +// using the new hts_pos_t data type. +#define HTS_POS_MAX ((((int64_t)INT_MAX)<<32)|INT_MAX) +#define HTS_POS_MIN INT64_MIN +#define PRIhts_pos PRId64 +typedef int64_t hts_pos_t; + +// For comparison with previous release: +// +// #define HTS_POS_MAX INT_MAX +// #define HTS_POS_MIN INT_MIN +// #define PRIhts_pos PRId32 +// typedef int32_t hts_pos_t; typedef struct { - uint32_t beg, end; -} hts_pair32_t; + hts_pos_t beg, end; +} hts_pair_pos_t; + +typedef hts_pair_pos_t hts_pair32_t; // For backwards compatibility typedef struct { uint64_t u, v; @@ -527,23 +656,28 @@ typedef struct { typedef struct { const char *reg; + hts_pair_pos_t *intervals; int tid; - hts_pair32_t *intervals; uint32_t count; - uint32_t min_beg, max_end; + hts_pos_t min_beg, max_end; } hts_reglist_t; -typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end); +typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, hts_pos_t *beg, hts_pos_t *end); typedef int hts_seek_func(void *fp, int64_t offset, int where); typedef int64_t hts_tell_func(void *fp); typedef struct { - uint32_t read_rest:1, finished:1, is_cram:1, dummy:29; - int tid, beg, end, n_off, i; - int curr_tid, curr_beg, curr_end; - uint64_t curr_off; - hts_pair64_t *off; + uint32_t read_rest:1, finished:1, is_cram:1, nocoor:1, multi:1, dummy:27; + int tid, n_off, i, n_reg; + hts_pos_t beg, end; + hts_reglist_t *reg_list; + int curr_tid, curr_reg, curr_intv; + hts_pos_t curr_beg, curr_end; + uint64_t curr_off, nocoor_off; + hts_pair64_max_t *off; hts_readrec_func *readrec; + hts_seek_func *seek; + hts_tell_func *tell; struct { int n, m; int *a; @@ -555,26 +689,74 @@ typedef struct { uint64_t min_off, max_off; } aux_key_t; -typedef struct { - uint32_t read_rest:1, finished:1, is_cram:1, nocoor:1, dummy:28; - hts_reglist_t *reg_list; - int n_reg, i; - int curr_tid, curr_intv, curr_beg, curr_end, curr_reg; - hts_pair64_max_t *off; - int n_off; - uint64_t curr_off, nocoor_off; - hts_readrec_func *readrec; - hts_seek_func *seek; - hts_tell_func *tell; -} hts_itr_multi_t; +typedef hts_itr_t hts_itr_multi_t; #define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7) #define hts_bin_parent(l) (((l) - 1) >> 3) - hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls); - void hts_idx_destroy(hts_idx_t *idx); - int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped); - void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset); +/////////////////////////////////////////////////////////// +// Low-level API for building indexes. + +/// Create a BAI/CSI/TBI type index structure +/** @param n Initial number of targets + @param fmt Format, one of HTS_FMT_CSI, HTS_FMT_BAI or HTS_FMT_TBI + @param offset0 Initial file offset + @param min_shift Number of bits for the minimal interval + @param n_lvls Number of levels in the binning index + @return An initialised hts_idx_t struct on success; NULL on failure + +The struct returned by a successful call should be freed via hts_idx_destroy() +when it is no longer needed. +*/ +HTSLIB_EXPORT +hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls); + +/// Free a BAI/CSI/TBI type index +/** @param idx Index structure to free + */ +HTSLIB_EXPORT +void hts_idx_destroy(hts_idx_t *idx); + +/// Push an index entry +/** @param idx Index + @param tid Target id + @param beg Range start (zero-based) + @param end Range end (zero-based, half-open) + @param offset File offset + @param is_mapped Range corresponds to a mapped read + @return 0 on success; -1 on failure + +The @p is_mapped parameter is used to update the n_mapped / n_unmapped counts +stored in the meta-data bin. + */ +HTSLIB_EXPORT +int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped); + +/// Finish building an index +/** @param idx Index + @param final_offset Last file offset + @return 0 on success; non-zero on failure. +*/ +HTSLIB_EXPORT +int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset); + +/// Returns index format +/** @param idx Index + @return One of HTS_FMT_CSI, HTS_FMT_BAI or HTS_FMT_TBI +*/ +HTSLIB_EXPORT +int hts_idx_fmt(hts_idx_t *idx); + +/// Add name to TBI index meta-data +/** @param idx Index + @param tid Target identifier + @param name Target name + @return Index number of name in names list on success; -1 on failure. +*/ +HTSLIB_EXPORT +int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name); + +// Index loading and saving /// Save an index to a file /** @param idx Index to be written @@ -582,6 +764,7 @@ typedef struct { @param fmt One of the HTS_FMT_* index formats @return 0 if successful, or negative if an error occurred. */ +HTSLIB_EXPORT int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) HTS_RESULT_USED; /// Save an index to a specific file @@ -591,23 +774,85 @@ int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) HTS_RESULT_USED; @param fmt One of the HTS_FMT_* index formats @return 0 if successful, or negative if an error occurred. */ +HTSLIB_EXPORT int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt) HTS_RESULT_USED; /// Load an index file /** @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or - the extension substituted, to search for an existing index file + the extension substituted, to search for an existing index file. + In case of a non-standard naming, the file name can include the + name of the index file delimited with HTS_IDX_DELIM. @param fmt One of the HTS_FMT_* index formats @return The index, or NULL if an error occurred. + +If @p fn contains the string "##idx##" (HTS_IDX_DELIM), the part before +the delimiter will be used as the name of the data file and the part after +it will be used as the name of the index. + +Otherwise, this function tries to work out the index name as follows: + + It will try appending ".csi" to @p fn + It will try substituting an existing suffix (e.g. .bam, .vcf) with ".csi" + Then, if @p fmt is HTS_FMT_BAI: + It will try appending ".bai" to @p fn + To will substituting the existing suffix (e.g. .bam) with ".bai" + else if @p fmt is HTS_FMT_TBI: + It will try appending ".tbi" to @p fn + To will substituting the existing suffix (e.g. .vcf) with ".tbi" + +If the index file is remote (served over a protocol like https), first a check +is made to see is a locally cached copy is available. This is done for all +of the possible names listed above. If a cached copy is not available then +the index will be downloaded and stored in the current working directory, +with the same name as the remote index. + + Equivalent to hts_idx_load3(fn, NULL, fmt, HTS_IDX_SAVE_REMOTE); */ +HTSLIB_EXPORT hts_idx_t *hts_idx_load(const char *fn, int fmt); /// Load a specific index file /** @param fn Input BAM/BCF/etc filename @param fnidx The input index filename @return The index, or NULL if an error occurred. + + Equivalent to hts_idx_load3(fn, fnidx, 0, 0); + + This function will not attempt to save index files locally. */ +HTSLIB_EXPORT hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx); +/// Load a specific index file +/** @param fn Input BAM/BCF/etc filename + @param fnidx The input index filename + @param fmt One of the HTS_FMT_* index formats + @param flags Flags to alter behaviour (see description) + @return The index, or NULL if an error occurred. + + If @p fnidx is NULL, the index name will be derived from @p fn in the + same way as hts_idx_load(). + + If @p fnidx is not NULL, @p fmt is ignored. + + The @p flags parameter can be set to a combination of the following + values: + + HTS_IDX_SAVE_REMOTE Save a local copy of any remote indexes + HTS_IDX_SILENT_FAIL Fail silently if the index is not present + + The index struct returned by a successful call should be freed + via hts_idx_destroy() when it is no longer needed. +*/ +HTSLIB_EXPORT +hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags); + +/// Flags for hts_idx_load3() ( and also sam_idx_load3(), tbx_idx_load3() ) +#define HTS_IDX_SAVE_REMOTE 1 +#define HTS_IDX_SILENT_FAIL 2 + +/////////////////////////////////////////////////////////// +// Functions for accessing meta-data stored in indexes /// Get extra index meta-data /** @param idx The index @@ -620,6 +865,7 @@ hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx); the results themselves, including knowing what sort of data to expect; byte swapping etc. */ +HTSLIB_EXPORT uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta); /// Set extra index meta-data @@ -634,13 +880,42 @@ uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta); If is_copy != 0, a copy of the input data is taken. If not, ownership of the data pointed to by *meta passes to the index. */ +HTSLIB_EXPORT int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta, int is_copy); - int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped); - uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx); +/// Get number of mapped and unmapped reads from an index +/** @param idx Index + @param tid Target ID + @param[out] mapped Location to store number of mapped reads + @param[out] unmapped Location to store number of unmapped reads + @return 0 on success; -1 on failure (data not available) + + BAI and CSI indexes store information on the number of reads for each + target that were mapped or unmapped (unmapped reads will generally have + a paired read that is mapped to the target). This function returns this + infomation if it is available. + + @note Cram CRAI indexes do not include this information. +*/ +HTSLIB_EXPORT +int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped); + +/// Return the number of unplaced reads from an index +/** @param idx Index + @return Unplaced reads count + Unplaced reads are not linked to any reference (e.g. RNAME is '*' in SAM + files). +*/ +HTSLIB_EXPORT +uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx); + +/////////////////////////////////////////////////////////// +// Region parsing #define HTS_PARSE_THOUSANDS_SEP 1 ///< Ignore ',' separators within numbers +#define HTS_PARSE_ONE_COORD 2 ///< chr:pos means chr:pos-pos and not chr:pos-end +#define HTS_PARSE_LIST 4 ///< Expect a comma separated list of regions. (Disables HTS_PARSE_THOUSANDS_SEP) /// Parse a numeric string /** The number may be expressed in scientific notation, and optionally may @@ -654,39 +929,235 @@ int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta, int is_copy When @a strend is NULL, a warning will be printed (if hts_verbose is HTS_LOG_WARNING or more) if there are any trailing characters after the number. */ +HTSLIB_EXPORT long long hts_parse_decimal(const char *str, char **strend, int flags); +typedef int (*hts_name2id_f)(void*, const char*); +typedef const char *(*hts_id2name_f)(void*, int); + /// Parse a "CHR:START-END"-style region string /** @param str String to be parsed @param beg Set on return to the 0-based start of the region @param end Set on return to the 1-based end of the region @return Pointer to the colon or '\0' after the reference sequence name, or NULL if @a str could not be parsed. + + NOTE: For compatibility with hts_parse_reg only. + Please use hts_parse_region instead. */ +HTSLIB_EXPORT +const char *hts_parse_reg64(const char *str, hts_pos_t *beg, hts_pos_t *end); + +/// Parse a "CHR:START-END"-style region string +/** @param str String to be parsed + @param beg Set on return to the 0-based start of the region + @param end Set on return to the 1-based end of the region + @return Pointer to the colon or '\0' after the reference sequence name, + or NULL if @a str could not be parsed. +*/ +HTSLIB_EXPORT const char *hts_parse_reg(const char *str, int *beg, int *end); - hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec); - void hts_itr_destroy(hts_itr_t *iter); +/// Parse a "CHR:START-END"-style region string +/** @param str String to be parsed + @param tid Set on return (if not NULL) to be reference index (-1 if invalid) + @param beg Set on return to the 0-based start of the region + @param end Set on return to the 1-based end of the region + @param getid Function pointer. Called if not NULL to set tid. + @param hdr Caller data passed to getid. + @param flags Bitwise HTS_PARSE_* flags listed above. + @return Pointer to the byte after the end of the entire region + specifier (including any trailing comma) on success, + or NULL if @a str could not be parsed. + + A variant of hts_parse_reg which is reference-id aware. It uses + the iterator name2id callbacks to validate the region tokenisation works. + + This is necessary due to GRCh38 HLA additions which have reference names + like "HLA-DRB1*12:17". + + To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" + are reference names, quote using curly braces. + Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. + + Flags are used to control how parsing works, and can be one of the below. + + HTS_PARSE_THOUSANDS_SEP: + Ignore commas in numbers. For example with this flag 1,234,567 + is interpreted as 1234567. + + HTS_PARSE_LIST: + If present, the region is assmed to be a comma separated list and + position parsing will not contain commas (this implicitly + clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal). + On success the return pointer will be the start of the next region, ie + the character after the comma. (If *ret != '\0' then the caller can + assume another region is present in the list.) + + If not set then positions may contain commas. In this case the return + value should point to the end of the string, or NULL on failure. + + HTS_PARSE_ONE_COORD: + If present, X:100 is treated as the single base pair region X:100-100. + In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-. + (This is the standard bcftools region convention.) + + When not set X:100 is considered to be X:100- where is + the end of chromosome X (set to INT_MAX here). X:100- and X:-100 are + invalid. + (This is the standard samtools region convention.) + + Note the supplied string expects 1 based inclusive coordinates, but the + returned coordinates start from 0 and are half open, so pos0 is valid + for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}" + + If NULL is returned, the value in tid mat give additional information + about the error: + + -2 Failed to parse @p hdr; or out of memory + -1 The reference in @p str has mismatched braces, or does not + exist in @p hdr + >= 0 The specified range in @p str could not be parsed +*/ +HTSLIB_EXPORT +const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg, + hts_pos_t *end, hts_name2id_f getid, void *hdr, + int flags); - typedef int (*hts_name2id_f)(void*, const char*); - typedef const char *(*hts_id2name_f)(void*, int); - typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec); - hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec); - int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) HTS_RESULT_USED; - const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values +/////////////////////////////////////////////////////////// +// Generic iterators +// +// These functions provide the low-level infrastructure for iterators. +// Wrappers around these are used to make iterators for specific file types. +// See: +// htslib/sam.h for SAM/BAM/CRAM iterators +// htslib/vcf.h for VCF/BCF iterators +// htslib/tbx.h for files indexed by tabix + +/// Create a single-region iterator +/** @param idx Index + @param tid Target ID + @param beg Start of region + @param end End of region + @param readrec Callback to read a record from the input file + @return An iterator on success; NULL on failure + + The iterator struct returned by a successful call should be freed + via hts_itr_destroy() when it is no longer needed. + */ +HTSLIB_EXPORT +hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec); + +/// Free an iterator +/** @param iter Iterator to free + */ +HTSLIB_EXPORT +void hts_itr_destroy(hts_itr_t *iter); + +typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec); + +/// Create a single-region iterator from a text region specification +/** @param idx Index + @param reg Region specifier + @param getid Callback function to return the target ID for a name + @param hdr Input file header + @param itr_query Callback function returning an iterator for a numeric tid, + start and end position + @param readrec Callback to read a record from the input file + @return An iterator on success; NULL on error + + The iterator struct returned by a successful call should be freed + via hts_itr_destroy() when it is no longer needed. + */ +HTSLIB_EXPORT +hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec); + +/// Return the next record from an iterator +/** @param fp Input file handle + @param iter Iterator + @param r Pointer to record placeholder + @param data Data passed to the readrec callback + @return >= 0 on success, -1 when there is no more data, < -1 on error + */ +HTSLIB_EXPORT +int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) HTS_RESULT_USED; + +/// Return a list of target names from an index +/** @param idx Index + @param[out] n Location to store the number of targets + @param getid Callback function to get the name for a target ID + @param hdr Header from indexed file + @return An array of pointers to the names on success; NULL on failure + + @note The names are pointers into the header data structure. When cleaning + up, only the array should be freed, not the names. + */ +HTSLIB_EXPORT +const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values /********************************** * Iterator with multiple regions * **********************************/ -typedef hts_itr_multi_t *hts_itr_multi_query_func(const hts_idx_t *idx, hts_itr_multi_t *itr); -hts_itr_multi_t *hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_multi_t *iter); -hts_itr_multi_t *hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_multi_t *iter); -hts_itr_multi_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell); -int hts_itr_multi_next(htsFile *fd, hts_itr_multi_t *iter, void *r); +typedef int hts_itr_multi_query_func(const hts_idx_t *idx, hts_itr_t *itr); +HTSLIB_EXPORT +int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter); +HTSLIB_EXPORT +int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter); + +/// Create a multi-region iterator from a region list +/** @param idx Index + @param reglist Region list + @param count Number of items in region list + @param getid Callback to convert names to target IDs + @param hdr Indexed file header (passed to getid) + @param itr_specific Filetype-specific callback function + @param readrec Callback to read an input file record + @param seek Callback to seek in the input file + @param tell Callback to return current input file location + @return An iterator on success; NULL on failure + + The iterator struct returned by a successful call should be freed + via hts_itr_destroy() when it is no longer needed. + */ +HTSLIB_EXPORT +hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell); + +/// Return the next record from an iterator +/** @param fp Input file handle + @param iter Iterator + @param r Pointer to record placeholder + @return >= 0 on success, -1 when there is no more data, < -1 on error + */ +HTSLIB_EXPORT +int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r); + +/// Create a region list from a char array +/** @param argv Char array of target:interval elements, e.g. chr1:2500-3600, chr1:5100, chr2 + @param argc Number of items in the array + @param r_count Pointer to the number of items in the resulting region list + @param hdr Header for the sam/bam/cram file + @param getid Callback to convert target names to target ids. + @return A region list on success, NULL on failure + + The hts_reglist_t struct returned by a successful call should be freed + via hts_reglist_free() when it is no longer needed. + */ +HTSLIB_EXPORT +hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr, hts_name2id_f getid); + +/// Free a region list +/** @param reglist Region list + @param count Number of items in the list + */ +HTSLIB_EXPORT void hts_reglist_free(hts_reglist_t *reglist, int count); -void hts_itr_multi_destroy(hts_itr_multi_t *iter); + +/// Free a multi-region iterator +/** @param iter Iterator to free + */ +#define hts_itr_multi_destroy(iter) hts_itr_destroy(iter) /** @@ -701,6 +1172,7 @@ void hts_itr_multi_destroy(hts_itr_multi_t *iter); #define FT_BCF (1<<2) #define FT_BCF_GZ (FT_GZ|FT_BCF) #define FT_STDIN (1<<3) + HTSLIB_EXPORT int hts_file_type(const char *fname); @@ -711,7 +1183,9 @@ void hts_itr_multi_destroy(hts_itr_multi_t *iter); struct errmod_t; typedef struct errmod_t errmod_t; +HTSLIB_EXPORT errmod_t *errmod_init(double depcorr); +HTSLIB_EXPORT void errmod_destroy(errmod_t *em); /* @@ -720,6 +1194,7 @@ void errmod_destroy(errmod_t *em); bases[i]: qual:6, strand:1, base:4 q[i*m+j]: phred-scaled likelihood of (i,j) */ +HTSLIB_EXPORT int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q); @@ -757,6 +1232,7 @@ On failure, errno will be set to EINVAL if the values of l_ref or l_query were invalid; or ENOMEM if a memory allocation failed. */ +HTSLIB_EXPORT int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_query, const uint8_t *iqual, const probaln_par_t *c, int *state, uint8_t *q); @@ -781,29 +1257,34 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu * * @return hts_md5_context pointer on success, NULL otherwise. */ + HTSLIB_EXPORT hts_md5_context *hts_md5_init(void); /*! @abstract Updates the context with the MD5 of the data. */ + HTSLIB_EXPORT void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size); /*! @abstract Computes the final 128-bit MD5 hash from the given context */ + HTSLIB_EXPORT void hts_md5_final(unsigned char *digest, hts_md5_context *ctx); /*! @abstract Resets an md5_context to the initial state, as returned * by hts_md5_init(). */ + HTSLIB_EXPORT void hts_md5_reset(hts_md5_context *ctx); /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated * hex string. */ + HTSLIB_EXPORT void hts_md5_hex(char *hex, const unsigned char *digest); /*! @abstract Deallocates any memory allocated by hts_md5_init. */ + HTSLIB_EXPORT void hts_md5_destroy(hts_md5_context *ctx); - -static inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) +static inline int hts_reg2bin(hts_pos_t beg, hts_pos_t end, int min_shift, int n_lvls) { int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7; for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l)) diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h index 3bf4a4630..e041930a5 100644 --- a/htslib/hts_defs.h +++ b/htslib/hts_defs.h @@ -1,6 +1,6 @@ /* hts_defs.h -- Miscellaneous definitions. - Copyright (C) 2013-2015,2017 Genome Research Ltd. + Copyright (C) 2013-2015,2017, 2019 Genome Research Ltd. Author: John Marshall @@ -25,6 +25,8 @@ DEALINGS IN THE SOFTWARE. */ #ifndef HTSLIB_HTS_DEFS_H #define HTSLIB_HTS_DEFS_H +#include // For __MINGW_PRINTF_FORMAT macro + #ifdef __clang__ #ifdef __has_attribute #define HTS_COMPILER_HAS(attribute) __has_attribute(attribute) @@ -93,4 +95,20 @@ DEALINGS IN THE SOFTWARE. */ #define HTS_FORMAT(type, idx, first) #endif +#if defined(_WIN32) || defined(__CYGWIN__) +#define HTS_DLL_EXPORT __declspec(dllexport) +#elif HTS_COMPILER_HAS(__visibility__) || HTS_GCC_AT_LEAST(4,0) +#define HTS_DLL_EXPORT __attribute__((__visibility__("default"))) +#elif defined(__SUNPRO_C) && __SUNPRO_C >= 0x550 +#define HTS_DLL_EXPORT __global +#else +#define HTS_DLL_EXPORT +#endif + +#if !(defined(_WIN32) || defined(__CYGWIN__)) || defined(HTS_BUILDING_LIBRARY) +#define HTSLIB_EXPORT HTS_DLL_EXPORT +#else +#define HTSLIB_EXPORT +#endif + #endif diff --git a/htslib/hts_log.h b/htslib/hts_log.h index 98e7a5163..b2336a4df 100644 --- a/htslib/hts_log.h +++ b/htslib/hts_log.h @@ -46,10 +46,12 @@ enum htsLogLevel { }; /// Sets the selected log level. +HTSLIB_EXPORT void hts_set_log_level(enum htsLogLevel level); /// Gets the selected log level. -enum htsLogLevel hts_get_log_level(); +HTSLIB_EXPORT +enum htsLogLevel hts_get_log_level(void); /// Selected log level. /*! @@ -68,6 +70,7 @@ extern int hts_verbose; * \param context Context where the event occurred. Typically set to "__func__". * \param format Format string with placeholders, like printf. */ +HTSLIB_EXPORT void hts_log(enum htsLogLevel severity, const char *context, const char *format, ...) HTS_FORMAT(HTS_PRINTF_FMT, 3, 4); diff --git a/htslib/hts_os.h b/htslib/hts_os.h index 3a671d4a2..f4c3b32ef 100644 --- a/htslib/hts_os.h +++ b/htslib/hts_os.h @@ -1,7 +1,7 @@ /// @file hts_os.h /// Operating System specific tweaks, for compatibility with POSIX. /* - Copyright (C) 2017 Genome Research Ltd. + Copyright (C) 2017, 2019 Genome Research Ltd. Author: James Bonfield @@ -26,10 +26,23 @@ DEALINGS IN THE SOFTWARE. */ #ifndef HTSLIB_HTS_OS_H #define HTSLIB_HTS_OS_H -extern void hts_srand48(long seed); -extern double hts_erand48(unsigned short xseed[3]); -extern double hts_drand48(void); -extern long hts_lrand48(void); +#include "hts_defs.h" + +#ifdef __cplusplus +extern "C" { +#endif + +HTSLIB_EXPORT +void hts_srand48(long seed); + +HTSLIB_EXPORT +double hts_erand48(unsigned short xseed[3]); + +HTSLIB_EXPORT +double hts_drand48(void); + +HTSLIB_EXPORT +long hts_lrand48(void); #if defined(_WIN32) && !defined(__CYGWIN__) // Windows usually lacks *rand48(), but cygwin provides them. @@ -44,6 +57,9 @@ extern long hts_lrand48(void); extern int is_cygpty(int fd); #endif +#ifdef __cplusplus +} +#endif #if defined(__MINGW32__) #include diff --git a/htslib/kbitset.h b/htslib/kbitset.h index 22fb34d75..4b7e4948a 100644 --- a/htslib/kbitset.h +++ b/htslib/kbitset.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (C) 2015 Genome Research Ltd. + Copyright (C) 2015, 2018 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -37,7 +37,7 @@ if (kbs_exists(bset, 68)) printf("68 present\n"); - kbitset_iter itr; + kbitset_iter_t itr; int i; kbs_start(&itr); while ((i = kbs_next(bset, &itr)) >= 0) @@ -61,10 +61,18 @@ #define KBS_MASK(i) (1UL << ((i) % KBS_ELTBITS)) typedef struct kbitset_t { - size_t n; + size_t n, n_max; unsigned long b[1]; } kbitset_t; +// (For internal use only.) Returns a mask (like 00011111) showing +// which bits are in use in the last slot (for the given ni) set. +static inline unsigned long kbs_last_mask(size_t ni) +{ + unsigned long mask = KBS_MASK(ni) - 1; + return mask? mask : ~0UL; +} + // Initialise a bit set capable of holding ni integers, 0 <= i < ni. // The set returned is empty if fill == 0, or all of [0,ni) otherwise. static inline kbitset_t *kbs_init2(size_t ni, int fill) @@ -73,9 +81,11 @@ static inline kbitset_t *kbs_init2(size_t ni, int fill) kbitset_t *bs = (kbitset_t *) malloc(sizeof(kbitset_t) + n * sizeof(unsigned long)); if (bs == NULL) return NULL; - bs->n = n; + bs->n = bs->n_max = n; memset(bs->b, fill? ~0 : 0, n * sizeof (unsigned long)); - bs->b[n] = ~0UL; + // b[n] is always non-zero (a fact used by kbs_next()). + bs->b[n] = kbs_last_mask(ni); + if (fill) bs->b[n-1] &= bs->b[n]; return bs; } @@ -85,6 +95,38 @@ static inline kbitset_t *kbs_init(size_t ni) return kbs_init2(ni, 0); } +// Resize an existing bit set to be capable of holding ni_new integers. +// Elements in [ni_old,ni_new) are added to the set if fill != 0. +static inline int kbs_resize2(kbitset_t **bsp, size_t ni_new, int fill) +{ + kbitset_t *bs = *bsp; + size_t n = bs? bs->n : 0; + size_t n_new = (ni_new + KBS_ELTBITS-1) / KBS_ELTBITS; + if (bs == NULL || n_new > bs->n_max) { + bs = (kbitset_t *) + realloc(bs, sizeof(kbitset_t) + n_new * sizeof(unsigned long)); + if (bs == NULL) return -1; + + bs->n_max = n_new; + *bsp = bs; + } + + bs->n = n_new; + if (n_new >= n) + memset(&bs->b[n], fill? ~0 : 0, (n_new - n) * sizeof (unsigned long)); + bs->b[n_new] = kbs_last_mask(ni_new); + // Need to clear excess bits when fill!=0 or n_newb[n_new-1] &= bs->b[n_new]; + return 0; +} + +// Resize an existing bit set to be capable of holding ni_new integers. +// Returns negative on error. +static inline int kbs_resize(kbitset_t **bsp, size_t ni_new) +{ + return kbs_resize2(bsp, ni_new, 0); +} + // Destroy a bit set. static inline void kbs_destroy(kbitset_t *bs) { @@ -101,6 +143,7 @@ static inline void kbs_clear(kbitset_t *bs) static inline void kbs_insert_all(kbitset_t *bs) { memset(bs->b, ~0, bs->n * sizeof (unsigned long)); + bs->b[bs->n-1] &= bs->b[bs->n]; } // Insert an element into the bit set. diff --git a/htslib/kfunc.h b/htslib/kfunc.h index 162c90d84..34704b1f7 100644 --- a/htslib/kfunc.h +++ b/htslib/kfunc.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (C) 2010, 2013 Genome Research Ltd. + Copyright (C) 2010, 2013-2014 Genome Research Ltd. Copyright (C) 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining @@ -27,6 +27,8 @@ #ifndef HTSLIB_KFUNC_H #define HTSLIB_KFUNC_H +#include "hts_defs.h" + #ifdef __cplusplus extern "C" { #endif @@ -35,12 +37,14 @@ extern "C" { * \log{\Gamma(z)} * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 */ +HTSLIB_EXPORT double kf_lgamma(double z); /* complementary error function * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66 */ +HTSLIB_EXPORT double kf_erfc(double x); /* The following computes regularized incomplete gamma functions. @@ -56,7 +60,9 @@ double kf_erfc(double x); * kf_gammaq(s,z)*tgamma(s). */ +HTSLIB_EXPORT double kf_gammap(double s, double z); +HTSLIB_EXPORT double kf_gammaq(double s, double z); /* Regularized incomplete beta function. The method is taken from @@ -66,6 +72,7 @@ double kf_gammaq(double s, double z); * * http://www.danielsoper.com/statcalc/calc36.aspx */ +HTSLIB_EXPORT double kf_betai(double a, double b, double x); /* @@ -74,6 +81,7 @@ double kf_betai(double a, double b, double x); * -----------+---- * n_1 n_2 | n */ +HTSLIB_EXPORT double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); #ifdef __cplusplus diff --git a/htslib/khash.h b/htslib/khash.h index b2179d4ac..08bc16985 100644 --- a/htslib/khash.h +++ b/htslib/khash.h @@ -1,6 +1,7 @@ /* The MIT License Copyright (c) 2008, 2009, 2011 by Attractive Chaos + Copyright (C) 2014-2015, 2018 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -128,6 +129,7 @@ int main() { #include #include #include +#include /* compiler specific configuration */ @@ -400,7 +402,7 @@ static kh_inline khint_t __ac_X31_hash_string(const char *s) } /*! @function @abstract Another interface to const char* hash function - @param key Pointer to a null terminated string [const char*] + @param key Pointer to a nul terminated string [const char*] @return The hash value [khint_t] */ #define kh_str_hash_func(key) __ac_X31_hash_string(key) @@ -409,6 +411,30 @@ static kh_inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) +/*! @function + @abstract Kstring hash function + @param s Pointer to a kstring + @return The hash value + */ +static kh_inline khint_t __ac_X31_hash_kstring(const kstring_t ks) +{ + khint_t h = 0; + size_t i; + for (i = 0; i < ks.l; i++) + h = (h << 5) - h + (khint_t)ks.s[i]; + return h; +} +/*! @function + @abstract Interface to kstring hash function. + @param key Pointer to a khash; permits hashing on non-nul terminated strings. + @return The hash value [khint_t] + */ +#define kh_kstr_hash_func(key) __ac_X31_hash_kstring(key) +/*! @function + @abstract kstring comparison function + */ +#define kh_kstr_hash_equal(a, b) ((a).l == (b).l && strncmp((a).s, (b).s, (a).l) == 0) + static kh_inline khint_t __ac_Wang_hash(khint_t key) { key += ~(key << 15); @@ -624,4 +650,19 @@ typedef const char *kh_cstr_t; #define KHASH_MAP_INIT_STR(name, khval_t) \ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) +/*! @function + @abstract Instantiate a hash set containing kstring_t keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_KSTR(name) \ + KHASH_INIT(name, kstring_t, char, 0, kh_kstr_hash_func, kh_kstr_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing kstring_t keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_KSTR(name, khval_t) \ + KHASH_INIT(name, kstring_t, khval_t, 1, kh_kstr_hash_func, kh_kstr_hash_equal) + #endif /* __AC_KHASH_H */ diff --git a/htslib/khash_str2int.h b/htslib/khash_str2int.h index 4bbc10088..3af4a42a5 100644 --- a/htslib/khash_str2int.h +++ b/htslib/khash_str2int.h @@ -1,6 +1,6 @@ /* khash_str2int.h -- C-string to integer hash table. - Copyright (C) 2013 Genome Research Ltd. + Copyright (C) 2013-2014 Genome Research Ltd. Author: Petr Danecek diff --git a/htslib/klist.h b/htslib/klist.h index adc3db1e9..398f205db 100644 --- a/htslib/klist.h +++ b/htslib/klist.h @@ -1,6 +1,7 @@ /* The MIT License Copyright (c) 2008-2009, by Attractive Chaos + Copyright (C) 2013, 2015 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/htslib/knetfile.h b/htslib/knetfile.h index 863359670..87fba4adc 100644 --- a/htslib/knetfile.h +++ b/htslib/knetfile.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008 by Genome Research Ltd (GRL). + Copyright (c) 2008, 2012, 2014 Genome Research Ltd (GRL). 2010 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining @@ -31,6 +31,8 @@ #include #include +#include "hts_defs.h" + #ifndef _WIN32 #define netread(fd, ptr, len) read(fd, ptr, len) #define netwrite(fd, ptr, len) write(fd, ptr, len) @@ -75,24 +77,29 @@ extern "C" { void knet_win32_destroy(); #endif + HTSLIB_EXPORT knetFile *knet_open(const char *fn, const char *mode); /* This only works with local files. */ + HTSLIB_EXPORT knetFile *knet_dopen(int fd, const char *mode); /* If ->is_ready==0, this routine updates ->fd; otherwise, it simply reads from ->fd. */ + HTSLIB_EXPORT ssize_t knet_read(knetFile *fp, void *buf, size_t len); /* This routine only sets ->offset and ->is_ready=0. It does not communicate with the FTP server. */ + HTSLIB_EXPORT off_t knet_seek(knetFile *fp, off_t off, int whence); + HTSLIB_EXPORT int knet_close(knetFile *fp); #ifdef __cplusplus diff --git a/htslib/kseq.h b/htslib/kseq.h index a2349470f..91e401eff 100644 --- a/htslib/kseq.h +++ b/htslib/kseq.h @@ -1,6 +1,7 @@ /* The MIT License Copyright (c) 2008, 2009, 2011 Attractive Chaos + Copyright (C) 2013, 2018 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -89,7 +90,7 @@ #ifndef KSTRING_T #define KSTRING_T kstring_t -typedef struct __kstring_t { +typedef struct kstring_t { size_t l, m; char *s; } kstring_t; diff --git a/htslib/ksort.h b/htslib/ksort.h index f7476481f..755010951 100644 --- a/htslib/ksort.h +++ b/htslib/ksort.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008 Genome Research Ltd (GRL). + Copyright (c) 2008, 2012-2013, 2017-2019 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -65,6 +65,14 @@ #include #include +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + #ifdef __cplusplus extern "C" { #endif @@ -82,9 +90,12 @@ typedef struct { #define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } -#define KSORT_INIT(name, type_t, __sort_lt) KSORT_INIT_(_ ## name, type_t, __sort_lt) -#define KSORT_INIT_(name, type_t, __sort_lt) \ - void ks_mergesort##name(size_t n, type_t array[], type_t temp[]) \ +#define KSORT_INIT(name, type_t, __sort_lt) KSORT_INIT_(_ ## name, , type_t, __sort_lt) +#define KSORT_INIT_STATIC(name, type_t, __sort_lt) KSORT_INIT_(_ ## name, static klib_unused, type_t, __sort_lt) +#define KSORT_INIT2(name, SCOPE, type_t, __sort_lt) KSORT_INIT_(_ ## name, SCOPE, type_t, __sort_lt) + +#define KSORT_INIT_(name, SCOPE, type_t, __sort_lt) \ + SCOPE int ks_mergesort##name(size_t n, type_t array[], type_t temp[]) \ { \ type_t *a2[2], *a, *b; \ int curr, shift; \ @@ -131,8 +142,9 @@ typedef struct { for (; p < eb; ++i) *p++ = *i; \ } \ if (temp == 0) free(a2[1]); \ + return 0; \ } \ - void ks_heapadjust##name(size_t i, size_t n, type_t l[]) \ + SCOPE void ks_heapadjust##name(size_t i, size_t n, type_t l[]) \ { \ size_t k = i; \ type_t tmp = l[i]; \ @@ -143,13 +155,13 @@ typedef struct { } \ l[i] = tmp; \ } \ - void ks_heapmake##name(size_t lsize, type_t l[]) \ + SCOPE void ks_heapmake##name(size_t lsize, type_t l[]) \ { \ size_t i; \ for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \ ks_heapadjust##name(i, lsize, l); \ } \ - void ks_heapsort##name(size_t lsize, type_t l[]) \ + SCOPE void ks_heapsort##name(size_t lsize, type_t l[]) \ { \ size_t i; \ for (i = lsize - 1; i > 0; --i) { \ @@ -165,7 +177,7 @@ typedef struct { swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ } \ } \ - void ks_combsort##name(size_t n, type_t a[]) \ + SCOPE void ks_combsort##name(size_t n, type_t a[]) \ { \ const double shrink_factor = 1.2473309501039786540366528676643; \ int do_swap; \ @@ -187,17 +199,17 @@ typedef struct { } while (do_swap || gap > 2); \ if (gap != 1) __ks_insertsort##name(a, a + n); \ } \ - void ks_introsort##name(size_t n, type_t a[]) \ + SCOPE int ks_introsort##name(size_t n, type_t a[]) \ { \ int d; \ ks_isort_stack_t *top, *stack; \ type_t rp, swap_tmp; \ type_t *s, *t, *i, *j, *k; \ \ - if (n < 1) return; \ + if (n < 1) return 0; \ else if (n == 2) { \ if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ - return; \ + return 0; \ } \ for (d = 2; 1ul<left; t = (type_t*)top->right; d = top->depth; } \ } \ } \ + return 0; \ } \ /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ /* 0 <= kk < n */ \ - type_t ks_ksmall##name(size_t n, type_t arr[], size_t kk) \ + SCOPE type_t ks_ksmall##name(size_t n, type_t arr[], size_t kk) \ { \ type_t *low, *high, *k, *ll, *hh, *mid; \ low = arr; high = arr + n - 1; k = arr + kk; \ @@ -267,7 +280,7 @@ typedef struct { if (hh >= k) high = hh - 1; \ } \ } \ - void ks_shuffle##name(size_t n, type_t a[]) \ + SCOPE void ks_shuffle##name(size_t n, type_t a[]) \ { \ int i, j; \ for (i = n; i > 1; --i) { \ @@ -291,9 +304,15 @@ typedef struct { typedef const char *ksstr_t; -#define KSORT_INIT_GENERIC(type_t) KSORT_INIT_(_ ## type_t, type_t, ks_lt_generic) +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT_(_ ## type_t, , type_t, ks_lt_generic) #define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) +#define KSORT_INIT_STATIC_GENERIC(type_t) KSORT_INIT_(_ ## type_t, static klib_unused, type_t, ks_lt_generic) +#define KSORT_INIT_STATIC_STR KSORT_INIT_STATIC(str, ksstr_t, ks_lt_str) + +#define KSORT_INIT2_GENERIC(type_t, SCOPE) KSORT_INIT_(_ ## type_t, SCOPE, type_t, ks_lt_generic) +#define KSORT_INIT2_STR KSORT_INIT2(str, SCOPE, ksstr_t, ks_lt_str) + #ifdef __cplusplus } #endif diff --git a/htslib/kstring.h b/htslib/kstring.h index d80b6f82e..02da3ed72 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -1,6 +1,7 @@ /* The MIT License Copyright (C) 2011 by Attractive Chaos + Copyright (C) 2013-2014, 2016, 2018-2019 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -32,6 +33,9 @@ #include #include #include +#include + +#include "hts_defs.h" #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) @@ -49,23 +53,32 @@ #endif #if defined __GNUC__ && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)) +#ifdef __MINGW_PRINTF_FORMAT +#define KS_ATTR_PRINTF(fmt, arg) __attribute__((__format__ (__MINGW_PRINTF_FORMAT, fmt, arg))) +#else #define KS_ATTR_PRINTF(fmt, arg) __attribute__((__format__ (__printf__, fmt, arg))) +#endif // __MINGW_PRINTF_FORMAT #else #define KS_ATTR_PRINTF(fmt, arg) #endif +#ifndef HAVE___BUILTIN_CLZ +#if defined __GNUC__ && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) +#define HAVE___BUILTIN_CLZ 1 +#endif +#endif /* kstring_t is a simple non-opaque type whose fields are likely to be * used directly by user code (but see also ks_str() and ks_len() below). * A kstring_t object is initialised by either of - * kstring_t str = { 0, 0, NULL }; - * kstring_t str; ...; str.l = str.m = 0; str.s = NULL; + * kstring_t str = KS_INITIALIZE; + * kstring_t str; ...; ks_initialize(&str); * and either ownership of the underlying buffer should be given away before * the object disappears (see ks_release() below) or the kstring_t should be - * destroyed with free(str.s); */ + * destroyed with ks_free(&str) or free(str.s) */ #ifndef KSTRING_T #define KSTRING_T kstring_t -typedef struct __kstring_t { +typedef struct kstring_t { size_t l, m; char *s; } kstring_t; @@ -81,18 +94,32 @@ typedef struct { extern "C" { #endif + HTSLIB_EXPORT int kvsprintf(kstring_t *s, const char *fmt, va_list ap) KS_ATTR_PRINTF(2,0); + + HTSLIB_EXPORT int ksprintf(kstring_t *s, const char *fmt, ...) KS_ATTR_PRINTF(2,3); + + HTSLIB_EXPORT int kputd(double d, kstring_t *s); // custom %g only handler + + HTSLIB_EXPORT int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + + HTSLIB_EXPORT char *kstrstr(const char *str, const char *pat, int **_prep); + + HTSLIB_EXPORT char *kstrnstr(const char *str, const char *pat, int n, int **_prep); + + HTSLIB_EXPORT void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); /* kstrtok() is similar to strtok_r() except that str is not * modified and both str and sep can be NULL. For efficiency, it is * actually recommended to set both to NULL in the subsequent calls * if sep is not changed. */ + HTSLIB_EXPORT char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); /* kgetline() uses the supplied fgets()-like function to read a "\n"- @@ -100,36 +127,92 @@ extern "C" { * kstring without its terminator and 0 is returned; EOF is returned at * EOF or on error (determined by querying fp, as per fgets()). */ typedef char *kgets_func(char *, int, void *); + HTSLIB_EXPORT int kgetline(kstring_t *s, kgets_func *fgets, void *fp); + // This matches the signature of hgetln(), apart from the last pointer + typedef ssize_t kgets_func2(char *, size_t, void *); + HTSLIB_EXPORT + int kgetline2(kstring_t *s, kgets_func2 *fgets, void *fp); + #ifdef __cplusplus } #endif +/// kstring initializer for structure assignment +#define KS_INITIALIZE { 0, 0, NULL } + +/// kstring initializer for pointers +/** + @note Not to be used if the buffer has been allocated. Use ks_release() + or ks_clear() instead. +*/ + +static inline void ks_initialize(kstring_t *s) +{ + s->l = s->m = 0; + s->s = NULL; +} + +/// Resize a kstring to a given capacity static inline int ks_resize(kstring_t *s, size_t size) { if (s->m < size) { char *tmp; kroundup_size_t(size); tmp = (char*)realloc(s->s, size); - if (!tmp) - return -1; + if (!tmp && size) + return -1; s->s = tmp; s->m = size; } return 0; } +/// Increase kstring capacity by a given number of bytes +static inline int ks_expand(kstring_t *s, size_t expansion) +{ + size_t new_size = s->l + expansion; + + if (new_size < s->l) // Overflow check + return -1; + return ks_resize(s, new_size); +} + +/// Returns the kstring buffer static inline char *ks_str(kstring_t *s) { return s->s; } +/// Returns the kstring buffer, or an empty string if l == 0 +/** + * Unlike ks_str(), this function will never return NULL. If the kstring is + * empty it will return a read-only empty string. As the returned value + * may be read-only, the caller should not attempt to modify it. + */ +static inline const char *ks_c_str(kstring_t *s) +{ + return s->l && s->s ? s->s : ""; +} + static inline size_t ks_len(kstring_t *s) { return s->l; } +/// Reset kstring length to zero +/** + @return The kstring itself + + Example use: kputsn(string, len, ks_clear(s)) +*/ +static inline kstring_t *ks_clear(kstring_t *s) +{ + s->l = 0; + return s; +} + // Give ownership of the underlying buffer away to something else (making // that something else responsible for freeing it), leaving the kstring_t // empty and ready to be used again, or ready to go out of scope without @@ -142,9 +225,19 @@ static inline char *ks_release(kstring_t *s) return ss; } +/// Safely free the underlying buffer in a kstring. +static inline void ks_free(kstring_t *s) +{ + if (s) { + free(s->s); + ks_initialize(s); + } +} + static inline int kputsn(const char *p, size_t l, kstring_t *s) { - if (l > SIZE_MAX - 2 - s->l || ks_resize(s, s->l + l + 2) < 0) + size_t new_sz = s->l + l + 2; + if (new_sz <= s->l || ks_resize(s, new_sz) < 0) return EOF; memcpy(s->s + s->l, p, l); s->l += l; @@ -163,7 +256,7 @@ static inline int kputc(int c, kstring_t *s) return EOF; s->s[s->l++] = c; s->s[s->l] = 0; - return c; + return (unsigned char)c; } static inline int kputc_(int c, kstring_t *s) @@ -176,47 +269,117 @@ static inline int kputc_(int c, kstring_t *s) static inline int kputsn_(const void *p, size_t l, kstring_t *s) { - if (l > SIZE_MAX - s->l || ks_resize(s, s->l + l) < 0) + size_t new_sz = s->l + l; + if (new_sz < s->l || ks_resize(s, new_sz ? new_sz : 1) < 0) return EOF; memcpy(s->s + s->l, p, l); s->l += l; return l; } -static inline int kputw(int c, kstring_t *s) +static inline int kputuw(unsigned x, kstring_t *s) { - char buf[16]; - int i, l = 0; - unsigned int x = c; - if (c < 0) x = -x; - do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0); - if (c < 0) buf[l++] = '-'; - if (ks_resize(s, s->l + l + 2) < 0) - return EOF; - for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; - s->s[s->l] = 0; - return 0; +#if HAVE___BUILTIN_CLZ && UINT_MAX == 4294967295U + static const unsigned int kputuw_num_digits[32] = { + 10, 10, 10, 9, 9, 9, 8, 8, + 8, 7, 7, 7, 7, 6, 6, 6, + 5, 5, 5, 4, 4, 4, 4, 3, + 3, 3, 2, 2, 2, 1, 1, 1 + }; + static const unsigned int kputuw_thresholds[32] = { + 0, 0, 1000000000U, 0, 0, 100000000U, 0, 0, + 10000000, 0, 0, 0, 1000000, 0, 0, 100000, + 0, 0, 10000, 0, 0, 0, 1000, 0, + 0, 100, 0, 0, 10, 0, 0, 0 + }; +#else + uint64_t m; +#endif + static const char kputuw_dig2r[] = + "00010203040506070809" + "10111213141516171819" + "20212223242526272829" + "30313233343536373839" + "40414243444546474849" + "50515253545556575859" + "60616263646566676869" + "70717273747576777879" + "80818283848586878889" + "90919293949596979899"; + unsigned int l, j; + char *cp; + + // Trivial case - also prevents __builtin_clz(0), which is undefined + if (x < 10) { + if (ks_resize(s, s->l + 2) < 0) + return EOF; + s->s[s->l++] = '0'+x; + s->s[s->l] = 0; + return 0; + } + + // Find out how many digits are to be printed. +#if HAVE___BUILTIN_CLZ && UINT_MAX == 4294967295U + /* + * Table method - should be quick if clz can be done in hardware. + * Find the most significant bit of the value to print and look + * up in a table to find out how many decimal digits are needed. + * This number needs to be adjusted by 1 for cases where the decimal + * length could vary for a given number of bits (for example, + * a four bit number could be between 8 and 15). + */ + + l = __builtin_clz(x); + l = kputuw_num_digits[l] - (x < kputuw_thresholds[l]); +#else + // Fallback for when clz is not available + m = 1; + l = 0; + do { + l++; + m *= 10; + } while (x >= m); +#endif + + if (ks_resize(s, s->l + l + 2) < 0) + return EOF; + + // Add digits two at a time + j = l; + cp = s->s + s->l; + while (x >= 10) { + const char *d = &kputuw_dig2r[2*(x%100)]; + x /= 100; + memcpy(&cp[j-=2], d, 2); + } + + // Last one (if necessary). We know that x < 10 by now. + if (j == 1) + cp[0] = x + '0'; + + s->l += l; + s->s[s->l] = 0; + return 0; } -static inline int kputuw(unsigned c, kstring_t *s) +static inline int kputw(int c, kstring_t *s) { - char buf[16]; - int l, i; - unsigned x; - if (c == 0) return kputc('0', s); - for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; - if (ks_resize(s, s->l + l + 2) < 0) - return EOF; - for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; - s->s[s->l] = 0; - return 0; + unsigned int x = c; + if (c < 0) { + x = -x; + if (ks_resize(s, s->l + 3) < 0) + return EOF; + s->s[s->l++] = '-'; + } + + return kputuw(x, s); } -static inline int kputl(long c, kstring_t *s) +static inline int kputll(long long c, kstring_t *s) { char buf[32]; int i, l = 0; - unsigned long x = c; + unsigned long long x = c; if (c < 0) x = -x; do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0); if (c < 0) buf[l++] = '-'; @@ -227,6 +390,10 @@ static inline int kputl(long c, kstring_t *s) return 0; } +static inline int kputl(long c, kstring_t *s) { + return kputll(c, s); +} + /* * Returns 's' split by delimiter, with *n being the number of components; * NULL on failue. diff --git a/htslib/regidx.h b/htslib/regidx.h index f2e0e00da..03876d2fa 100644 --- a/htslib/regidx.h +++ b/htslib/regidx.h @@ -1,7 +1,7 @@ /// @file htslib/regidx.h /// Region indexing. /* - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2019 Genome Research Ltd. Author: Petr Danecek @@ -25,8 +25,7 @@ */ /* - Regions indexing with an optional payload. Inspired by samtools/bedidx.c. - This code is intended as future replacement of bcf_sr_regions_t. + Region indexing with an optional payload. Example of usage: @@ -36,18 +35,30 @@ // and for working example see test/test-regidx.c. regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL); - // Query overlap with chr:from-to - regitr_t itr; - if ( regidx_overlap(idx, chr,from,to, &itr) ) printf("There is an overlap!\n"); + // Query overlap with chr:beg-end (beg,end are 1-based coordinates) + regitr_t *itr = regitr_init(idx); + if ( regidx_overlap(idx, chr,beg-1,end-1, itr) ) printf("There is an overlap!\n"); - while ( REGITR_OVERLAP(itr,from,to) ) + while ( regitr_overlap(itr) ) { - printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", from,to, - REGITR_START(itr), REGITR_END(itr), REGITR_PAYLOAD(itr,char*)); - itr.i++; + printf("[%"PRIhts_pos",%"PRIhts_pos"] overlaps with [%"PRIhts_pos",%"PRIhts_pos"], payload=%s\n", + beg, end, itr->beg+1, itr->end+1, regitr_payload(itr,char*)); } - regidx_destroy(regs); + regidx_destroy(idx); + regitr_destroy(itr); + + + Another example, loop over all regions: + + regidx_t *idx = regidx_init(in_fname,NULL,NULL,0,NULL); + regitr_t *itr = regitr_init(idx); + + while ( regitr_loop(itr) ) + printf("chr=%s beg=%d end=%d\n", itr->seq, itr->beg+1, itr->end+1); + + regidx_destroy(idx); + regitr_destroy(itr); */ #ifndef HTSLIB_REGIDX_H @@ -55,50 +66,72 @@ #include #include +#include "hts.h" #ifdef __cplusplus extern "C" { #endif -typedef struct _regidx_t regidx_t; -typedef struct -{ - uint32_t start, end; -} -reg_t; +// maximum regidx position (0-based). Used to represent the end point of +// regions which do not explicitly set one. regidx_push() also limits +// positions passed to it to be no bigger than this. + +// Limit is set to ensure some internal values used by regidx keep within 32 +// bits and to stop the index from getting too big. + +#define REGIDX_MAX (1ULL << 35) + +typedef struct regidx_t regidx_t; typedef struct { - int i, n; - reg_t *reg; + hts_pos_t beg,end; void *payload; + char *seq; + void *itr; } regitr_t; -#define REGITR_START(itr) (itr).reg[(itr).i].start -#define REGITR_END(itr) (itr).reg[(itr).i].end -#define REGITR_PAYLOAD(itr,type_t) ((type_t*)(itr).payload)[(itr).i] -#define REGITR_OVERLAP(itr,from,to) (itr.i < itr.n && REGITR_START(itr)<=to && REGITR_END(itr)>=from ) +#define regitr_payload(itr,type_t) (*((type_t*)(itr)->payload)) + +// Old API for backwards compatibility +#define REGITR_START(itr) (itr).beg +#define REGITR_END(itr) (itr).end +#define REGITR_PAYLOAD(itr,type_t) ((type_t*)(itr).payload) +#define REGITR_OVERLAP(itr,from,to) regidx_overlap((itr)); /* * regidx_parse_f - Function to parse one input line, such as regidx_parse_bed * or regidx_parse_tab below. The function is expected to set `chr_from` and * `chr_to` to point to first and last character of chromosome name and set - * coordinates `reg->start` and `reg->end` (0-based, inclusive). If - * regidx_init() was called with non-zero payload_size, the `payload` points - * to a memory location of the payload_size and `usr` is data passed to - * regidx_init(). Any memory allocated by the function will be freed by - * regidx_free_f on regidx_destroy(). + * coordinates `beg` and `end` (0-based, inclusive). If regidx_init() was + * called with non-zero payload_size, the `payload` points to a memory + * location of the payload_size and `usr` is the data passed to regidx_init(). + * Any memory allocated by the function will be freed by regidx_free_f called + * by regidx_destroy(). * * Return value: 0 on success, -1 to skip a record, -2 on fatal error. */ -typedef int (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr); +typedef int (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end, hts_pos_t *beg, hts_pos_t *end, void *payload, void *usr); typedef void (*regidx_free_f)(void *payload); -int regidx_parse_bed(const char*,char**,char**,reg_t*,void*,void*); // CHROM,FROM,TO (0-based,right-open) -int regidx_parse_tab(const char*,char**,char**,reg_t*,void*,void*); // CHROM,POS (1-based, inclusive) +/* + * A note about the parsers: + * - leading spaces are ignored + * - lines starting with "#" are ignored + */ +HTSLIB_EXPORT +int regidx_parse_bed(const char*,char**,char**,hts_pos_t*,hts_pos_t*,void*,void*); // CHROM or whitespace-sepatated CHROM,FROM,TO (0-based,right-open) +HTSLIB_EXPORT +int regidx_parse_tab(const char*,char**,char**,hts_pos_t*,hts_pos_t*,void*,void*); // CHROM or whitespace-separated CHROM,POS (1-based, inclusive) +HTSLIB_EXPORT +int regidx_parse_reg(const char*,char**,char**,hts_pos_t*,hts_pos_t*,void*,void*); // CHROM, CHROM:POS, CHROM:FROM-TO, CHROM:FROM- (1-based, inclusive) +HTSLIB_EXPORT +int regidx_parse_vcf(const char*,char**,char**,hts_pos_t*,hts_pos_t*,void*,void*); /* * regidx_init() - creates new index + * regidx_init_string() - creates new index, from a string rather than from a file + * * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert() * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL, * the format will be autodected, currently either regidx_parse_tab (the default) or @@ -109,46 +142,101 @@ int regidx_parse_tab(const char*,char**,char**,reg_t*,void*,void*); // CHROM,P * @param usr: optional user data passed to regidx_parse_f * * Returns index on success or NULL on error. + * + * The regidx_t index struct returned by a successful call should be freed + * via regidx_destroy() when it is no longer needed. */ +HTSLIB_EXPORT regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr); +HTSLIB_EXPORT +regidx_t *regidx_init_string(const char *string, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr); /* * regidx_destroy() - free memory allocated by regidx_init */ +HTSLIB_EXPORT void regidx_destroy(regidx_t *idx); /* * regidx_overlap() - check overlap of the location chr:from-to with regions - * @param start,end: 0-based start, end coordinate (inclusive) - * @param itr: pointer to iterator, can be NULL if not needed + * @param beg,end: 0-based start, end coordinate (inclusive) + * @param itr: pointer to iterator, can be NULL if regidx_loop not needed * * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping * regions can be iterated as shown in the example above. */ -int regidx_overlap(regidx_t *idx, const char *chr, uint32_t start, uint32_t end, regitr_t *itr); +HTSLIB_EXPORT +int regidx_overlap(regidx_t *idx, const char *chr, hts_pos_t beg, hts_pos_t end, regitr_t *itr); /* * regidx_insert() - add a new region. - * - * After last region has been added, call regidx_insert(idx,NULL) to - * build the index. + * regidx_insert_list() - add new regions from a list + * regidx_push() - low level insertion of a new region * * Returns 0 on success or -1 on error. */ +HTSLIB_EXPORT int regidx_insert(regidx_t *idx, char *line); +HTSLIB_EXPORT +int regidx_insert_list(regidx_t *idx, char *line, char delim); +HTSLIB_EXPORT +int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, hts_pos_t beg, hts_pos_t end, void *payload); /* * regidx_seq_names() - return list of all sequence names */ +HTSLIB_EXPORT char **regidx_seq_names(regidx_t *idx, int *n); /* * regidx_seq_nregs() - number of regions * regidx_nregs() - total number of regions */ +HTSLIB_EXPORT int regidx_seq_nregs(regidx_t *idx, const char *seq); + +HTSLIB_EXPORT int regidx_nregs(regidx_t *idx); +/* + * regitr_init() - initialize an iterator. The idx parameter is required only + * with regitr_loop. If only regitr_overlap is called, NULL + * can be given. + * + * The regitr_t struct returned by a successful regitr_init() + * call should be freed via regitr_destroy() when it is no + * longer needed. + * + * regitr_reset() - initialize an iterator for a repeated regitr_loop cycle. + * Not required with regitr_overlap. + */ +HTSLIB_EXPORT +regitr_t *regitr_init(regidx_t *idx); +HTSLIB_EXPORT +void regitr_destroy(regitr_t *itr); +HTSLIB_EXPORT +void regitr_reset(regidx_t *idx, regitr_t *itr); + +/* + * regitr_overlap() - next overlapping region + * Returns 0 when done or 1 when itr is set to next region + */ +HTSLIB_EXPORT +int regitr_overlap(regitr_t *itr); + +/* + * regitr_loop() - loop over all regions + * Returns 0 when done or 1 when itr is set to next region + */ +HTSLIB_EXPORT +int regitr_loop(regitr_t *itr); + +/* + * regitr_copy() - create a copy of an iterator for a repeated iteration with regitr_loop + */ +HTSLIB_EXPORT +void regitr_copy(regitr_t *dst, regitr_t *src); + #ifdef __cplusplus } #endif diff --git a/htslib/sam.h b/htslib/sam.h index 1d9081403..f6a503895 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1,7 +1,7 @@ /// @file htslib/sam.h /// High-level SAM/BAM/CRAM sequence file operations. /* - Copyright (C) 2008, 2009, 2013-2017 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2019 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -37,29 +37,52 @@ extern "C" { /// Highest SAM format version supported by this library #define SAM_FORMAT_VERSION "1.6" -/********************** - *** SAM/BAM header *** - **********************/ +/*************************** + *** SAM/BAM/CRAM header *** + ***************************/ + +/*! @typedef + * @abstract Header extension structure, grouping a collection + * of hash tables that contain the parsed header data. + */ + +typedef struct sam_hrecs_t sam_hrecs_t; /*! @typedef @abstract Structure for the alignment header. @field n_targets number of reference sequences - @field l_text length of the plain text in the header + @field l_text length of the plain text in the header (may be zero if + the header has been edited) @field target_len lengths of the reference sequences @field target_name names of the reference sequences - @field text plain text + @field text plain text (may be NULL if the header has been edited) @field sdict header dictionary + @field hrecs pointer to the extended header struct (internal use only) + @field ref_count reference count + + @note The text and l_text fields are included for backwards compatibility. + These fields may be set to NULL and zero respectively as a side-effect + of calling some header API functions. New code that needs to access the + header text should use the sam_hdr_str() and sam_hdr_length() functions + instead of these fields. */ -typedef struct { +typedef struct sam_hdr_t { int32_t n_targets, ignore_sam_err; - uint32_t l_text; + size_t l_text; uint32_t *target_len; - int8_t *cigar_tab; + const int8_t *cigar_tab HTS_DEPRECATED("Use bam_cigar_table[] instead"); char **target_name; char *text; void *sdict; -} bam_hdr_t; + sam_hrecs_t *hrecs; + uint32_t ref_count; +} sam_hdr_t; + +/*! @typedef + * @abstract Old name for compatibility with existing code. + */ +typedef sam_hdr_t bam_hdr_t; /**************************** *** CIGAR related macros *** @@ -81,6 +104,12 @@ typedef struct { #define BAM_CIGAR_MASK 0xf #define BAM_CIGAR_TYPE 0x3C1A7 +/*! @abstract Table for converting a CIGAR operator character to BAM_CMATCH etc. +Result is operator code or -1. Be sure to cast the index if it is a plain char: + int op = bam_cigar_table[(unsigned char) ch]; +*/ +extern const int8_t bam_cigar_table[256]; + #define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) #define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT) // Note that BAM_CIGAR_STR is padded to length 16 bytes below so that @@ -140,61 +169,83 @@ typedef struct { *** Alignment records *** *************************/ +/* + * Assumptions made here. While pos can be 64-bit, no sequence + * itself is that long, but due to ref skip CIGAR fields it + * may span more than that. (CIGAR itself is 28-bit len + 4 bit + * type, but in theory we can combine multiples together.) + * + * Mate position and insert size also need to be 64-bit, but + * we won't accept more than 32-bit for tid. + * + * The bam_core_t structure is the *in memory* layout and not + * the same as the on-disk format. 64-bit changes here permit + * SAM to work with very long chromosomes and permit BAM and CRAM + * to seamlessly update in the future without further API/ABI + * revisions. + */ + /*! @typedef @abstract Structure for core alignment information. - @field tid chromosome ID, defined by bam_hdr_t @field pos 0-based leftmost coordinate + @field tid chromosome ID, defined by sam_hdr_t @field bin bin calculated by bam_reg2bin() @field qual mapping quality - @field l_qname length of the query name - @field flag bitwise flag @field l_extranul length of extra NULs between qname & cigar (for alignment) + @field flag bitwise flag + @field l_qname length of the query name @field n_cigar number of CIGAR operations @field l_qseq length of the query sequence (read) - @field mtid chromosome ID of next read in template, defined by bam_hdr_t + @field mtid chromosome ID of next read in template, defined by sam_hdr_t @field mpos 0-based leftmost coordinate of next read in template + @field isize observed template length ("insert size") */ typedef struct { + hts_pos_t pos; int32_t tid; - int32_t pos; - uint16_t bin; + uint16_t bin; // NB: invalid on 64-bit pos uint8_t qual; - uint8_t l_qname; - uint16_t flag; - uint8_t unused1; uint8_t l_extranul; + uint16_t flag; + uint16_t l_qname; uint32_t n_cigar; int32_t l_qseq; int32_t mtid; - int32_t mpos; - int32_t isize; + hts_pos_t mpos; + hts_pos_t isize; } bam1_core_t; /*! @typedef @abstract Structure for one alignment. @field core core information about the alignment + @field id + @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux @field l_data current length of bam1_t::data @field m_data maximum length of bam1_t::data - @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux + @field mempolicy memory handling policy, see bam_set_mempolicy() @discussion Notes: - 1. qname is terminated by one to four NULs, so that the following - cigar data is 32-bit aligned; core.l_qname includes these trailing NULs, - while core.l_extranul counts the excess NULs (so 0 <= l_extranul <= 3). - 2. l_qseq is calculated from the total length of an alignment block - on reading or from CIGAR. - 3. cigar data is encoded 4 bytes per CIGAR operation. - 4. seq is nybble-encoded according to bam_nt16_table. + 1. The data blob should be accessed using bam_get_qname, bam_get_cigar, + bam_get_seq, bam_get_qual and bam_get_aux macros. These returns pointers + to the start of each type of data. + 2. qname is terminated by one to four NULs, so that the following + cigar data is 32-bit aligned; core.l_qname includes these trailing NULs, + while core.l_extranul counts the excess NULs (so 0 <= l_extranul <= 3). + 3. Cigar data is encoded 4 bytes per CIGAR operation. + See the bam_cigar_* macros for manipulation. + 4. seq is nibble-encoded according to bam_nt16_table. + See the bam_seqi macro for retrieving individual bases. + 5. Per base qualilties are stored in the Phred scale with no +33 offset. + Ie as per the BAM specification and not the SAM ASCII printable method. */ typedef struct { bam1_core_t core; + uint64_t id; + uint8_t *data; int l_data; uint32_t m_data; - uint8_t *data; -#ifndef BAM_NO_ID - uint64_t id; -#endif + uint32_t mempolicy:2, :30 /* Reserved */; } bam1_t; /*! @function @@ -266,28 +317,724 @@ typedef struct { *** Exported functions *** **************************/ - /*************** - *** BAM I/O *** - ***************/ +/*************** + *** BAM I/O *** + ***************/ + +/* Header */ + +/// Generates a new unpopulated header structure. +/*! + * + * @return A valid pointer to new header on success, NULL on failure + * + * The sam_hdr_t struct returned by a successful call should be freed + * via sam_hdr_destroy() when it is no longer needed. + */ +HTSLIB_EXPORT +sam_hdr_t *sam_hdr_init(void); + +/// Read the header from a BAM compressed file. +/*! + * @param fp File pointer + * @return A valid pointer to new header on success, NULL on failure + * + * This function only works with BAM files. It is usually better to use + * sam_hdr_read(), which works on SAM, BAM and CRAM files. + * + * The sam_hdr_t struct returned by a successful call should be freed + * via sam_hdr_destroy() when it is no longer needed. + */ +HTSLIB_EXPORT +sam_hdr_t *bam_hdr_read(BGZF *fp); + +/// Writes the header to a BAM file. +/*! + * @param fp File pointer + * @param h Header pointer + * @return 0 on success, -1 on failure + * + * This function only works with BAM files. Use sam_hdr_write() to + * write in any of the SAM, BAM or CRAM formats. + */ +HTSLIB_EXPORT +int bam_hdr_write(BGZF *fp, const sam_hdr_t *h) HTS_RESULT_USED; + +/*! + * Frees the resources associated with a header. + */ +HTSLIB_EXPORT +void sam_hdr_destroy(sam_hdr_t *h); + +/// Duplicate a header structure. +/*! + * @return A valid pointer to new header on success, NULL on failure + * + * The sam_hdr_t struct returned by a successful call should be freed + * via sam_hdr_destroy() when it is no longer needed. + */ +HTSLIB_EXPORT +sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0); + +/*! + * @abstract Old names for compatibility with existing code. + */ +static inline sam_hdr_t *bam_hdr_init(void) { return sam_hdr_init(); } +static inline void bam_hdr_destroy(sam_hdr_t *h) { sam_hdr_destroy(h); } +static inline sam_hdr_t *bam_hdr_dup(const sam_hdr_t *h0) { return sam_hdr_dup(h0); } + +typedef htsFile samFile; + +/// Create a header from existing text. +/*! + * @param l_text Length of text + * @param text Header text + * @return A populated sam_hdr_t structure on success; NULL on failure. + * @note The text field of the returned header will be NULL, and the l_text + * field will be zero. + * + * The sam_hdr_t struct returned by a successful call should be freed + * via sam_hdr_destroy() when it is no longer needed. + */ +HTSLIB_EXPORT +sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text); + +/// Read a header from a SAM, BAM or CRAM file. +/*! + * @param fp Pointer to a SAM, BAM or CRAM file handle + * @return A populated sam_hdr_t struct on success; NULL on failure. + * + * The sam_hdr_t struct returned by a successful call should be freed + * via sam_hdr_destroy() when it is no longer needed. + */ +HTSLIB_EXPORT +sam_hdr_t *sam_hdr_read(samFile *fp); + +/// Write a header to a SAM, BAM or CRAM file. +/*! + * @param fp SAM, BAM or CRAM file header + * @param h Header structure to write + * @return 0 on success; -1 on failure + */ +HTSLIB_EXPORT +int sam_hdr_write(samFile *fp, const sam_hdr_t *h) HTS_RESULT_USED; - bam_hdr_t *bam_hdr_init(void); - bam_hdr_t *bam_hdr_read(BGZF *fp); - int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) HTS_RESULT_USED; - void bam_hdr_destroy(bam_hdr_t *h); - int bam_name2id(bam_hdr_t *h, const char *ref); - bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0); +/// Returns the current length of the header text. +/*! + * @return >= 0 on success, SIZE_MAX on failure + */ +HTSLIB_EXPORT +size_t sam_hdr_length(sam_hdr_t *h); + +/// Returns the text representation of the header. +/*! + * @return valid char pointer on success, NULL on failure + * + * The returned string is part of the header structure. It will remain + * valid until a call to a header API function causes the string to be + * invalidated, or the header is destroyed. + * + * The caller should not attempt to free or realloc this pointer. + */ +HTSLIB_EXPORT +const char *sam_hdr_str(sam_hdr_t *h); + +/// Returns the number of references in the header. +/*! + * @return >= 0 on success, -1 on failure + */ +HTSLIB_EXPORT +int sam_hdr_nref(const sam_hdr_t *h); + +/* ==== Line level methods ==== */ + +/// Add formatted lines to an existing header. +/*! + * @param lines Full SAM header record, eg "@SQ\tSN:foo\tLN:100", with + * optional new-line. If it contains more than 1 line then + * multiple lines will be added in order + * @param len The maximum length of lines (if an early NUL is not + * encountered). len may be 0 if unknown, in which case + * lines must be NUL-terminated + * @return 0 on success, -1 on failure + * + * The lines will be appended to the end of the existing header + * (apart from HD, which always comes first). + */ +HTSLIB_EXPORT +int sam_hdr_add_lines(sam_hdr_t *h, const char *lines, size_t len); + +/// Adds a single line to an existing header. +/*! + * Specify type and one or more key,value pairs, ending with the NULL key. + * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL). + * + * @param type Type of the added line. Eg. "SQ" + * @return 0 on success, -1 on failure + * + * The new line will be added immediately after any others of the same + * type, or at the end of the existing header if no lines of the + * given type currently exist. The exception is HD lines, which always + * come first. If an HD line already exists, it will be replaced. + */ +HTSLIB_EXPORT +int sam_hdr_add_line(sam_hdr_t *h, const char *type, ...); + +/// Returns a complete line of formatted text for a given type and ID. +/*! + * @param type Type of the searched line. Eg. "SQ" + * @param ID_key Tag key defining the line. Eg. "SN" + * @param ID_value Tag value associated with the key above. Eg. "ref1" + * @param ks kstring to hold the result + * @return 0 on success; + * -1 if no matching line is found + * -2 on other failures + * + * Puts a complete line of formatted text for a specific header type/ID + * combination into @p ks. If ID_key is NULL then it returns the first line of + * the specified type. + * + * Any existing content in @p ks will be overwritten. + */ +HTSLIB_EXPORT +int sam_hdr_find_line_id(sam_hdr_t *h, const char *type, + const char *ID_key, const char *ID_val, kstring_t *ks); + +/// Returns a complete line of formatted text for a given type and index. +/*! + * @param type Type of the searched line. Eg. "SQ" + * @param position Index in lines of this type (zero-based) + * @param ks kstring to hold the result + * @return 0 on success; + * -1 if no matching line is found + * -2 on other failures + * + * Puts a complete line of formatted text for a specific line into @p ks. + * The header line is selected using the @p type and @p position parameters. + * + * Any existing content in @p ks will be overwritten. + */ +HTSLIB_EXPORT +int sam_hdr_find_line_pos(sam_hdr_t *h, const char *type, + int pos, kstring_t *ks); + +/// Remove a line with given type / id from a header +/*! + * @param type Type of the searched line. Eg. "SQ" + * @param ID_key Tag key defining the line. Eg. "SN" + * @param ID_value Tag value associated with the key above. Eg. "ref1" + * @return 0 on success, -1 on error + * + * Remove a line from the header by specifying a tag:value that uniquely + * identifies the line, i.e. the @SQ line containing "SN:ref1". + * + * \@SQ line is uniquely identified by the SN tag. + * \@RG line is uniquely identified by the ID tag. + * \@PG line is uniquely identified by the ID tag. + * Eg. sam_hdr_remove_line_id(h, "SQ", "SN", "ref1") + * + * If no key:value pair is specified, the type MUST be followed by a NULL argument and + * the first line of the type will be removed, if any. + * Eg. sam_hdr_remove_line_id(h, "SQ", NULL, NULL) + * + * @note Removing \@PG lines is currently unsupported. + */ +HTSLIB_EXPORT +int sam_hdr_remove_line_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value); + +/// Remove nth line of a given type from a header +/*! + * @param type Type of the searched line. Eg. "SQ" + * @param position Index in lines of this type (zero-based). E.g. 3 + * @return 0 on success, -1 on error + * + * Remove a line from the header by specifying the position in the type + * group, i.e. 3rd @SQ line. + */ +HTSLIB_EXPORT +int sam_hdr_remove_line_pos(sam_hdr_t *h, const char *type, int position); + +/// Add or update tag key,value pairs in a header line. +/*! + * @param type Type of the searched line. Eg. "SQ" + * @param ID_key Tag key defining the line. Eg. "SN" + * @param ID_value Tag value associated with the key above. Eg. "ref1" + * @return 0 on success, -1 on error + * + * Adds or updates tag key,value pairs in a header line. + * Eg. for adding M5 tags to @SQ lines or updating sort order for the + * @HD line. + * + * Specify multiple key,value pairs ending in NULL. Eg. + * sam_hdr_update_line(h, "RG", "ID", "rg1", "DS", "description", "PG", "samtools", NULL) + * + * Attempting to update the record name (i.e. @SQ SN or @RG ID) will + * work as long as the new name is not already in use, however doing this + * on a file opened for reading may produce unexpected results. + * + * Renaming an @RG record in this way will only change the header. Alignment + * records written later will not be updated automatically even if they + * reference the old read group name. + * + * Attempting to change an @PG ID tag is not permitted. + */ +HTSLIB_EXPORT +int sam_hdr_update_line(sam_hdr_t *h, const char *type, + const char *ID_key, const char *ID_value, ...); + +/// Remove all lines of a given type from a header, except the one matching an ID +/*! + * @param type Type of the searched line. Eg. "SQ" + * @param ID_key Tag key defining the line. Eg. "SN" + * @param ID_value Tag value associated with the key above. Eg. "ref1" + * @return 0 on success, -1 on failure + * + * Remove all lines of type from the header, except the one + * specified by tag:value, i.e. the @SQ line containing "SN:ref1". + * + * If no line matches the key:value ID, all lines of the given type are removed. + * To remove all lines of a given type, use NULL for both ID_key and ID_value. + */ +HTSLIB_EXPORT +int sam_hdr_remove_except(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value); + +/// Remove header lines of a given type, except those in a given ID set +/*! + * @param type Type of the searched line. Eg. "RG" + * @param id Tag key defining the line. Eg. "ID" + * @param rh Hash set initialised by the caller with the values to be kept. + * See description for how to create this. If @p rh is NULL, all + * lines of this type will be removed. + * @return 0 on success, -1 on failure + * + * Remove all lines of type @p type from the header, except the ones + * specified in the hash set @p rh. If @p rh is NULL, all lines of + * this type will be removed. + * Declaration of @p rh is done using KHASH_SET_INIT_STR macro. Eg. + * @code{.c} + * #include "htslib/khash.h" + * KHASH_SET_INIT_STR(keep) + * typedef khash_t(keep) *keephash_t; + * + * void your_method() { + * samFile *sf = sam_open("alignment.bam", "r"); + * sam_hdr_t *h = sam_hdr_read(sf); + * keephash_t rh = kh_init(keep); + * int ret = 0; + * kh_put(keep, rh, strdup("chr2"), &ret); + * kh_put(keep, rh, strdup("chr3"), &ret); + * if (sam_hdr_remove_lines(h, "SQ", "SN", rh) == -1) + * fprintf(stderr, "Error removing lines\n"); + * khint_t k; + * for (k = 0; k < kh_end(rh); ++k) + * if (kh_exist(rh, k)) free((char*)kh_key(rh, k)); + * kh_destroy(keep, rh); + * sam_hdr_destroy(h); + * sam_close(sf); + * } + * @endcode + * + */ +HTSLIB_EXPORT +int sam_hdr_remove_lines(sam_hdr_t *h, const char *type, const char *id, void *rh); + +/// Count the number of lines for a given header type +/*! + * @param h BAM header + * @param type Header type to count. Eg. "RG" + * @return Number of lines of this type on success; -1 on failure + */ +HTSLIB_EXPORT +int sam_hdr_count_lines(sam_hdr_t *h, const char *type); + +/// Index of the line for the types that have dedicated look-up tables (SQ, RG, PG) +/*! + * @param h BAM header + * @param type Type of the searched line. Eg. "RG" + * @param key The value of the identifying key. Eg. "rg1" + * @return 0-based index on success; -1 if line does not exist; -2 on failure + */ +HTSLIB_EXPORT +int sam_hdr_line_index(sam_hdr_t *bh, const char *type, const char *key); + +/// Id key of the line for the types that have dedicated look-up tables (SQ, RG, PG) +/*! + * @param h BAM header + * @param type Type of the searched line. Eg. "RG" + * @param pos Zero-based index inside the type group. Eg. 2 (for the third RG line) + * @return Valid key string on success; NULL on failure + */ +HTSLIB_EXPORT +const char *sam_hdr_line_name(sam_hdr_t *bh, const char *type, int pos); + +/* ==== Key:val level methods ==== */ + +/// Return the value associated with a key for a header line identified by ID_key:ID_val +/*! + * @param type Type of the line to which the tag belongs. Eg. "SQ" + * @param ID_key Tag key defining the line. Eg. "SN". Can be NULL, if looking for the first line. + * @param ID_value Tag value associated with the key above. Eg. "ref1". Can be NULL, if ID_key is NULL. + * @param key Key of the searched tag. Eg. "LN" + * @param ks kstring where the value will be written + * @return 0 on success + * -1 if the requested tag does not exist + * -2 on other errors + * + * Looks for a specific key in a single SAM header line and writes the + * associated value into @p ks. The header line is selected using the ID_key + * and ID_value parameters. Any pre-existing content in @p ks will be + * overwritten. + */ +HTSLIB_EXPORT +int sam_hdr_find_tag_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value, const char *key, kstring_t *ks); + +/// Return the value associated with a key for a header line identified by position +/*! + * @param type Type of the line to which the tag belongs. Eg. "SQ" + * @param position Index in lines of this type (zero-based). E.g. 3 + * @param key Key of the searched tag. Eg. "LN" + * @param ks kstring where the value will be written + * @return 0 on success + * -1 if the requested tag does not exist + * -2 on other errors + * + * Looks for a specific key in a single SAM header line and writes the + * associated value into @p ks. The header line is selected using the @p type + * and @p position parameters. Any pre-existing content in @p ks will be + * overwritten. + */ +HTSLIB_EXPORT +int sam_hdr_find_tag_pos(sam_hdr_t *h, const char *type, int pos, const char *key, kstring_t *ks); + +/// Remove the key from the line identified by type, ID_key and ID_value. +/*! + * @param type Type of the line to which the tag belongs. Eg. "SQ" + * @param ID_key Tag key defining the line. Eg. "SN" + * @param ID_value Tag value associated with the key above. Eg. "ref1" + * @param key Key of the targeted tag. Eg. "M5" + * @return 1 if the key was removed; 0 if it was not present; -1 on error + */ +HTSLIB_EXPORT +int sam_hdr_remove_tag_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value, const char *key); + +/// Get the target id for a given reference sequence name +/*! + * @param ref Reference name + * @return Positive value on success, + * -1 if unknown reference, + * -2 if the header could not be parsed + * + * Looks up a reference sequence by name in the reference hash table + * and returns the numerical target id. + */ +HTSLIB_EXPORT +int sam_hdr_name2tid(sam_hdr_t *h, const char *ref); - bam1_t *bam_init1(void); - void bam_destroy1(bam1_t *b); - int bam_read1(BGZF *fp, bam1_t *b) HTS_RESULT_USED; - int bam_write1(BGZF *fp, const bam1_t *b) HTS_RESULT_USED; - bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc); - bam1_t *bam_dup1(const bam1_t *bsrc); +/// Get the reference sequence name from a target index +/*! + * @param tid Target index + * @return Valid reference name on success, NULL on failure + * + * Fetch the reference sequence name from the target name array, + * using the numerical target id. + */ +HTSLIB_EXPORT +const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid); - int bam_cigar2qlen(int n_cigar, const uint32_t *cigar); - int bam_cigar2rlen(int n_cigar, const uint32_t *cigar); +/// Get the reference sequence length from a target index +/*! + * @param tid Target index + * @return Strictly positive value on success, 0 on failure + * + * Fetch the reference sequence length from the target length array, + * using the numerical target id. + */ +HTSLIB_EXPORT +hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid); + +/// Alias of sam_hdr_name2tid(), for backwards compatibility. +/*! + * @param ref Reference name + * @return Positive value on success, + * -1 if unknown reference, + * -2 if the header could not be parsed + */ +static inline int bam_name2id(sam_hdr_t *h, const char *ref) { return sam_hdr_name2tid(h, ref); } - /*! +/// Generate a unique \@PG ID: value +/*! + * @param name Name of the program. Eg. samtools + * @return Valid ID on success, NULL on failure + * + * Returns a unique ID from a base name. The string returned will remain + * valid until the next call to this function, or the header is destroyed. + * The caller should not attempt to free() or realloc() it. + */ +HTSLIB_EXPORT +const char *sam_hdr_pg_id(sam_hdr_t *h, const char *name); + +/// Add an \@PG line. +/*! + * @param name Name of the program. Eg. samtools + * @return 0 on success, -1 on failure + * + * If we wish complete control over this use sam_hdr_add_line() directly. This + * function uses that, but attempts to do a lot of tedious house work for + * you too. + * + * - It will generate a suitable ID if the supplied one clashes. + * - It will generate multiple \@PG records if we have multiple PG chains. + * + * Call it as per sam_hdr_add_line() with a series of key,value pairs ending + * in NULL. + */ +HTSLIB_EXPORT +int sam_hdr_add_pg(sam_hdr_t *h, const char *name, ...); + +/*! + * A function to help with construction of CL tags in @PG records. + * Takes an argc, argv pair and returns a single space-separated string. + * This string should be deallocated by the calling function. + * + * @return + * Returns malloced char * on success; + * NULL on failure + */ +HTSLIB_EXPORT +char *stringify_argv(int argc, char *argv[]); + +/// Increments the reference count on a header +/*! + * This permits multiple files to share the same header, all calling + * sam_hdr_destroy when done, without causing errors for other open files. + */ +HTSLIB_EXPORT +void sam_hdr_incr_ref(sam_hdr_t *h); + +/* + * Macros for changing the \@HD line. They eliminate the need to use NULL method arguments. + */ + +/// Returns the SAM formatted text of the \@HD header line +#define sam_hdr_find_hd(h, ks) sam_hdr_find_line_id((h), "HD", NULL, NULL, (ks)) +/// Returns the value associated with a given \@HD line tag +#define sam_hdr_find_tag_hd(h, key, ks) sam_hdr_find_tag_id((h), "HD", NULL, NULL, (key), (ks)) +/// Adds or updates tags on the header \@HD line +#define sam_hdr_update_hd(h, ...) sam_hdr_update_line((h), "HD", NULL, NULL, __VA_ARGS__, NULL) +/// Removes the \@HD line tag with the given key +#define sam_hdr_remove_tag_hd(h, key) sam_hdr_remove_tag_id((h), "HD", NULL, NULL, (key)) + +/* Alignment */ + +/// Create a new bam1_t alignment structure +/** + @return An empty bam1_t structure on success, NULL on failure + + The bam1_t struct returned by a successful call should be freed + via bam_destroy1() when it is no longer needed. + */ +HTSLIB_EXPORT +bam1_t *bam_init1(void); + +/// Destroy a bam1_t structure +/** + @param b structure to destroy + + Does nothing if @p b is NULL. If not, all memory associated with @p b + will be freed, along with the structure itself. @p b should not be + accessed after calling this function. + */ +HTSLIB_EXPORT +void bam_destroy1(bam1_t *b); + +#define BAM_USER_OWNS_STRUCT 1 +#define BAM_USER_OWNS_DATA 2 + +/// Set alignment record memory policy +/** + @param b Alignment record + @param policy Desired policy + + Allows the way HTSlib reallocates and frees bam1_t data to be + changed. @policy can be set to the bitwise-or of the following + values: + + \li \c BAM_USER_OWNS_STRUCT + If this is set then bam_destroy1() will not try to free the bam1_t struct. + + \li \c BAM_USER_OWNS_DATA + If this is set, bam_destroy1() will not free the bam1_t::data pointer. + Also, functions which need to expand bam1_t::data memory will change + behaviour. Instead of calling realloc() on the pointer, they will + allocate a new data buffer and copy any existing content in to it. + The existing memory will \b not be freed. bam1_t::data will be + set to point to the new memory and the BAM_USER_OWNS_DATA flag will be + cleared. + + BAM_USER_OWNS_STRUCT allows bam_destroy1() to be called on bam1_t + structures that are members of an array. + + BAM_USER_OWNS_DATA can be used by applications that want more control + over where the variable-length parts of the bam record will be stored. + By preventing calls to free() and realloc(), it allows bam1_t::data + to hold pointers to memory that cannot be passed to those functions. + + Example: Read a block of alignment records, storing the variable-length + data in a single buffer and the records in an array. Stop when either + the array or the buffer is full. + + \code{.c} + #define MAX_RECS 1000 + #define REC_LENGTH 400 // Average length estimate, to get buffer size + size_t bufsz = MAX_RECS * REC_LENGTH, nrecs, buff_used = 0; + bam1_t *recs = calloc(MAX_RECS, sizeof(bam1_t)); + uint8_t *buffer = malloc(bufsz); + int res = 0, result = EXIT_FAILURE; + uint32_t new_m_data; + + if (!recs || !buffer) goto cleanup; + for (nrecs = 0; nrecs < MAX_RECS; nrecs++) { + bam_set_mempolicy(BAM_USER_OWNS_STRUCT|BAM_USER_OWNS_DATA); + + // Set data pointer to unused part of buffer + recs[nrecs].data = &buffer[buff_used]; + + // Set m_data to size of unused part of buffer. On 64-bit platforms it + // will be necessary to limit this to UINT32_MAX due to the size of + // bam1_t::m_data (not done here as our buffer is only 400K). + recs[nrecs].m_data = bufsz - buff_used; + + // Read the record + res = sam_read1(file_handle, header, &recs[nrecs]); + if (res <= 0) break; // EOF or error + + // Check if the record data didn't fit - if not, stop reading + if ((bam_get_mempolicy(&recs[nrecs]) & BAM_USER_OWNS_DATA) == 0) { + nrecs++; // Include last record in count + break; + } + + // Adjust m_data to the space actually used. If space is available, + // round up to eight bytes so the next record aligns nicely. + new_m_data = ((uint32_t) recs[nrecs].l_data + 7) & (~7U); + if (new_m_data < recs[nrecs].m_data) recs[nrecs].m_data = new_m_data; + + buff_used += recs[nrecs].m_data; + } + if (res < 0) goto cleanup; + result = EXIT_SUCCESS; + + // ... use data ... + + cleanup: + for (size_t i = 0; i < nrecs; i++) + bam_destroy1(i); + free(buffer); + free(recs); + + \endcode +*/ +static inline void bam_set_mempolicy(bam1_t *b, uint32_t policy) { + b->mempolicy = policy; +} + +/// Get alignment record memory policy +/** @param b Alignment record + + See bam_set_mempolicy() + */ +static inline uint32_t bam_get_mempolicy(bam1_t *b) { + return b->mempolicy; +} + +/// Read a BAM format alignment record +/** + @param fp BGZF file being read + @param b Destination for the alignment data + @return number of bytes read on success + -1 at end of file + < -1 on failure + + This function can only read BAM format files. Most code should use + sam_read1() instead, which can be used with BAM, SAM and CRAM formats. +*/ +HTSLIB_EXPORT +int bam_read1(BGZF *fp, bam1_t *b) HTS_RESULT_USED; + +/// Write a BAM format alignment record +/** + @param fp BGZF file being written + @param b Alignment record to write + @return number of bytes written on success + -1 on error + + This function can only write BAM format files. Most code should use + sam_write1() instead, which can be used with BAM, SAM and CRAM formats. +*/ +HTSLIB_EXPORT +int bam_write1(BGZF *fp, const bam1_t *b) HTS_RESULT_USED; + +/// Copy alignment record data +/** + @param bdst Destination alignment record + @param bsrc Source alignment record + @return bdst on success; NULL on failure + */ +HTSLIB_EXPORT +bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) HTS_RESULT_USED; + +/// Create a duplicate alignment record +/** + @param bsrc Source alignment record + @return Pointer to a new alignment record on success; NULL on failure + + The bam1_t struct returned by a successful call should be freed + via bam_destroy1() when it is no longer needed. + */ +HTSLIB_EXPORT +bam1_t *bam_dup1(const bam1_t *bsrc); + +/// Calculate query length from CIGAR data +/** + @param n_cigar Number of items in @p cigar + @param cigar CIGAR data + @return Query length + + CIGAR data is stored as in the BAM format, i.e. (op_len << 4) | op + where op_len is the length in bases and op is a value between 0 and 8 + representing one of the operations "MIDNSHP=X" (M = 0; X = 8) + + This function returns the sum of the lengths of the M, I, S, = and X + operations in @p cigar (these are the operations that "consume" query + bases). All other operations (including invalid ones) are ignored. + + @note This return type of this function is hts_pos_t so that it can + correctly return the length of CIGAR sequences including many long + operations without overflow. However, other restrictions (notably the sizes + of bam1_core_t::l_qseq and bam1_t::data) limit the maximum query sequence + length supported by HTSlib to fewer than INT_MAX bases. + */ +HTSLIB_EXPORT +hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar); + +/// Calculate reference length from CIGAR data +/** + @param n_cigar Number of items in @p cigar + @param cigar CIGAR data + @return Reference length + + CIGAR data is stored as in the BAM format, i.e. (op_len << 4) | op + where op_len is the length in bases and op is a value between 0 and 8 + representing one of the operations "MIDNSHP=X" (M = 0; X = 8) + + This function returns the sum of the lengths of the M, D, N, = and X + operations in @p cigar (these are the operations that "consume" reference + bases). All other operations (including invalid ones) are ignored. + */ +HTSLIB_EXPORT +hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar); + +/*! @abstract Calculate the rightmost base position of an alignment on the reference genome. @@ -297,33 +1044,69 @@ typedef struct { @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen. For an unmapped read (either according to its flags or if it has no cigar string), we return b->core.pos + 1 by convention. - */ - int32_t bam_endpos(const bam1_t *b); + */ +HTSLIB_EXPORT +hts_pos_t bam_endpos(const bam1_t *b); - int bam_str2flag(const char *str); /** returns negative value on error */ - char *bam_flag2str(int flag); /** The string must be freed by the user */ +HTSLIB_EXPORT +int bam_str2flag(const char *str); /** returns negative value on error */ - /************************* - *** BAM/CRAM indexing *** - *************************/ +HTSLIB_EXPORT +char *bam_flag2str(int flag); /** The string must be freed by the user */ - // These BAM iterator functions work only on BAM files. To work with either - // BAM or CRAM files use the sam_index_load() & sam_itr_*() functions. - #define bam_itr_destroy(iter) hts_itr_destroy(iter) - #define bam_itr_queryi(idx, tid, beg, end) sam_itr_queryi(idx, tid, beg, end) - #define bam_itr_querys(idx, hdr, region) sam_itr_querys(idx, hdr, region) - #define bam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0) +/*! @function + @abstract Set the name of the query + @param b pointer to an alignment + @return 0 on success, -1 on failure + */ +HTSLIB_EXPORT +int bam_set_qname(bam1_t *b, const char *qname); + +/************************* + *** BAM/CRAM indexing *** + *************************/ + +// These BAM iterator functions work only on BAM files. To work with either +// BAM or CRAM files use the sam_index_load() & sam_itr_*() functions. +#define bam_itr_destroy(iter) hts_itr_destroy(iter) +#define bam_itr_queryi(idx, tid, beg, end) sam_itr_queryi(idx, tid, beg, end) +#define bam_itr_querys(idx, hdr, region) sam_itr_querys(idx, hdr, region) +#define bam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0) // Load/build .csi or .bai BAM index file. Does not work with CRAM. // It is recommended to use the sam_index_* functions below instead. #define bam_index_load(fn) hts_idx_load((fn), HTS_FMT_BAI) #define bam_index_build(fn, min_shift) (sam_index_build((fn), (min_shift))) +/// Initialise fp->idx for the current format type for SAM, BAM and CRAM types . +/** @param fp File handle for the data file being written. + @param h Bam header structured (needed for BAI and CSI). + @param min_shift 0 for BAI, or larger for CSI (CSI defaults to 14). + @param fnidx Filename to write index to. This pointer must remain valid + until after sam_idx_save is called. + @return 0 on success, <0 on failure. + + @note This must be called after the header has been written, but before + any other data. +*/ +HTSLIB_EXPORT +int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx); + +/// Writes the index initialised with sam_idx_init to disk. +/** @param fp File handle for the data file being written. + @return 0 on success, <0 on filaure. +*/ +HTSLIB_EXPORT +int sam_idx_save(htsFile *fp) HTS_RESULT_USED; + /// Load a BAM (.csi or .bai) or CRAM (.crai) index file /** @param fp File handle of the data file whose index is being opened @param fn BAM/CRAM/etc filename to search alongside for the index file @return The index, or NULL if an error occurred. + +Equivalent to sam_index_load3(fp, fn, NULL, HTS_IDX_SAVE_REMOTE); */ +HTSLIB_EXPORT hts_idx_t *sam_index_load(htsFile *fp, const char *fn); /// Load a specific BAM (.csi or .bai) or CRAM (.crai) index file @@ -331,9 +1114,33 @@ hts_idx_t *sam_index_load(htsFile *fp, const char *fn); @param fn BAM/CRAM/etc data file filename @param fnidx Index filename, or NULL to search alongside @a fn @return The index, or NULL if an error occurred. + +Equivalent to sam_index_load3(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE); */ +HTSLIB_EXPORT hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx); +/// Load or stream a BAM (.csi or .bai) or CRAM (.crai) index file +/** @param fp File handle of the data file whose index is being opened + @param fn BAM/CRAM/etc data file filename + @param fnidx Index filename, or NULL to search alongside @a fn + @param flags Flags to alter behaviour (see description) + @return The index, or NULL if an error occurred. + +The @p flags parameter can be set to a combination of the following values: + + HTS_IDX_SAVE_REMOTE Save a local copy of any remote indexes + HTS_IDX_SILENT_FAIL Fail silently if the index is not present + +Note that HTS_IDX_SAVE_REMOTE has no effect for remote CRAM indexes. They +are always downloaded and never cached locally. + +The index struct returned by a successful call should be freed +via hts_idx_destroy() when it is no longer needed. +*/ +HTSLIB_EXPORT +hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags); + /// Generate and save an index file /** @param fn Input BAM/etc filename, to which .csi/etc will be added @param min_shift Positive to generate CSI, or 0 to generate BAI @@ -341,6 +1148,7 @@ hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx); -2: opening fn failed; -3: format not indexable; -4: failed to create and/or save the index) */ +HTSLIB_EXPORT int sam_index_build(const char *fn, int min_shift) HTS_RESULT_USED; /// Generate and save an index to a specific file @@ -350,16 +1158,146 @@ int sam_index_build(const char *fn, int min_shift) HTS_RESULT_USED; @return 0 if successful, or negative if an error occurred (see sam_index_build for error codes) */ +HTSLIB_EXPORT int sam_index_build2(const char *fn, const char *fnidx, int min_shift) HTS_RESULT_USED; + +/// Generate and save an index to a specific file +/** @param fn Input BAM/CRAM/etc filename + @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn + @param min_shift Positive to generate CSI, or 0 to generate BAI + @param nthreads Number of threads to use when building the index + @return 0 if successful, or negative if an error occurred (see + sam_index_build for error codes) +*/ +HTSLIB_EXPORT int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads) HTS_RESULT_USED; - #define sam_itr_destroy(iter) hts_itr_destroy(iter) - hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end); - hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region); - hts_itr_multi_t *sam_itr_regions(const hts_idx_t *idx, bam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount); +/// Free a SAM iterator +/// @param iter Iterator to free +#define sam_itr_destroy(iter) hts_itr_destroy(iter) + +/// Create a BAM/CRAM iterator +/** @param idx Index + @param tid Target id + @param beg Start position in target + @param end End position in target + @return An iterator on success; NULL on failure + +The following special values (defined in htslib/hts.h)can be used for @p tid. +When using one of these values, @p beg and @p end are ignored. - #define sam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), (htsfp)) - #define sam_itr_multi_next(htsfp, itr, r) hts_itr_multi_next((htsfp), (itr), (r)) + HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file + HTS_IDX_START iterates over the entire file + HTS_IDX_REST iterates from the current position to the end of the file + HTS_IDX_NONE always returns "no more alignment records" + +When using HTS_IDX_REST or HTS_IDX_NONE, NULL can be passed in to @p idx. + */ +HTSLIB_EXPORT +hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end); + +/// Create a SAM/BAM/CRAM iterator +/** @param idx Index + @param hdr Header + @param region Region specification + @return An iterator on success; NULL on failure + +Regions are parsed by hts_parse_reg(), and take one of the following forms: + +region | Outputs +--------------- | ------------- +REF | All reads with RNAME REF +REF: | All reads with RNAME REF +REF:START | Reads with RNAME REF overlapping START to end of REF +REF:-END | Reads with RNAME REF overlapping start of REF to END +REF:START-END | Reads with RNAME REF overlapping START to END +. | All reads from the start of the file +* | Unmapped reads at the end of the file (RNAME '*' in SAM) + +The form `REF:` should be used when the reference name itself contains a colon. + +Note that SAM files must be bgzf-compressed for iterators to work. + */ +HTSLIB_EXPORT +hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region); + +/// Create a multi-region iterator +/** @param idx Index + @param hdr Header + @param reglist Array of regions to iterate over + @param regcount Number of items in reglist + +Each @p reglist entry should have the reference name in the `reg` field, an +array of regions for that reference in `intervals` and the number of items +in `intervals` should be stored in `count`. No other fields need to be filled +in. + +The iterator will return all reads overlapping the given regions. If a read +overlaps more than one region, it will only be returned once. + */ +HTSLIB_EXPORT +hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount); + +/// Create a multi-region iterator +/** @param idx Index + @param hdr Header + @param regarray Array of ref:interval region specifiers + @param regcount Number of items in regarray + +Each @p regarray entry is parsed by hts_parse_reg(), and takes one of the +following forms: + +region | Outputs +--------------- | ------------- +REF | All reads with RNAME REF +REF: | All reads with RNAME REF +REF:START | Reads with RNAME REF overlapping START to end of REF +REF:-END | Reads with RNAME REF overlapping start of REF to END +REF:START-END | Reads with RNAME REF overlapping START to END +. | All reads from the start of the file +* | Unmapped reads at the end of the file (RNAME '*' in SAM) + +The form `REF:` should be used when the reference name itself contains a colon. + +The iterator will return all reads overlapping the given regions. If a read +overlaps more than one region, it will only be returned once. + */ +HTSLIB_EXPORT +hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount); + +/// Get the next read from a SAM/BAM/CRAM iterator +/** @param htsfp Htsfile pointer for the input file + @param itr Iterator + @param r Pointer to a bam1_t struct + @return >= 0 on success; -1 when there is no more data; < -1 on error + */ +static inline int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, bam1_t *r) { + if (!htsfp->is_bgzf && !htsfp->is_cram) { + hts_log_error("%s not BGZF compressed", htsfp->fn ? htsfp->fn : "File"); + return -2; + } + if (!itr) { + hts_log_error("Null iterator"); + return -2; + } + + if (itr->multi) + return hts_itr_multi_next(htsfp, itr, r); + else + return hts_itr_next(htsfp->is_bgzf ? htsfp->fp.bgzf : NULL, itr, r, htsfp); +} + +/// Get the next read from a BAM/CRAM multi-iterator +/** @param htsfp Htsfile pointer for the input file + @param itr Iterator + @param r Pointer to a bam1_t struct + @return >= 0 on success; -1 when there is no more data; < -1 on error + */ +#define sam_itr_multi_next(htsfp, itr, r) sam_itr_next(htsfp, itr, r) + +HTSLIB_EXPORT +const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, + hts_pos_t *beg, hts_pos_t *end, int flags); /*************** *** SAM I/O *** @@ -369,29 +1307,41 @@ int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthre #define sam_open_format(fn, mode, fmt) (hts_open_format((fn), (mode), (fmt))) #define sam_close(fp) hts_close(fp) + HTSLIB_EXPORT int sam_open_mode(char *mode, const char *fn, const char *format); // A version of sam_open_mode that can handle ,key=value options. // The format string is allocated and returned, to be freed by the caller. // Prefix should be "r" or "w", + HTSLIB_EXPORT char *sam_open_mode_opts(const char *fn, const char *mode, const char *format); - typedef htsFile samFile; - bam_hdr_t *sam_hdr_parse(int l_text, const char *text); - bam_hdr_t *sam_hdr_read(samFile *fp); - int sam_hdr_write(samFile *fp, const bam_hdr_t *h) HTS_RESULT_USED; - int sam_hdr_change_HD(bam_hdr_t *h, const char *key, const char *val); + HTSLIB_EXPORT + int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val); - int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) HTS_RESULT_USED; - int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) HTS_RESULT_USED; + HTSLIB_EXPORT + int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) HTS_RESULT_USED; + HTSLIB_EXPORT + int sam_format1(const sam_hdr_t *h, const bam1_t *b, kstring_t *str) HTS_RESULT_USED; - /*! - * @return >= 0 on successfully reading a new record, -1 on end of stream, < -1 on error - **/ - int sam_read1(samFile *fp, bam_hdr_t *h, bam1_t *b) HTS_RESULT_USED; - int sam_write1(samFile *fp, const bam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED; +/// sam_read1 - Read a record from a file +/** @param fp Pointer to the source file + * @param h Pointer to the header previously read (fully or partially) + * @param b Pointer to the record placeholder + * @return >= 0 on successfully reading a new record, -1 on end of stream, < -1 on error + */ + HTSLIB_EXPORT + int sam_read1(samFile *fp, sam_hdr_t *h, bam1_t *b) HTS_RESULT_USED; +/// sam_write1 - Write a record to a file +/** @param fp Pointer to the destination file + * @param h Pointer to the header structure previously read + * @param b Pointer to the record to be written + * @return >= 0 on successfully writing the record, -1 on error + */ + HTSLIB_EXPORT + int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED; /************************************* *** Manipulating auxiliary fields *** @@ -406,6 +1356,7 @@ int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthre invalid type, or the last record is incomplete) then errno is set to EINVAL and NULL is returned. */ +HTSLIB_EXPORT uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); /// Get an integer aux value @@ -414,6 +1365,7 @@ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); If the tag is not an integer type, errno is set to EINVAL. This function will not return the value of floating-point tags. */ +HTSLIB_EXPORT int64_t bam_aux2i(const uint8_t *s); /// Get an integer aux value @@ -422,6 +1374,7 @@ int64_t bam_aux2i(const uint8_t *s); If the tag is not an numeric type, errno is set to EINVAL. The value of integer flags will be returned cast to a double. */ +HTSLIB_EXPORT double bam_aux2f(const uint8_t *s); /// Get a character aux value @@ -429,6 +1382,7 @@ double bam_aux2f(const uint8_t *s); @return The value, or 0 if the tag was not a character ('A') type If the tag is not a character type, errno is set to EINVAL. */ +HTSLIB_EXPORT char bam_aux2A(const uint8_t *s); /// Get a string aux value @@ -436,6 +1390,7 @@ char bam_aux2A(const uint8_t *s); @return Pointer to the string, or NULL if the tag was not a string type If the tag is not a string type ('Z' or 'H'), errno is set to EINVAL. */ +HTSLIB_EXPORT char *bam_aux2Z(const uint8_t *s); /// Get the length of an array-type ('B') tag @@ -443,6 +1398,7 @@ char *bam_aux2Z(const uint8_t *s); @return The length of the array, or 0 if the tag is not an array type. If the tag is not an array type, errno is set to EINVAL. */ +HTSLIB_EXPORT uint32_t bam_auxB_len(const uint8_t *s); /// Get an integer value from an array-type tag @@ -453,6 +1409,7 @@ uint32_t bam_auxB_len(const uint8_t *s); is greater than or equal to the value returned by bam_auxB_len(s), errno is set to ERANGE. In both cases, 0 will be returned. */ +HTSLIB_EXPORT int64_t bam_auxB2i(const uint8_t *s, uint32_t idx); /// Get a floating-point value from an array-type tag @@ -464,6 +1421,7 @@ int64_t bam_auxB2i(const uint8_t *s, uint32_t idx); idx is greater than or equal to the value returned by bam_auxB_len(s), errno is set to ERANGE. In both cases, 0.0 will be returned. */ +HTSLIB_EXPORT double bam_auxB2f(const uint8_t *s, uint32_t idx); /// Append tag data to a bam record @@ -477,6 +1435,7 @@ If there is not enough space to store the additional tag, errno is set to ENOMEM. If the type is invalid, errno may be set to EINVAL. errno is also set to EINVAL if the bam record's aux data is corrupt. */ +HTSLIB_EXPORT int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data); /// Delete tag data from a bam record @@ -486,6 +1445,7 @@ int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8 If the bam record's aux data is corrupt, errno is set to EINVAL and this function returns -1; */ +HTSLIB_EXPORT int bam_aux_del(bam1_t *b, uint8_t *s); /// Update or add a string-type tag @@ -506,6 +1466,7 @@ int bam_aux_del(bam1_t *b, uint8_t *s); reallocate the data buffer failed or the resulting buffer would be longer than the maximum size allowed in a bam record (2Gbytes). */ +HTSLIB_EXPORT int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data); /// Update or add an integer tag @@ -529,6 +1490,7 @@ int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data); reallocate the data buffer failed or the resulting buffer would be longer than the maximum size allowed in a bam record (2Gbytes). */ +HTSLIB_EXPORT int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val); /// Update or add a floating-point tag @@ -548,6 +1510,7 @@ int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val); reallocate the data buffer failed or the resulting buffer would be longer than the maximum size allowed in a bam record (2Gbytes). */ +HTSLIB_EXPORT int bam_aux_update_float(bam1_t *b, const char tag[2], float val); /// Update or add an array tag @@ -586,6 +1549,7 @@ int bam_aux_update_float(bam1_t *b, const char tag[2], float val); reallocate the data buffer failed or the resulting buffer would be longer than the maximum size allowed in a bam record (2Gbytes). */ +HTSLIB_EXPORT int bam_aux_update_array(bam1_t *b, const char tag[2], uint8_t type, uint32_t items, void *data); @@ -617,10 +1581,11 @@ typedef union { @field indel indel length; 0 for no indel, positive for ins and negative for del @field level the level of the read in the "viewer" mode @field is_del 1 iff the base on the padded read is a deletion - @field is_head ??? - @field is_tail ??? - @field is_refskip ??? - @field aux ??? + @field is_head 1 iff this is the first base in the query sequence + @field is_tail 1 iff this is the last base in the query sequence + @field is_refskip 1 iff the base on the padded read is part of CIGAR N op + @field aux (used by bcf_call_gap_prep()) + @field cigar_ind index of the CIGAR operator that has just been processed @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The difference between the two functions is that the former does not @@ -632,8 +1597,9 @@ typedef struct { bam1_t *b; int32_t qpos; int indel, level; - uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; + uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, /* reserved */ :1, aux:27; bam_pileup_cd cd; // generic per-struct data, owned by caller. + int cigar_ind; } bam_pileup1_t; typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); @@ -649,13 +1615,35 @@ typedef struct __bam_mplp_t *bam_mplp_t; * @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return * status: 0 on success, -1 on end, < -1 on non-recoverable errors * @data: user data to pass to @func + * + * The struct returned by a successful call should be freed + * via bam_plp_destroy() when it is no longer needed. */ + HTSLIB_EXPORT bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data); + + HTSLIB_EXPORT void bam_plp_destroy(bam_plp_t iter); + + HTSLIB_EXPORT int bam_plp_push(bam_plp_t iter, const bam1_t *b); + + HTSLIB_EXPORT const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + + HTSLIB_EXPORT const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + + HTSLIB_EXPORT + const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp); + + HTSLIB_EXPORT + const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp); + + HTSLIB_EXPORT void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); + + HTSLIB_EXPORT void bam_plp_reset(bam_plp_t iter); /** @@ -666,27 +1654,71 @@ typedef struct __bam_mplp_t *bam_mplp_t; * a pointer to a locally allocated bam_pileup_cd union. This union * will also be present in each bam_pileup1_t created. */ + HTSLIB_EXPORT void bam_plp_constructor(bam_plp_t plp, int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)); + HTSLIB_EXPORT void bam_plp_destructor(bam_plp_t plp, int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)); + /// Get pileup padded insertion sequence + /** + * @param p pileup data + * @param ins the kstring where the insertion sequence will be written + * @param del_len location for deletion length + * @return the length of insertion string on success; -1 on failure. + * + * Fills out the kstring with the padded insertion sequence for the current + * location in 'p'. If this is not an insertion site, the string is blank. + * + * If del_len is not NULL, the location pointed to is set to the length of + * any deletion immediately following the insertion, or zero if none. + */ + HTSLIB_EXPORT + int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) HTS_RESULT_USED; + + /// Create a new bam_mplp_t structure + /** The struct returned by a successful call should be freed + * via bam_mplp_destroy() when it is no longer needed. + */ + HTSLIB_EXPORT bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); + + /// Set up mpileup overlap detection /** - * bam_mplp_init_overlaps() - if called, mpileup will detect overlapping + * @param iter mpileup iterator + * @return 0 on success; a negative value on error + * + * If called, mpileup will detect overlapping * read pairs and for each base pair set the base quality of the * lower-quality base to zero, thus effectively discarding it from * calling. If the two bases are identical, the quality of the other base * is increased to the sum of their qualities (capped at 200), otherwise * it is multiplied by 0.8. */ - void bam_mplp_init_overlaps(bam_mplp_t iter); + HTSLIB_EXPORT + int bam_mplp_init_overlaps(bam_mplp_t iter); + + HTSLIB_EXPORT void bam_mplp_destroy(bam_mplp_t iter); + + HTSLIB_EXPORT void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); + + HTSLIB_EXPORT int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); + + HTSLIB_EXPORT + int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp); + + HTSLIB_EXPORT void bam_mplp_reset(bam_mplp_t iter); + + HTSLIB_EXPORT void bam_mplp_constructor(bam_mplp_t iter, int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)); + + HTSLIB_EXPORT void bam_mplp_destructor(bam_mplp_t iter, int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)); @@ -697,7 +1729,8 @@ typedef struct __bam_mplp_t *bam_mplp_t; * BAQ calculation and realignment * ***********************************/ -int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres); +HTSLIB_EXPORT +int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres); /// Calculate BAQ scores /** @param b BAM record @@ -739,7 +1772,8 @@ Depending on what previous processing happened, this may or may not be the correct thing to do. It would be wise to avoid this situation if possible. */ -int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag); +HTSLIB_EXPORT +int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag); #ifdef __cplusplus } diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index b13c361e5..475f78f3b 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -1,7 +1,7 @@ /// @file htslib/synced_bcf_reader.h /// Stream through multiple VCF files. /* - Copyright (C) 2012-2014 Genome Research Ltd. + Copyright (C) 2012-2017, 2019 Genome Research Ltd. Author: Petr Danecek @@ -125,8 +125,9 @@ typedef struct _bcf_sr_regions_t char **seq_names; // sequence names int nseqs; // number of sequences (chromosomes) in the file int iseq; // current position: chr name, index to snames - int start, end; // current position: start, end of the region (0-based) - int prev_seq, prev_start; + hts_pos_t start, end; // current position: start, end of the region (0-based) + int prev_seq; + hts_pos_t prev_start, prev_end; } bcf_sr_regions_t; @@ -184,14 +185,22 @@ typedef struct } bcf_srs_t; -/** Init bcf_srs_t struct */ +/** Allocate and initialize a bcf_srs_t struct. + * + * The bcf_srs_t struct returned by a successful call should be freed + * via bcf_sr_destroy() when it is no longer needed. + */ +HTSLIB_EXPORT bcf_srs_t *bcf_sr_init(void); -/** Destroy bcf_srs_t struct */ +/** Destroy a bcf_srs_t struct */ +HTSLIB_EXPORT void bcf_sr_destroy(bcf_srs_t *readers); +HTSLIB_EXPORT char *bcf_sr_strerror(int errnum); +HTSLIB_EXPORT int bcf_sr_set_opt(bcf_srs_t *readers, bcf_sr_opt_t opt, ...); @@ -201,9 +210,11 @@ int bcf_sr_set_opt(bcf_srs_t *readers, bcf_sr_opt_t opt, ...); * * Returns 0 if the call succeeded, or <0 on error. */ +HTSLIB_EXPORT int bcf_sr_set_threads(bcf_srs_t *files, int n_threads); /** Deallocates thread memory, if owned by us. */ +HTSLIB_EXPORT void bcf_sr_destroy_threads(bcf_srs_t *files); /** @@ -216,7 +227,10 @@ void bcf_sr_destroy_threads(bcf_srs_t *files); * See also the bcf_srs_t data structure for parameters controlling * the reader's logic. */ +HTSLIB_EXPORT int bcf_sr_add_reader(bcf_srs_t *readers, const char *fname); + +HTSLIB_EXPORT void bcf_sr_remove_reader(bcf_srs_t *files, int i); /** @@ -227,7 +241,9 @@ void bcf_sr_remove_reader(bcf_srs_t *files, int i); * (bcf_sr_t.buffer[0]) set at this position. Use the bcf_sr_has_line macro to * determine which of the readers are set. */ +HTSLIB_EXPORT int bcf_sr_next_line(bcf_srs_t *readers); + #define bcf_sr_has_line(readers, i) (readers)->has_line[i] #define bcf_sr_get_line(_readers, i) ((_readers)->has_line[i] ? ((_readers)->readers[i].buffer[0]) : NULL) #define bcf_sr_swap_line(_readers, i, lieu) { bcf1_t *tmp = lieu; lieu = (_readers)->readers[i].buffer[0]; (_readers)->readers[i].buffer[0] = tmp; } @@ -241,7 +257,8 @@ int bcf_sr_next_line(bcf_srs_t *readers); * @seq: sequence name; NULL to seek to start * @pos: 0-based coordinate */ -int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos); +HTSLIB_EXPORT +int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos); /** * bcf_sr_set_samples() - sets active samples @@ -254,6 +271,7 @@ int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos); * * Returns 1 if the call succeeded, or 0 on error. */ +HTSLIB_EXPORT int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file); /** @@ -282,7 +300,10 @@ int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file); * Targets (but not regions) can be prefixed with "^" to request logical complement, * for example "^X,Y,MT" indicates that sequences X, Y and MT should be skipped. */ +HTSLIB_EXPORT int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles); + +HTSLIB_EXPORT int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file); @@ -306,8 +327,14 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file); * supply 'from' in place of 'to'. When 'to' is negative, first * abs(to) will be attempted and if that fails, 'from' will be used * instead. + * + * The bcf_sr_regions_t struct returned by a successful call should be freed + * via bcf_sr_regions_destroy() when it is no longer needed. */ +HTSLIB_EXPORT bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int chr, int from, int to); + +HTSLIB_EXPORT void bcf_sr_regions_destroy(bcf_sr_regions_t *regions); /* @@ -316,6 +343,7 @@ void bcf_sr_regions_destroy(bcf_sr_regions_t *regions); * Returns 0 on success or -1 on failure. Sets reg->seq appropriately and * reg->start,reg->end to -1. */ +HTSLIB_EXPORT int bcf_sr_regions_seek(bcf_sr_regions_t *regions, const char *chr); /* @@ -325,6 +353,7 @@ int bcf_sr_regions_seek(bcf_sr_regions_t *regions, const char *chr); * NULL,-1,-1 when no region is available. The coordinates are 0-based, * inclusive. */ +HTSLIB_EXPORT int bcf_sr_regions_next(bcf_sr_regions_t *reg); /* @@ -336,13 +365,16 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg); * regions and more regions exist; -2 if not in the regions and there are no more * regions left. */ -int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end); +HTSLIB_EXPORT +int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end); /* * bcf_sr_regions_flush() - calls repeatedly regs->missed_reg_handler() until * all remaining records are processed. + * Returns 0 on success, <0 on error. */ -void bcf_sr_regions_flush(bcf_sr_regions_t *regs); +HTSLIB_EXPORT +int bcf_sr_regions_flush(bcf_sr_regions_t *regs); #ifdef __cplusplus } diff --git a/htslib/tbx.h b/htslib/tbx.h index 0a1953891..a00463ea9 100644 --- a/htslib/tbx.h +++ b/htslib/tbx.h @@ -1,7 +1,7 @@ /// @file htslib/tbx.h /// Tabix API functions. /* - Copyright (C) 2009, 2012-2015 Genome Research Ltd. + Copyright (C) 2009, 2012-2015, 2019 Genome Research Ltd. Copyright (C) 2010, 2012 Broad Institute. Author: Heng Li @@ -60,19 +60,79 @@ extern const tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sa #define tbx_itr_next(htsfp, tbx, itr, r) hts_itr_next(hts_get_bgzfp(htsfp), (itr), (r), (tbx)) #define tbx_bgzf_itr_next(bgzfp, tbx, itr, r) hts_itr_next((bgzfp), (itr), (r), (tbx)) + HTSLIB_EXPORT int tbx_name2id(tbx_t *tbx, const char *ss); /* Internal helper function used by tbx_itr_next() */ + HTSLIB_EXPORT BGZF *hts_get_bgzfp(htsFile *fp); - int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end); + HTSLIB_EXPORT + int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_pos_t *end); + +/// Build an index of the lines in a BGZF-compressed file +/** The index struct returned by a successful call should be freed + via tbx_destroy() when it is no longer needed. +*/ + HTSLIB_EXPORT tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf); +/* + * All tbx_index_build* methods return: 0 (success), -1 (general failure) or -2 (compression not BGZF) + */ + HTSLIB_EXPORT int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf); + + HTSLIB_EXPORT int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf); + + HTSLIB_EXPORT int tbx_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads, const tbx_conf_t *conf); + + +/// Load or stream a .tbi or .csi index +/** @param fn Name of the data file corresponding to the index + + Equivalent to tbx_index_load3(fn, NULL, HTS_IDX_SAVE_REMOTE); +*/ + HTSLIB_EXPORT tbx_t *tbx_index_load(const char *fn); + +/// Load or stream a .tbi or .csi index +/** @param fn Name of the data file corresponding to the index + @param fnidx Name of the indexed file + @return The index, or NULL if an error occurred + + If @p fnidx is NULL, the index name will be derived from @p fn. + + Equivalent to tbx_index_load3(fn, fnidx, HTS_IDX_SAVE_REMOTE); +*/ + HTSLIB_EXPORT tbx_t *tbx_index_load2(const char *fn, const char *fnidx); + +/// Load or stream a .tbi or .csi index +/** @param fn Name of the data file corresponding to the index + @param fnidx Name of the indexed file + @param flags Flags to alter behaviour (see description) + @return The index, or NULL if an error occurred + + If @p fnidx is NULL, the index name will be derived from @p fn. + + The @p flags parameter can be set to a combination of the following + values: + + HTS_IDX_SAVE_REMOTE Save a local copy of any remote indexes + HTS_IDX_SILENT_FAIL Fail silently if the index is not present + + The index struct returned by a successful call should be freed + via tbx_destroy() when it is no longer needed. +*/ + HTSLIB_EXPORT + tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags); + + HTSLIB_EXPORT const char **tbx_seqnames(tbx_t *tbx, int *n); // free the array but not the values + + HTSLIB_EXPORT void tbx_destroy(tbx_t *tbx); #ifdef __cplusplus diff --git a/htslib/thread_pool.h b/htslib/thread_pool.h index 70fd7391a..715bcc51d 100644 --- a/htslib/thread_pool.h +++ b/htslib/thread_pool.h @@ -1,7 +1,7 @@ /// @file htslib/thread_pool.h /// Thread pool for multi-threading applications. /* - Copyright (c) 2013-2017 Genome Research Ltd. + Copyright (c) 2013-2017, 2019 Genome Research Ltd. Author: James Bonfield @@ -47,6 +47,8 @@ DEALINGS IN THE SOFTWARE. */ #ifndef HTSLIB_THREAD_POOL_H #define HTSLIB_THREAD_POOL_H +#include "hts_defs.h" + #ifdef __cplusplus extern "C" { #endif @@ -98,37 +100,108 @@ typedef struct hts_tpool_result hts_tpool_result; * * Returns pool pointer on success; * NULL on failure + * + * The hts_tpool struct returned by a successful call should be freed + * via hts_tpool_destroy() when it is no longer needed. */ +HTSLIB_EXPORT hts_tpool *hts_tpool_init(int n); /* * Returns the number of requested threads for a pool. */ +HTSLIB_EXPORT int hts_tpool_size(hts_tpool *p); -/* - * Adds an item to the work pool. - * - * FIXME: permit q to be NULL, indicating a global/default pool held by - * the thread pool itself? This pool would be for jobs that have no - * output, so fire and forget only with.. - * - * Returns 0 on success +/// Add an item to the work pool. +/** + * @param p Thread pool + * @param q Process queue + * @param func Function run by the thread pool + * @param arg Data for use by func() + * @return 0 on success * -1 on failure */ - // FIXME: should this drop the hts_tpool*p argument? It's just q->p +HTSLIB_EXPORT int hts_tpool_dispatch(hts_tpool *p, hts_tpool_process *q, void *(*func)(void *arg), void *arg); + +/// Add an item to the work pool, with nonblocking option. +/** + * @param p Thread pool + * @param q Process queue + * @param func Function run by the thread pool + * @param arg Data for use by func() + * @param nonblock Non-blocking flag (see description) + * @return 0 on success + * -1 on failure + * + * The @p nonblock parameter can take one of the following values: + * 0 => block if input queue is full + * +1 => don't block if input queue is full, but do not add task + * -1 => add task regardless of whether queue is full (over-size) + * + * If @p nonblock is +1 and the queue is full, -1 will be returned and + * `errno` is set to `EAGAIN`. + */ +HTSLIB_EXPORT int hts_tpool_dispatch2(hts_tpool *p, hts_tpool_process *q, void *(*func)(void *arg), void *arg, int nonblock); +/// Add an item to the work pool, with nonblocking and cleanup callbacks. +/** + * @param p Thread pool + * @param q Process queue + * @param exec_func Function run by the thread pool + * @param arg Data for use by func() + * @param job_cleanup Callback to clean up when discarding jobs + * @param result_cleanup Callback to clean up when discarding result data + * @param nonblock Non-blocking flag (see description) + * @return 0 on success + * -1 on failure + * + * The @p nonblock parameter can take one of the following values: + * 0 => block if input queue is full + * +1 => don't block if input queue is full, but do not add task + * -1 => add task regardless of whether queue is full (over-size) + * + * If @p nonblock is +1 and the queue is full, -1 will be returned and + * `errno` is set to `EAGAIN`. + * + * The job_cleanup() and result_cleanup() callbacks are used when discarding + * data from a queue, for example when calling hts_tpool_process_reset() + * or hts_tpool_process_destroy(). + * + * If not NULL, job_cleanup() will be called for each pending job with the + * value of @p arg that was set for that job. This can be used to free + * any data associated with @p arg, and also @p arg itself. + * + * Similarly, result_cleanup() can be used to free any results left by + * jobs that had started before hts_tpool_process_reset() was called. + * The argument passed to result_cleanup() is the pointer that would + * have been returned by calling hts_tpool_result_data() on the result + * when pulled from the queue. + * + * job_cleanup() and result_cleanup() are only called when discarding jobs. + * For jobs that are processed normally, it is the resposibility of + * exec_func() and / or consumers of any results to do any cleaning up + * necessary. + */ +HTSLIB_EXPORT +int hts_tpool_dispatch3(hts_tpool *p, hts_tpool_process *q, + void *(*exec_func)(void *arg), void *arg, + void (*job_cleanup)(void *arg), + void (*result_cleanup)(void *data), + int nonblock); + /* * Wakes up a single thread stuck in dispatch and make it return with * errno EAGAIN. */ +HTSLIB_EXPORT void hts_tpool_wake_dispatch(hts_tpool_process *q); /* @@ -142,6 +215,7 @@ void hts_tpool_wake_dispatch(hts_tpool_process *q); * Returns 0 on success; * -1 on failure */ +HTSLIB_EXPORT int hts_tpool_process_flush(hts_tpool_process *q); /* @@ -156,9 +230,11 @@ int hts_tpool_process_flush(hts_tpool_process *q); * Returns 0 on success; * -1 on failure */ +HTSLIB_EXPORT int hts_tpool_process_reset(hts_tpool_process *q, int free_results); /* Returns the process queue size */ +HTSLIB_EXPORT int hts_tpool_process_qsize(hts_tpool_process *q); @@ -166,12 +242,14 @@ int hts_tpool_process_qsize(hts_tpool_process *q); * Destroys a thread pool. The threads are joined into the main * thread so they will finish their current work load. */ +HTSLIB_EXPORT void hts_tpool_destroy(hts_tpool *p); /* * Destroys a thread pool without waiting on jobs to complete. * Use hts_tpool_kill(p) to quickly exit after a fatal error. */ +HTSLIB_EXPORT void hts_tpool_kill(hts_tpool *p); /* @@ -184,6 +262,7 @@ void hts_tpool_kill(hts_tpool *p); * Returns hts_tpool_result pointer if a result is ready. * NULL if not. */ +HTSLIB_EXPORT hts_tpool_result *hts_tpool_next_result(hts_tpool_process *q); /* @@ -196,18 +275,21 @@ hts_tpool_result *hts_tpool_next_result(hts_tpool_process *q); * Returns hts_tpool_result pointer if a result is ready. * NULL on error or during shutdown. */ +HTSLIB_EXPORT hts_tpool_result *hts_tpool_next_result_wait(hts_tpool_process *q); /* * Frees a result 'r' and if free_data is true also frees * the internal r->data result too. */ +HTSLIB_EXPORT void hts_tpool_delete_result(hts_tpool_result *r, int free_data); /* * Returns the data portion of a hts_tpool_result, corresponding * to the actual "result" itself. */ +HTSLIB_EXPORT void *hts_tpool_result_data(hts_tpool_result *r); /* @@ -219,40 +301,38 @@ void *hts_tpool_result_data(hts_tpool_result *r); * * Results hts_tpool_process pointer on success; * NULL on failure + * + * The hts_tpool_process struct returned by a successful call should be freed + * via hts_tpool_process_destroy() when it is no longer needed. */ +HTSLIB_EXPORT hts_tpool_process *hts_tpool_process_init(hts_tpool *p, int qsize, int in_only); /* Deallocates memory for a thread process-queue. * Must be called before the thread pool is destroyed. */ +HTSLIB_EXPORT void hts_tpool_process_destroy(hts_tpool_process *q); -/* - * Flushes the thread pool, but doesn't exit. This simply drains the - * process-queue and ensures all worker threads have finished their current - * task if associated with this process. - * - * Returns 0 on success; - * -1 on failure - */ -int hts_tpool_process_flush(hts_tpool_process *q); - /* * Returns true if there are no items in the process results queue and * also none still pending. */ +HTSLIB_EXPORT int hts_tpool_process_empty(hts_tpool_process *q); /* * Returns the number of completed jobs in the process results queue. */ +HTSLIB_EXPORT int hts_tpool_process_len(hts_tpool_process *q); /* * Returns the number of completed jobs in the process results queue plus the * number running and queued up to run. */ +HTSLIB_EXPORT int hts_tpool_process_sz(hts_tpool_process *q); /* @@ -261,6 +341,7 @@ int hts_tpool_process_sz(hts_tpool_process *q); * This sets the shutdown flag and wakes any threads waiting on process * condition variables. */ +HTSLIB_EXPORT void hts_tpool_process_shutdown(hts_tpool_process *q); /* @@ -271,7 +352,10 @@ void hts_tpool_process_shutdown(hts_tpool_process *q); * to temporarily detach if we wish to stop running jobs on a specific * process while permitting other process to continue. */ +HTSLIB_EXPORT void hts_tpool_process_attach(hts_tpool *p, hts_tpool_process *q); + +HTSLIB_EXPORT void hts_tpool_process_detach(hts_tpool *p, hts_tpool_process *q); /* @@ -280,7 +364,10 @@ void hts_tpool_process_detach(hts_tpool *p, hts_tpool_process *q); * threads, eg "main" and a "reader", this permits each end to * decrement its use of the process-queue independently. */ +HTSLIB_EXPORT void hts_tpool_process_ref_incr(hts_tpool_process *q); + +HTSLIB_EXPORT void hts_tpool_process_ref_decr(hts_tpool_process *q); #ifdef __cplusplus diff --git a/htslib/vcf.h b/htslib/vcf.h index 7f65d54d4..caedfebf4 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -2,7 +2,7 @@ /// High-level VCF/BCF variant calling file operations. /* Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2014 Genome Research Ltd. + Copyright (C) 2012-2019 Genome Research Ltd. Author: Heng Li @@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include "hts.h" #include "kstring.h" #include "hts_defs.h" @@ -60,6 +61,7 @@ extern "C" { #define BCF_HT_INT 1 #define BCF_HT_REAL 2 #define BCF_HT_STR 3 +#define BCF_HT_LONG (BCF_HT_INT | 0x100) // BCF_HT_INT, but for int64_t values; VCF only! #define BCF_VL_FIXED 0 // variable length #define BCF_VL_VAR 1 @@ -93,7 +95,7 @@ typedef struct { } bcf_hrec_t; typedef struct { - uint32_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 in info[0..2] + uint64_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 in info[0..2] // for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG bcf_hrec_t *hrec[3]; int id; @@ -129,15 +131,17 @@ extern uint8_t bcf_type_shift[]; #define BCF_BT_INT8 1 #define BCF_BT_INT16 2 #define BCF_BT_INT32 3 +#define BCF_BT_INT64 4 // Unofficial, for internal use only. #define BCF_BT_FLOAT 5 #define BCF_BT_CHAR 7 -#define VCF_REF 0 -#define VCF_SNP 1 -#define VCF_MNP 2 -#define VCF_INDEL 4 -#define VCF_OTHER 8 -#define VCF_BND 16 // breakend +#define VCF_REF 0 +#define VCF_SNP 1 +#define VCF_MNP 2 +#define VCF_INDEL 4 +#define VCF_OTHER 8 +#define VCF_BND 16 // breakend +#define VCF_OVERLAP 32 // overlapping deletion, ALT=* typedef struct { int type, n; // variant type and the number of bases affected, negative for deletions @@ -153,9 +157,9 @@ typedef struct { typedef struct { int key; // key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key - int type, len; // type: one of BCF_BT_* types; len: vector length, 1 for scalars + int type; // type: one of BCF_BT_* types union { - int32_t i; // integer value + int64_t i; // integer value float f; // float value } v1; // only set if $len==1; for easier access uint8_t *vptr; // pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes @@ -163,6 +167,7 @@ typedef struct { uint32_t vptr_off:31, // vptr offset, i.e., the size of the INFO key plus size+type bytes vptr_free:1; // indicates that vptr-vptr_off must be freed; set only when modified and the new // data block is bigger than the original + int len; // vector length, 1 for scalars } bcf_info_t; @@ -206,9 +211,9 @@ typedef struct { line must be formatted in vcf_format. */ typedef struct { + hts_pos_t pos; // POS + hts_pos_t rlen; // length of REF int32_t rid; // CHROM - int32_t pos; // POS - int32_t rlen; // length of REF float qual; // QUAL uint32_t n_info:16, n_allele:16; uint32_t n_fmt:8, n_sample:24; @@ -254,22 +259,34 @@ typedef struct { * * When opened for writing, the mandatory fileFormat and * FILTER=PASS lines are added automatically. + * + * The bcf_hdr_t struct returned by a successful call should be freed + * via bcf_hdr_destroy() when it is no longer needed. */ + HTSLIB_EXPORT bcf_hdr_t *bcf_hdr_init(const char *mode); /** Destroy a BCF header struct */ + HTSLIB_EXPORT void bcf_hdr_destroy(bcf_hdr_t *h); - /** Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t)) */ + /** Allocate and initialize a bcf1_t object. + * + * The bcf1_t struct returned by a successful call should be freed + * via bcf_destroy() when it is no longer needed. + */ + HTSLIB_EXPORT bcf1_t *bcf_init(void); /** Deallocate a bcf1_t object */ + HTSLIB_EXPORT void bcf_destroy(bcf1_t *v); /** * Same as bcf_destroy() but frees only the memory allocated by bcf1_t, * not the bcf1_t object itself. */ + HTSLIB_EXPORT void bcf_empty(bcf1_t *v); /** @@ -277,6 +294,7 @@ typedef struct { * internal use, the user should rarely need to call this function * directly. */ + HTSLIB_EXPORT void bcf_clear(bcf1_t *v); @@ -287,17 +305,25 @@ typedef struct { #define bcf_close(fp) hts_close(fp) #define vcf_close(fp) hts_close(fp) - /** Reads VCF or BCF header */ - bcf_hdr_t *bcf_hdr_read(htsFile *fp); + /// Read a VCF or BCF header + /** @param fp The file to read the header from + @return Pointer to a populated header structure on success; + NULL on failure + + The bcf_hdr_t struct returned by a successful call should be freed + via bcf_hdr_destroy() when it is no longer needed. + */ + HTSLIB_EXPORT + bcf_hdr_t *bcf_hdr_read(htsFile *fp) HTS_RESULT_USED; /** * bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed - * @samples: samples to include or exclude from file or as a comma-separated string. + * @param samples samples to include or exclude from file or as a comma-separated string. * LIST|FILE .. select samples in list/file * ^LIST|FILE .. exclude samples from list/file * - .. include all samples * NULL .. exclude all samples - * @is_file: @samples is a file (1) or a comma-separated list (0) + * @param is_file @p samples is a file (1) or a comma-separated list (0) * * The bottleneck of VCF reading is parsing of genotype fields. If the * reader knows in advance that only subset of samples is needed (possibly @@ -312,31 +338,43 @@ typedef struct { * contains samples not present in the VCF header. In such a case, the * return value is the index of the offending sample. */ - int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file); - int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec); + HTSLIB_EXPORT + int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) HTS_RESULT_USED; + HTSLIB_EXPORT + int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec); - /** Writes VCF or BCF header */ - int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h); + /// Write a VCF or BCF header + /** @param fp Output file + @param h The header to write + @return 0 on success; -1 on failure + */ + HTSLIB_EXPORT + int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h) HTS_RESULT_USED; /** * Parse VCF line contained in kstring and populate the bcf1_t struct * The line must not end with \n or \r characters. */ + HTSLIB_EXPORT int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v); /** The opposite of vcf_parse. It should rarely be called directly, see vcf_write */ + HTSLIB_EXPORT int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s); - /** - * bcf_read() - read next VCF or BCF record - * - * Returns -1 on critical errors, 0 otherwise. On errors which are not - * critical for reading, such as missing header definitions, v->errcode is - * set to one of BCF_ERR* code and must be checked before calling - * vcf_write(). + /// Read next VCF or BCF record + /** @param fp The file to read the record from + @param h The header for the vcf/bcf file + @param v The bcf1_t structure to populate + @return 0 on success; -1 on end of file; < -1 on critical error + +On errors which are not critical for reading, such as missing header +definitions in vcf files, zero will be returned but v->errcode will have been +set to one of BCF_ERR* codes and must be checked before calling bcf_write(). */ - int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v); + HTSLIB_EXPORT + int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) HTS_RESULT_USED; /** * bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field) @@ -352,6 +390,7 @@ typedef struct { #define BCF_UN_FMT 8 // unpack format and each sample #define BCF_UN_IND BCF_UN_FMT // a synonymo of BCF_UN_FMT #define BCF_UN_ALL (BCF_UN_SHR|BCF_UN_FMT) // everything + HTSLIB_EXPORT int bcf_unpack(bcf1_t *b, int which); /* @@ -360,35 +399,101 @@ typedef struct { * Note that bcf_unpack() must be called on the returned copy as if it was * obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src) * internally to reflect any changes made by bcf_update_* functions. + * + * The bcf1_t struct returned by a successful call should be freed + * via bcf_destroy() when it is no longer needed. */ + HTSLIB_EXPORT bcf1_t *bcf_dup(bcf1_t *src); + + HTSLIB_EXPORT bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src); - /** - * bcf_write() - write one VCF or BCF record. The type is determined at the open() call. + /// Write one VCF or BCF record. The type is determined at the open() call. + /** @param fp The file to write to + @param h The header for the vcf/bcf file + @param v The bcf1_t structure to write + @return 0 on success; -1 on error */ - int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v); + HTSLIB_EXPORT + int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v) HTS_RESULT_USED; /** * The following functions work only with VCFs and should rarely be called * directly. Usually one wants to use their bcf_* alternatives, which work * transparently with both VCFs and BCFs. */ - bcf_hdr_t *vcf_hdr_read(htsFile *fp); - int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h); - int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v); - int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v); + /// Read a VCF format header + /** @param fp The file to read the header from + @return Pointer to a populated header structure on success; + NULL on failure + + Use bcf_hdr_read() instead. + + The bcf_hdr_t struct returned by a successful call should be freed + via bcf_hdr_destroy() when it is no longer needed. + */ + HTSLIB_EXPORT + bcf_hdr_t *vcf_hdr_read(htsFile *fp) HTS_RESULT_USED; + + /// Write a VCF format header + /** @param fp Output file + @param h The header to write + @return 0 on success; -1 on failure + + Use bcf_hdr_write() instead + */ + HTSLIB_EXPORT + int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) HTS_RESULT_USED; + + /// Read a record from a VCF file + /** @param fp The file to read the record from + @param h The header for the vcf file + @param v The bcf1_t structure to populate + @return 0 on success; -1 on end of file; < -1 on error + + Use bcf_read() instead + */ + HTSLIB_EXPORT + int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) HTS_RESULT_USED; + + /// Write a record to a VCF file + /** @param fp The file to write to + @param h The header for the vcf file + @param v The bcf1_t structure to write + @return 0 on success; -1 on error + + Use bcf_write() instead + */ + HTSLIB_EXPORT + int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) HTS_RESULT_USED; /** Helper function for the bcf_itr_next() macro; internal use, ignore it */ - int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, int *beg, int *end); + HTSLIB_EXPORT + int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, hts_pos_t *beg, hts_pos_t *end); + /// Write a line to a VCF file + /** @param line Line to write + @param fp File to write it to + @return 0 on success; -1 on failure + @note No checks are done on the line being added, apart from + ensuring that it ends with a newline. This function + should therefore be used with care. + */ + HTSLIB_EXPORT + int vcf_write_line(htsFile *fp, kstring_t *line); /************************************************************************** * Header querying and manipulation routines **************************************************************************/ - /** Create a new header using the supplied template */ + /** Create a new header using the supplied template + * + * The bcf_hdr_t struct returned by a successful call should be freed + * via bcf_hdr_destroy() when it is no longer needed. + */ + HTSLIB_EXPORT bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr); /** @@ -397,6 +502,7 @@ typedef struct { * 1 .. conflicting definitions of tag length * // todo */ + HTSLIB_EXPORT int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) HTS_DEPRECATED("Please use bcf_hdr_merge instead"); /** @@ -413,15 +519,24 @@ typedef struct { * combining multiple BCF headers. The current bcf_hdr_combine() * does not have this problem, but became slow when used for many files. */ + HTSLIB_EXPORT bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src); /** * bcf_hdr_add_sample() - add a new sample. * @param sample: sample name to be added + * + * Note: + * After all samples have been added, the internal header structure must be updated + * by calling bcf_hdr_sync(). This is normally done automatically by the first bcf_hdr_write() + * or bcf_write() call. Otherwise, the caller must force the update by calling bcf_hdr_sync() + * explicitly. */ + HTSLIB_EXPORT int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample); /** Read VCF header from a file and update the header */ + HTSLIB_EXPORT int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname); /// Appends formatted header text to _str_. @@ -429,6 +544,7 @@ typedef struct { * @return 0 if successful, or negative if an error occurred * @since 1.4 */ + HTSLIB_EXPORT int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str); /** Returns formatted header (newly allocated string) and its length, @@ -436,22 +552,36 @@ typedef struct { * fields are discarded. * @deprecated Use bcf_hdr_format() instead as it can handle huge headers. */ + HTSLIB_EXPORT char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len) HTS_DEPRECATED("use bcf_hdr_format() instead"); /** Append new VCF header line, returns 0 on success */ + HTSLIB_EXPORT int bcf_hdr_append(bcf_hdr_t *h, const char *line); + + HTSLIB_EXPORT int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...); /** VCF version, e.g. VCFv4.2 */ + HTSLIB_EXPORT const char *bcf_hdr_get_version(const bcf_hdr_t *hdr); - void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version); + + /// Set version in bcf header + /** + @param hdr BCF header struct + @param version Version to set, e.g. "VCFv4.3" + @return 0 on success; < 0 on error + */ + HTSLIB_EXPORT + int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version); /** * bcf_hdr_remove() - remove VCF header tag * @param type: one of BCF_HL_* * @param key: tag name or NULL to remove all tags of the given type */ + HTSLIB_EXPORT void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key); /** @@ -464,10 +594,14 @@ typedef struct { * by comparing n and bcf_hdr_nsamples(out_hdr). * This function can be used to reorder samples. * See also bcf_subset() which subsets individual records. + * The bcf_hdr_t struct returned by a successful call should be freed + * via bcf_hdr_destroy() when it is no longer needed. */ + HTSLIB_EXPORT bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap); /** Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) */ + HTSLIB_EXPORT const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs); /** Get number of samples */ @@ -475,11 +609,34 @@ typedef struct { /** The following functions are for internal use and should rarely be called directly */ + HTSLIB_EXPORT int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt); - int bcf_hdr_sync(bcf_hdr_t *h); + + /// Synchronize internal header structures + /** @param h Header + @return 0 on success, -1 on failure + + This function updates the id, sample and contig arrays in the + bcf_hdr_t structure so that they point to the same locations as + the id, sample and contig dictionaries. + */ + HTSLIB_EXPORT + int bcf_hdr_sync(bcf_hdr_t *h) HTS_RESULT_USED; + + HTSLIB_EXPORT bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len); - void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str); + /// Convert a bcf header record to string form + /** + * @param hrec Header record + * @param str Destination kstring + * @return 0 on success; < 0 on error + */ + HTSLIB_EXPORT + int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str); + + HTSLIB_EXPORT int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec); + /** * bcf_hdr_get_hrec() - get header line info * @param type: one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN @@ -488,12 +645,55 @@ typedef struct { * @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN * @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL */ + HTSLIB_EXPORT bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class); + + /// Duplicate a header record + /** @param hrec Header record to copy + @return A new header record on success; NULL on failure + + The bcf_hrec_t struct returned by a successful call should be freed + via bcf_hrec_destroy() when it is no longer needed. + */ + HTSLIB_EXPORT bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec); - void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len); - void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted); + + /// Add a new header record key + /** @param hrec Header record + @param str Key name + @param len Length of @p str + @return 0 on success; -1 on failure + */ + HTSLIB_EXPORT + int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len) HTS_RESULT_USED; + + /// Set a header record value + /** @param hrec Header record + @param i Index of value + @param str Value to set + @param len Length of @p str + @param is_quoted Value should be quoted + @return 0 on success; -1 on failure + */ + HTSLIB_EXPORT + int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted) HTS_RESULT_USED; + + HTSLIB_EXPORT int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key); - void hrec_add_idx(bcf_hrec_t *hrec, int idx); + + + /// Add an IDX header record + /** @param hrec Header record + @param idx IDX value to add + @return 0 on success; -1 on failure + */ + HTSLIB_EXPORT + int hrec_add_idx(bcf_hrec_t *hrec, int idx) HTS_RESULT_USED; + + /// Free up a header record and associated structures + /** @param hrec Header record + */ + HTSLIB_EXPORT void bcf_hrec_destroy(bcf_hrec_t *hrec); @@ -503,6 +703,7 @@ typedef struct { **************************************************************************/ /** See the description of bcf_hdr_subset() */ + HTSLIB_EXPORT int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap); /** @@ -512,13 +713,19 @@ typedef struct { * @src_hdr: the source header, used in bcf_read() * @src_line: line obtained by bcf_read() */ + HTSLIB_EXPORT int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line); /** * bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc */ + HTSLIB_EXPORT int bcf_get_variant_types(bcf1_t *rec); + + HTSLIB_EXPORT int bcf_get_variant_type(bcf1_t *rec, int ith_allele); + + HTSLIB_EXPORT int bcf_is_snp(bcf1_t *v); /** @@ -526,6 +733,7 @@ typedef struct { * @flt_ids: The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") * @n: Number of filters. If n==0, all filters are removed */ + HTSLIB_EXPORT int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n); /** * bcf_add_filter() - adds to the FILTER column @@ -533,16 +741,19 @@ typedef struct { * * If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed. */ + HTSLIB_EXPORT int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id); /** * bcf_remove_filter() - removes from the FILTER column * @flt_id: filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") * @pass: when set to 1 and no filters are present, set to PASS */ + HTSLIB_EXPORT int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass); /** * Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably. */ + HTSLIB_EXPORT int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter); /** * bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALLT column @@ -550,35 +761,65 @@ typedef struct { * @nals: Number of alleles * @alleles_string: Comma-separated alleles, starting with the REF allele */ + HTSLIB_EXPORT int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals); + + HTSLIB_EXPORT int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string); /** * bcf_update_id() - sets new ID string * bcf_add_id() - adds to the ID string checking for duplicates */ + HTSLIB_EXPORT int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id); + + HTSLIB_EXPORT int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id); - /* + /** * bcf_update_info_*() - functions for updating INFO fields - * @hdr: the BCF header - * @line: VCF line to be edited - * @key: the INFO tag to be updated - * @values: pointer to the array of values. Pass NULL to remove the tag. - * @n: number of values in the array. When set to 0, the INFO tag is removed + * @param hdr: the BCF header + * @param line: VCF line to be edited + * @param key: the INFO tag to be updated + * @param values: pointer to the array of values. Pass NULL to remove the tag. + * @param n: number of values in the array. When set to 0, the INFO tag is removed + * @return 0 on success or negative value on error. * - * The @string in bcf_update_info_flag() is optional, @n indicates whether - * the flag is set or removed. + * The @p string in bcf_update_info_flag() is optional, + * @p n indicates whether the flag is set or removed. * - * Returns 0 on success or negative value on error. */ #define bcf_update_info_int32(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_INT) #define bcf_update_info_float(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_REAL) #define bcf_update_info_flag(hdr,line,key,string,n) bcf_update_info((hdr),(line),(key),(string),(n),BCF_HT_FLAG) #define bcf_update_info_string(hdr,line,key,string) bcf_update_info((hdr),(line),(key),(string),1,BCF_HT_STR) + HTSLIB_EXPORT int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type); + /// Set or update 64-bit integer INFO values + /** + * @param hdr: the BCF header + * @param line: VCF line to be edited + * @param key: the INFO tag to be updated + * @param values: pointer to the array of values. Pass NULL to remove the tag. + * @param n: number of values in the array. When set to 0, the INFO tag is removed + * @return 0 on success or negative value on error. + * + * This function takes an int64_t values array as input. The data + * actually stored will be shrunk to the minimum size that can + * accept all of the values. + * + * INFO values outside of the range BCF_MIN_BT_INT32 to BCF_MAX_BT_INT32 + * can only be written to VCF files. + */ + static inline int bcf_update_info_int64(const bcf_hdr_t *hdr, bcf1_t *line, + const char *key, + const int64_t *values, int n) + { + return bcf_update_info(hdr, line, key, values, n, BCF_HT_LONG); + } + /* * bcf_update_format_*() - functions for updating FORMAT fields * @values: pointer to the array of values, the same number of elements @@ -599,7 +840,11 @@ typedef struct { #define bcf_update_format_float(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_REAL) #define bcf_update_format_char(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_STR) #define bcf_update_genotypes(hdr,line,gts,n) bcf_update_format((hdr),(line),"GT",(gts),(n),BCF_HT_INT) // See bcf_gt_ macros below + + HTSLIB_EXPORT int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n); + + HTSLIB_EXPORT int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type); // Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds @@ -630,7 +875,10 @@ typedef struct { * Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field * is not available. */ + HTSLIB_EXPORT bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key); + + HTSLIB_EXPORT bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key); /** @@ -641,34 +889,69 @@ typedef struct { * Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid * as their goal is to avoid the header lookup. */ + HTSLIB_EXPORT bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id); + + HTSLIB_EXPORT bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id); /** * bcf_get_info_*() - get INFO values, integers or floats - * @hdr: BCF header - * @line: BCF record - * @tag: INFO tag to retrieve - * @dst: *dst is pointer to a memory location, can point to NULL - * @ndst: pointer to the size of allocated memory - * - * Returns negative value on error or the number of written values - * (including missing values) on success. bcf_get_info_string() returns - * on success the number of characters written excluding the null- - * terminating byte. bcf_get_info_flag() returns 1 when flag is set or 0 - * if not. - * - * List of return codes: - * -1 .. no such INFO tag defined in the header - * -2 .. clash between types defined in the header and encountered in the VCF record - * -3 .. tag is not present in the VCF record + * @param hdr: BCF header + * @param line: BCF record + * @param tag: INFO tag to retrieve + * @param dst: *dst is pointer to a memory location, can point to NULL + * @param ndst: pointer to the size of allocated memory + * @return >=0 on success + * -1 .. no such INFO tag defined in the header + * -2 .. clash between types defined in the header and encountered in the VCF record + * -3 .. tag is not present in the VCF record + * -4 .. the operation could not be completed (e.g. out of memory) + * + * Returns negative value on error or the number of values (including + * missing values) put in *dst on success. bcf_get_info_string() returns + * on success the number of characters stored excluding the nul- + * terminating byte. bcf_get_info_flag() does not store anything in *dst + * but returns 1 if the flag is set or 0 if not. + * + * *dst will be reallocated if it is not big enough (i.e. *ndst is too + * small) or NULL on entry. The new size will be stored in *ndst. */ #define bcf_get_info_int32(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_INT) #define bcf_get_info_float(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL) #define bcf_get_info_string(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_STR) #define bcf_get_info_flag(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_FLAG) + + HTSLIB_EXPORT int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type); + /// Put integer INFO values into an int64_t array + /** + * @param hdr: BCF header + * @param line: BCF record + * @param tag: INFO tag to retrieve + * @param dst: *dst is pointer to a memory location, can point to NULL + * @param ndst: pointer to the size of allocated memory + * @return >=0 on success + * -1 .. no such INFO tag defined in the header + * -2 .. clash between types defined in the header and encountered in the VCF record + * -3 .. tag is not present in the VCF record + * -4 .. the operation could not be completed (e.g. out of memory) + * + * Returns negative value on error or the number of values (including + * missing values) put in *dst on success. + * + * *dst will be reallocated if it is not big enough (i.e. *ndst is too + * small) or NULL on entry. The new size will be stored in *ndst. + */ + static inline int bcf_get_info_int64(const bcf_hdr_t *hdr, bcf1_t *line, + const char *tag, int64_t **dst, + int *ndst) + { + return bcf_get_info_values(hdr, line, tag, + (void **) dst, ndst, BCF_HT_LONG); + } + /** * bcf_get_format_*() - same as bcf_get_info*() above * @@ -725,7 +1008,11 @@ typedef struct { #define bcf_get_format_float(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL) #define bcf_get_format_char(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_STR) #define bcf_get_genotypes(hdr,line,dst,ndst) bcf_get_format_values(hdr,line,"GT",(void**)(dst),ndst,BCF_HT_INT) + + HTSLIB_EXPORT int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst); + + HTSLIB_EXPORT int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type); @@ -743,6 +1030,7 @@ typedef struct { * Returns -1 if string is not in dictionary, otherwise numeric ID which identifies * fields in BCF records. */ + HTSLIB_EXPORT int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id); #define bcf_hdr_int2id(hdr,type,int_id) ((hdr)->id[type][int_id].key) @@ -770,17 +1058,56 @@ typedef struct { */ #define bcf_hdr_id2length(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>8 & 0xf) #define bcf_hdr_id2number(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>12) - #define bcf_hdr_id2type(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf) - #define bcf_hdr_id2coltype(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) + #define bcf_hdr_id2type(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf) + #define bcf_hdr_id2coltype(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id<0 || bcf_hdr_id2coltype(hdr,type,int_id)==0xf) ? 0 : 1) #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)]) + /// Convert BCF FORMAT data to string form + /** + * @param s kstring to write into + * @param n number of items in @p data + * @param type type of items in @p data + * @param data BCF format data + * @return 0 on success + * -1 if out of memory + */ + HTSLIB_EXPORT + int bcf_fmt_array(kstring_t *s, int n, int type, void *data); - void bcf_fmt_array(kstring_t *s, int n, int type, void *data); + HTSLIB_EXPORT uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr); - void bcf_enc_vchar(kstring_t *s, int l, const char *a); - void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize); - void bcf_enc_vfloat(kstring_t *s, int n, float *a); + /// Encode a variable-length char array in BCF format + /** + * @param s kstring to write into + * @param l length of input + * @param a input data to encode + * @return 0 on success; < 0 on error + */ + HTSLIB_EXPORT + int bcf_enc_vchar(kstring_t *s, int l, const char *a); + + /// Encode a variable-length integer array in BCF format + /** + * @param s kstring to write into + * @param n total number of items in @p a (<= 0 to encode BCF_BT_NULL) + * @param a input data to encode + * @param wsize vector length (<= 0 is equivalent to @p n) + * @return 0 on success; < 0 on error + * @note @p n should be an exact multiple of @p wsize + */ + HTSLIB_EXPORT + int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize); + + /// Encode a variable-length float array in BCF format + /** + * @param s kstring to write into + * @param n total number of items in @p a (<= 0 to encode BCF_BT_NULL) + * @param a input data to encode + * @return 0 on success; < 0 on error + */ + HTSLIB_EXPORT + int bcf_enc_vfloat(kstring_t *s, int n, float *a); /************************************************************************** @@ -794,12 +1121,53 @@ typedef struct { #define bcf_itr_destroy(iter) hts_itr_destroy(iter) #define bcf_itr_queryi(idx, tid, beg, end) hts_itr_query((idx), (tid), (beg), (end), bcf_readrec) #define bcf_itr_querys(idx, hdr, s) hts_itr_querys((idx), (s), (hts_name2id_f)(bcf_hdr_name2id), (hdr), hts_itr_query, bcf_readrec) - #define bcf_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0) + + static inline int bcf_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) { + if (htsfp->is_bgzf) + return hts_itr_next(htsfp->fp.bgzf, itr, r, 0); + + hts_log_error("Only bgzf compressed files can be used with iterators"); + errno = EINVAL; + return -2; + } +/// Load a BCF index +/** @param fn BCF file name + @return The index, or NULL if an error occurred. + @note This only works for BCF files. Consider synced_bcf_reader instead +which works for both BCF and VCF. +*/ #define bcf_index_load(fn) hts_idx_load(fn, HTS_FMT_CSI) #define bcf_index_seqnames(idx, hdr, nptr) hts_idx_seqnames((idx),(nptr),(hts_id2name_f)(bcf_hdr_id2name),(hdr)) +/// Load a BCF index from a given index file name +/** @param fn Input BAM/BCF/etc filename + @param fnidx The input index filename + @return The index, or NULL if an error occurred. + @note This only works for BCF files. Consider synced_bcf_reader instead +which works for both BCF and VCF. +*/ + HTSLIB_EXPORT hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx); +/// Load a BCF index from a given index file name +/** @param fn Input BAM/BCF/etc filename + @param fnidx The input index filename + @param flags Flags to alter behaviour (see description) + @return The index, or NULL if an error occurred. + @note This only works for BCF files. Consider synced_bcf_reader instead +which works for both BCF and VCF. + + The @p flags parameter can be set to a combination of the following + values: + + HTS_IDX_SAVE_REMOTE Save a local copy of any remote indexes + HTS_IDX_SILENT_FAIL Fail silently if the index is not present + + Equivalent to hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags); +*/ + HTSLIB_EXPORT + hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags); + /** * bcf_index_build() - Generate and save an index file * @fn: Input VCF(compressed)/BCF filename @@ -818,6 +1186,7 @@ typedef struct { * -3 .. format not indexable * -4 .. failed to create and/or save the index */ + HTSLIB_EXPORT int bcf_index_build(const char *fn, int min_shift); /** @@ -834,6 +1203,7 @@ typedef struct { * -3 .. format not indexable * -4 .. failed to create and/or save the index */ + HTSLIB_EXPORT int bcf_index_build2(const char *fn, const char *fnidx, int min_shift); /** @@ -851,8 +1221,29 @@ typedef struct { * -3 .. format not indexable * -4 .. failed to create and/or save the index */ + HTSLIB_EXPORT int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads); + /// Initialise fp->idx for the current format type, for VCF and BCF files. + /** @param fp File handle for the data file being written. + @param h BCF header structured (needed for BAI and CSI). + @param min_shift CSI bin size (CSI default is 14). + @param fnidx Filename to write index to. This pointer must remain valid + until after bcf_idx_save is called. + @return 0 on success, <0 on failure. + @note This must be called after the header has been written, but before + any other data. + */ + HTSLIB_EXPORT + int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx); + + /// Writes the index initialised with bcf_idx_init to disk. + /** @param fp File handle for the data file being written. + @return 0 on success, <0 on failure. + */ + HTSLIB_EXPORT + int bcf_idx_save(htsFile *fp); + /******************* * Typed value I/O * *******************/ @@ -870,14 +1261,27 @@ typedef struct { enables to handle correctly vectors with different ploidy in presence of missing values. */ -#define bcf_int8_vector_end (INT8_MIN+1) -#define bcf_int16_vector_end (INT16_MIN+1) -#define bcf_int32_vector_end (INT32_MIN+1) +#define bcf_int8_vector_end (-127) /* INT8_MIN + 1 */ +#define bcf_int16_vector_end (-32767) /* INT16_MIN + 1 */ +#define bcf_int32_vector_end (-2147483647) /* INT32_MIN + 1 */ +#define bcf_int64_vector_end (-9223372036854775807LL) /* INT64_MIN + 1 */ #define bcf_str_vector_end 0 -#define bcf_int8_missing INT8_MIN -#define bcf_int16_missing INT16_MIN -#define bcf_int32_missing INT32_MIN +#define bcf_int8_missing (-128) /* INT8_MIN */ +#define bcf_int16_missing (-32767-1) /* INT16_MIN */ +#define bcf_int32_missing (-2147483647-1) /* INT32_MIN */ +#define bcf_int64_missing (-9223372036854775807LL - 1LL) /* INT64_MIN */ #define bcf_str_missing 0x07 + +// Limits on BCF values stored in given types. Max values are the same +// as for the underlying type. Min values are slightly different as +// the last 8 values for each type were reserved by BCFv2.2. +#define BCF_MAX_BT_INT8 (0x7f) /* INT8_MAX */ +#define BCF_MAX_BT_INT16 (0x7fff) /* INT16_MAX */ +#define BCF_MAX_BT_INT32 (0x7fffffff) /* INT32_MAX */ +#define BCF_MIN_BT_INT8 (-120) /* INT8_MIN + 8 */ +#define BCF_MIN_BT_INT16 (-32760) /* INT16_MIN + 8 */ +#define BCF_MIN_BT_INT32 (-2147483640) /* INT32_MIN + 8 */ + extern uint32_t bcf_float_vector_end; extern uint32_t bcf_float_missing; static inline void bcf_float_set(float *ptr, uint32_t value) @@ -901,80 +1305,101 @@ static inline int bcf_float_is_vector_end(float f) return u.i==bcf_float_vector_end ? 1 : 0; } -static inline void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) +static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) { + uint32_t e = 0; #define BRANCH(type_t, missing, vector_end) { \ type_t *ptr = (type_t*) (fmt->p + isample*fmt->size); \ int i; \ for (i=0; in && ptr[i]!=vector_end; i++) \ { \ - if ( i ) kputc("/|"[ptr[i]&1], str); \ - if ( !(ptr[i]>>1) ) kputc('.', str); \ - else kputw((ptr[i]>>1) - 1, str); \ + if ( i ) e |= kputc("/|"[ptr[i]&1], str) < 0; \ + if ( !(ptr[i]>>1) ) e |= kputc('.', str) < 0; \ + else e |= kputw((ptr[i]>>1) - 1, str) < 0; \ } \ - if (i == 0) kputc('.', str); \ + if (i == 0) e |= kputc('.', str) < 0; \ } switch (fmt->type) { case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - case BCF_BT_NULL: kputc('.', str); break; - default: hts_log_error("Unexpected type %d", fmt->type); abort(); break; + case BCF_BT_NULL: e |= kputc('.', str) < 0; break; + default: hts_log_error("Unexpected type %d", fmt->type); return -2; } #undef BRANCH + return e == 0 ? 0 : -1; } -static inline void bcf_enc_size(kstring_t *s, int size, int type) +static inline int bcf_enc_size(kstring_t *s, int size, int type) { + uint32_t e = 0; if (size >= 15) { - kputc(15<<4|type, s); + e |= kputc(15<<4|type, s) < 0; if (size >= 128) { if (size >= 32768) { int32_t x = size; - kputc(1<<4|BCF_BT_INT32, s); - kputsn((char*)&x, 4, s); + e |= kputc(1<<4|BCF_BT_INT32, s) < 0; + e |= kputsn((char*)&x, 4, s) < 0; } else { int16_t x = size; - kputc(1<<4|BCF_BT_INT16, s); - kputsn((char*)&x, 2, s); + e |= kputc(1<<4|BCF_BT_INT16, s) < 0; + e |= kputsn((char*)&x, 2, s) < 0; } } else { - kputc(1<<4|BCF_BT_INT8, s); - kputc(size, s); + e |= kputc(1<<4|BCF_BT_INT8, s) < 0; + e |= kputc(size, s) < 0; } - } else kputc(size<<4|type, s); + } else e |= kputc(size<<4|type, s) < 0; + return e == 0 ? 0 : -1; } static inline int bcf_enc_inttype(long x) { - if (x <= INT8_MAX && x > bcf_int8_missing) return BCF_BT_INT8; - if (x <= INT16_MAX && x > bcf_int16_missing) return BCF_BT_INT16; + if (x <= BCF_MAX_BT_INT8 && x >= BCF_MIN_BT_INT8) return BCF_BT_INT8; + if (x <= BCF_MAX_BT_INT16 && x >= BCF_MIN_BT_INT16) return BCF_BT_INT16; return BCF_BT_INT32; } -static inline void bcf_enc_int1(kstring_t *s, int32_t x) +static inline int bcf_enc_int1(kstring_t *s, int32_t x) { + uint32_t e = 0; if (x == bcf_int32_vector_end) { - bcf_enc_size(s, 1, BCF_BT_INT8); - kputc(bcf_int8_vector_end, s); + e |= bcf_enc_size(s, 1, BCF_BT_INT8); + e |= kputc(bcf_int8_vector_end, s) < 0; } else if (x == bcf_int32_missing) { - bcf_enc_size(s, 1, BCF_BT_INT8); - kputc(bcf_int8_missing, s); - } else if (x <= INT8_MAX && x > bcf_int8_missing) { - bcf_enc_size(s, 1, BCF_BT_INT8); - kputc(x, s); - } else if (x <= INT16_MAX && x > bcf_int16_missing) { + e |= bcf_enc_size(s, 1, BCF_BT_INT8); + e |= kputc(bcf_int8_missing, s) < 0; + } else if (x <= BCF_MAX_BT_INT8 && x >= BCF_MIN_BT_INT8) { + e |= bcf_enc_size(s, 1, BCF_BT_INT8); + e |= kputc(x, s) < 0; + } else if (x <= BCF_MAX_BT_INT16 && x >= BCF_MIN_BT_INT16) { int16_t z = x; - bcf_enc_size(s, 1, BCF_BT_INT16); - kputsn((char*)&z, 2, s); + e |= bcf_enc_size(s, 1, BCF_BT_INT16); + e |= kputsn((char*)&z, 2, s) < 0; } else { int32_t z = x; - bcf_enc_size(s, 1, BCF_BT_INT32); - kputsn((char*)&z, 4, s); + e |= bcf_enc_size(s, 1, BCF_BT_INT32); + e |= kputsn((char*)&z, 4, s) < 0; } + return e == 0 ? 0 : -1; } -static inline int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) +/// Return the value of a single typed integer. +/** @param p Pointer to input data block. + @param type One of the BCF_BT_INT* type codes + @param[out] q Location to store an updated value for p + @return The integer value, or zero if @p type is not valid. + +If @p type is not one of BCF_BT_INT8, BCF_BT_INT16 or BCF_BT_INT32, zero +will be returned and @p *q will not be updated. Otherwise, the integer +value will be returned and @p *q will be set to the memory location +immediately following the integer value. + +Cautious callers can detect invalid type codes by checking that *q has +actually been updated. +*/ + +static inline int64_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) { if (type == BCF_BT_INT8) { *q = (uint8_t*)p + 1; @@ -982,13 +1407,34 @@ static inline int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) } else if (type == BCF_BT_INT16) { *q = (uint8_t*)p + 2; return le_to_i16(p); - } else { + } else if (type == BCF_BT_INT32) { *q = (uint8_t*)p + 4; return le_to_i32(p); + } else if (type == BCF_BT_INT64) { + *q = (uint8_t*)p + 4; + return le_to_i64(p); + } else { // Invalid type. + return 0; } } -static inline int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) +/// Return the value of a single typed integer from a byte stream. +/** @param p Pointer to input data block. + @param[out] q Location to store an updated value for p + @return The integer value, or zero if the type code was not valid. + +Reads a one-byte type code from @p p, and uses it to decode an integer +value from the following bytes in @p p. + +If the type is not one of BCF_BT_INT8, BCF_BT_INT16 or BCF_BT_INT32, zero +will be returned and @p *q will unchanged. Otherwise, the integer value will +be returned and @p *q will be set to the memory location immediately following +the integer value. + +Cautious callers can detect invalid type codes by checking that *q has +actually been updated. +*/ +static inline int64_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) { return bcf_dec_int1(p + 1, *p&0xf, q); } diff --git a/htslib/vcf_sweep.h b/htslib/vcf_sweep.h index 07c340af2..a48590abe 100644 --- a/htslib/vcf_sweep.h +++ b/htslib/vcf_sweep.h @@ -1,7 +1,7 @@ /// @file htslib/vcf_sweep.h /// Forward/reverse sweep API. /* - Copyright (C) 2013 Genome Research Ltd. + Copyright (C) 2013-2015 Genome Research Ltd. Author: Petr Danecek @@ -35,10 +35,19 @@ extern "C" { typedef struct _bcf_sweep_t bcf_sweep_t; +HTSLIB_EXPORT bcf_sweep_t *bcf_sweep_init(const char *fname); + +HTSLIB_EXPORT void bcf_sweep_destroy(bcf_sweep_t *sw); + +HTSLIB_EXPORT bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw); + +HTSLIB_EXPORT bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw); + +HTSLIB_EXPORT bcf1_t *bcf_sweep_bwd(bcf_sweep_t *sw); #ifdef __cplusplus diff --git a/htslib/vcfutils.h b/htslib/vcfutils.h index 4999df415..8395fe42c 100644 --- a/htslib/vcfutils.h +++ b/htslib/vcfutils.h @@ -44,6 +44,7 @@ struct kbitset_t; * -1 .. some allele index is out of bounds * -2 .. could not remove alleles */ +HTSLIB_EXPORT int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line); /** @@ -54,8 +55,10 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line); * * If you have more than 31 alleles, then the integer bit mask will * overflow, so use bcf_remove_allele_set instead + * Returns 0 on sucess, <0 on error */ -void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask) HTS_DEPRECATED("Please use bcf_remove_allele_set instead"); +HTSLIB_EXPORT +int bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask) HTS_DEPRECATED("Please use bcf_remove_allele_set instead"); /** * bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set @@ -68,6 +71,7 @@ void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask) HTS_DEP * * Number=A,R,G INFO and FORMAT fields will be updated accordingly. */ +HTSLIB_EXPORT int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kbitset_t *rm_set); /** @@ -84,6 +88,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb * used (BCF_UN_INFO) and and if indv fields can be splitted * (BCF_UN_FMT). */ +HTSLIB_EXPORT int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which); @@ -109,6 +114,7 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which); #define GT_HAPL_R 4 #define GT_HAPL_A 5 #define GT_UNKN 6 +HTSLIB_EXPORT int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal); static inline int bcf_acgt2int(char c) diff --git a/htslib_vars.mk b/htslib_vars.mk index 97928473e..2c97e3633 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -1,6 +1,6 @@ # Makefile variables useful for third-party code using htslib's public API. # -# Copyright (C) 2013-2017 Genome Research Ltd. +# Copyright (C) 2013-2017, 2019 Genome Research Ltd. # # Author: John Marshall # @@ -26,28 +26,28 @@ # See htslib.mk for details. htslib_bgzf_h = $(HTSPREFIX)htslib/bgzf.h $(htslib_hts_defs_h) -htslib_cram_h = $(HTSPREFIX)htslib/cram.h $(htslib_hts_h) -htslib_faidx_h = $(HTSPREFIX)htslib/faidx.h $(htslib_hts_defs_h) +htslib_cram_h = $(HTSPREFIX)htslib/cram.h $(htslib_hts_defs_h) $(htslib_hts_h) $(htslib_sam_h) +htslib_faidx_h = $(HTSPREFIX)htslib/faidx.h $(htslib_hts_defs_h) $(htslib_hts_h) htslib_hfile_h = $(HTSPREFIX)htslib/hfile.h $(htslib_hts_defs_h) htslib_hts_h = $(HTSPREFIX)htslib/hts.h $(htslib_hts_defs_h) $(htslib_hts_log_h) htslib_hts_defs_h = $(HTSPREFIX)htslib/hts_defs.h htslib_hts_endian_h = $(HTSPREFIX)htslib/hts_endian.h htslib_hts_log_h = $(HTSPREFIX)htslib/hts_log.h $(htslib_hts_defs_h) -htslib_hts_os_h = $(HTSPREFIX)htslib/hts_os.h +htslib_hts_os_h = $(HTSPREFIX)htslib/hts_os.h $(htslib_hts_defs_h) htslib_kbitset_h = $(HTSPREFIX)htslib/kbitset.h -htslib_kfunc_h = $(HTSPREFIX)htslib/kfunc.h -htslib_khash_h = $(HTSPREFIX)htslib/khash.h +htslib_kfunc_h = $(HTSPREFIX)htslib/kfunc.h $(htslib_hts_defs_h) +htslib_khash_h = $(HTSPREFIX)htslib/khash.h $(htslib_kstring_h) htslib_khash_str2int_h = $(HTSPREFIX)htslib/khash_str2int.h $(htslib_khash_h) htslib_klist_h = $(HTSPREFIX)htslib/klist.h -htslib_knetfile_h = $(HTSPREFIX)htslib/knetfile.h +htslib_knetfile_h = $(HTSPREFIX)htslib/knetfile.h $(htslib_hts_defs_h) htslib_kseq_h = $(HTSPREFIX)htslib/kseq.h htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h -htslib_kstring_h = $(HTSPREFIX)htslib/kstring.h -htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h +htslib_kstring_h = $(HTSPREFIX)htslib/kstring.h $(htslib_hts_defs_h) +htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h $(htslib_hts_h) htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h) htslib_synced_bcf_reader_h = $(HTSPREFIX)htslib/synced_bcf_reader.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_tbx_h) htslib_tbx_h = $(HTSPREFIX)htslib/tbx.h $(htslib_hts_h) -htslib_thread_pool_h = $(HTSPREFIX)htslib/thread_pool.h +htslib_thread_pool_h = $(HTSPREFIX)htslib/thread_pool.h $(htslib_hts_defs_h) htslib_vcf_h = $(HTSPREFIX)htslib/vcf.h $(htslib_hts_h) $(htslib_kstring_h) $(htslib_hts_defs_h) $(htslib_hts_endian_h) htslib_vcf_sweep_h = $(HTSPREFIX)htslib/vcf_sweep.h $(htslib_hts_h) $(htslib_vcf_h) htslib_vcfutils_h = $(HTSPREFIX)htslib/vcfutils.h $(htslib_vcf_h) diff --git a/kfunc.c b/kfunc.c index 323e70fcb..c3afa3c69 100644 --- a/kfunc.c +++ b/kfunc.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (C) 2010, 2013 Genome Research Ltd. + Copyright (C) 2010, 2013-2014 Genome Research Ltd. Copyright (C) 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining @@ -24,6 +24,7 @@ SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include diff --git a/knetfile.c b/knetfile.c index 35de40664..f0a608ea4 100644 --- a/knetfile.c +++ b/knetfile.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008 by Genome Research Ltd (GRL). + Copyright (c) 2008, 2012-2014, 2017 Genome Research Ltd (GRL). 2010 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining @@ -28,6 +28,7 @@ therefore I decide to heavily annotate this file, for Linux and Windows as well. -ac */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -597,6 +598,7 @@ int knet_close(knetFile *fp) free(fp->host); free(fp->port); free(fp->response); free(fp->retr); // FTP specific free(fp->path); free(fp->http_host); // HTTP specific + free(fp->size_cmd); free(fp); return 0; } diff --git a/kstring.c b/kstring.c index 0c7dd73d0..04d9b3f12 100644 --- a/kstring.c +++ b/kstring.c @@ -1,6 +1,7 @@ /* The MIT License Copyright (C) 2011 by Attractive Chaos + Copyright (C) 2013-2018 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,6 +24,7 @@ SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -271,6 +273,30 @@ int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp) return 0; } +int kgetline2(kstring_t *s, kgets_func2 *fgets_fn, void *fp) +{ + size_t l0 = s->l; + + while (s->l == l0 || s->s[s->l-1] != '\n') { + if (s->m - s->l < 200) { + if (ks_resize(s, s->m + 200) < 0) + return EOF; + } + ssize_t len = fgets_fn(s->s + s->l, s->m - s->l, fp); + if (len <= 0) break; + s->l += len; + } + + if (s->l == l0) return EOF; + + if (s->l > l0 && s->s[s->l-1] == '\n') { + s->l--; + if (s->l > l0 && s->s[s->l-1] == '\r') s->l--; + } + s->s[s->l] = '\0'; + return 0; +} + /********************** * Boyer-Moore search * **********************/ @@ -282,12 +308,14 @@ static int *ksBM_prep(const ubyte_t *pat, int m) { int i, *suff, *prep, *bmGs, *bmBc; prep = (int*)calloc(m + 256, sizeof(int)); + if (!prep) return NULL; bmGs = prep; bmBc = prep + m; { // preBmBc() for (i = 0; i < 256; ++i) bmBc[i] = m; for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; } suff = (int*)calloc(m, sizeof(int)); + if (!suff) { free(prep); return NULL; } { // suffixes() int f = 0, g; suff[m - 1] = m; @@ -324,6 +352,7 @@ void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep) const ubyte_t *str, *pat; str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat; prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep; + if (!prep) return NULL; if (_prep && *_prep == 0) *_prep = prep; bmGs = prep; bmBc = prep + m; j = 0; diff --git a/m4/hts_hide_dynamic_syms.m4 b/m4/hts_hide_dynamic_syms.m4 new file mode 100644 index 000000000..62ccb8eb2 --- /dev/null +++ b/m4/hts_hide_dynamic_syms.m4 @@ -0,0 +1,65 @@ +dnl @synopsis HTS_HIDE_DYNAMIC_SYMBOLS +dnl +dnl Turn on compiler options that prevent unwanted symbols from being exported +dnl by shared libraries. +dnl +dnl @author Rob Davies +dnl @license MIT/Expat +dnl +dnl Copyright (C) 2018 Genome Research Ltd. +dnl +dnl Permission is hereby granted, free of charge, to any person obtaining a copy +dnl of this software and associated documentation files (the "Software"), to +dnl deal in the Software without restriction, including without limitation the +dnl rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +dnl sell copies of the Software, and to permit persons to whom the Software is +dnl furnished to do so, subject to the following conditions: +dnl +dnl The above copyright notice and this permission notice shall be included in +dnl all copies or substantial portions of the Software. +dnl +dnl THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +dnl IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +dnl FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +dnl THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +dnl LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +dnl FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +dnl DEALINGS IN THE SOFTWARE. + +# SYNOPSIS +# +# HTS_TEST_CC_C_LD_FLAG(FLAG, FOUND_VAR) +# +# Test if FLAG can be used on both CFLAGS and LDFLAGS. It it works, +# variable FOUND_VAR is set to FLAG. + +AC_DEFUN([HTS_TEST_CC_C_LD_FLAG], + [AS_VAR_PUSHDEF([hts_cv_check_flag],[hts_cv_check_$1])dnl + AC_CACHE_CHECK([whether the compiler accepts $1], + [hts_cv_check_flag], + [ac_check_save_cflags=$CFLAGS + ac_check_save_ldflags=$LDFLAGS + CFLAGS="$CFLAGS $1" + LDFLAGS="$LDFLAGS $1" + AC_LINK_IFELSE([AC_LANG_PROGRAM()], + [AS_VAR_SET([hts_cv_check_flag],[yes]) + AS_IF([test "x$2" != x],[eval AS_TR_SH([$2])="$1"])], + [AS_VAR_SET([hts_cv_check_flag],[no])]) + CFLAGS=$ac_check_save_cflags + LDFLAGS=$ac_check_save_ldflags]) + AS_VAR_POPDEF([hts_cv_check_flag])dnl +]) + +AC_DEFUN([HTS_HIDE_DYNAMIC_SYMBOLS], [ + # Test for flags to set default shared library visibility to hidden + # -fvisibility=hidden : GCC compatible + # -xldscope=hidden : SunStudio + ac_opt_found=no + m4_foreach_w([ac_opt],[-fvisibility=hidden -xldscope=hidden], + [AS_IF([test "x$ac_opt_found" = "xno"], + [HTS_TEST_CC_C_LD_FLAG(ac_opt,[ac_opt_found])]) + ]) + AS_IF([test "x$ac_opt_found" != "xno"], + [CFLAGS="$CFLAGS $ac_opt_found" + LDFLAGS="$LDFLAGS $ac_opt_found"]) +]) diff --git a/m4/hts_prog_cc_warnings.m4 b/m4/hts_prog_cc_warnings.m4 index b1b365349..f2aed9328 100644 --- a/m4/hts_prog_cc_warnings.m4 +++ b/m4/hts_prog_cc_warnings.m4 @@ -167,10 +167,10 @@ CFLAGS="$ac_arg_needed $CFLAGS"],[dnl AC_DEFUN([HTS_PROG_CC_WERROR], [ AC_ARG_ENABLE([werror], [AS_HELP_STRING([--enable-werror], [change warnings into errors, where supported])], - [enable_werror=yes], - []) + [], + [enable_werror=no]) - AS_IF([test "x$enable_werror" = xyes],[ + AS_IF([test "x$enable_werror" != xno],[ AC_MSG_CHECKING([for C compiler flags to error on warnings]) AC_CACHE_VAL(hts_cv_prog_cc_werror, [dnl hts_cv_prog_cc_werror="" diff --git a/md5.c b/md5.c index 93515a73d..1a43da500 100644 --- a/md5.c +++ b/md5.c @@ -46,6 +46,7 @@ * compile-time configuration. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include diff --git a/multipart.c b/multipart.c index 14c2584dd..12d0df282 100644 --- a/multipart.c +++ b/multipart.c @@ -1,6 +1,6 @@ /* multipart.c -- GA4GH redirection and multipart backend for file streams. - Copyright (C) 2016 Genome Research Ltd. + Copyright (C) 2016-2017 Genome Research Ltd. Author: John Marshall @@ -22,6 +22,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include diff --git a/os/rand.c b/os/rand.c index ff0046d31..7ceafa882 100644 --- a/os/rand.c +++ b/os/rand.c @@ -62,8 +62,8 @@ _dorand48(unsigned short xseed[3]) xseed[2] = (unsigned short) accu; } -void -hts_srand48(long seed) +HTSLIB_EXPORT +void hts_srand48(long seed) { _rand48_seed[0] = RAND48_SEED_0; _rand48_seed[1] = (unsigned short) seed; @@ -74,8 +74,8 @@ hts_srand48(long seed) _rand48_add = RAND48_ADD; } -double -hts_erand48(unsigned short xseed[3]) +HTSLIB_EXPORT +double hts_erand48(unsigned short xseed[3]) { _dorand48(xseed); return ldexp((double) xseed[0], -48) + @@ -83,14 +83,14 @@ hts_erand48(unsigned short xseed[3]) ldexp((double) xseed[2], -16); } -double -hts_drand48(void) +HTSLIB_EXPORT +double hts_drand48(void) { return hts_erand48(_rand48_seed); } -long -hts_lrand48(void) +HTSLIB_EXPORT +long hts_lrand48(void) { _dorand48(_rand48_seed); return ((long) _rand48_seed[2] << 15) + ((long) _rand48_seed[1] >> 1); diff --git a/plugin.c b/plugin.c index 603a82a65..59fbc9420 100644 --- a/plugin.c +++ b/plugin.c @@ -1,6 +1,6 @@ /* plugin.c -- low-level path parsing and plugin functions. - Copyright (C) 2015 Genome Research Ltd. + Copyright (C) 2015-2016 Genome Research Ltd. Author: John Marshall @@ -44,7 +44,7 @@ static DIR *open_nextdir(struct hts_path_itr *itr) DIR *dir; while (1) { - const char *colon = strchr(itr->pathdir, ':'); + const char *colon = strchr(itr->pathdir, HTS_PATH_SEPARATOR_CHAR); if (colon == NULL) return NULL; itr->entry.l = 0; @@ -86,13 +86,13 @@ void hts_path_itr_setup(struct hts_path_itr *itr, const char *path, } while (1) { - size_t len = strcspn(path, ":"); + size_t len = strcspn(path, HTS_PATH_SEPARATOR_STR); if (len == 0) kputs(builtin_path, &itr->path); else kputsn(path, len, &itr->path); - kputc(':', &itr->path); + kputc(HTS_PATH_SEPARATOR_CHAR, &itr->path); path += len; - if (*path == ':') path++; + if (*path == HTS_PATH_SEPARATOR_CHAR) path++; else break; } diff --git a/probaln.c b/probaln.c index 2eb8a31cf..9b9442c5b 100644 --- a/probaln.c +++ b/probaln.c @@ -1,6 +1,7 @@ /* The MIT License Copyright (C) 2003-2006, 2008-2010 by Heng Li + Copyright (C) 2016-2017 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,6 +24,7 @@ SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -259,7 +261,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0 if (state) state[i-1] = max_k; if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k; -#ifdef _MAIN +#ifdef PROBALN_MAIN k = 0; set_u(k, bw, 0, 0); fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", b[0][k], sum, i-1, max_k>>2, @@ -275,7 +277,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu return INT_MIN; } -#ifdef _MAIN +#ifdef PROBALN_MAIN #include int main(int argc, char *argv[]) { diff --git a/realn.c b/realn.c index e25fdf7ba..40b796b46 100644 --- a/realn.c +++ b/realn.c @@ -1,6 +1,6 @@ /* realn.c -- BAQ calculation and realignment. - Copyright (C) 2009-2011, 2014-2015 Genome Research Ltd. + Copyright (C) 2009-2011, 2014-2016, 2018 Genome Research Ltd. Portions copyright (C) 2009-2011 Broad Institute. Author: Heng Li @@ -23,6 +23,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -35,12 +36,13 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts.h" #include "htslib/sam.h" -int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres) +int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres) { uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b); uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; - int i, x, y, mm, q, len, clip_l, clip_q; + int i, y, mm, q, len, clip_l, clip_q; + hts_pos_t x; double t; if (thres < 0) thres = 40; // set the default mm = q = len = clip_l = clip_q = 0; @@ -50,7 +52,7 @@ int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres) for (j = 0; j < l; ++j) { int c1, c2, z = y + j; if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds - c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; + c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(unsigned char)ref[x+j]]; if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous ++len; if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch @@ -101,9 +103,10 @@ static int realn_check_tag(const uint8_t *tg, enum htsLogLevel severity, return 0; } -int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag) +int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) { - int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4, fix_bq = 0; + int k, bw, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4, fix_bq = 0; + hts_pos_t i, x; uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; probaln_par_t conf = { 0.001, 0.1, 10 }; @@ -220,6 +223,7 @@ int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag) if (!extend_baq) { // in this block, bq[] is capped by base quality qual[] for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { int op = cigar[k]&0xf, l = cigar[k]>>4; + if (l == 0) continue; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { // Sanity check running off the end of the sequence // Can only happen if the alignment is broken @@ -246,6 +250,7 @@ int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag) uint8_t *rght = tref; for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { int op = cigar[k]&0xf, l = cigar[k]>>4; + if (l == 0) continue; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { // Sanity check running off the end of the sequence // Can only happen if the alignment is broken diff --git a/regidx.c b/regidx.c index 874f9c9f9..67b356825 100644 --- a/regidx.c +++ b/regidx.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2019 Genome Research Ltd. Author: Petr Danecek @@ -22,9 +22,10 @@ THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include - +#include #include "htslib/hts.h" #include "htslib/kstring.h" #include "htslib/kseq.h" @@ -32,48 +33,73 @@ #include "htslib/regidx.h" #include "hts_internal.h" -#define LIDX_SHIFT 13 // number of insignificant index bits +#define MAX_COOR_0 REGIDX_MAX // CSI and hts_itr_query limit, 0-based + +#define iBIN(x) ((x)>>13) + +typedef struct +{ + hts_pos_t beg, end; +} +reg_t; + +typedef struct +{ + hts_pos_t pos; // position + uint32_t ireg; // index to reglist.reg and reglist.dat +} +pos_t; + +typedef struct reglist_t reglist_t; -// List of regions for one chromosome typedef struct { - int *idx, nidx; - int nregs, mregs; // n:used, m:alloced - reg_t *regs; - void *payload; + hts_pos_t beg, end; // query region + uint32_t ireg; // index of active region + regidx_t *ridx; + reglist_t *list; + int active; } -reglist_t; +itr_t_; + +// List of regions for one chromosome. +struct reglist_t +{ + uint32_t *idx, nidx; // index to list.reg+1 + uint32_t nreg, mreg; // n:used, m:allocated + reg_t *reg; // regions + uint8_t *dat; // payload data + char *seq; // sequence name + int unsorted; +}; // Container of all sequences -struct _regidx_t +struct regidx_t { - int nseq, mseq; // n:used, m:alloced - reglist_t *seq; // regions for each sequence - void *seq2regs; // hash for fast lookup from chr name to regions + int nseq, mseq; // n:used, m:alloced + reglist_t *seq; // regions for each sequence + void *seq2regs; // hash for fast lookup from chr name to regions char **seq_names; regidx_free_f free; // function to free any data allocated by regidx_parse_f regidx_parse_f parse; // parse one input line void *usr; // user data to pass to regidx_parse_f - - // temporary data for index initialization - kstring_t str; - int rid_prev, start_prev, end_prev; int payload_size; - void *payload; + void *payload; // temporary payload data set by regidx_parse_f (sequence is not known beforehand) + kstring_t str; }; int regidx_seq_nregs(regidx_t *idx, const char *seq) { int iseq; if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence - return idx->seq[iseq].nregs; + return idx->seq[iseq].nreg; } int regidx_nregs(regidx_t *idx) { - int i, nregs = 0; - for (i=0; inseq; i++) nregs += idx->seq[i].nregs; - return nregs; + int i, nreg = 0; + for (i=0; inseq; i++) nreg += idx->seq[i].nreg; + return nreg; } char **regidx_seq_names(regidx_t *idx, int *n) @@ -82,88 +108,137 @@ char **regidx_seq_names(regidx_t *idx, int *n) return idx->seq_names; } -int _regidx_build_index(regidx_t *idx) +int regidx_insert_list(regidx_t *idx, char *line, char delim) { - int iseq; - for (iseq=0; iseqnseq; iseq++) + kstring_t tmp = KS_INITIALIZE; + char *ss = line; + while ( *ss ) { - reglist_t *list = &idx->seq[iseq]; - int j,k, imax = 0; // max index bin - for (j=0; jnregs; j++) + char *se = ss; + while ( *se && *se!=delim ) se++; + kputsn(ss, se-ss, ks_clear(&tmp)); + if ( regidx_insert(idx,tmp.s) < 0 ) { - int ibeg = list->regs[j].start >> LIDX_SHIFT; - int iend = list->regs[j].end >> LIDX_SHIFT; - if ( imax < iend + 1 ) - { - int old_imax = imax; - imax = iend + 1; - kroundup32(imax); - list->idx = (int*) realloc(list->idx, imax*sizeof(int)); - for (k=old_imax; kidx[k] = -1; - } - if ( ibeg==iend ) - { - if ( list->idx[ibeg]<0 ) list->idx[ibeg] = j; - } - else - { - for (k=ibeg; k<=iend; k++) - if ( list->idx[k]<0 ) list->idx[k] = j; - } - list->nidx = iend + 1; + ks_free(&tmp); + return -1; } + if ( !*se ) break; + ss = se+1; } + ks_free(&tmp); return 0; } -int regidx_insert(regidx_t *idx, char *line) +static inline int cmp_regs(reg_t *a, reg_t *b) +{ + if ( a->beg < b->beg ) return -1; + if ( a->beg > b->beg ) return 1; + if ( a->end < b->end ) return 1; // longer intervals come first + if ( a->end > b->end ) return -1; + return 0; +} +static int cmp_reg_ptrs(const void *a, const void *b) { - if ( !line ) - return _regidx_build_index(idx); + return cmp_regs((reg_t*)a,(reg_t*)b); +} +static int cmp_reg_ptrs2(const void *a, const void *b) +{ + return cmp_regs(*((reg_t**)a),*((reg_t**)b)); +} - char *chr_from, *chr_to; - reg_t reg; - int ret = idx->parse(line,&chr_from,&chr_to,®,idx->payload,idx->usr); - if ( ret==-2 ) return -1; // error - if ( ret==-1 ) return 0; // skip the line +int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, hts_pos_t beg, hts_pos_t end, void *payload) +{ + if (beg < 0) beg = 0; + if (end < 0) end = 0; + if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0; + if ( end > MAX_COOR_0 ) end = MAX_COOR_0; int rid; - idx->str.l = 0; - kputsn(chr_from, chr_to-chr_from+1, &idx->str); + if (kputsn(chr_beg, chr_end-chr_beg+1, ks_clear(&idx->str)) < 0) return -1; if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 ) { + // new chromosome + int m_tmp = idx->mseq; + if (hts_resize(char*, idx->nseq + 1, &m_tmp, + &idx->seq_names, HTS_RESIZE_CLEAR) < 0) { + return -1; + } + if (hts_resize(reglist_t, idx->nseq + 1, &idx->mseq, + &idx->seq, HTS_RESIZE_CLEAR) < 0) { + return -1; + } + assert(m_tmp == idx->mseq); + idx->seq_names[idx->nseq] = strdup(idx->str.s); + rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq]); idx->nseq++; - int m_prev = idx->mseq; - hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq); - hts_expand0(char*,idx->nseq,m_prev,idx->seq_names); - idx->seq_names[idx->nseq-1] = strdup(idx->str.s); - rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]); } reglist_t *list = &idx->seq[rid]; - list->nregs++; - int m_prev = list->mregs; - hts_expand(reg_t,list->nregs,list->mregs,list->regs); - list->regs[list->nregs-1] = reg; - if ( idx->payload_size ) - { - if ( m_prev < list->mregs ) list->payload = realloc(list->payload,idx->payload_size*list->mregs); - memcpy((char*)list->payload + idx->payload_size*(list->nregs-1), idx->payload, idx->payload_size); + list->seq = idx->seq_names[rid]; + int mreg = list->mreg; + if (hts_resize(reg_t, list->nreg + 1, &list->mreg, &list->reg, 0) < 0) + return -1; + list->reg[list->nreg].beg = beg; + list->reg[list->nreg].end = end; + if ( idx->payload_size ) { + if ( mreg != list->mreg ) { + uint8_t *new_dat = realloc(list->dat, idx->payload_size*list->mreg); + if (!new_dat) return -1; + list->dat = new_dat; + } + memcpy(list->dat + idx->payload_size*list->nreg, payload, idx->payload_size); } + list->nreg++; + if ( !list->unsorted && list->nreg>1 && cmp_regs(&list->reg[list->nreg-2],&list->reg[list->nreg-1])>0 ) list->unsorted = 1; + return 0; +} - if ( idx->rid_prev==rid ) +int regidx_insert(regidx_t *idx, char *line) +{ + if ( !line ) return 0; + char *chr_from, *chr_to; + hts_pos_t beg,end; + int ret = idx->parse(line,&chr_from,&chr_to,&beg,&end,idx->payload,idx->usr); + if ( ret==-2 ) return -1; // error + if ( ret==-1 ) return 0; // skip the line + return regidx_push(idx, chr_from,chr_to,beg,end,idx->payload); +} + +regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) +{ + kstring_t tmp = KS_INITIALIZE; + regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t)); + if ( !idx ) return NULL; + + idx->free = free_f; + idx->parse = parser ? parser : regidx_parse_tab; + idx->usr = usr_dat; + idx->seq2regs = khash_str2int_init(); + if (!idx->seq2regs) goto fail; + idx->payload_size = payload_size; + if ( payload_size ) { + idx->payload = malloc(payload_size); + if (!idx->payload) goto fail; + } + + const char *ss = str; + while ( *ss ) { - if ( idx->start_prev > reg.start || (idx->start_prev==reg.start && idx->end_prev>reg.end) ) - { - hts_log_error("The regions are not sorted: %s:%d-%d is before %s:%d-%d", - idx->str.s,idx->start_prev+1,idx->end_prev+1,idx->str.s,reg.start+1,reg.end+1); - return -1; - } + while ( *ss && isspace_c(*ss) ) ss++; + const char *se = ss; + while ( *se && *se!='\r' && *se!='\n' ) se++; + if (kputsn(ss, se-ss, ks_clear(&tmp)) < 0) goto fail; + if (regidx_insert(idx, tmp.s) < 0) goto fail; + while ( *se && isspace_c(*se) ) se++; + ss = se; } - idx->rid_prev = rid; - idx->start_prev = reg.start; - idx->end_prev = reg.end; - return 0; + ks_free(&tmp); + return idx; + + fail: + regidx_destroy(idx); + ks_free(&tmp); + return NULL; } regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) @@ -180,41 +255,52 @@ regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f fr parser = regidx_parse_bed; else if ( len>=4 && !strcasecmp(".bed",fname+len-4) ) parser = regidx_parse_bed; + else if ( len>=4 && !strcasecmp(".vcf",fname+len-4) ) + parser = regidx_parse_vcf; + else if ( len>=7 && !strcasecmp(".vcf.gz",fname+len-7) ) + parser = regidx_parse_vcf; else parser = regidx_parse_tab; } } + kstring_t str = KS_INITIALIZE; + htsFile *fp = NULL; + int ret; regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t)); + if (!idx) return NULL; idx->free = free_f; idx->parse = parser; idx->usr = usr_dat; idx->seq2regs = khash_str2int_init(); - idx->rid_prev = -1; - idx->start_prev = -1; - idx->end_prev = -1; + if (!idx->seq2regs) goto error; idx->payload_size = payload_size; - if ( payload_size ) idx->payload = malloc(payload_size); + if ( payload_size ) { + idx->payload = malloc(payload_size); + if (!idx->payload) goto error; + } if ( !fname ) return idx; - kstring_t str = {0,0,0}; - - htsFile *fp = hts_open(fname,"r"); + fp = hts_open(fname,"r"); if ( !fp ) goto error; - while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) - { + while ((ret = hts_getline(fp, KS_SEP_LINE, &str)) > 0 ) { if ( regidx_insert(idx, str.s) ) goto error; } - regidx_insert(idx, NULL); + if (ret < -1) goto error; - free(str.s); - hts_close(fp); + ret = hts_close(fp); + fp = NULL; + if ( ret != 0 ) { + hts_log_error("Close failed .. %s", fname); + goto error; + } + ks_free(&str); return idx; error: - free(str.s); + ks_free(&str); if ( fp ) hts_close(fp); regidx_destroy(idx); return NULL; @@ -223,16 +309,17 @@ regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f fr void regidx_destroy(regidx_t *idx) { int i, j; + if (!idx) return; for (i=0; inseq; i++) { reglist_t *list = &idx->seq[i]; if ( idx->free ) { - for (j=0; jnregs; j++) - idx->free((char*)list->payload + idx->payload_size*j); + for (j=0; jnreg; j++) + idx->free((char *)list->dat + idx->payload_size*j); } - free(list->payload); - free(list->regs); + free(list->dat); + free(list->reg); free(list->idx); } free(idx->seq_names); @@ -243,48 +330,138 @@ void regidx_destroy(regidx_t *idx) free(idx); } -int regidx_overlap(regidx_t *idx, const char *chr, uint32_t from, uint32_t to, regitr_t *itr) +static int reglist_build_index_(regidx_t *regidx, reglist_t *list) { - if ( itr ) itr->i = itr->n = 0; + int i; + if ( list->unsorted ) { + if ( !regidx->payload_size ) { + qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs); + } else { + reg_t **ptr = malloc(sizeof(*ptr)*list->nreg); + if (!ptr) return -1; + for (i=0; inreg; i++) ptr[i] = list->reg + i; + qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2); + + uint8_t *tmp_dat = malloc(regidx->payload_size*list->nreg); + if (!tmp_dat) { free(ptr); return -1; } + for (i=0; inreg; i++) { + size_t iori = ptr[i] - list->reg; + memcpy(tmp_dat+i*regidx->payload_size, + list->dat+iori*regidx->payload_size, + regidx->payload_size); + } + free(list->dat); + list->dat = tmp_dat; + + reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg); + if (!tmp_reg) { free(ptr); return -1; } + for (i=0; inreg; i++) { + size_t iori = ptr[i] - list->reg; + tmp_reg[i] = list->reg[iori]; + } + free(ptr); + free(list->reg); + list->reg = tmp_reg; + list->mreg = list->nreg; + } + list->unsorted = 0; + } - int iseq; - if ( khash_str2int_get(idx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence + list->nidx = 0; + uint32_t j,k, midx = 0; + // Find highest index bin. It's possible that we could just look at + // the last region, but go through the list in case some entries overlap. + for (j=0; jnreg; j++) { + int iend = iBIN(list->reg[j].end); + if (midx <= iend) midx = iend; + } + midx++; + uint32_t *new_idx = calloc(midx, sizeof(uint32_t)); + if (!new_idx) return -1; + free(list->idx); // Should be NULL on entry, but just in case... + list->idx = new_idx; + list->nidx = midx; + + for (j=0; jnreg; j++) { + int ibeg = iBIN(list->reg[j].beg); + int iend = iBIN(list->reg[j].end); + if ( ibeg==iend ) { + if ( !list->idx[ibeg] ) list->idx[ibeg] = j + 1; + } else { + for (k=ibeg; k<=iend; k++) + if ( !list->idx[k] ) list->idx[k] = j + 1; + } + } + + return 0; +} + +int regidx_overlap(regidx_t *regidx, const char *chr, hts_pos_t beg, hts_pos_t end, regitr_t *regitr) +{ + if ( regitr ) regitr->seq = NULL; + + int iseq, ireg; + if ( khash_str2int_get(regidx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence - reglist_t *list = &idx->seq[iseq]; - if ( !list->nregs ) return 0; + reglist_t *list = ®idx->seq[iseq]; + if ( !list->nreg ) return 0; - int i, ibeg = from>>LIDX_SHIFT; - int ireg = ibeg < list->nidx ? list->idx[ibeg] : list->idx[ list->nidx - 1 ]; - if ( ireg < 0 ) + if ( list->nreg==1 ) { - // linear search; if slow, replace with binary search - if ( ibeg > list->nidx ) ibeg = list->nidx; - for (i=ibeg - 1; i>=0; i--) - if ( list->idx[i] >=0 ) break; - ireg = i>=0 ? list->idx[i] : 0; + if ( beg > list->reg[0].end ) return 0; + if ( end < list->reg[0].beg ) return 0; + ireg = 0; } - for (i=ireg; inregs; i++) + else { - if ( list->regs[i].start > to ) return 0; // no match - if ( list->regs[i].end >= from && list->regs[i].start <= to ) break; // found + if ( !list->idx ) { + if (reglist_build_index_(regidx,list) < 0) return -1; + } + + int ibeg = iBIN(beg); + if ( ibeg >= list->nidx ) return 0; // beg is too big + + // find a matching region + uint32_t i = list->idx[ibeg]; + if ( !i ) + { + int iend = iBIN(end); + if ( iend > list->nidx ) iend = list->nidx; + for (i=ibeg; i<=iend; i++) + if ( list->idx[i] ) break; + if ( i>iend ) return 0; + i = list->idx[i]; + } + for (ireg=i-1; iregnreg; ireg++) + { + if ( list->reg[ireg].beg > end ) return 0; // no match, past the query region + if ( list->reg[ireg].end >= beg && list->reg[ireg].beg <= end ) break; // found + } + + if ( ireg >= list->nreg ) return 0; // no match } - if ( i>=list->nregs ) return 0; // no match + if ( !regitr ) return 1; // match, but no more info to save - if ( !itr ) return 1; + // may need to iterate over the matching regions later + itr_t_ *itr = (itr_t_*)regitr->itr; + itr->ridx = regidx; + itr->list = list; + itr->beg = beg; + itr->end = end; + itr->ireg = ireg; + itr->active = 0; - itr->i = 0; - itr->n = list->nregs - i; - itr->reg = &idx->seq[iseq].regs[i]; - if ( idx->payload_size ) - itr->payload = (char*)idx->seq[iseq].payload + i*idx->payload_size; - else - itr->payload = NULL; + regitr->seq = list->seq; + regitr->beg = list->reg[ireg].beg; + regitr->end = list->reg[ireg].end; + if ( regidx->payload_size ) + regitr->payload = list->dat + regidx->payload_size*ireg; return 1; } -int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr) +int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, hts_pos_t *beg, hts_pos_t *end, void *payload, void *usr) { char *ss = (char*) line; while ( *ss && isspace_c(*ss) ) ss++; @@ -293,23 +470,30 @@ int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, reg_t *re char *se = ss; while ( *se && !isspace_c(*se) ) se++; - if ( !*se ) { hts_log_error("Could not parse bed line: %s", line); return -2; } *chr_beg = ss; *chr_end = se-1; + if ( !*se ) + { + // just the chromosome name + *beg = 0; + *end = MAX_COOR_0; + return 0; + } + ss = se+1; - reg->start = hts_parse_decimal(ss, &se, 0); + *beg = hts_parse_decimal(ss, &se, 0); if ( ss==se ) { hts_log_error("Could not parse bed line: %s", line); return -2; } ss = se+1; - reg->end = hts_parse_decimal(ss, &se, 0) - 1; + *end = hts_parse_decimal(ss, &se, 0) - 1; if ( ss==se ) { hts_log_error("Could not parse bed line: %s", line); return -2; } return 0; } -int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr) +int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, hts_pos_t *beg, hts_pos_t *end, void *payload, void *usr) { char *ss = (char*) line; while ( *ss && isspace_c(*ss) ) ss++; @@ -318,25 +502,185 @@ int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *re char *se = ss; while ( *se && !isspace_c(*se) ) se++; - if ( !*se ) { hts_log_error("Could not parse bed line: %s", line); return -2; } *chr_beg = ss; *chr_end = se-1; + if ( !*se ) + { + // just the chromosome name + *beg = 0; + *end = MAX_COOR_0; + return 0; + } + ss = se+1; - reg->start = hts_parse_decimal(ss, &se, 0) - 1; - if ( ss==se ) { hts_log_error("Could not parse bed line: %s", line); return -2; } + *beg = hts_parse_decimal(ss, &se, 0); + if ( ss==se ) { hts_log_error("Could not parse tab line: %s", line); return -2; } + if ( *beg==0 ) { hts_log_error("Could not parse tab line, expected 1-based coordinate: %s", line); return -2; } + (*beg)--; if ( !se[0] || !se[1] ) - reg->end = reg->start; + *end = *beg; else { ss = se+1; - reg->end = hts_parse_decimal(ss, &se, 0); - if ( ss==se ) reg->end = reg->start; - else reg->end--; + *end = hts_parse_decimal(ss, &se, 0); + if ( ss==se || (*se && !isspace_c(*se)) ) *end = *beg; + else if ( *end==0 ) { hts_log_error("Could not parse tab line, expected 1-based coordinate: %s", line); return -2; } + else (*end)--; + } + return 0; +} + +int regidx_parse_vcf(const char *line, char **chr_beg, char **chr_end, hts_pos_t *beg, hts_pos_t *end, void *payload, void *usr) +{ + int ret = regidx_parse_tab(line, chr_beg, chr_end, beg, end, payload, usr); + if ( !ret ) *end = *beg; + return ret; +} + +int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, hts_pos_t *beg, hts_pos_t *end, void *payload, void *usr) +{ + char *ss = (char*) line; + while ( *ss && isspace_c(*ss) ) ss++; + if ( !*ss ) return -1; // skip blank lines + if ( *ss=='#' ) return -1; // skip comments + + char *se = ss; + while ( *se && *se!=':' ) se++; + + *chr_beg = ss; + *chr_end = se-1; + + if ( !*se ) + { + *beg = 0; + *end = MAX_COOR_0; + return 0; } + ss = se+1; + *beg = hts_parse_decimal(ss, &se, 0); + if ( ss==se ) { hts_log_error("Could not parse reg line: %s", line); return -2; } + if ( *beg==0 ) { hts_log_error("Could not parse reg line, expected 1-based coordinate: %s", line); return -2; } + (*beg)--; + + if ( !se[0] || !se[1] ) + *end = se[0]=='-' ? MAX_COOR_0 : *beg; + else + { + ss = se+1; + *end = hts_parse_decimal(ss, &se, 0); + if ( ss==se ) *end = *beg; + else if ( *end==0 ) { hts_log_error("Could not parse reg line, expected 1-based coordinate: %s", line); return -2; } + else (*end)--; + } return 0; } +regitr_t *regitr_init(regidx_t *regidx) +{ + regitr_t *regitr = (regitr_t*) calloc(1,sizeof(regitr_t)); + if (!regitr) return NULL; + regitr->itr = (itr_t_*) calloc(1,sizeof(itr_t_)); + if (!regitr->itr) { + free(regitr); + return NULL; + } + itr_t_ *itr = (itr_t_*) regitr->itr; + itr->ridx = regidx; + itr->list = NULL; + return regitr; +} + +void regitr_reset(regidx_t *regidx, regitr_t *regitr) +{ + itr_t_ *itr = (itr_t_*) regitr->itr; + memset(itr,0,sizeof(itr_t_)); + itr->ridx = regidx; +} + +void regitr_destroy(regitr_t *regitr) +{ + free(regitr->itr); + free(regitr); +} + +int regitr_overlap(regitr_t *regitr) +{ + if ( !regitr || !regitr->seq || !regitr->itr ) return 0; + + itr_t_ *itr = (itr_t_*) regitr->itr; + if ( !itr->active ) + { + // is this the first call after regidx_overlap? + itr->active = 1; + itr->ireg++; + return 1; + } + + reglist_t *list = itr->list; + + int i; + for (i=itr->ireg; inreg; i++) + { + if ( list->reg[i].beg > itr->end ) return 0; // no match, past the query region + if ( list->reg[i].end >= itr->beg && list->reg[i].beg <= itr->end ) break; // found + } + + if ( i >= list->nreg ) return 0; // no match + + itr->ireg = i + 1; + regitr->seq = list->seq; + regitr->beg = list->reg[i].beg; + regitr->end = list->reg[i].end; + if ( itr->ridx->payload_size ) + regitr->payload = (char *)list->dat + itr->ridx->payload_size*i; + + return 1; +} + +int regitr_loop(regitr_t *regitr) +{ + if ( !regitr || !regitr->itr ) return 0; + + itr_t_ *itr = (itr_t_*) regitr->itr; + regidx_t *regidx = itr->ridx; + + if ( !itr->list ) // first time here + { + itr->list = regidx->seq; + itr->ireg = 0; + } + + size_t iseq = itr->list - regidx->seq; + if ( iseq >= regidx->nseq ) return 0; + + if ( itr->ireg >= itr->list->nreg ) + { + iseq++; + if ( iseq >= regidx->nseq ) return 0; // no more sequences, done + itr->ireg = 0; + itr->list = ®idx->seq[iseq]; + } + + regitr->seq = itr->list->seq; + regitr->beg = itr->list->reg[itr->ireg].beg; + regitr->end = itr->list->reg[itr->ireg].end; + if ( regidx->payload_size ) + regitr->payload = (char *)itr->list->dat + regidx->payload_size*itr->ireg; + itr->ireg++; + + return 1; +} + + +void regitr_copy(regitr_t *dst, regitr_t *src) +{ + itr_t_ *dst_itr = (itr_t_*) dst->itr; + itr_t_ *src_itr = (itr_t_*) src->itr; + *dst_itr = *src_itr; + *dst = *src; + dst->itr = dst_itr; +} diff --git a/region.c b/region.c new file mode 100644 index 000000000..8b570e0bf --- /dev/null +++ b/region.c @@ -0,0 +1,276 @@ +/* region.c -- Functions to create and free region lists + + Copyright (C) 2019 Genome Research Ltd. + + Author: Valeriu Ohan + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h +#include + +#include "htslib/hts.h" +#include "htslib/khash.h" + +typedef struct reglist +{ + uint32_t n, m; + hts_pair_pos_t *a; + int tid; +} reglist_t; + +KHASH_MAP_INIT_INT(reg, reglist_t) +typedef kh_reg_t reghash_t; + +static int compare_hts_pair_pos_t (const void *av, const void *bv) +{ + hts_pair_pos_t *a = (hts_pair_pos_t *) av; + hts_pair_pos_t *b = (hts_pair_pos_t *) bv; + if (a->beg < b->beg) return -1; + if (a->beg > b->beg) return 1; + if (a->end < b->end) return -1; + if (a->end > b->end) return 1; + + return 0; +} + +#if 0 +/** + * Good to have around for debugging + */ +static void reg_print(reghash_t *h) { + reglist_t *p; + khint_t k; + uint32_t i; + khint32_t key; + + if (!h) { + fprintf(stderr, "Hash table is empty!\n"); + return; + } + for (k = kh_begin(h); k < kh_end(h); k++) { + if (kh_exist(h,k)) { + key = kh_key(h,k); + fprintf(stderr, "Region: key %u tid %d\n", key, p->tid); + if ((p = &kh_val(h,k)) != NULL && p->n > 0) { + for (i=0; in; i++) { + fprintf(stderr, "\tinterval[%d]: %"PRIhts_pos"-%"PRIhts_pos"\n", i, + p->a[i].beg, p->a[i].end); + } + } else { + fprintf(stderr, "Region key %u has no intervals!\n", key); + } + } + } +} +#endif + +/** + * Sort and merge overlapping or adjacent intervals. + */ +static int reg_compact(reghash_t *h) { + khint_t i; + uint32_t j, new_n; + reglist_t *p; + int count = 0; + + if (!h) + return 0; + + for (i = kh_begin(h); i < kh_end(h); i++) { + if (!kh_exist(h,i) || !(p = &kh_val(h,i)) || !(p->n)) + continue; + + qsort(p->a, p->n, sizeof(p->a[0]), compare_hts_pair_pos_t); + for (new_n = 0, j = 1; j < p->n; j++) { + if (p->a[new_n].end < p->a[j].beg) { + p->a[++new_n].beg = p->a[j].beg; + p->a[new_n].end = p->a[j].end; + } else { + if (p->a[new_n].end < p->a[j].end) + p->a[new_n].end = p->a[j].end; + } + } + ++new_n; + if (p->n > new_n) { + // Shrink array to required size. + hts_pair_pos_t *new_a = realloc(p->a, new_n * sizeof(p->a[0])); + if (new_a) p->a = new_a; + } + p->n = new_n; + count++; + } + + return count; +} + +static int reg_insert(reghash_t *h, int tid, hts_pos_t beg, hts_pos_t end) { + + khint_t k; + reglist_t *p; + + if (!h) + return -1; + + // Put reg in the hash table if not already there + k = kh_get(reg, h, tid); + if (k == kh_end(h)) { // absent from the hash table + int ret; + k = kh_put(reg, h, tid, &ret); + if (-1 == ret) { + return -1; + } + memset(&kh_val(h, k), 0, sizeof(reglist_t)); + kh_val(h, k).tid = tid; + } + p = &kh_val(h, k); + + // Add beg and end to the list + if (p->n == p->m) { + uint32_t new_m = p->m ? p->m<<1 : 4; + if (new_m == 0) return -1; + hts_pair_pos_t *new_a = realloc(p->a, new_m * sizeof(p->a[0])); + if (new_a == NULL) return -1; + p->m = new_m; + p->a = new_a; + } + p->a[p->n].beg = beg; + p->a[p->n++].end = end; + + return 0; +} + +static void reg_destroy(reghash_t *h) { + + khint_t k; + + if (!h) + return; + + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + } + } + kh_destroy(reg, h); +} + +/** + * Take a char array of reg:interval elements and produce a hts_reglis_t with r_count elements. + */ +hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr, hts_name2id_f getid) { + + if (!argv || argc < 1) + return NULL; + + reghash_t *h = NULL; + reglist_t *p; + hts_reglist_t *h_reglist = NULL; + + khint_t k; + int i, l_count = 0, tid; + const char *q; + hts_pos_t beg, end; + + /* First, transform the char array into a hash table */ + h = kh_init(reg); + if (!h) { + hts_log_error("Error when creating the region hash table"); + return NULL; + } + + for (i=0; itid; + h_reglist[l_count].intervals = p->a; + h_reglist[l_count].count = p->n; + p->a = NULL; // As we stole it. + + // After reg_compact(), list is ordered and non-overlapping, so... + if (p->n > 0) { + h_reglist[l_count].min_beg = h_reglist[l_count].intervals[0].beg; + h_reglist[l_count].max_end = h_reglist[l_count].intervals[p->n - 1].end; + } else { + h_reglist[l_count].min_beg = 0; + h_reglist[l_count].max_end = 0; + } + + l_count++; + } + reg_destroy(h); + + return h_reglist; + +fail: + reg_destroy(h); + if(h_reglist) hts_reglist_free(h_reglist, l_count); + + return NULL; +} + +void hts_reglist_free(hts_reglist_t *reglist, int count) { + + int i; + if(reglist) { + for (i = 0; i < count; i++) { + if (reglist[i].intervals) + free(reglist[i].intervals); + } + free(reglist); + } +} diff --git a/sam.5 b/sam.5 index 66542bbff..46ba42c66 100644 --- a/sam.5 +++ b/sam.5 @@ -3,7 +3,7 @@ .SH NAME sam \- Sequence Alignment/Map file format .\" -.\" Copyright (C) 2009, 2013 Genome Research Ltd. +.\" Copyright (C) 2009, 2013-2014 Genome Research Ltd. .\" .\" Author: Heng Li .\" diff --git a/sam.c b/sam.c index aa947761d..688147633 100644 --- a/sam.c +++ b/sam.c @@ -1,6 +1,6 @@ /* sam.c -- SAM and BAM file I/O and manipulation. - Copyright (C) 2008-2010, 2012-2018 Genome Research Ltd. + Copyright (C) 2008-2010, 2012-2019 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -23,6 +23,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -32,18 +33,29 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include +#include + +// Suppress deprecation message for cigar_tab, which we initialise +#include "htslib/hts_defs.h" +#undef HTS_DEPRECATED +#define HTS_DEPRECATED(message) + #include "htslib/sam.h" #include "htslib/bgzf.h" #include "cram/cram.h" #include "hts_internal.h" +#include "sam_internal.h" #include "htslib/hfile.h" #include "htslib/hts_endian.h" +#include "header.h" #include "htslib/khash.h" KHASH_DECLARE(s2i, kh_cstr_t, int64_t) -typedef khash_t(s2i) sdict_t; - +#ifndef EFTYPE +#define EFTYPE ENOEXEC +#endif #ifndef EOVERFLOW #define EOVERFLOW ERANGE #endif @@ -52,75 +64,163 @@ typedef khash_t(s2i) sdict_t; *** BAM header I/O *** **********************/ -bam_hdr_t *bam_hdr_init() +HTSLIB_EXPORT +const int8_t bam_cigar_table[256] = { + // 0 .. 47 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + // 48 .. 63 (including =) + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, BAM_CEQUAL, -1, -1, + + // 64 .. 79 (including MIDNHB) + -1, -1, BAM_CBACK, -1, BAM_CDEL, -1, -1, -1, + BAM_CHARD_CLIP, BAM_CINS, -1, -1, -1, BAM_CMATCH, BAM_CREF_SKIP, -1, + + // 80 .. 95 (including SPX) + BAM_CPAD, -1, -1, BAM_CSOFT_CLIP, -1, -1, -1, -1, + BAM_CDIFF, -1, -1, -1, -1, -1, -1, -1, + + // 96 .. 127 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + // 128 .. 255 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +}; + +sam_hdr_t *sam_hdr_init() { - return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t)); + sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t)); + if (bh == NULL) return NULL; + + bh->cigar_tab = bam_cigar_table; + return bh; } -void bam_hdr_destroy(bam_hdr_t *h) +void sam_hdr_destroy(sam_hdr_t *bh) { int32_t i; - if (h == NULL) return; - if (h->target_name) { - for (i = 0; i < h->n_targets; ++i) - free(h->target_name[i]); - free(h->target_name); - free(h->target_len); + + if (bh == NULL) return; + + if (bh->ref_count > 0) { + --bh->ref_count; + return; + } + + if (bh->target_name) { + for (i = 0; i < bh->n_targets; ++i) + free(bh->target_name[i]); + free(bh->target_name); + free(bh->target_len); + } + free(bh->text); + if (bh->hrecs) + sam_hrecs_free(bh->hrecs); + if (bh->sdict) + kh_destroy(s2i, (khash_t(s2i) *) bh->sdict); + free(bh); +} + +// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long +// references before sam_hdr_t::hrecs is populated +int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h) +{ + const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict; + khash_t(s2i) *dest_long_refs = kh_init(s2i); + int i; + if (!dest_long_refs) return -1; + + for (i = 0; i < h->n_targets; i++) { + int ret; + khiter_t ksrc, kdest; + if (h->target_len[i] < UINT32_MAX) continue; + ksrc = kh_get(s2i, src_long_refs, h->target_name[i]); + if (ksrc == kh_end(src_long_refs)) continue; + kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret); + if (ret < 0) { + kh_destroy(s2i, dest_long_refs); + return -1; + } + kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc); } - free(h->text); free(h->cigar_tab); - if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict); - free(h); + + h->sdict = dest_long_refs; + return 0; } -bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0) +sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0) { if (h0 == NULL) return NULL; - bam_hdr_t *h; - if ((h = bam_hdr_init()) == NULL) return NULL; + sam_hdr_t *h; + if ((h = sam_hdr_init()) == NULL) return NULL; // copy the simple data - h->n_targets = h0->n_targets; + h->n_targets = 0; h->ignore_sam_err = h0->ignore_sam_err; - h->l_text = h0->l_text; + h->l_text = 0; + // Then the pointery stuff - h->cigar_tab = NULL; - h->sdict = NULL; - // TODO Check for memory allocation failures - h->text = (char*)calloc(h->l_text + 1, 1); - memcpy(h->text, h0->text, h->l_text); - h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t)); - h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); - int i; - for (i = 0; i < h->n_targets; ++i) { - h->target_len[i] = h0->target_len[i]; - h->target_name[i] = strdup(h0->target_name[i]); + + if (!h0->hrecs) { + h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t)); + if (!h->target_len) goto fail; + h->target_name = (char**)calloc(h0->n_targets, sizeof(char*)); + if (!h->target_name) goto fail; + + int i; + for (i = 0; i < h0->n_targets; ++i) { + h->target_len[i] = h0->target_len[i]; + h->target_name[i] = strdup(h0->target_name[i]); + if (!h->target_name[i]) break; + } + h->n_targets = i; + if (i < h0->n_targets) goto fail; + + if (h0->sdict) { + if (sam_hdr_dup_sdict(h0, h) < 0) goto fail; + } } - return h; -} + if (h0->hrecs) { + kstring_t tmp = { 0, 0, NULL }; + if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) { + free(ks_release(&tmp)); + goto fail; + } + + h->l_text = tmp.l; + h->text = ks_release(&tmp); -static bam_hdr_t *hdr_from_dict(sdict_t *d) -{ - bam_hdr_t *h; - khint_t k; - h = bam_hdr_init(); - h->sdict = d; - h->n_targets = kh_size(d); - // TODO Check for memory allocation failures - h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets); - h->target_name = (char**)malloc(sizeof(char*) * h->n_targets); - for (k = kh_begin(d); k != kh_end(d); ++k) { - if (!kh_exist(d, k)) continue; - h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k); - h->target_len[kh_val(d, k)>>32] = kh_val(d, k) & 0xffffffffUL; - kh_val(d, k) >>= 32; + if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0) + goto fail; + } else { + h->l_text = h0->l_text; + h->text = malloc(h->l_text + 1); + if (!h->text) goto fail; + memcpy(h->text, h0->text, h->l_text); + h->text[h->l_text] = '\0'; } + return h; + + fail: + sam_hdr_destroy(h); + return NULL; } -bam_hdr_t *bam_hdr_read(BGZF *fp) +sam_hdr_t *bam_hdr_read(BGZF *fp) { - bam_hdr_t *h; - char buf[4]; + sam_hdr_t *h; + uint8_t buf[4]; int magic_len, has_EOF; int32_t i, name_len, num_names = 0; size_t bufsize; @@ -134,19 +234,19 @@ bam_hdr_t *bam_hdr_read(BGZF *fp) } // read "BAM1" magic_len = bgzf_read(fp, buf, 4); - if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) { + if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) { hts_log_error("Invalid BAM binary header"); return 0; } - h = bam_hdr_init(); + h = sam_hdr_init(); if (!h) goto nomem; // read plain text and the number of reference sequences - bytes = bgzf_read(fp, &h->l_text, 4); + bytes = bgzf_read(fp, buf, 4); if (bytes != 4) goto read_err; - if (fp->is_be) ed_swap_4p(&h->l_text); + h->l_text = le_to_u32(buf); - bufsize = ((size_t) h->l_text) + 1; + bufsize = h->l_text + 1; if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed h->text = (char*)malloc(bufsize); if (!h->text) goto nomem; @@ -220,32 +320,56 @@ bam_hdr_t *bam_hdr_read(BGZF *fp) clean: if (h != NULL) { h->n_targets = num_names; // ensure we free only allocated target_names - bam_hdr_destroy(h); + sam_hdr_destroy(h); } return NULL; } -int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) +int bam_hdr_write(BGZF *fp, const sam_hdr_t *h) { int32_t i, name_len, x; + kstring_t hdr_ks = { 0, 0, NULL }; + char *text; + uint32_t l_text; + + if (!h) return -1; + + if (h->hrecs) { + if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1; + if (hdr_ks.l > INT32_MAX) { + hts_log_error("Header too long for BAM format"); + free(hdr_ks.s); + return -1; + } + text = hdr_ks.s; + l_text = hdr_ks.l; + } else { + if (h->l_text > INT32_MAX) { + hts_log_error("Header too long for BAM format"); + return -1; + } + text = h->text; + l_text = h->l_text; + } // write "BAM1" - if (bgzf_write(fp, "BAM\1", 4) < 0) return -1; + if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; } // write plain text and the number of reference sequences if (fp->is_be) { - x = ed_swap_4(h->l_text); - if (bgzf_write(fp, &x, 4) < 0) return -1; - if (h->l_text) { - if (bgzf_write(fp, h->text, h->l_text) < 0) return -1; + x = ed_swap_4(l_text); + if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; } + if (l_text) { + if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; } } x = ed_swap_4(h->n_targets); - if (bgzf_write(fp, &x, 4) < 0) return -1; + if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; } } else { - if (bgzf_write(fp, &h->l_text, 4) < 0) return -1; - if (h->l_text) { - if (bgzf_write(fp, h->text, h->l_text) < 0) return -1; + if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; } + if (l_text) { + if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; } } - if (bgzf_write(fp, &h->n_targets, 4) < 0) return -1; + if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; } } + free(hdr_ks.s); // write sequence names and lengths for (i = 0; i != h->n_targets; ++i) { char *p = h->target_name[i]; @@ -268,21 +392,9 @@ int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) return 0; } -int bam_name2id(bam_hdr_t *h, const char *ref) -{ - sdict_t *d = (sdict_t*)h->sdict; - khint_t k; - if (h->sdict == 0) { - int i, absent; - d = kh_init(s2i); - for (i = 0; i < h->n_targets; ++i) { - k = kh_put(s2i, d, h->target_name[i], &absent); - kh_val(d, k) = i; - } - h->sdict = d; - } - k = kh_get(s2i, d, ref); - return k == kh_end(d)? -1 : kh_val(d, k); +const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, + hts_pos_t *beg, hts_pos_t *end, int flags) { + return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags); } /************************* @@ -294,7 +406,7 @@ bam1_t *bam_init1() return (bam1_t*)calloc(1, sizeof(bam1_t)); } -static int do_realloc_bam_data(bam1_t *b, size_t desired) +int sam_realloc_bam_data(bam1_t *b, size_t desired) { uint32_t new_m_data; uint8_t *new_data; @@ -304,49 +416,46 @@ static int do_realloc_bam_data(bam1_t *b, size_t desired) errno = ENOMEM; // Not strictly true but we can't store the size return -1; } - new_data = realloc(b->data, new_m_data); + if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { + new_data = realloc(b->data, new_m_data); + } else { + if ((new_data = malloc(new_m_data)) != NULL) { + if (b->l_data > 0) + memcpy(new_data, b->data, + b->l_data < b->m_data ? b->l_data : b->m_data); + bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA)); + } + } if (!new_data) return -1; b->data = new_data; b->m_data = new_m_data; return 0; } -static inline int realloc_bam_data(bam1_t *b, size_t desired) -{ - if (desired <= b->m_data) return 0; - return do_realloc_bam_data(b, desired); -} - -static inline int possibly_expand_bam_data(bam1_t *b, size_t bytes) { - uint32_t new_len = b->l_data + bytes; - - if (new_len > INT32_MAX || new_len < b->l_data) { - errno = ENOMEM; - return -1; - } - if (new_len <= b->m_data) return 0; - return do_realloc_bam_data(b, new_len); -} - void bam_destroy1(bam1_t *b) { if (b == 0) return; - free(b->data); free(b); + if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { + free(b->data); + if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) { + // In case of reuse + b->data = NULL; + b->m_data = 0; + b->l_data = 0; + } + } + + if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0) + free(b); } bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) { - uint8_t *data = bdst->data; - int m_data = bdst->m_data; // backup data and m_data - if (m_data < bsrc->l_data) { // double the capacity - m_data = bsrc->l_data; kroundup32(m_data); - data = (uint8_t*)realloc(data, m_data); - } - memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data - *bdst = *bsrc; // copy the rest - // restore the backup - bdst->m_data = m_data; - bdst->data = data; + if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL; + memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data + memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest + bdst->l_data = bsrc->l_data; + bdst->id = bsrc->id; return bdst; } @@ -355,10 +464,15 @@ bam1_t *bam_dup1(const bam1_t *bsrc) if (bsrc == NULL) return NULL; bam1_t *bdst = bam_init1(); if (bdst == NULL) return NULL; - return bam_copy1(bdst, bsrc); + if (bam_copy1(bdst, bsrc) == NULL) { + bam_destroy1(bdst); + return NULL; + } + return bdst; } -void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, int *rlen, int *qlen) +static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, + hts_pos_t *rlen, hts_pos_t *qlen) { int k; *rlen = *qlen = 0; @@ -370,25 +484,27 @@ void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, int *rlen, int *qlen) } } -int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) +hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar) { - int k, l; + int k; + hts_pos_t l; for (k = l = 0; k < n_cigar; ++k) if (bam_cigar_type(bam_cigar_op(cigar[k]))&1) l += bam_cigar_oplen(cigar[k]); return l; } -int bam_cigar2rlen(int n_cigar, const uint32_t *cigar) +hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar) { - int k, l; + int k; + hts_pos_t l; for (k = l = 0; k < n_cigar; ++k) if (bam_cigar_type(bam_cigar_op(cigar[k]))&2) l += bam_cigar_oplen(cigar[k]); return l; } -int32_t bam_endpos(const bam1_t *b) +hts_pos_t bam_endpos(const bam1_t *b) { if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0) return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); @@ -457,11 +573,36 @@ static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_ho for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]); } +// Fix bad records where qname is not terminated correctly. +static int fixup_missing_qname_nul(bam1_t *b) { + bam1_core_t *c = &b->core; + + // Note this is called before c->l_extranul is added to c->l_qname + if (c->l_extranul > 0) { + b->data[c->l_qname++] = '\0'; + c->l_extranul--; + } else { + if (b->l_data > INT_MAX - 4) return -1; + if (realloc_bam_data(b, b->l_data + 4) < 0) return -1; + b->l_data += 4; + b->data[c->l_qname++] = '\0'; + c->l_extranul = 3; + } + return 0; +} + +/* + * Note a second interface that returns a bam pointer instead would avoid bam_copy1 + * in multi-threaded handling. This may be worth considering for htslib2. + */ int bam_read1(BGZF *fp, bam1_t *b) { bam1_core_t *c = &b->core; int32_t block_len, ret, i; uint32_t x[8], new_l_data; + + b->l_data = 0; + if ((ret = bgzf_read(fp, &block_len, 4)) != 4) { if (ret == 0) return -1; // normal end-of-file else return -2; // truncated @@ -473,14 +614,12 @@ int bam_read1(BGZF *fp, bam1_t *b) if (fp->is_be) { for (i = 0; i < 8; ++i) ed_swap_4p(x + i); } - c->tid = x[0]; c->pos = x[1]; + c->tid = x[0]; c->pos = (int32_t)x[1]; c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0; - if ((uint32_t) c->l_qname + c->l_extranul > 255) // l_qname would overflow - return -4; c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; c->l_qseq = x[4]; - c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; + c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7]; new_l_data = block_len - 32 + c->l_extranul; if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4; @@ -491,6 +630,9 @@ int bam_read1(BGZF *fp, bam1_t *b) b->l_data = new_l_data; if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4; + if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination + if (fixup_missing_qname_nul(b) < 0) return -4; + } for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0'; c->l_qname += c->l_extranul; if (b->l_data < c->l_qname || @@ -501,7 +643,7 @@ int bam_read1(BGZF *fp, bam1_t *b) return -4; if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency - int rlen, qlen; + hts_pos_t rlen, qlen; bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen); if ((b->core.flag & BAM_FUNMAP)) rlen=1; b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5); @@ -521,7 +663,18 @@ int bam_write1(BGZF *fp, const bam1_t *b) const bam1_core_t *c = &b->core; uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; int i, ok; + if (c->l_qname - c->l_extranul > 255) { + hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b)); + errno = EOVERFLOW; + return -1; + } if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR + if (c->pos > INT_MAX || + c->mpos > INT_MAX || + c->isize < INT_MIN || c->isize > INT_MAX) { + hts_log_error("Positional data is too large for BAM format"); + return -1; + } x[0] = c->tid; x[1] = c->pos; x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); @@ -547,10 +700,19 @@ int bam_write1(BGZF *fp, const bam1_t *b) } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag uint8_t buf[8]; uint32_t cigar_st, cigar_en, cigar[2]; + hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)); + if (cigreflen >= (1<<28)) { + // Length of reference covered is greater than the biggest + // CIGAR operation currently allowed. + hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos + " cannot be written in BAM. Try writing SAM or CRAM instead.\n", + bam_get_qname(b), c->n_cigar, cigreflen); + return -1; + } cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; cigar_en = cigar_st + c->n_cigar * 4; cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP; - cigar[1] = (uint32_t)bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)) << 4 | BAM_CREF_SKIP; + cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP; u32_to_le(cigar[0], buf); u32_to_le(cigar[1], buf + 4); if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: SN @@ -564,36 +726,101 @@ int bam_write1(BGZF *fp, const bam1_t *b) return ok? 4 + block_len : -1; } +/* + * Write a BAM file and append to the in-memory index simultaneously. + */ +static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) { + BGZF *bfp = fp->fp.bgzf; + + if (!fp->idx) + return bam_write1(bfp, b); + + uint32_t block_len = b->l_data - b->core.l_extranul + 32; + if (bgzf_flush_try(bfp, 4 + block_len) < 0) + return -1; + if (!bfp->mt) + hts_idx_amend_last(fp->idx, bgzf_tell(bfp)); + + int ret = bam_write1(bfp, b); + if (ret < 0) + return -1; + + if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) { + hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", + bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); + ret = -1; + } + + return ret; +} + +/* + * Set the qname in a BAM record + */ +int bam_set_qname(bam1_t *rec, const char *qname) +{ + if (!rec) return -1; + if (!qname || !*qname) return -1; + + size_t old_len = rec->core.l_qname; + size_t new_len = strlen(qname) + 1; + if (new_len < 1 || new_len > 255) return -1; + + int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0; + + size_t new_data_len = rec->l_data - old_len + new_len + extranul; + if (realloc_bam_data(rec, new_data_len) < 0) return -1; + + // Make room + if (new_len + extranul != rec->core.l_qname) + memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname); + // Copy in new name and pad if needed + memcpy(rec->data, qname, new_len); + int n; + for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0'; + + rec->l_data = new_data_len; + rec->core.l_qname = new_len + extranul; + rec->core.l_extranul = extranul; + + return 0; +} + /******************** *** BAM indexing *** ********************/ -static hts_idx_t *bam_index(BGZF *fp, int min_shift) +static hts_idx_t *sam_index(htsFile *fp, int min_shift) { int n_lvls, i, fmt, ret; bam1_t *b; hts_idx_t *idx; - bam_hdr_t *h; - h = bam_hdr_read(fp); + sam_hdr_t *h; + h = sam_hdr_read(fp); if (h == NULL) return NULL; if (min_shift > 0) { - int64_t max_len = 0, s; - for (i = 0; i < h->n_targets; ++i) - if (max_len < h->target_len[i]) max_len = h->target_len[i]; + hts_pos_t max_len = 0, s; + for (i = 0; i < h->n_targets; ++i) { + hts_pos_t len = sam_hdr_tid2len(h, i); + if (max_len < len) max_len = len; + } max_len += 256; for (n_lvls = 0, s = 1< s; ++n_lvls, s <<= 3); fmt = HTS_FMT_CSI; } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; - idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls); - bam_hdr_destroy(h); + idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); b = bam_init1(); - while ((ret = bam_read1(fp, b)) >= 0) { - ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP)); - if (ret < 0) goto err; // unsorted + while ((ret = sam_read1(fp, h, b)) >= 0) { + ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)); + if (ret < 0) { // unsorted or doesn't fit + hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); + goto err; + } } if (ret < -1) goto err; // corrupted BAM file - hts_idx_finish(idx, bgzf_tell(fp)); + hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf)); + sam_hdr_destroy(h); bam_destroy1(b); return idx; @@ -615,11 +842,19 @@ int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthre switch (fp->format.format) { case cram: + ret = cram_index_build(fp->fp.cram, fn, fnidx); break; case bam: - idx = bam_index(fp->fp.bgzf, min_shift); + case sam: + if (!fp->is_bgzf) { + hts_log_error("%s file \"%s\" not BGZF compressed", + fp->format.format == bam ? "BAM" : "SAM", fn); + ret = -1; + break; + } + idx = sam_index(fp, min_shift); if (idx) { ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI); if (ret < 0) ret = -4; @@ -654,11 +889,68 @@ int bam_index_build(const char *fn, int min_shift) return sam_index_build2(fn, NULL, min_shift); } -static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end) +// Initialise fp->idx for the current format type. +// This must be called after the header has been written but no other data. +int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) { + fp->fnidx = fnidx; + if (fp->format.format == bam || fp->format.format == bcf || + (fp->format.format == sam && fp->format.compression == bgzf)) { + int n_lvls, fmt = HTS_FMT_CSI; + if (min_shift > 0) { + int64_t max_len = 0, s; + int i; + for (i = 0; i < h->n_targets; ++i) + if (max_len < h->target_len[i]) max_len = h->target_len[i]; + max_len += 256; + for (n_lvls = 0, s = 1< s; ++n_lvls, s <<= 3); + + } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; + + fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); + return fp->idx ? 0 : -1; + } + + if (fp->format.format == cram) { + fp->fp.cram->idxfp = bgzf_open(fnidx, "wg"); + return fp->fp.cram->idxfp ? 0 : -1; + } + + return -1; +} + +// Finishes an index. Call afer the last record has been written. +// Returns 0 on success, <0 on failure. +int sam_idx_save(htsFile *fp) { + if (fp->format.format == bam || fp->format.format == bcf || + fp->format.format == vcf || fp->format.format == sam) { + int ret; + if ((ret = sam_state_destroy(fp)) < 0) { + errno = -ret; + return -1; + } + if (bgzf_flush(fp->fp.bgzf) < 0) + return -1; + hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); + + if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0) + return -1; + + return hts_idx_save_as(fp->idx, NULL, fp->fnidx, hts_idx_fmt(fp->idx)); + + } else if (fp->format.format == cram) { + // flushed and closed by cram_close + } + + return 0; +} + +static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) { + htsFile *fp = (htsFile *)fpv; bam1_t *b = bv; - int ret; - if ((ret = bam_read1(fp, b)) >= 0) { + fp->line.l = 0; + int ret = sam_read1(fp, fp->bam_header, b); + if (ret >= 0) { *tid = b->core.tid; *beg = b->core.pos; *end = bam_endpos(b); @@ -667,7 +959,16 @@ static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, in } // This is used only with read_rest=1 iterators, so need not set tid/beg/end. -static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end) +static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) +{ + htsFile *fp = (htsFile *)fpv; + bam1_t *b = bv; + fp->line.l = 0; + int ret = sam_read1(fp, fp->bam_header, b); + return ret; +} + +static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) { htsFile *fp = fpv; bam1_t *b = bv; @@ -693,6 +994,8 @@ static int cram_pseek(void *fp, int64_t offset, int whence) && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR))) return -1; + fd->curr_position = offset; + if (fd->ctr) { cram_free_container(fd->ctr); if (fd->ctr_mt && fd->ctr_mt != fd->ctr) @@ -717,13 +1020,17 @@ static int64_t cram_ptell(void *fp) { cram_fd *fd = (cram_fd *)fp; cram_container *c; + cram_slice *s; int64_t ret = -1L; - if (fd && fd->fp) { - ret = htell(fd->fp); + if (fd) { if ((c = fd->ctr) != NULL) { - ret -= ((c->curr_slice < c->max_slice || c->curr_rec < c->num_records) ? c->offset + 1 : 0); + if ((s = c->slice) != NULL && s->max_rec) { + if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1)) + fd->curr_position += c->offset + c->length; + } } + ret = fd->curr_position; } return ret; @@ -745,37 +1052,18 @@ static int64_t bam_ptell(void *fp) return bgzf_tell(fd); } -// This is used only with read_rest=1 iterators, so need not set tid/beg/end. -static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end) -{ - htsFile *fp = fpv; - bam1_t *b = bv; - switch (fp->format.format) { - case bam: return bam_read1(bgzfp, b); - case cram: { - int ret = cram_get_bam_seq(fp->fp.cram, &b); - if (ret < 0) - return cram_eof(fp->fp.cram) ? -1 : -2; - if (bam_tag2cigar(b, 1, 1) < 0) - return -2; - return ret; - } - default: - // TODO Need headers available to implement this for SAM files - hts_log_error("Not implemented for SAM files"); - abort(); - } -} -hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) +static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags) { switch (fp->format.format) { case bam: - return fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_BAI); + case sam: + return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags); case cram: { if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL; + // Cons up a fake "index" just pointing at the associated cram_fd: hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t)); if (idx == NULL) return NULL; @@ -789,12 +1077,21 @@ hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) } } +hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags) +{ + return index_load(fp, fn, fnidx, flags); +} + +hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) { + return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE); +} + hts_idx_t *sam_index_load(htsFile *fp, const char *fn) { - return sam_index_load2(fp, fn, NULL); + return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE); } -static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) +static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t)); @@ -851,41 +1148,74 @@ static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end return iter; } -hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) +hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; if (idx == NULL) - return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec); + return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest); else if (cidx->fmt == HTS_FMT_CRAI) - return cram_itr_query(idx, tid, beg, end, cram_readrec); + return cram_itr_query(idx, tid, beg, end, sam_readrec); else - return hts_itr_query(idx, tid, beg, end, bam_readrec); + return hts_itr_query(idx, tid, beg, end, sam_readrec); } static int cram_name2id(void *fdv, const char *ref) { cram_fd *fd = (cram_fd *) fdv; - return sam_hdr_name2ref(fd->header, ref); + return sam_hdr_name2tid(fd->header, ref); } -hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) +hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; - if (cidx->fmt == HTS_FMT_CRAI) - return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec); - else - return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec); + return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, + cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query, + sam_readrec); +} + +hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount) +{ + const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; + hts_reglist_t *r_list = NULL; + int r_count = 0; + + if (!cidx || !hdr) + return NULL; + + hts_itr_t *itr = NULL; + if (cidx->fmt == HTS_FMT_CRAI) { + r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id); + if (!r_list) + return NULL; + itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram, + hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell); + } else { + r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id)); + if (!r_list) + return NULL; + itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr, + hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell); + } + + if (!itr) + hts_reglist_free(r_list, r_count); + + return itr; } -hts_itr_multi_t *sam_itr_regions(const hts_idx_t *idx, bam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount) +hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; + + if(!cidx || !hdr || !reglist) + return NULL; + if (cidx->fmt == HTS_FMT_CRAI) return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram, hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell); else return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr, - hts_itr_multi_bam, bam_readrec, bam_pseek, bam_ptell); + hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell); } /********************** @@ -895,41 +1225,17 @@ hts_itr_multi_t *sam_itr_regions(const hts_idx_t *idx, bam_hdr_t *hdr, hts_regli #include "htslib/kseq.h" #include "htslib/kstring.h" -bam_hdr_t *sam_hdr_parse(int l_text, const char *text) +sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text) { - const char *q, *r, *p; - khash_t(s2i) *d; - d = kh_init(s2i); - for (p = text; *p; ++p) { - if (strncmp(p, "@SQ\t", 4) == 0) { - char *sn = 0; - int ln = -1; - for (q = p + 4;; ++q) { - if (strncmp(q, "SN:", 3) == 0) { - q += 3; - for (r = q; *r != '\t' && *r != '\n' && *r != '\0'; ++r); - sn = (char*)calloc(r - q + 1, 1); - strncpy(sn, q, r - q); - q = r; - } else if (strncmp(q, "LN:", 3) == 0) - ln = strtol(q + 3, (char**)&q, 10); - while (*q != '\t' && *q != '\n' && *q != '\0') ++q; - if (*q == '\0' || *q == '\n') break; - } - p = q; - if (sn && ln >= 0) { - khint_t k; - int absent; - k = kh_put(s2i, d, sn, &absent); - if (!absent) { - hts_log_warning("Duplicated sequence '%s'", sn); - free(sn); - } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln; - } - } - while (*p != '\0' && *p != '\n') ++p; + sam_hdr_t *bh = sam_hdr_init(); + if (!bh) return NULL; + + if (sam_hdr_add_lines(bh, text, l_text) != 0) { + sam_hdr_destroy(bh); + return NULL; } - return hdr_from_dict(d); + + return bh; } // Minimal sanitisation of a header to ensure. @@ -941,7 +1247,7 @@ bam_hdr_t *sam_hdr_parse(int l_text, const char *text) // - syntax (eg checking tab separated fields). // - validating n_targets matches @SQ records. // - validating target lengths against @SQ records. -static bam_hdr_t *sam_hdr_sanitise(bam_hdr_t *h) { +static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) { if (!h) return NULL; @@ -949,7 +1255,8 @@ static bam_hdr_t *sam_hdr_sanitise(bam_hdr_t *h) { if (h->l_text == 0) return h; - uint32_t i, lnum = 0; + size_t i; + unsigned int lnum = 0; char *cp = h->text, last = '\n'; for (i = 0; i < h->l_text; i++) { // NB: l_text excludes terminating nul. This finds early ones. @@ -961,7 +1268,7 @@ static bam_hdr_t *sam_hdr_sanitise(bam_hdr_t *h) { lnum++; if (cp[i] != '@') { hts_log_error("Malformed SAM header at line %u", lnum); - bam_hdr_destroy(h); + sam_hdr_destroy(h); return NULL; } } @@ -970,7 +1277,7 @@ static bam_hdr_t *sam_hdr_sanitise(bam_hdr_t *h) { } if (i < h->l_text) { // Early nul found. Complain if not just padding. - uint32_t j = i; + size_t j = i; while (j < h->l_text && cp[j] == '\0') j++; if (j < h->l_text) hts_log_warning("Unexpected NUL character in header. Possibly truncated"); @@ -980,16 +1287,16 @@ static bam_hdr_t *sam_hdr_sanitise(bam_hdr_t *h) { if (last != '\n') { hts_log_warning("Missing trailing newline on SAM header. Possibly truncated"); - if (h->l_text == UINT32_MAX) { - hts_log_error("No room for extra newline"); - bam_hdr_destroy(h); - return NULL; - } + if (h->l_text < 2 || i >= h->l_text - 2) { + if (h->l_text >= SIZE_MAX - 2) { + hts_log_error("No room for extra newline"); + sam_hdr_destroy(h); + return NULL; + } - if (i >= h->l_text - 1) { cp = realloc(h->text, (size_t) h->l_text+2); if (!cp) { - bam_hdr_destroy(h); + sam_hdr_destroy(h); return NULL; } h->text = cp; @@ -1005,75 +1312,265 @@ static bam_hdr_t *sam_hdr_sanitise(bam_hdr_t *h) { return h; } -bam_hdr_t *sam_hdr_read(htsFile *fp) -{ - if (!fp) { - errno = EINVAL; - return NULL; - } +static sam_hdr_t *sam_hdr_create(htsFile* fp) { + kstring_t str = { 0, 0, NULL }; + khint_t k; + sam_hdr_t* h = sam_hdr_init(); + const char *q, *r; + char* sn = NULL; + khash_t(s2i) *d = kh_init(s2i); + khash_t(s2i) *long_refs = NULL; + if (!h || !d) + goto error; + + int ret, has_SQ = 0; + int next_c = '@'; + while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) { + if (fp->line.s[0] != '@') + break; - switch (fp->format.format) { - case bam: - return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf)); + if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) { + has_SQ = 1; + hts_pos_t ln = -1; + for (q = fp->line.s + 4;; ++q) { + if (strncmp(q, "SN:", 3) == 0) { + q += 3; + for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r); - case cram: - return sam_hdr_sanitise(cram_header_to_bam(fp->fp.cram->header)); + if (sn) { + hts_log_warning("SQ header line has more than one SN: tag"); + free(sn); + } + sn = (char*)calloc(r - q + 1, 1); + if (!sn) + goto error; - case sam: { - kstring_t str = { 0, 0, NULL }; - bam_hdr_t *h = NULL; - int ret, has_SQ = 0; - while ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) { - if (fp->line.s[0] != '@') break; - if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1; - kputsn(fp->line.s, fp->line.l, &str); - kputc('\n', &str); - } - if (ret < -1) goto error; - if (! has_SQ && fp->fn_aux) { - kstring_t line = { 0, 0, NULL }; - hFILE *f = hopen(fp->fn_aux, "r"); - if (f == NULL) goto error; - while (line.l = 0, kgetline(&line, (kgets_func *) hgets, f) >= 0) { - char *tab = strchr(line.s, '\t'); - if (tab == NULL) continue; - kputs("@SQ\tSN:", &str); - kputsn(line.s, tab - line.s, &str); - kputs("\tLN:", &str); - kputl(atol(tab), &str); - kputc('\n', &str); + strncpy(sn, q, r - q); + q = r; + } else { + if (strncmp(q, "LN:", 3) == 0) + ln = strtoll(q + 3, (char**)&q, 10); + } + + while (*q != '\t' && *q != '\n' && *q != '\0') + ++q; + if (*q == '\0' || *q == '\n') + break; } - free(line.s); - if (hclose(f) != 0) { - hts_log_warning("Failed to close %s", fp->fn_aux); + if (sn) { + if (ln >= 0) { + int absent; + k = kh_put(s2i, d, sn, &absent); + if (absent < 0) + goto error; + + if (!absent) { + hts_log_warning("Duplicated sequence '%s'", sn); + free(sn); + } else { + if (ln >= UINT32_MAX) { + // Stash away ref length that + // doesn't fit in target_len array + int k2; + if (!long_refs) { + long_refs = kh_init(s2i); + if (!long_refs) + goto error; + } + k2 = kh_put(s2i, long_refs, sn, &absent); + if (absent < 0) + goto error; + kh_val(long_refs, k2) = ln; + kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 + | UINT32_MAX); + } else { + kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; + } + } + } else { + hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn); + free(sn); + } + } else { + hts_log_warning("Ignored @SQ line with missing SN: tag"); } + sn = NULL; } - if (str.l == 0) kputsn("", 0, &str); - h = sam_hdr_parse(str.l, str.s); - h->l_text = str.l; h->text = str.s; - return sam_hdr_sanitise(h); + if (kputsn(fp->line.s, fp->line.l, &str) < 0) + goto error; - error: - bam_hdr_destroy(h); - free(str.s); - return NULL; - } + if (kputc('\n', &str) < 0) + goto error; - default: - abort(); + if (fp->format.compression == bgzf) { + next_c = bgzf_peek(fp->fp.bgzf); + } else { + unsigned char nc; + ssize_t pret = hpeek(fp->fp.hfile, &nc, 1); + next_c = pret > 0 ? nc : pret - 1; + } + if (next_c < -1) + goto error; } -} + if (next_c != '@') + fp->line.l = 0; -int sam_hdr_write(htsFile *fp, const bam_hdr_t *h) -{ - if (!fp || !h) { - errno = EINVAL; - return -1; - } + if (ret < -1) + goto error; - switch (fp->format.format) { - case binary_format: - fp->format.category = sequence_data; + if (!has_SQ && fp->fn_aux) { + kstring_t line = { 0, 0, NULL }; + hFILE* f = hopen(fp->fn_aux, "r"); + int e = 0, absent; + if (f == NULL) + goto error; + + while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) { + char* tab = strchr(line.s, '\t'); + hts_pos_t ln; + + if (tab == NULL) + continue; + + sn = (char*)calloc(tab-line.s+1, 1); + if (!sn) + break; + memcpy(sn, line.s, tab-line.s); + k = kh_put(s2i, d, sn, &absent); + if (absent < 0) + break; + + ln = strtoll(tab, NULL, 10); + + if (!absent) { + hts_log_warning("Duplicated sequence '%s'", sn); + free(sn); + } else { + if (ln >= UINT32_MAX) { + // Stash away ref length that + // doesn't fit in target_len array + khint_t k2; + int absent = -1; + if (!long_refs) { + long_refs = kh_init(s2i); + if (!long_refs) + goto error; + } + k2 = kh_put(s2i, long_refs, sn, &absent); + if (absent < 0) + goto error; + kh_val(long_refs, k2) = ln; + kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 + | UINT32_MAX); + } else { + kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; + } + has_SQ = 1; + } + + e |= kputs("@SQ\tSN:", &str) < 0; + e |= kputsn(line.s, tab - line.s, &str) < 0; + e |= kputs("\tLN:", &str) < 0; + e |= kputll(ln, &str) < 0; + e |= kputc('\n', &str) < 0; + if (e) + break; + } + + ks_free(&line); + if (hclose(f) != 0) { + hts_log_error("Error on closing %s", fp->fn_aux); + e = 1; + } + if (e) + goto error; + } + + if (has_SQ) { + // Populate the targets array + h->n_targets = kh_size(d); + + h->target_name = (char**) malloc(sizeof(char*) * h->n_targets); + if (!h->target_name) + goto error; + + h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets); + if (!h->target_len) + goto error; + + for (k = kh_begin(d); k != kh_end(d); ++k) { + if (!kh_exist(d, k)) + continue; + + h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k); + h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL; + kh_val(d, k) >>= 32; + } + } + + // Repurpose sdict to hold any references longer than UINT32_MAX + h->sdict = long_refs; + + kh_destroy(s2i, d); + + if (str.l == 0) + kputsn("", 0, &str); + h->l_text = str.l; + h->text = ks_release(&str); + fp->bam_header = sam_hdr_sanitise(h); + fp->bam_header->ref_count = 1; + + return fp->bam_header; + + error: + sam_hdr_destroy(h); + ks_free(&str); + kh_destroy(s2i, d); + kh_destroy(s2i, long_refs); + if (sn) free(sn); + return NULL; +} + +sam_hdr_t *sam_hdr_read(htsFile *fp) +{ + if (!fp) { + errno = EINVAL; + return NULL; + } + + switch (fp->format.format) { + case bam: + return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf)); + + case cram: + return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header)); + + case sam: + return sam_hdr_create(fp); + + case empty_format: + errno = EPIPE; + return NULL; + + default: + errno = EFTYPE; + return NULL; + } +} + +int sam_hdr_write(htsFile *fp, const sam_hdr_t *h) +{ + if (!fp || !h) { + errno = EINVAL; + return -1; + } + + if (!h->hrecs && !h->text) + return 0; + + switch (fp->format.format) { + case binary_format: + fp->format.category = sequence_data; fp->format.format = bam; /* fall-through */ case bam: @@ -1082,9 +1579,7 @@ int sam_hdr_write(htsFile *fp, const bam_hdr_t *h) case cram: { cram_fd *fd = fp->fp.cram; - SAM_hdr *hdr = bam_header_to_cram((bam_hdr_t *)h); - if (! hdr) return -1; - if (cram_set_header(fd, hdr) < 0) return -1; + if (cram_set_header2(fd, h) < 0) return -1; if (fp->fn_aux) cram_load_reference(fd, fp->fn_aux); if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1; @@ -1096,31 +1591,77 @@ int sam_hdr_write(htsFile *fp, const bam_hdr_t *h) fp->format.format = sam; /* fall-through */ case sam: { - char *p; - hputs(h->text, fp->fp.hfile); - p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!! - if (p == 0) { + char *text; + kstring_t hdr_ks = { 0, 0, NULL }; + size_t l_text; + ssize_t bytes; + int r = 0, no_sq = 0; + + if (h->hrecs) { + if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) + return -1; + text = hdr_ks.s; + l_text = hdr_ks.l; + } else { + const char *p = NULL; + do { + const char *q = p == NULL ? h->text : p + 4; + p = strstr(q, "@SQ\t"); + } while (!(p == NULL || p == h->text || *(p - 1) == '\n')); + no_sq = p == NULL; + text = h->text; + l_text = h->l_text; + } + + if (fp->format.compression == bgzf) { + bytes = bgzf_write(fp->fp.bgzf, text, l_text); + } else { + bytes = hwrite(fp->fp.hfile, text, l_text); + } + free(hdr_ks.s); + if (bytes != l_text) + return -1; + + if (no_sq) { int i; for (i = 0; i < h->n_targets; ++i) { fp->line.l = 0; - kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line); - kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line); - if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; + r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0; + r |= kputs(h->target_name[i], &fp->line) < 0; + r |= kputsn("\tLN:", 4, &fp->line) < 0; + r |= kputw(h->target_len[i], &fp->line) < 0; + r |= kputc('\n', &fp->line) < 0; + if (r != 0) + return -1; + + if (fp->format.compression == bgzf) { + bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); + } else { + bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); + } + if (bytes != fp->line.l) + return -1; } } - if ( hflush(fp->fp.hfile) != 0 ) return -1; + if (fp->format.compression == bgzf) { + if (bgzf_flush(fp->fp.bgzf) != 0) return -1; + } else { + if (hflush(fp->fp.hfile) != 0) return -1; + } } break; default: - abort(); + errno = EBADF; + return -1; } return 0; } -int sam_hdr_change_HD(bam_hdr_t *h, const char *key, const char *val) +static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val) { char *p, *q, *beg = NULL, *end = NULL, *newtext; + size_t new_l_text; if (!h || !key) return -1; @@ -1150,103 +1691,315 @@ int sam_hdr_change_HD(bam_hdr_t *h, const char *key, const char *val) } } if (beg == NULL) { // no @HD - if (h->l_text > UINT32_MAX - strlen(SAM_FORMAT_VERSION) - 9) + new_l_text = h->l_text; + if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9) return -1; - h->l_text += strlen(SAM_FORMAT_VERSION) + 8; + new_l_text += strlen(SAM_FORMAT_VERSION) + 8; if (val) { - if (h->l_text > UINT32_MAX - strlen(val) - 5) + if (new_l_text > SIZE_MAX - strlen(val) - 5) return -1; - h->l_text += strlen(val) + 4; + new_l_text += strlen(val) + 4; } - newtext = (char*)malloc(h->l_text + 1); + newtext = (char*)malloc(new_l_text + 1); if (!newtext) return -1; if (val) - snprintf(newtext, h->l_text + 1, + snprintf(newtext, new_l_text + 1, "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text); else - snprintf(newtext, h->l_text + 1, + snprintf(newtext, new_l_text + 1, "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text); } else { // has @HD but different or no key - h->l_text = (beg - h->text) + (h->text + h->l_text - end); + new_l_text = (beg - h->text) + (h->text + h->l_text - end); if (val) { - if (h->l_text > UINT32_MAX - strlen(val) - 5) + if (new_l_text > SIZE_MAX - strlen(val) - 5) return -1; - h->l_text += strlen(val) + 4; + new_l_text += strlen(val) + 4; } - newtext = (char*)malloc(h->l_text + 1); + newtext = (char*)malloc(new_l_text + 1); if (!newtext) return -1; if (val) { - snprintf(newtext, h->l_text + 1, "%.*s\t%s:%s%s", + snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s", (int) (beg - h->text), h->text, key, val, end); } else { //delete key - snprintf(newtext, h->l_text + 1, "%.*s%s", + snprintf(newtext, new_l_text + 1, "%.*s%s", (int) (beg - h->text), h->text, end); } } free(h->text); h->text = newtext; + h->l_text = new_l_text; return 0; } + +int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val) +{ + if (!h || !key) + return -1; + + if (!h->hrecs) + return old_sam_hdr_change_HD(h, key, val); + + if (val) { + if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0) + return -1; + } else { + if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0) + return -1; + } + return sam_hdr_rebuild(h); +} /********************** *** SAM record I/O *** **********************/ -int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) +static int sam_parse_B_vals(char type, uint32_t n, char *in, char **end, + char *r, bam1_t *b) { -#define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0 -#define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t' -#define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l) + int orig_l = b->l_data; + char *q = in; + int32_t size; + size_t bytes; + int overflow = 0; + + size = aux_type2size(type); + if (size <= 0 || size > 4) { + hts_log_error("Unrecognized type B:%c", type); + return -1; + } + + // Ensure space for type + values + bytes = (size_t) n * (size_t) size; + if (bytes / size != n + || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) { + hts_log_error("Out of memory"); + return -1; + } + + b->data[b->l_data++] = 'B'; + b->data[b->l_data++] = type; + i32_to_le(n, b->data + b->l_data); + b->l_data += sizeof(uint32_t); + // This ensures that q always ends up at the next comma after + // reading a number even if it's followed by junk. It + // prevents the possibility of trying to read more than n items. +#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0) + if (type == 'c') { + while (q < r) { + *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, &overflow); + b->l_data++; + skip_to_comma_(q); + } + } else if (type == 'C') { + while (q < r) { + if (*q != '-') { + *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, &overflow); + b->l_data++; + } else { + overflow = 1; + } + skip_to_comma_(q); + } + } else if (type == 's') { + while (q < r) { + i16_to_le(hts_str2int(q + 1, &q, 16, &overflow), b->data + b->l_data); + b->l_data += 2; + skip_to_comma_(q); + } + } else if (type == 'S') { + while (q < r) { + if (*q != '-') { + u16_to_le(hts_str2uint(q + 1, &q, 16, &overflow), b->data + b->l_data); + b->l_data += 2; + } else { + overflow = 1; + } + skip_to_comma_(q); + } + } else if (type == 'i') { + while (q < r) { + i32_to_le(hts_str2int(q + 1, &q, 32, &overflow), b->data + b->l_data); + b->l_data += 4; + skip_to_comma_(q); + } + } else if (type == 'I') { + while (q < r) { + if (*q != '-') { + u32_to_le(hts_str2uint(q + 1, &q, 32, &overflow), b->data + b->l_data); + b->l_data += 4; + } else { + overflow = 1; + } + skip_to_comma_(q); + } + } else if (type == 'f') { + while (q < r) { + float_to_le(strtod(q + 1, &q), b->data + b->l_data); + b->l_data += 4; + skip_to_comma_(q); + } + } else { + hts_log_error("Unrecognized type B:%c", type); + return -1; + } + + if (!overflow) { + *end = q; + return 0; + } else { + int64_t max = 0, min = 0, val; + // Given type was incorrect. Try to rescue the situation. + q = in; + overflow = 0; + b->l_data = orig_l; + // Find out what range of values is present + while (q < r) { + val = hts_str2int(q + 1, &q, 64, &overflow); + if (max < val) max = val; + if (min > val) min = val; + skip_to_comma_(q); + } + // Retry with appropriate type + if (!overflow) { + if (min < 0) { + if (min >= INT8_MIN && max <= INT8_MAX) { + return sam_parse_B_vals('c', n, in, end, r, b); + } else if (min >= INT16_MIN && max <= INT16_MAX) { + return sam_parse_B_vals('s', n, in, end, r, b); + } else if (min >= INT32_MIN && max <= INT32_MAX) { + return sam_parse_B_vals('i', n, in, end, r, b); + } + } else { + if (max < UINT8_MAX) { + return sam_parse_B_vals('C', n, in, end, r, b); + } else if (max <= UINT16_MAX) { + return sam_parse_B_vals('S', n, in, end, r, b); + } else if (max <= UINT32_MAX) { + return sam_parse_B_vals('I', n, in, end, r, b); + } + } + } + // If here then at least one of the values is too big to store + hts_log_error("Numeric value in B array out of allowed range"); + return -1; + } +#undef skip_to_comma_ +} + +static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) { + if (*v >= '1' && *v <= '9') { + return hts_str2uint(v, rv, 16, overflow); + } + else if (*v == '0') { + // handle single-digit "0" directly; otherwise it's hex or octal + if (v[1] == '\t') { *rv = v+1; return 0; } + else { + unsigned long val = strtoul(v, rv, 0); + if (val > 65535) { *overflow = 1; return 65535; } + return val; + } + } + else { + // TODO implement symbolic flag letters + *rv = v; + return 0; + } +} + +int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) +{ +#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0) + +#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff + +// Macro that operates on 64-bits at a time. +#define COPY_MINUS_N(to,from,n,l,failed) \ + do { \ + uint64_u *from8 = (uint64_u *)(from); \ + uint64_u *to8 = (uint64_u *)(to); \ + uint64_t uflow = 0; \ + size_t l8 = (l)>>3, i; \ + for (i = 0; i < l8; i++) { \ + to8[i] = from8[i] - (n)*0x0101010101010101UL; \ + uflow |= to8[i]; \ + } \ + for (i<<=3; i < (l); ++i) { \ + to[i] = from[i] - (n); \ + uflow |= to[i]; \ + } \ + failed = (uflow & 0x8080808080808080UL) > 0; \ + } while (0) + +#else + +// Basic version which operates a byte at a time +#define COPY_MINUS_N(to,from,n,l,failed) do { \ + uint8_t uflow = 0; \ + for (i = 0; i < (l); ++i) { \ + (to)[i] = (from)[i] - (n); \ + uflow |= (uint8_t) (to)[i]; \ + } \ + failed = (uflow & 0x80) > 0; \ + } while (0) + +#endif + +#define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l) #define _parse_err(cond, msg) do { if (cond) { hts_log_error(msg); goto err_ret; } } while (0) #define _parse_err_param(cond, msg, param) do { if (cond) { hts_log_error(msg, param); goto err_ret; } } while (0) #define _parse_warn(cond, msg) do { if (cond) { hts_log_warning(msg); } } while (0) uint8_t *t; + char *p = s->s, *q; - int i; - kstring_t str; + int i, overflow = 0; + hts_pos_t cigreflen; bam1_core_t *c = &b->core; - str.l = b->l_data = 0; - str.s = (char*)b->data; str.m = b->m_data; + b->l_data = 0; memset(c, 0, 32); - if (h->cigar_tab == 0) { - h->cigar_tab = (int8_t*) malloc(128); - for (i = 0; i < 128; ++i) - h->cigar_tab[i] = -1; - for (i = 0; BAM_CIGAR_STR[i]; ++i) - h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i; - } + // qname q = _read_token(p); + _parse_warn(p - q <= 1, "empty query name"); - _parse_err(p - q > 252, "query name too long"); - kputsn_(q, p - q, &str); - for (c->l_extranul = 0; str.l % 4 != 0; c->l_extranul++) - kputc_('\0', &str); + _parse_err(p - q > 255, "query name too long"); + // resize large enough for name + extranul + if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret; + memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q; + + c->l_extranul = (4 - (b->l_data & 3)) & 3; + memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul); + b->l_data += c->l_extranul; + c->l_qname = p - q + c->l_extranul; + // flag - c->flag = strtol(p, &p, 0); + c->flag = parse_sam_flag(p, &p, &overflow); if (*p++ != '\t') goto err_ret; // malformated flag + // chr q = _read_token(p); if (strcmp(q, "*")) { - _parse_err(h->n_targets == 0, "missing SAM header"); + _parse_err(h->n_targets == 0, "no SQ lines present in the header"); c->tid = bam_name2id(h, q); + _parse_err(c->tid < -1, "failed to parse header"); _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped"); } else c->tid = -1; + // pos - c->pos = strtol(p, &p, 10) - 1; + c->pos = hts_str2uint(p, &p, 63, &overflow) - 1; if (*p++ != '\t') goto err_ret; if (c->pos < 0 && c->tid >= 0) { _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped"); c->tid = -1; } if (c->tid < 0) c->flag |= BAM_FUNMAP; + // mapq - c->qual = strtol(p, &p, 10); + c->qual = hts_str2uint(p, &p, 8, &overflow); if (*p++ != '\t') goto err_ret; // cigar if (*p != '*') { @@ -1258,23 +2011,25 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) _parse_err(n_cigar == 0, "no CIGAR operations"); _parse_err(n_cigar >= 2147483647, "too many CIGAR operations"); c->n_cigar = n_cigar; - _get_mem(uint32_t, &cigar, &str, c->n_cigar * sizeof(uint32_t)); - for (i = 0; i < c->n_cigar; ++i, ++q) { + _get_mem(uint32_t, &cigar, b, c->n_cigar * sizeof(uint32_t)); + for (i = 0; i < c->n_cigar; ++i) { int op; - cigar[i] = strtol(q, &q, 10)<= 128? -1 : h->cigar_tab[(int)*q]; + cigar[i] = hts_str2uint(q, &q, 28, &overflow)<flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1; + cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1; } else { _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped"); c->flag |= BAM_FUNMAP; q = _read_token(p); - i = 1; + cigreflen = 1; } - c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5); + _parse_err(HTS_POS_MAX - cigreflen <= c->pos, + "read ends beyond highest supported position"); + c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5); // mate chr q = _read_token(p); if (strcmp(q, "=") == 0) { @@ -1283,163 +2038,842 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) c->mtid = -1; } else { c->mtid = bam_name2id(h, q); + _parse_err(c->tid < -1, "failed to parse header"); _parse_warn(c->mtid < 0, "urecognized mate reference name; treated as unmapped"); } - // mpos - c->mpos = strtol(p, &p, 10) - 1; - if (*p++ != '\t') goto err_ret; - if (c->mpos < 0 && c->mtid >= 0) { - _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped"); - c->mtid = -1; + // mpos + c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1; + if (*p++ != '\t') goto err_ret; + if (c->mpos < 0 && c->mtid >= 0) { + _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped"); + c->mtid = -1; + } + // tlen + c->isize = hts_str2int(p, &p, 64, &overflow); + if (*p++ != '\t') goto err_ret; + // seq + q = _read_token(p); + if (strcmp(q, "*")) { + _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long"); + c->l_qseq = p - q - 1; + hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname)); + _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length"); + i = (c->l_qseq + 1) >> 1; + _get_mem(uint8_t, &t, b, i); + + unsigned int lqs2 = c->l_qseq&~1, i; + for (i = 0; i < lqs2; i+=2) + t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]]; + for (; i < c->l_qseq; ++i) + t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2); + } else c->l_qseq = 0; + // qual + _get_mem(uint8_t, &t, b, c->l_qseq); + if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) { + memset(t, 0xff, c->l_qseq); + p += 2; + } else { + int failed = 0; + _parse_err(s->l - (p - s->s) < c->l_qseq + || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'), + "SEQ and QUAL are of different length"); + COPY_MINUS_N(t, p, 33, c->l_qseq, failed); + _parse_err(failed, "invalid QUAL character"); + p += c->l_qseq + 1; + } + // aux + q = p; + p = s->s + s->l; + while (q < p) { + uint8_t type; + _parse_err(p - q < 5, "incomplete aux field"); + _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id"); + // Copy over id + if (possibly_expand_bam_data(b, 2) < 0) goto err_ret; + memcpy(b->data + b->l_data, q, 2); b->l_data += 2; + q += 3; type = *q++; ++q; // q points to value + if (type != 'Z' && type != 'H') // the only zero length acceptable fields + _parse_err(*q <= '\t', "incomplete aux field"); + + // Ensure enough space for a double + type allocated. + if (possibly_expand_bam_data(b, 16) < 0) goto err_ret; + + if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { + b->data[b->l_data++] = 'A'; + b->data[b->l_data++] = *q++; + } else if (type == 'i' || type == 'I') { + if (*q == '-') { + int32_t x = hts_str2int(q, &q, 32, &overflow); + if (x >= INT8_MIN) { + b->data[b->l_data++] = 'c'; + b->data[b->l_data++] = x; + } else if (x >= INT16_MIN) { + b->data[b->l_data++] = 's'; + i16_to_le(x, b->data + b->l_data); + b->l_data += 2; + } else { + b->data[b->l_data++] = 'i'; + i32_to_le(x, b->data + b->l_data); + b->l_data += 4; + } + } else { + uint32_t x = hts_str2uint(q, &q, 32, &overflow); + if (x <= UINT8_MAX) { + b->data[b->l_data++] = 'C'; + b->data[b->l_data++] = x; + } else if (x <= UINT16_MAX) { + b->data[b->l_data++] = 'S'; + u16_to_le(x, b->data + b->l_data); + b->l_data += 2; + } else { + b->data[b->l_data++] = 'I'; + u32_to_le(x, b->data + b->l_data); + b->l_data += 4; + } + } + } else if (type == 'f') { + b->data[b->l_data++] = 'f'; + float_to_le(strtod(q, &q), b->data + b->l_data); + b->l_data += sizeof(float); + } else if (type == 'd') { + b->data[b->l_data++] = 'd'; + double_to_le(strtod(q, &q), b->data + b->l_data); + b->l_data += sizeof(double); + } else if (type == 'Z' || type == 'H') { + char *end = strchr(q, '\t'); + if (!end) end = q + strlen(q); + _parse_err(type == 'H' && ((end-q)&1) != 0, + "hex field does not have an even number of digits"); + b->data[b->l_data++] = type; + if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret; + memcpy(b->data + b->l_data, q, end - q); + b->l_data += end - q; + b->data[b->l_data++] = '\0'; + q = end; + } else if (type == 'B') { + uint32_t n; + char *r; + type = *q++; // q points to the first ',' following the typing byte + _parse_err(*q && *q != ',' && *q != '\t', + "B aux field type not followed by ','"); + + for (r = q, n = 0; *r > '\t'; ++r) + if (*r == ',') ++n; + + if (sam_parse_B_vals(type, n, q, &q, r, b) < 0) + goto err_ret; + } else _parse_err_param(1, "unrecognized type %c", type); + + while (*q > '\t') { q++; } // Skip any junk to next tab + q++; + } + + _parse_err(overflow != 0, "numeric value out of allowed range"); + + if (bam_tag2cigar(b, 1, 1) < 0) + return -2; + return 0; + +#undef _parse_warn +#undef _parse_err +#undef _parse_err_param +#undef _get_mem +#undef _read_token +err_ret: + return -2; +} + +/* + * ----------------------------------------------------------------------------- + * SAM threading + */ +// Size of SAM text block (reading) +#define NM 240000 +// Number of BAM records (writing) +#define NB 1000 + +struct SAM_state; + +// Output job - a block of BAM records +typedef struct sp_bams { + struct sp_bams *next; + int serial; + + bam1_t *bams; + int nbams, abams; // used and alloc + + struct SAM_state *fd; +} sp_bams; + +// Input job - a block of SAM text +typedef struct sp_lines { + struct sp_lines *next; + int serial; + + char *data; + int data_size; + int alloc; + + struct SAM_state *fd; + sp_bams *bams; +} sp_lines; + +enum sam_cmd { + SAM_NONE = 0, + SAM_CLOSE, +}; + +typedef struct SAM_state { + sam_hdr_t *h; + + hts_tpool *p; + int own_pool; + pthread_mutex_t lines_m; + hts_tpool_process *q; + pthread_t dispatcher; + + sp_lines *lines; + sp_bams *bams; + + sp_bams *curr_bam; + int curr_idx; + int serial; + + // Be warned: moving these mutexes around in this struct can reduce + // threading performance by up to 70%! + pthread_mutex_t command_m; + pthread_cond_t command_c; + enum sam_cmd command; + + // One of the E* errno codes + int errcode; + + htsFile *fp; +} SAM_state; + +// Returns a SAM_state struct from a generic hFILE. +// +// Returns NULL on failure. +static SAM_state *sam_state_create(htsFile *fp) { + // Ideally sam_open wouldn't be a #define to hts_open but instead would + // be a redirect call with an additional 'S' mode. This in turn would + // correctly set the designed format to sam instead of a generic + // text_format. + if (fp->format.format != sam && fp->format.format != text_format) + return NULL; + + SAM_state *fd = calloc(1, sizeof(*fd)); + if (!fd) + return NULL; + + fp->state = fd; + fd->fp = fp; + + return fd; +} + +static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str); +static void *sam_format_worker(void *arg); + +static void sam_state_err(SAM_state *fd, int errcode) { + pthread_mutex_lock(&fd->command_m); + if (!fd->errcode) + fd->errcode = errcode; + pthread_mutex_unlock(&fd->command_m); +} + +static void sam_free_sp_bams(sp_bams *b) { + if (!b) + return; + + if (b->bams) { + int i; + for (i = 0; i < b->abams; i++) { + if (b->bams[i].data) + free(b->bams[i].data); + } + free(b->bams); + } + free(b); +} + +// Destroys the state produce by sam_state_create. +int sam_state_destroy(htsFile *fp) { + int ret = 0; + + if (!fp->state) + return 0; + + SAM_state *fd = fp->state; + if (fd->p) { + if (fd->h) { + // Notify sam_dispatcher we're closing + pthread_mutex_lock(&fd->command_m); + fd->command = SAM_CLOSE; + pthread_cond_signal(&fd->command_c); + ret = -fd->errcode; + if (!ret) hts_tpool_wake_dispatch(fd->q); // unstick the reader + pthread_mutex_unlock(&fd->command_m); + + if (fp->is_write) { + // Dispatch the last partial block. + sp_bams *gb = fd->curr_bam; + if (!ret && gb && gb->nbams > 0) + ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb); + + // Flush and drain output + hts_tpool_process_flush(fd->q); + pthread_mutex_lock(&fd->command_m); + if (!ret) ret = -fd->errcode; + pthread_mutex_unlock(&fd->command_m); + + while (!ret && !hts_tpool_process_empty(fd->q)) { + usleep(10000); + pthread_mutex_lock(&fd->command_m); + if (!ret) ret = -fd->errcode; + pthread_mutex_unlock(&fd->command_m); + } + hts_tpool_process_shutdown(fd->q); + } + + // Wait for it to acknowledge + pthread_join(fd->dispatcher, NULL); + if (!ret) ret = -fd->errcode; + } + + // Tidy up memory + if (fd->q) + hts_tpool_process_destroy(fd->q); + + if (fd->own_pool && fp->format.compression == no_compression) { + hts_tpool_destroy(fd->p); + fd->p = NULL; + } + pthread_mutex_destroy(&fd->lines_m); + pthread_mutex_destroy(&fd->command_m); + pthread_cond_destroy(&fd->command_c); + + sp_lines *l = fd->lines; + while (l) { + sp_lines *n = l->next; + free(l->data); + free(l); + l = n; + } + + sp_bams *b = fd->bams; + while (b) { + if (fd->curr_bam == b) + fd->curr_bam = NULL; + sp_bams *n = b->next; + sam_free_sp_bams(b); + b = n; + } + + if (fd->curr_bam) + sam_free_sp_bams(fd->curr_bam); + + // Decrement counter by one, maybe destroying too. + // This is to permit the caller using bam_hdr_destroy + // before sam_close without triggering decode errors + // in the background threads. + bam_hdr_destroy(fd->h); + } + + free(fp->state); + fp->state = NULL; + return ret; +} + +// Run from one of the worker threads. +// Convert a passed in array of lines to array of BAMs, returning +// the result back to the thread queue. +static void *sam_parse_worker(void *arg) { + sp_lines *gl = (sp_lines *)arg; + sp_bams *gb = NULL; + char *lines = gl->data; + int i; + bam1_t *b; + SAM_state *fd = gl->fd; + + // Use a block of BAM structs we had earlier if available. + pthread_mutex_lock(&fd->lines_m); + if (fd->bams) { + gb = fd->bams; + fd->bams = gb->next; + } + pthread_mutex_unlock(&fd->lines_m); + + if (gb == NULL) { + gb = calloc(1, sizeof(*gb)); + if (!gb) { + return NULL; + } + gb->abams = 100; + gb->bams = b = calloc(gb->abams, sizeof(*b)); + if (!gb->bams) { + sam_state_err(fd, ENOMEM); + goto err; + } + gb->nbams = 0; + } + gb->serial = gl->serial; + gb->next = NULL; + + b = (bam1_t *)gb->bams; + if (!b) { + sam_state_err(fd, ENOMEM); + goto err; + } + + i = 0; + char *cp = lines, *cp_end = lines + gl->data_size; + while (cp < cp_end) { + if (i >= gb->abams) { + int old_abams = gb->abams; + gb->abams *= 2; + b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t)); + if (!b) { + sam_state_err(fd, ENOMEM); + goto err; + } + memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b)); + gb->bams = b; + } + + // Ideally we'd get sam_parse1 to return the number of + // bytes decoded and to be able to stop on newline as + // well as \0. + // + // We can then avoid the additional strchr loop. + // It's around 6% of our CPU cost, albeit threadable. + // + // However this is an API change so for now we copy. + + char *nl = strchr(cp, '\n'); + nl = nl ? nl : cp_end; + if (*nl) *nl++ = '\0'; + kstring_t ks = {nl-cp, gl->alloc, cp}; + if (sam_parse1(&ks, fd->h, &b[i]) < 0) { + sam_state_err(fd, EIO); + goto err; + } + cp = nl; + i++; + } + gb->nbams = i; + + pthread_mutex_lock(&fd->lines_m); + gl->next = fd->lines; + fd->lines = gl; + pthread_mutex_unlock(&fd->lines_m); + return gb; + + err: + sam_free_sp_bams(gb); + return NULL; +} + +static void *sam_parse_eof(void *arg) { + return NULL; +} + +// Cleanup function - job for sam_parse_worker; result for sam_format_worker +static void cleanup_sp_lines(void *arg) { + sp_lines *gl = (sp_lines *)arg; + + if (!gl) return; + + // Should always be true for lines passed to / from thread workers + assert(gl->next == NULL); + + free(gl->data); + sam_free_sp_bams(gl->bams); + free(gl); +} + +// Cleanup function - result for sam_parse_worker; job for sam_format_worker +static void cleanup_sp_bams(void *arg) { + sam_free_sp_bams((sp_bams *) arg); +} + +// Runs in its own thread. +// Reads a block of text (SAM) and sends a new job to the thread queue to +// translate this to BAM. +static void *sam_dispatcher_read(void *vp) { + htsFile *fp = vp; + kstring_t line = {0}; + int line_frag = 0; + SAM_state *fd = fp->state; + sp_lines *l = NULL; + + // Pre-allocate buffer for left-over bits of line (exact size doesn't + // matter as it will grow if necessary). + if (ks_resize(&line, 1000) < 0) + goto err; + + for (;;) { + // Check for command + pthread_mutex_lock(&fd->command_m); + switch (fd->command) { + + case SAM_CLOSE: + pthread_cond_signal(&fd->command_c); + pthread_mutex_unlock(&fd->command_m); + hts_tpool_process_destroy(fd->q); + fd->q = NULL; + goto tidyup; + + default: + break; + } + pthread_mutex_unlock(&fd->command_m); + + pthread_mutex_lock(&fd->lines_m); + if (fd->lines) { + // reuse existing line buffer + l = fd->lines; + fd->lines = l->next; + } + pthread_mutex_unlock(&fd->lines_m); + + if (l == NULL) { + // none to reuse, to create a new one + l = calloc(1, sizeof(*l)); + if (!l) + goto err; + l->alloc = NM+8; // +8 for optimisation in sam_parse1 + l->data = malloc(l->alloc); + if (!l->data) { + free(l); + l = NULL; + goto err; + } + l->fd = fd; + } + l->next = NULL; + + if (l->alloc+NM/2 < line_frag) { + char *rp = realloc(l->data, line_frag+NM/2); + if (!rp) + goto err; + l->alloc = line_frag+NM/2; + l->data = rp; + } + memcpy(l->data, line.s, line_frag); + + l->data_size = line_frag; + ssize_t nbytes; + longer_line: + if (fp->is_bgzf) + nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag); + else + nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag); + if (nbytes < 0) { + sam_state_err(fd, EIO); + goto err; + } else if (nbytes == 0) + break; // EOF + l->data_size += nbytes; + + // trim to last \n. Maybe \r\n, but that's still fine + if (nbytes == l->alloc - line_frag) { + char *cp_end = l->data + l->data_size; + char *cp = cp_end-1; + + while (cp > (char *)l->data && *cp != '\n') + cp--; + + // entire buffer is part of a single line + if (cp == l->data) { + line_frag = l->data_size; + char *rp = realloc(l->data, l->alloc * 2); + if (!rp) + goto err; + l->alloc *= 2; + l->data = rp; + assert(l->alloc >= l->data_size); + assert(l->alloc >= line_frag); + assert(l->alloc >= l->alloc - line_frag); + goto longer_line; + } + cp++; + + // line holds the remainder of our line. + if (ks_resize(&line, cp_end - cp) < 0) + goto err; + memcpy(line.s, cp, cp_end - cp); + line_frag = cp_end - cp; + l->data_size = l->alloc - line_frag; + } else { + // out of buffer + line_frag = 0; + } + + l->serial = fd->serial++; + //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial); + if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l, + cleanup_sp_lines, cleanup_sp_bams, 0) < 0) + goto err; + l = NULL; // Now "owned" by sam_parse_worker() + } + + if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0) + goto err; + + // At EOF, wait for close request. + // (In future if we add support for seek, this is where we need to catch it.) + for (;;) { + pthread_mutex_lock(&fd->command_m); + if (fd->command == SAM_NONE) + pthread_cond_wait(&fd->command_c, &fd->command_m); + switch (fd->command) { + case SAM_CLOSE: + pthread_cond_signal(&fd->command_c); + pthread_mutex_unlock(&fd->command_m); + hts_tpool_process_destroy(fd->q); + fd->q = NULL; + goto tidyup; + + default: + pthread_mutex_unlock(&fd->command_m); + break; + } } - // tlen - c->isize = strtol(p, &p, 10); - if (*p++ != '\t') goto err_ret; - // seq - q = _read_token(p); - if (strcmp(q, "*")) { - c->l_qseq = p - q - 1; - i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname)); - _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length"); - i = (c->l_qseq + 1) >> 1; - _get_mem(uint8_t, &t, &str, i); - memset(t, 0, i); - for (i = 0; i < c->l_qseq; ++i) - t[i>>1] |= seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2); - } else c->l_qseq = 0; - // qual - q = _read_token_aux(p); - _get_mem(uint8_t, &t, &str, c->l_qseq); - if (strcmp(q, "*")) { - _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length"); - for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33; - } else memset(t, 0xff, c->l_qseq); - // aux - while (p < s->s + s->l) { - uint8_t type; - q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays - _parse_err(p - q - 1 < 5, "incomplete aux field"); - kputsn_(q, 2, &str); - q += 3; type = *q++; ++q; // q points to value - if (type != 'Z' && type != 'H') // the only zero length acceptable fields - _parse_err(p - q - 1 < 1, "incomplete aux field"); - // Ensure str has enough space for a double + type allocated. - // This is so we can stuff bigger integers and floats directly into - // the kstring. Sorry. - _parse_err(ks_resize(&str, str.l + 16), "out of memory"); + tidyup: + if (l) { + pthread_mutex_lock(&fd->lines_m); + l->next = fd->lines; + fd->lines = l; + pthread_mutex_unlock(&fd->lines_m); + } + free(line.s); - if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { - kputc_('A', &str); - kputc_(*q, &str); - } else if (type == 'i' || type == 'I') { - if (*q == '-') { - long x = strtol(q, &q, 10); - if (x >= INT8_MIN) { - kputc_('c', &str); kputc_(x, &str); - } else if (x >= INT16_MIN) { - str.s[str.l++] = 's'; - i16_to_le(x, (uint8_t *) str.s + str.l); - str.l += 2; + return NULL; + + err: + sam_state_err(fd, ENOMEM); + hts_tpool_process_destroy(fd->q); + fd->q = NULL; + goto tidyup; +} + +// Runs in its own thread. +// Takes encoded blocks of SAM off the thread results queue and writes them +// to our output stream. +static void *sam_dispatcher_write(void *vp) { + htsFile *fp = vp; + SAM_state *fd = fp->state; + hts_tpool_result *r; + + // Iterates until result queue is shutdown, where it returns NULL. + while ((r = hts_tpool_next_result_wait(fd->q))) { + sp_lines *gl = (sp_lines *)hts_tpool_result_data(r); + if (!gl) { + sam_state_err(fd, ENOMEM); + goto err; + } + + if (fp->idx) { + sp_bams *gb = gl->bams; + int i = 0, count = 0; + while (i < gl->data_size) { + int j = i; + while (i < gl->data_size && gl->data[i] != '\n') + i++; + if (i < gl->data_size) + i++; + + if (fp->format.compression == bgzf) { + if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j) + goto err; } else { - str.s[str.l++] = 'i'; - i32_to_le(x, (uint8_t *) str.s + str.l); - str.l += 4; + if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j) + goto err; } - } else { - unsigned long x = strtoul(q, &q, 10); - if (x <= UINT8_MAX) { - kputc_('C', &str); kputc_(x, &str); - } else if (x <= UINT16_MAX) { - str.s[str.l++] = 'S'; - u16_to_le(x, (uint8_t *) str.s + str.l); - str.l += 2; + + bam1_t *b = &gb->bams[count++]; + if (fp->format.compression == bgzf) { + if (bgzf_idx_push(fp->fp.bgzf, fp->idx, + b->core.tid, b->core.pos, bam_endpos(b), + bgzf_tell(fp->fp.bgzf), + !(b->core.flag&BAM_FUNMAP)) < 0) { + sam_state_err(fd, errno ? errno : ENOMEM); + hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", + bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1); + goto err; + } } else { - str.s[str.l++] = 'I'; - u32_to_le(x, (uint8_t *) str.s + str.l); - str.l += 4; + if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b), + bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { + sam_state_err(fd, errno ? errno : ENOMEM); + hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", + bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1); + goto err; + } } } - } else if (type == 'f') { - str.s[str.l++] = 'f'; - float_to_le(strtod(q, &q), (uint8_t *) str.s + str.l); - str.l += sizeof(float); - } else if (type == 'd') { - str.s[str.l++] = 'd'; - double_to_le(strtod(q, &q), (uint8_t *) str.s + str.l); - str.l += sizeof(double); - } else if (type == 'Z' || type == 'H') { - _parse_err(type == 'H' && !((p-q)&1), - "hex field does not have an even number of digits"); - kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL - } else if (type == 'B') { - int32_t n, size; - size_t bytes; - char *r; - _parse_err(p - q - 1 < 3, "incomplete B-typed aux field"); - type = *q++; // q points to the first ',' following the typing byte - size = aux_type2size(type); - _parse_err_param(size <= 0 || size > 4, - "unrecognized type B:%c", type); - _parse_err(*q && *q != ',', "B aux field type not followed by ','"); + assert(count == gb->nbams); - for (r = q, n = 0; *r; ++r) - if (*r == ',') ++n; + // Add bam array to free-list + pthread_mutex_lock(&fd->lines_m); + gb->next = fd->bams; + fd->bams = gl->bams; + gl->bams = NULL; + pthread_mutex_unlock(&fd->lines_m); + } else { + if (fp->format.compression == bgzf) { + if (bgzf_write(fp->fp.bgzf, gl->data, gl->data_size) != gl->data_size) + goto err; + } else { + if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size) + goto err; + } + } - // Ensure space for type + values - bytes = (size_t) n * (size_t) size; - _parse_err(bytes / size != n - || ks_resize(&str, str.l + bytes + 2 + sizeof(uint32_t)), - "out of memory"); - str.s[str.l++] = 'B'; - str.s[str.l++] = type; - i32_to_le(n, (uint8_t *) str.s + str.l); - str.l += sizeof(uint32_t); - - // This ensures that q always ends up at the next comma after - // reading a number even if it's followed by junk. It - // prevents the possibility of trying to read more than n items. -#define _skip_to_comma(q, p) do { while ((q) < (p) && *(q) != ',') (q)++; } while (0) - - if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); } - else if (type == 'C') while (q + 1 < p) { uint8_t x = strtoul(q + 1, &q, 0); kputc_(x, &str); } - else if (type == 's') while (q + 1 < p) { i16_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); } - else if (type == 'S') while (q + 1 < p) { u16_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); } - else if (type == 'i') while (q + 1 < p) { i32_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); } - else if (type == 'I') while (q + 1 < p) { u32_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); } - else if (type == 'f') while (q + 1 < p) { float_to_le(strtod(q + 1, &q), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); } - else _parse_err_param(1, "unrecognized type B:%c", type); - -#undef _skip_to_comma + hts_tpool_delete_result(r, 0); - } else _parse_err_param(1, "unrecognized type %c", type); + // Also updated by main thread + pthread_mutex_lock(&fd->lines_m); + gl->next = fd->lines; + fd->lines = gl; + pthread_mutex_unlock(&fd->lines_m); } - b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; - if (bam_tag2cigar(b, 1, 1) < 0) - return -2; + + sam_state_err(fd, 0); // success + hts_tpool_process_destroy(fd->q); + fd->q = NULL; + return NULL; + + err: + sam_state_err(fd, EIO); + return (void *)-1; +} + +// Run from one of the worker threads. +// Convert a passed in array of BAMs (sp_bams) and converts to a block +// of text SAM records (sp_lines). +static void *sam_format_worker(void *arg) { + sp_bams *gb = (sp_bams *)arg; + sp_lines *gl = NULL; + int i; + SAM_state *fd = gb->fd; + htsFile *fp = fd->fp; + + // Use a block of SAM strings we had earlier if available. + pthread_mutex_lock(&fd->lines_m); + if (fd->lines) { + gl = fd->lines; + fd->lines = gl->next; + } + pthread_mutex_unlock(&fd->lines_m); + + if (gl == NULL) { + gl = calloc(1, sizeof(*gl)); + if (!gl) { + sam_state_err(fd, ENOMEM); + return NULL; + } + gl->alloc = gl->data_size = 0; + gl->data = NULL; + } + gl->serial = gb->serial; + gl->next = NULL; + + kstring_t ks = {0, gl->alloc, gl->data}; + + for (i = 0; i < gb->nbams; i++) { + if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) { + sam_state_err(fd, EIO); + goto err; + } + kputc('\n', &ks); + } + + pthread_mutex_lock(&fd->lines_m); + gl->data_size = ks.l; + gl->alloc = ks.m; + gl->data = ks.s; + + if (fp->idx) { + // Keep hold of the bam array a little longer as + // sam_dispatcher_write needs to use them for building the index. + gl->bams = gb; + } else { + // Add bam array to free-list + gb->next = fd->bams; + fd->bams = gb; + } + pthread_mutex_unlock(&fd->lines_m); + + return gl; + + err: + sam_free_sp_bams(gb); + if (gl) { + free(gl->data); + free(gl); + } + return NULL; +} + +int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) { + if (fp->state) + return 0; + + if (!(fp->state = sam_state_create(fp))) + return -1; + SAM_state *fd = (SAM_state *)fp->state; + + pthread_mutex_init(&fd->lines_m, NULL); + pthread_mutex_init(&fd->command_m, NULL); + pthread_cond_init(&fd->command_c, NULL); + fd->p = p->pool; + int qsize = p->qsize; + if (!qsize) + qsize = 2*hts_tpool_size(fd->p); + fd->q = hts_tpool_process_init(fd->p, qsize, 0); + + if (fp->format.compression == bgzf) + return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize); + return 0; +} -#undef _parse_warn -#undef _parse_err -#undef _parse_err_param -#undef _get_mem -#undef _read_token_aux -#undef _read_token -err_ret: - b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; - return -2; +int sam_set_threads(htsFile *fp, int nthreads) { + if (nthreads <= 0) + return 0; + + htsThreadPool p; + p.pool = hts_tpool_init(nthreads); + p.qsize = nthreads*2; + + int ret = sam_set_thread_pool(fp, &p); + if (ret < 0) + return ret; + + SAM_state *fd = (SAM_state *)fp->state; + fd->own_pool = 1; + + return 0; } -int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) +// Returns 0 on success, +// -1 on EOF, +// <-1 on error +int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) { switch (fp->format.format) { case bam: { int r = bam_read1(fp->fp.bgzf, b); - if (r >= 0) { + if (h && r >= 0) { if (b->core.tid >= h->n_targets || b->core.tid < -1 || - b->core.mtid >= h->n_targets || b->core.mtid < -1) + b->core.mtid >= h->n_targets || b->core.mtid < -1) { + errno = ERANGE; return -3; + } } return r; } @@ -1455,65 +2889,164 @@ int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) } case sam: { - int ret; -err_recover: - if (fp->line.l == 0) { + // Consume 1st line after header parsing as it wasn't using peek + if (fp->line.l != 0) { + int ret = sam_parse1(&fp->line, h, b); + fp->line.l = 0; + return ret; + } + + if (fp->state) { + SAM_state *fd = (SAM_state *)fp->state; + + if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) { + // We don't support multi-threaded SAM parsing with seeks yet. + int ret; + if ((ret = sam_state_destroy(fp)) < 0) { + errno = -ret; + return -2; + } + if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0) + return -1; + fp->fp.bgzf->seeked = 0; + goto err_recover; + } + + if (!fd->h) { + fd->h = h; + fd->h->ref_count++; + // Ensure hrecs is initialised now as we don't want multiple + // threads trying to do this simultaneously. + if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0) + return -2; + + // We can only do this once we've got a header + if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, fp) != 0) + return -2; + } + + if (fd->h != h) { + hts_log_error("SAM multi-threaded decoding does not support changing header"); + return -1; + } + + sp_bams *gb = fd->curr_bam; + if (!gb) { + if (fd->errcode) { + // Incase reader failed + errno = fd->errcode; + return -2; + } + hts_tpool_result *r = hts_tpool_next_result_wait(fd->q); + if (!r) + return -2; + fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r); + hts_tpool_delete_result(r, 0); + } + if (!gb) + return fd->errcode ? -2 : -1; + bam1_t *b_array = (bam1_t *)gb->bams; + if (fd->curr_idx < gb->nbams) + if (!bam_copy1(b, &b_array[fd->curr_idx++])) + return -2; + if (fd->curr_idx == gb->nbams) { + pthread_mutex_lock(&fd->lines_m); + gb->next = fd->bams; + fd->bams = gb; + pthread_mutex_unlock(&fd->lines_m); + + fd->curr_bam = NULL; + fd->curr_idx = 0; + } + + return 0; + + } else { + int ret; + err_recover: + ret = hts_getline(fp, KS_SEP_LINE, &fp->line); if (ret < 0) return ret; + + ret = sam_parse1(&fp->line, h, b); + fp->line.l = 0; + if (ret < 0) { + hts_log_warning("Parse error at line %lld", (long long)fp->lineno); + if (h->ignore_sam_err) goto err_recover; + } + return ret; } - ret = sam_parse1(&fp->line, h, b); - fp->line.l = 0; - if (ret < 0) { - hts_log_warning("Parse error at line %lld", (long long)fp->lineno); - if (h->ignore_sam_err) goto err_recover; - } - return ret; } + case empty_format: + errno = EPIPE; + return -3; + default: - abort(); + errno = EFTYPE; + return -3; } } -int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) +static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) { - int i; + int i, r = 0; uint8_t *s, *end; const bam1_core_t *c = &b->core; - str->l = 0; - kputsn(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); kputc('\t', str); // query name - kputw(c->flag, str); kputc('\t', str); // flag + if (c->l_qname == 0) + return -1; + r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); + r |= kputc_('\t', str); // query name + r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag if (c->tid >= 0) { // chr - kputs(h->target_name[c->tid] , str); - kputc('\t', str); - } else kputsn("*\t", 2, str); - kputw(c->pos + 1, str); kputc('\t', str); // pos - kputw(c->qual, str); kputc('\t', str); // qual + r |= kputs(h->target_name[c->tid] , str); + r |= kputc_('\t', str); + } else r |= kputsn_("*\t", 2, str); + r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos + r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual if (c->n_cigar) { // cigar uint32_t *cigar = bam_get_cigar(b); for (i = 0; i < c->n_cigar; ++i) { - kputw(bam_cigar_oplen(cigar[i]), str); - kputc(bam_cigar_opchr(cigar[i]), str); + r |= kputw(bam_cigar_oplen(cigar[i]), str); + r |= kputc_(bam_cigar_opchr(cigar[i]), str); } - } else kputc('*', str); - kputc('\t', str); - if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr - else if (c->mtid == c->tid) kputsn("=\t", 2, str); + } else r |= kputc_('*', str); + r |= kputc_('\t', str); + if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr + else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str); else { - kputs(h->target_name[c->mtid], str); - kputc('\t', str); + r |= kputs(h->target_name[c->mtid], str); + r |= kputc_('\t', str); } - kputw(c->mpos + 1, str); kputc('\t', str); // mate pos - kputw(c->isize, str); kputc('\t', str); // template len + r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos + r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len if (c->l_qseq) { // seq and qual uint8_t *s = bam_get_seq(b); - for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str); - kputc('\t', str); + if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err; + char *cp = str->s + str->l; + int lq2 = c->l_qseq / 2; + for (i = 0; i < lq2; i++) { + uint8_t b = s[i]; + cp[i*2+0] = "=ACMGRSVTWYHKDBN"[b>>4]; + cp[i*2+1] = "=ACMGRSVTWYHKDBN"[b&0xf]; + } + for (i *= 2; i < c->l_qseq; ++i) + cp[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(s, i)]; + cp[i++] = '\t'; + cp += i; s = bam_get_qual(b); - if (s[0] == 0xff) kputc('*', str); - else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str); - } else kputsn("*\t*", 3, str); + i = 0; + if (s[0] == 0xff) { + cp[i++] = '*'; + } else { + for (i = 0; i < c->l_qseq; ++i) + cp[i]=s[i]+33; + } + cp[i] = 0; + cp += i; + str->l = cp - str->s; + } else r |= kputsn_("*\t*", 3, str); s = bam_get_aux(b); // aux end = b->data + b->l_data; @@ -1521,41 +3054,41 @@ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) uint8_t type, key[2]; key[0] = s[0]; key[1] = s[1]; s += 2; type = *s++; - kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str); + r |= kputc_('\t', str); r |= kputsn_((char*)key, 2, str); r |= kputc_(':', str); if (type == 'A') { - kputsn("A:", 2, str); - kputc(*s, str); + r |= kputsn_("A:", 2, str); + r |= kputc_(*s, str); ++s; } else if (type == 'C') { - kputsn("i:", 2, str); - kputw(*s, str); + r |= kputsn_("i:", 2, str); + r |= kputw(*s, str); ++s; } else if (type == 'c') { - kputsn("i:", 2, str); - kputw(*(int8_t*)s, str); + r |= kputsn_("i:", 2, str); + r |= kputw(*(int8_t*)s, str); ++s; } else if (type == 'S') { if (end - s >= 2) { - kputsn("i:", 2, str); - kputuw(le_to_u16(s), str); + r |= kputsn_("i:", 2, str); + r |= kputuw(le_to_u16(s), str); s += 2; } else goto bad_aux; } else if (type == 's') { if (end - s >= 2) { - kputsn("i:", 2, str); - kputw(le_to_i16(s), str); + r |= kputsn_("i:", 2, str); + r |= kputw(le_to_i16(s), str); s += 2; } else goto bad_aux; } else if (type == 'I') { if (end - s >= 4) { - kputsn("i:", 2, str); - kputuw(le_to_u32(s), str); + r |= kputsn_("i:", 2, str); + r |= kputuw(le_to_u32(s), str); s += 4; } else goto bad_aux; } else if (type == 'i') { if (end - s >= 4) { - kputsn("i:", 2, str); - kputw(le_to_i32(s), str); + r |= kputsn_("i:", 2, str); + r |= kputw(le_to_i32(s), str); s += 4; } else goto bad_aux; } else if (type == 'f') { @@ -1570,8 +3103,8 @@ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) s += 8; } else goto bad_aux; } else if (type == 'Z' || type == 'H') { - kputc(type, str); kputc(':', str); - while (s < end && *s) kputc(*s++, str); + r |= kputc_(type, str); r |= kputc_(':', str); + while (s < end && *s) r |= kputc_(*s++, str); if (s >= end) goto bad_aux; ++s; @@ -1585,22 +3118,46 @@ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) s += 4; // now points to the start of the array if ((end - s) / sub_type_size < n) goto bad_aux; - kputsn("B:", 2, str); kputc(sub_type, str); // write the typing - for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if" - kputc(',', str); - if ('c' == sub_type) { kputw(*(int8_t*)s, str); ++s; } - else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; } - else if ('s' == sub_type) { kputw(le_to_i16(s), str); s += 2; } - else if ('S' == sub_type) { kputw(le_to_u16(s), str); s += 2; } - else if ('i' == sub_type) { kputw(le_to_i32(s), str); s += 4; } - else if ('I' == sub_type) { kputuw(le_to_u32(s), str); s += 4; } - else if ('f' == sub_type) { kputd(le_to_float(s), str); s += 4; } - else goto bad_aux; // Unknown sub-type + r |= kputsn_("B:", 2, str); r |= kputc_(sub_type, str); // write the type + switch (sub_type) { + case 'c': + if (ks_resize(str, str->l + n*2) < 0) goto mem_err; + for (i = 0; i < n; ++i) {r |= kputc_(',', str); r |= kputw(*(int8_t*)s, str); ++s;} + break; + case 'C': + if (ks_resize(str, str->l + n*2) < 0) goto mem_err; + for (i = 0; i < n; ++i) {r |= kputc_(',', str); r |= kputw(*(uint8_t*)s, str); ++s;} + break; + case 's': + if (ks_resize(str, str->l + n*4) < 0) goto mem_err; + for (i = 0; i < n; ++i) {r |= kputc_(',', str); r |= kputw(le_to_i16(s), str); s += 2; } + break; + case 'S': + if (ks_resize(str, str->l + n*4) < 0) goto mem_err; + for (i = 0; i < n; ++i) {r |= kputc_(',', str); r |= kputw(le_to_u16(s), str); s += 2; } + break; + case 'i': + if (ks_resize(str, str->l + n*6) < 0) goto mem_err; + for (i = 0; i < n; ++i) {r |= kputc_(',', str); r |= kputw(le_to_i32(s), str); s += 4; } + break; + case 'I': + if (ks_resize(str, str->l + n*6) < 0) goto mem_err; + for (i = 0; i < n; ++i) {r |= kputc_(',', str); r |= kputuw(le_to_u32(s), str); s += 4; } + break; + case 'f': + if (ks_resize(str, str->l + n*8) < 0) goto mem_err; + for (i = 0; i < n; ++i) {r |= kputc_(',', str); r |= kputd(le_to_float(s), str); s += 4; } + break; + default: + goto bad_aux; } } else { // Unknown type goto bad_aux; } } + r |= kputsn("", 0, str); // nul terminate + if (r < 0) goto mem_err; + return str->l; bad_aux: @@ -1608,9 +3165,22 @@ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) b->core.l_qname, bam_get_qname(b)); errno = EINVAL; return -1; + + mem_err: + hts_log_error("Out of memory"); + errno = ENOMEM; + return -1; +} + +int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) +{ + str->l = 0; + return sam_format1_append(h, b, str); } -int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b) +// Sadly we need to be able to modify the bam_hdr here so we can +// reference count the structure. +int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) { switch (fp->format.format) { case binary_format: @@ -1618,7 +3188,7 @@ int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b) fp->format.format = bam; /* fall-through */ case bam: - return bam_write1(fp->fp.bgzf, b); + return bam_write_idx1(fp, h, b); case cram: return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b); @@ -1628,13 +3198,115 @@ int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b) fp->format.format = sam; /* fall-through */ case sam: - if (sam_format1(h, b, &fp->line) < 0) return -1; - kputc('\n', &fp->line); - if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; - return fp->line.l; + if (fp->state) { + SAM_state *fd = (SAM_state *)fp->state; + + // Threaded output + if (!fd->h) { + // NB: discard const. We don't actually modify sam_hdr_t here, + // just data pointed to by it (which is a bit weasely still), + // but out cached pointer must be non-const as we want to + // destroy it later on and sam_hdr_destroy takes non-const. + // + // We do this because some tools do sam_hdr_destroy; sam_close + // while others do sam_close; sam_hdr_destroy. The former is an + // issue as we need the header still when flushing. + fd->h = (sam_hdr_t *)h; + fd->h->ref_count++; + + if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write, fp) != 0) + return -2; + } + + if (fd->h != h) { + hts_log_error("SAM multi-threaded decoding does not support changing header"); + return -2; + } + + // Find a suitable BAM array to copy to + sp_bams *gb = fd->curr_bam; + if (!gb) { + pthread_mutex_lock(&fd->lines_m); + if (fd->bams) { + fd->curr_bam = gb = fd->bams; + fd->bams = gb->next; + gb->next = NULL; + gb->nbams = 0; + pthread_mutex_unlock(&fd->lines_m); + } else { + pthread_mutex_unlock(&fd->lines_m); + if (!(gb = calloc(1, sizeof(*gb)))) return -1; + if (!(gb->bams = calloc(NB, sizeof(*gb->bams)))) { + free(gb); + return -1; + } + gb->nbams = 0; + gb->abams = NB; + gb->fd = fd; + fd->curr_idx = 0; + fd->curr_bam = gb; + } + } + + if (!bam_copy1(&gb->bams[gb->nbams++], b)) + return -2; + + // Dispatch if full + if (gb->nbams == NB) { + gb->serial = fd->serial++; + //fprintf(stderr, "Dispatch another %d bams\n", NB); + pthread_mutex_lock(&fd->command_m); + if (fd->errcode != 0) { + pthread_mutex_unlock(&fd->command_m); + return -fd->errcode; + } + if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb, + cleanup_sp_bams, + cleanup_sp_lines, 0) < 0) { + pthread_mutex_unlock(&fd->command_m); + return -1; + } + pthread_mutex_unlock(&fd->command_m); + fd->curr_bam = NULL; + } + + // Dummy value as we don't know how long it really is. + // We could track file sizes via a SAM_state field, but I don't think + // it is necessary. + return 1; + } else { + if (sam_format1(h, b, &fp->line) < 0) return -1; + kputc('\n', &fp->line); + if (fp->format.compression == bgzf) { + if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1; + } else { + if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; + } + + if (fp->idx) { + if (fp->format.compression == bgzf) { + if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), + bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { + hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", + bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); + return -1; + } + } else { + if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b), + bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { + hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", + bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); + return -1; + } + } + } + + return fp->line.l; + } default: - abort(); + errno = EBADF; + return -1; } } @@ -1727,7 +3399,7 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) switch (size) { case 'Z': case 'H': - while (*s && s < end) ++s; + while (s < end && *s) ++s; return s < end ? s + 1 : end; case 'B': if (end - s < 5) return NULL; @@ -2067,18 +3739,35 @@ double bam_auxB2f(const uint8_t *s, uint32_t idx) else return get_int_aux_val(s[1], s + 6, idx); } +static int find_file_extension(const char *fn, char ext_out[5]) +{ + const char *delim = fn ? strstr(fn, HTS_IDX_DELIM) : NULL, *ext; + if (!fn) return -1; + if (!delim) delim = fn + strlen(fn); + for (ext = delim; ext > fn && *ext != '.' && *ext != '/'; --ext) {} + if (*ext == '.' && delim - ext == 3 && ext[1] == 'g' && ext[2] == 'z') { + // permit .sam.gz as a valid file extension + for (ext--; ext > fn && *ext != '.' && *ext != '/'; --ext) {} + } + if (*ext != '.' || delim - ext > 7 || delim - ext < 4) return -1; + memcpy(ext_out, ext + 1, delim - ext - 1); + ext_out[delim - ext - 1] = '\0'; + return 0; +} + int sam_open_mode(char *mode, const char *fn, const char *format) { // TODO Parse "bam5" etc for compression level if (format == NULL) { // Try to pick a format based on the filename extension - const char *ext = fn? strrchr(fn, '.') : NULL; - if (ext == NULL || strchr(ext, '/')) return -1; - return sam_open_mode(mode, fn, ext+1); + char extension[7]; + if (find_file_extension(fn, extension) < 0) return -1; + return sam_open_mode(mode, fn, extension); } else if (strcmp(format, "bam") == 0) strcpy(mode, "b"); else if (strcmp(format, "cram") == 0) strcpy(mode, "c"); else if (strcmp(format, "sam") == 0) strcpy(mode, ""); + else if (strcmp(format, "sam.gz") == 0) strcpy(mode, "z"); else return -1; return 0; @@ -2104,12 +3793,12 @@ char *sam_open_mode_opts(const char *fn, if (format == NULL) { // Try to pick a format based on the filename extension - const char *ext = fn? strrchr(fn, '.') : NULL; - if (ext == NULL || strchr(ext, '/')) { + char extension[7]; + if (find_file_extension(fn, extension) < 0) { free(mode_opts); return NULL; } - if (sam_open_mode(cp, fn, ext+1) == 0) { + if (sam_open_mode(cp, fn, extension) == 0) { return mode_opts; } else { free(mode_opts); @@ -2138,6 +3827,8 @@ char *sam_open_mode_opts(const char *fn, cp += 12; } else if (strncmp(format, "sam", format_len) == 0) { ; // format mode="" + } else if (strncmp(format, "sam.gz", format_len) == 0) { + *cp++ = 'z'; } else { free(mode_opts); return NULL; @@ -2211,14 +3902,15 @@ char *bam_flag2str(int flag) *******************/ typedef struct { - int k, x, y, end; + int k, y; + hts_pos_t x, end; } cstate_t; static cstate_t g_cstate_null = { -1, 0, 0, 0 }; typedef struct __linkbuf_t { bam1_t b; - int32_t beg, end; + hts_pos_t beg, end; cstate_t s; struct __linkbuf_t *next; bam_pileup_cd cd; @@ -2269,7 +3961,7 @@ static inline void mp_free(mempool_t *mp, lbnode_t *p) s->x: the reference coordinate of the start of s->k s->y: the query coordiante of the start of s->k */ -static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s) +static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) { #define _cop(c) ((c)&BAM_CIGAR_MASK) #define _cln(c) ((c)>>BAM_CIGAR_SHIFT) @@ -2281,14 +3973,15 @@ static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s) // determine the current CIGAR operation //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y); if (s->k == -1) { // never processed + p->qpos = 0; if (c->n_cigar == 1) { // just one operation, save a loop if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0; } else { // find the first match or deletion for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { int op = _cop(cigar[k]); int l = _cln(cigar[k]); - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break; - else if (op == BAM_CREF_SKIP) s->x += l; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || + op == BAM_CEQUAL || op == BAM_CDIFF) break; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; } assert(k < c->n_cigar); @@ -2343,9 +4036,90 @@ static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s) } // cannot be other operations; otherwise a bug p->is_head = (pos == c->pos); p->is_tail = (pos == s->end); } + p->cigar_ind = s->k; return 1; } +/******************************* + *** Expansion of insertions *** + *******************************/ + +/* + * Fills out the kstring with the padded insertion sequence for the current + * location in 'p'. If this is not an insertion site, the string is blank. + * + * Returns the length of insertion string on success; + * -1 on failure. + */ +int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { + int j, k, indel; + uint32_t *cigar; + + if (p->indel <= 0) { + if (ks_resize(ins, 1) < 0) + return -1; + ins->l = 0; + ins->s[0] = '\0'; + return 0; + } + + if (del_len) + *del_len = 0; + + // Measure indel length including pads + indel = 0; + k = p->cigar_ind+1; + cigar = bam_get_cigar(p->b); + while (k < p->b->core.n_cigar) { + switch (cigar[k] & BAM_CIGAR_MASK) { + case BAM_CPAD: + case BAM_CINS: + indel += (cigar[k] >> BAM_CIGAR_SHIFT); + break; + default: + k = p->b->core.n_cigar; + break; + } + k++; + } + ins->l = indel; + + // Produce sequence + if (ks_resize(ins, indel+1) < 0) + return -1; + indel = 0; + k = p->cigar_ind+1; + j = 1; + while (k < p->b->core.n_cigar) { + int l, c; + switch (cigar[k] & BAM_CIGAR_MASK) { + case BAM_CPAD: + for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++) + ins->s[indel++] = '*'; + break; + case BAM_CINS: + for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) { + c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), + p->qpos + j - p->is_del)]; + ins->s[indel++] = c; + } + break; + case BAM_CDEL: + // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style + if (del_len) + *del_len = cigar[k]>>BAM_CIGAR_SHIFT; + // fall through + default: + k = p->b->core.n_cigar; + break; + } + k++; + } + ins->s[indel] = '\0'; + + return indel; +} + /*********************** *** Pileup iterator *** ***********************/ @@ -2357,7 +4131,8 @@ typedef khash_t(olap_hash) olap_hash_t; struct __bam_plp_t { mempool_t *mp; lbnode_t *head, *tail; - int32_t tid, pos, max_tid, max_pos; + int32_t tid, max_tid; + hts_pos_t pos, max_pos; int is_eof, max_plp, error, maxcnt; uint64_t id; bam_pileup1_t *plp; @@ -2389,9 +4164,10 @@ bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) return iter; } -void bam_plp_init_overlaps(bam_plp_t iter) +int bam_plp_init_overlaps(bam_plp_t iter) { iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads + return iter->overlaps ? 0 : -1; } void bam_plp_destroy(bam_plp_t iter) @@ -2431,11 +4207,12 @@ void bam_plp_destructor(bam_plp_t plp, * @iseq: position in the sequence (rw) * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw) * - * Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered. + * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered, + * or -2 on error. */ -static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref) +static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref) { - int pos = *iref; + hts_pos_t pos = *iref; if ( pos < 0 ) return -1; *icig = 0; *iseq = 0; @@ -2463,12 +4240,12 @@ static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int continue; } hts_log_error("Unexpected cigar %d", cig); - assert(0); + return -2; } *iseq = -1; return -1; } -static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref) +static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref) { while ( *cigar < cigar_max ) { @@ -2486,51 +4263,55 @@ static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, in if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; } hts_log_error("Unexpected cigar %d", cig); - assert(0); + return -2; } *iseq = -1; *iref = -1; return -1; } -static void tweak_overlap_quality(bam1_t *a, bam1_t *b) +static int tweak_overlap_quality(bam1_t *a, bam1_t *b) { uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar; uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar; - int a_icig = 0, a_iseq = 0; - int b_icig = 0, b_iseq = 0; + hts_pos_t a_icig = 0, a_iseq = 0; + hts_pos_t b_icig = 0, b_iseq = 0; uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b); - int iref = b->core.pos; - int a_iref = iref - a->core.pos; - int b_iref = iref - b->core.pos; + hts_pos_t iref = b->core.pos; + hts_pos_t a_iref = iref - a->core.pos; + hts_pos_t b_iref = iref - b->core.pos; int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); - if ( a_ret<0 ) return; // no overlap + if ( a_ret<0 ) return a_ret<-1 ? -1:0; // no overlap or error int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); - if ( b_ret<0 ) return; // no overlap + if ( b_ret<0 ) return b_ret<-1 ? -1:0; // no overlap or error #if DBG - fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, + fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %"PRIhts_pos"-%"PRIhts_pos"\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b))); #endif + int err = 0; while ( 1 ) { // Increment reference position - while ( a_iref>=0 && a_iref < iref - a->core.pos ) + while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos ) a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); - if ( a_ret<0 ) break; // done + if ( a_ret<0 ) { err = a_ret<-1?-1:0; break; } // done if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos; - while ( b_iref>=0 && b_iref < iref - b->core.pos ) + while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos ) b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); - if ( b_ret<0 ) break; // done + if ( b_ret<0 ) { err = b_ret<-1?-1:0; break; } // done if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos; iref++; if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels + if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq) + return -1; // Fell off end of sequence, bad CIGAR? + if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) { #if DBG @@ -2564,37 +4345,48 @@ static void tweak_overlap_quality(bam1_t *a, bam1_t *b) #if DBG fprintf(stderr,"\n"); #endif + return err; } // Fix overlapping reads. Simple soft-clipping did not give good results. // Lowering qualities of unwanted bases is more selective and works better. // -static void overlap_push(bam_plp_t iter, lbnode_t *node) +// Returns 0 on success, -1 on failure +static int overlap_push(bam_plp_t iter, lbnode_t *node) { - if ( !iter->overlaps ) return; + if ( !iter->overlaps ) return 0; // mapped mates and paired reads only - if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return; + if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0; // no overlap possible, unless some wild cigar - if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return; + if ( node->b.core.tid != node->b.core.mtid + || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq + && node->b.core.mpos >= node->end) // for those wild cigars + ) return 0; khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b)); if ( kitr==kh_end(iter->overlaps) ) { - int ret; - kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret); - kh_value(iter->overlaps, kitr) = node; + // Only add reads where the mate is still to arrive + if (node->b.core.mpos >= node->b.core.pos) { + int ret; + kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret); + if (ret < 0) return -1; + kh_value(iter->overlaps, kitr) = node; + } } else { lbnode_t *a = kh_value(iter->overlaps, kitr); - tweak_overlap_quality(&a->b, &node->b); + int err = tweak_overlap_quality(&a->b, &node->b); kh_del(olap_hash, iter->overlaps, kitr); assert(a->end-1 == a->s.end); a->end = bam_endpos(&a->b); a->s.end = a->end - 1; + return err; } + return 0; } static void overlap_remove(bam_plp_t iter, const bam1_t *b) @@ -2621,7 +4413,7 @@ static void overlap_remove(bam_plp_t iter, const bam1_t *b) // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns // pointer to the piled records if next position is ready or NULL if there is not enough records in the // buffer yet (the current position is still the maximum position across all buffered reads). -const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) { if (iter->error) { *_n_plp = -1; return NULL; } *_n_plp = 0; @@ -2673,6 +4465,22 @@ const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_ return NULL; } +const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +{ + hts_pos_t pos64 = 0; + const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp); + if (pos64 < INT_MAX) { + *_pos = pos64; + } else { + hts_log_error("Position %"PRId64" too large", pos64); + *_pos = INT_MAX; + iter->error = 1; + *_n_plp = -1; + return NULL; + } + return p; +} + int bam_plp_push(bam_plp_t iter, const bam1_t *b) { if (iter->error) return -1; @@ -2685,11 +4493,9 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b) overlap_remove(iter, b); return 0; } - bam_copy1(&iter->tail->b, b); - overlap_push(iter, iter->tail); -#ifndef BAM_NO_ID + if (bam_copy1(&iter->tail->b, b) == NULL) + return -1; iter->tail->b.id = iter->id++; -#endif iter->tail->beg = b->core.pos; iter->tail->end = bam_endpos(b); iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t @@ -2705,20 +4511,30 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b) } iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg; if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) { + lbnode_t *next = mp_alloc(iter->mp); + if (!next) { + iter->error = 1; + return -1; + } if (iter->plp_construct) iter->plp_construct(iter->data, b, &iter->tail->cd); - iter->tail->next = mp_alloc(iter->mp); + if (overlap_push(iter, iter->tail) < 0) { + mp_free(iter->mp, next); + iter->error = 1; + return -1; + } + iter->tail->next = next; iter->tail = iter->tail->next; } } else iter->is_eof = 1; return 0; } -const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) { const bam_pileup1_t *plp; if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } - if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; else { // no pileup line can be obtained; read alignments *_n_plp = 0; if (iter->is_eof) return 0; @@ -2728,16 +4544,35 @@ const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_ *_n_plp = -1; return 0; } - if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; // otherwise no pileup line can be returned; read the next alignment. } if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; } - bam_plp_push(iter, 0); - if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + if (bam_plp_push(iter, 0) < 0) { + *_n_plp = -1; + return 0; + } + if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; return 0; } } +const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +{ + hts_pos_t pos64 = 0; + const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp); + if (pos64 < INT_MAX) { + *_pos = pos64; + } else { + hts_log_error("Position %"PRId64" too large", pos64); + *_pos = INT_MAX; + iter->error = 1; + *_n_plp = -1; + return NULL; + } + return p; +} + void bam_plp_reset(bam_plp_t iter) { overlap_remove(iter, NULL); @@ -2762,7 +4597,8 @@ void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) struct __bam_mplp_t { int n; - uint64_t min, *pos; + int32_t min_tid, *tid; + hts_pos_t min_pos, *pos; bam_plp_t *iter; int *n_plp; const bam_pileup1_t **plp; @@ -2773,24 +4609,28 @@ bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) int i; bam_mplp_t iter; iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t)); - iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t)); + iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t)); + iter->tid = (int32_t*)calloc(n, sizeof(int32_t)); iter->n_plp = (int*)calloc(n, sizeof(int)); iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*)); iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t)); iter->n = n; - iter->min = (uint64_t)-1; + iter->min_pos = HTS_POS_MAX; + iter->min_tid = (uint32_t)-1; for (i = 0; i < n; ++i) { iter->iter[i] = bam_plp_init(func, data[i]); - iter->pos[i] = iter->min; + iter->pos[i] = iter->min_pos; + iter->tid[i] = iter->min_tid; } return iter; } -void bam_mplp_init_overlaps(bam_mplp_t iter) +int bam_mplp_init_overlaps(bam_mplp_t iter) { - int i; + int i, r = 0; for (i = 0; i < iter->n; ++i) - bam_plp_init_overlaps(iter->iter[i]); + r |= bam_plp_init_overlaps(iter->iter[i]); + return r == 0 ? 0 : -1; } void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) @@ -2804,28 +4644,45 @@ void bam_mplp_destroy(bam_mplp_t iter) { int i; for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]); - free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp); + free(iter->iter); free(iter->pos); free(iter->tid); + free(iter->n_plp); free(iter->plp); free(iter); } -int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) +int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp) { int i, ret = 0; - uint64_t new_min = (uint64_t)-1; + hts_pos_t new_min_pos = HTS_POS_MAX; + uint32_t new_min_tid = (uint32_t)-1; for (i = 0; i < iter->n; ++i) { - if (iter->pos[i] == iter->min) { - int tid, pos; - iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); + if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { + int tid; + hts_pos_t pos; + iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); if ( iter->iter[i]->error ) return -1; - iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0; + if (iter->plp[i]) { + iter->tid[i] = tid; + iter->pos[i] = pos; + } else { + iter->tid[i] = 0; + iter->pos[i] = 0; + } + } + if (iter->plp[i]) { + if (iter->tid[i] < new_min_tid) { + new_min_tid = iter->tid[i]; + new_min_pos = iter->pos[i]; + } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) { + new_min_pos = iter->pos[i]; + } } - if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i]; } - iter->min = new_min; - if (new_min == (uint64_t)-1) return 0; - *_tid = new_min>>32; *_pos = (uint32_t)new_min; + iter->min_pos = new_min_pos; + iter->min_tid = new_min_tid; + if (new_min_pos == HTS_POS_MAX) return 0; + *_tid = new_min_tid; *_pos = new_min_pos; for (i = 0; i < iter->n; ++i) { - if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line" + if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; ++ret; } else n_plp[i] = 0, plp[i] = 0; @@ -2833,13 +4690,31 @@ int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_p return ret; } +int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) +{ + hts_pos_t pos64 = 0; + int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp); + if (ret >= 0) { + if (pos64 < INT_MAX) { + *_pos = pos64; + } else { + hts_log_error("Position %"PRId64" too large", pos64); + *_pos = INT_MAX; + return -1; + } + } + return ret; +} + void bam_mplp_reset(bam_mplp_t iter) { int i; - iter->min = (uint64_t)-1; + iter->min_pos = HTS_POS_MAX; + iter->min_tid = (uint32_t)-1; for (i = 0; i < iter->n; ++i) { bam_plp_reset(iter->iter[i]); - iter->pos[i] = (uint64_t)-1; + iter->pos[i] = HTS_POS_MAX; + iter->tid[i] = (uint32_t)-1; iter->n_plp[i] = 0; iter->plp[i] = NULL; } diff --git a/sam_internal.h b/sam_internal.h new file mode 100644 index 000000000..642779f66 --- /dev/null +++ b/sam_internal.h @@ -0,0 +1,64 @@ +/* sam_internal.h -- internal functions; not part of the public API. + + Copyright (C) 2019 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef HTSLIB_SAM_INTERNAL_H +#define HTSLIB_SAM_INTERNAL_H + +#include +#include +#include "htslib/sam.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Used internally in the SAM format multi-threading. +int sam_state_destroy(samFile *fp); +int sam_set_thread_pool(htsFile *fp, htsThreadPool *p); +int sam_set_threads(htsFile *fp, int nthreads); + +// bam1_t data (re)allocation +int sam_realloc_bam_data(bam1_t *b, size_t desired); + +static inline int realloc_bam_data(bam1_t *b, size_t desired) +{ + if (desired <= b->m_data) return 0; + return sam_realloc_bam_data(b, desired); +} + +static inline int possibly_expand_bam_data(bam1_t *b, size_t bytes) { + size_t new_len = (size_t) b->l_data + bytes; + + if (new_len > INT32_MAX || new_len < bytes) { // Too big or overflow + errno = ENOMEM; + return -1; + } + if (new_len <= b->m_data) return 0; + return sam_realloc_bam_data(b, new_len); +} + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 982f2bd02..4c55b9b8d 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1,6 +1,6 @@ /* synced_bcf_reader.c -- stream through multiple VCF files. - Copyright (C) 2012-2014 Genome Research Ltd. + Copyright (C) 2012-2019 Genome Research Ltd. Author: Petr Danecek @@ -22,6 +22,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -29,6 +30,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include "htslib/synced_bcf_reader.h" @@ -38,18 +40,22 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/thread_pool.h" #include "bcf_sr_sort.h" -#define MAX_CSI_COOR 0x7fffffff // maximum indexable coordinate of .csi +// Maximum indexable coordinate of .csi, for default min_shift of 14. +// This comes out to about 17 Tbp. Limiting factor is the bin number, +// which is a uint32_t in CSI. The highest number of levels compatible +// with this is 10 (needs 31 bits). +#define MAX_CSI_COOR ((1LL << (14 + 30)) - 1) typedef struct { - uint32_t start, end; + hts_pos_t start, end; // records are marked for skipping have start>end } region1_t; typedef struct _region_t { - region1_t *regs; - int nregs, mregs, creg; + region1_t *regs; // regions will sorted and merged, redundant records marked for skipping have start>end + int nregs, mregs, creg; // creg: the current active region } region_t; @@ -60,9 +66,10 @@ typedef struct } aux_t; -static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end); +static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end); static bcf_sr_regions_t *_regions_init_string(const char *str); static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); +static void _regions_sort_and_merge(bcf_sr_regions_t *reg); char *bcf_sr_strerror(int errnum) { @@ -316,6 +323,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) _regions_add(files->regions, names[i], -1, -1); } free(names); + _regions_sort_and_merge(files->regions); } return 1; @@ -376,14 +384,14 @@ void bcf_sr_remove_reader(bcf_srs_t *files, int i) files->nreaders--; } - +#if DEBUG_SYNCED_READER void debug_buffer(FILE *fp, bcf_sr_t *reader) { int j; for (j=0; j<=reader->nbuffer; j++) { bcf1_t *line = reader->buffer[j]; - fprintf(fp,"\t%p\t%s%s\t%s:%d\t%s ", (void*)line,reader->fname,j==0?"*":" ",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:""); + fprintf(fp,"\t%p\t%s%s\t%s:%"PRIhts_pos"\t%s ", (void*)line,reader->fname,j==0?"*":" ",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:""); int k; for (k=1; kn_allele; k++) fprintf(fp," %s", line->d.allele[k]); fprintf(fp,"\n"); @@ -400,6 +408,7 @@ void debug_buffers(FILE *fp, bcf_srs_t *files) } fprintf(fp,"\n"); } +#endif static inline int has_filter(bcf_sr_t *reader, bcf1_t *line) { @@ -418,11 +427,11 @@ static inline int has_filter(bcf_sr_t *reader, bcf1_t *line) return 0; } -static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end) +static int _reader_seek(bcf_sr_t *reader, const char *seq, hts_pos_t start, hts_pos_t end) { if ( end>=MAX_CSI_COOR ) { - hts_log_error("The coordinate is out of csi index limit: %d", end+1); + hts_log_error("The coordinate is out of csi index limit: %"PRIhts_pos, end+1); exit(1); } if ( reader->itr ) @@ -444,7 +453,7 @@ static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end) reader->itr = bcf_itr_queryi(reader->bcf_idx,tid,start,end+1); } if (!reader->itr) { - hts_log_error("Could not seek: %s:%d-%d", seq, start + 1, end + 1); + hts_log_error("Could not seek: %s:%"PRIhts_pos"-%"PRIhts_pos, seq, start + 1, end + 1); assert(0); } return 0; @@ -467,8 +476,11 @@ static int _readers_next_region(bcf_srs_t *files) return 0; } - // No lines in the buffer, need to open new region or quit + // No lines in the buffer, need to open new region or quit. + int prev_iseq = files->regions->iseq; + hts_pos_t prev_end = files->regions->end; if ( bcf_sr_regions_next(files->regions)<0 ) return -1; + files->regions->prev_end = prev_iseq==files->regions->iseq ? prev_end : -1; for (i=0; inreaders; i++) _reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end); @@ -479,13 +491,13 @@ static int _readers_next_region(bcf_srs_t *files) /* * _reader_fill_buffer() - buffers all records with the same coordinate */ -static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) +static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { // Return if the buffer is full: the coordinate of the last buffered record differs - if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return; + if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return 0; // No iterator (sequence not present in this file) and not streaming - if ( !reader->itr && !files->streaming ) return; + if ( !reader->itr && !files->streaming ) return 0; // Fill the buffer with records starting at the same position int i, ret = 0; @@ -537,6 +549,9 @@ static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } + // prevent creation of duplicates from records overlapping multiple regions + if ( files->regions && reader->buffer[reader->nbuffer+1]->pos <= files->regions->prev_end ) continue; + // apply filter if ( !reader->nfilter_ids ) bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR); @@ -555,6 +570,7 @@ static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) tbx_itr_destroy(reader->itr); reader->itr = NULL; } + return 0; // FIXME: Check for more errs in this function } /* @@ -576,9 +592,10 @@ static void _reader_shift_buffer(bcf_sr_t *reader) reader->nbuffer = 0; // no other line } -int _reader_next_line(bcf_srs_t *files) +static int next_line(bcf_srs_t *files) { - int i, min_pos = INT_MAX; + int i; + hts_pos_t min_pos = HTS_POS_MAX; const char *chr = NULL; // Loop until next suitable line is found or all readers have finished @@ -603,7 +620,7 @@ int _reader_next_line(bcf_srs_t *files) else if ( min_pos==files->readers[i].buffer[1]->pos ) bcf_sr_sort_add_active(&BCF_SR_AUX(files)->sort, i); } - if ( min_pos==INT_MAX ) + if ( min_pos==HTS_POS_MAX ) { if ( !files->regions ) break; continue; @@ -619,7 +636,7 @@ int _reader_next_line(bcf_srs_t *files) for (i=0; inreaders; i++) if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos ) _reader_shift_buffer(&files->readers[i]); - min_pos = INT_MAX; + min_pos = HTS_POS_MAX; chr = NULL; continue; } @@ -635,11 +652,11 @@ int _reader_next_line(bcf_srs_t *files) int bcf_sr_next_line(bcf_srs_t *files) { if ( !files->targets_als ) - return _reader_next_line(files); + return next_line(files); while (1) { - int i, ret = _reader_next_line(files); + int i, ret = next_line(files); if ( !ret ) return ret; for (i=0; inreaders; i++) @@ -669,7 +686,7 @@ static void bcf_sr_seek_start(bcf_srs_t *readers) } -int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos) +int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos) { if ( !readers->regions ) return 0; bcf_sr_sort_reset(&BCF_SR_AUX(readers)->sort); @@ -762,9 +779,9 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) return 1; } -// Add a new region into a list sorted by start,end. On input the coordinates -// are 1-based, stored 0-based, inclusive. -static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end) +// Add a new region into a list. On input the coordinates are 1-based, inclusive, then stored 0-based, +// inclusive. Sorting and merging step needed afterwards: qsort(..,cmp_regions) and merge_regions(). +static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end) { if ( start==-1 && end==-1 ) { @@ -792,25 +809,48 @@ static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int } region_t *creg = ®->regs[iseq]; + hts_expand(region1_t,creg->nregs+1,creg->mregs,creg->regs); + creg->regs[creg->nregs].start = start; + creg->regs[creg->nregs].end = end; + creg->nregs++; - // the regions may not be sorted on input: binary search - int i, min = 0, max = creg->nregs - 1; - while ( min<=max ) + return 0; // FIXME: check for errs in this function +} + +static int regions_cmp(const void *aptr, const void *bptr) +{ + region1_t *a = (region1_t*)aptr; + region1_t *b = (region1_t*)bptr; + if ( a->start < b->start ) return -1; + if ( a->start > b->start ) return 1; + if ( a->end < b->end ) return -1; + if ( a->end > b->end ) return 1; + return 0; +} +static void regions_merge(region_t *reg) +{ + int i = 0, j; + while ( inregs ) { - i = (max+min)/2; - if ( start < creg->regs[i].start ) max = i - 1; - else if ( start > creg->regs[i].start ) min = i + 1; - else break; + j = i + 1; + while ( jnregs && reg->regs[i].end >= reg->regs[j].start ) + { + if ( reg->regs[i].end < reg->regs[j].end ) reg->regs[i].end = reg->regs[j].end; + reg->regs[j].start = 1; reg->regs[j].end = 0; // if beg>end, this region marked for skipping + j++; + } + i = j; } - if ( min>max || creg->regs[i].start!=start || creg->regs[i].end!=end ) +} +void _regions_sort_and_merge(bcf_sr_regions_t *reg) +{ + if ( !reg ) return; + + int i; + for (i=0; inseqs; i++) { - // no such region, insert a new one just after max - hts_expand(region1_t,creg->nregs+1,creg->mregs,creg->regs); - if ( ++max < creg->nregs ) - memmove(&creg->regs[max+1],&creg->regs[max],(creg->nregs - max)*sizeof(region1_t)); - creg->regs[max].start = start; - creg->regs[max].end = end; - creg->nregs++; + qsort(reg->regs[i].regs, reg->regs[i].nregs, sizeof(*reg->regs[i].regs), regions_cmp); + regions_merge(®->regs[i]); } } @@ -819,11 +859,11 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) { bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); reg->start = reg->end = -1; - reg->prev_start = reg->prev_seq = -1; + reg->prev_start = reg->prev_end = reg->prev_seq = -1; kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; - int from, to; + hts_pos_t from, to; while ( 1 ) { while ( *ep && *ep!=',' && *ep!=':' ) ep++; @@ -875,7 +915,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) // ichr,ifrom,ito are 0-based; // returns -1 on error, 0 if the line is a comment line, 1 on success -static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **chr,char **chr_end,int *from,int *to) +static int _regions_parse_line(char *line, int ichr, int ifrom, int ito, char **chr, char **chr_end, hts_pos_t *from, hts_pos_t *to) { if (ifrom < 0 || ito < 0) return -1; *chr_end = NULL; @@ -938,11 +978,16 @@ static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **ch bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito) { bcf_sr_regions_t *reg; - if ( !is_file ) return _regions_init_string(regions); + if ( !is_file ) + { + reg = _regions_init_string(regions); + _regions_sort_and_merge(reg); + return reg; + } reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); reg->start = reg->end = -1; - reg->prev_start = reg->prev_seq = -1; + reg->prev_start = reg->prev_end = reg->prev_seq = -1; reg->file = hts_open(regions, "rb"); if ( !reg->file ) @@ -952,7 +997,7 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr return NULL; } - reg->tbx = tbx_index_load(regions); + reg->tbx = tbx_index_load3(regions, NULL, HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL); if ( !reg->tbx ) { int len = strlen(regions); @@ -965,7 +1010,8 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 ) { char *chr, *chr_end; - int from, to, ret; + hts_pos_t from, to; + int ret; ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to); if ( ret < 0 ) { @@ -987,6 +1033,7 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr } hts_close(reg->file); reg->file = NULL; if ( !reg->nseqs ) { free(reg); return NULL; } + _regions_sort_and_merge(reg); return reg; } @@ -1048,6 +1095,16 @@ int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq) return -1; } +// Returns 0 on success, -1 when done +static int advance_creg(region_t *reg) +{ + int i = reg->creg + 1; + while ( inregs && reg->regs[i].start > reg->regs[i].end ) i++; // regions with start>end are marked to skip by merge_regions() + reg->creg = i; + if ( i>=reg->nregs ) return -1; + return 0; +} + int bcf_sr_regions_next(bcf_sr_regions_t *reg) { if ( reg->iseq<0 ) return -1; @@ -1059,8 +1116,7 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg) { while ( reg->iseq < reg->nseqs ) { - reg->regs[reg->iseq].creg++; - if ( reg->regs[reg->iseq].creg < reg->regs[reg->iseq].nregs ) break; + if ( advance_creg(®->regs[reg->iseq])==0 ) break; // a valid record was found reg->iseq++; } if ( reg->iseq >= reg->nseqs ) { reg->iseq = -1; return -1; } // no more regions left @@ -1072,7 +1128,8 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg) // reading from tabix char *chr, *chr_end; - int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to; + int ichr = 0, ifrom = 1, ito = 2, is_bed = 0; + hts_pos_t from, to; if ( reg->tbx ) { ichr = reg->tbx->conf.sc-1; @@ -1191,7 +1248,7 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re return !(type & VCF_INDEL) ? 1 : 0; } -int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end) +int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end) { int iseq; if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence @@ -1219,10 +1276,10 @@ int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, in return -1; // no overlap } -void bcf_sr_regions_flush(bcf_sr_regions_t *reg) +int bcf_sr_regions_flush(bcf_sr_regions_t *reg) { - if ( !reg->missed_reg_handler || reg->prev_seq==-1 ) return; + if ( !reg->missed_reg_handler || reg->prev_seq==-1 ) return 0; while ( !bcf_sr_regions_next(reg) ) reg->missed_reg_handler(reg, reg->missed_reg_data); - return; + return 0; // FIXME: check for errs in this function } diff --git a/tabix.1 b/tabix.1 index 8bc37cac9..4e2cb1818 100644 --- a/tabix.1 +++ b/tabix.1 @@ -1,9 +1,10 @@ -.TH tabix 1 "18 July 2018" "htslib-1.9" "Bioinformatics tools" +.TH tabix 1 "6 December 2019" "htslib-1.10" "Bioinformatics tools" .SH NAME .PP tabix \- Generic indexer for TAB-delimited genome position files .\" .\" Copyright (C) 2009-2011 Broad Institute. +.\" Copyright (C) 2014, 2016, 2018 Genome Research Ltd. .\" .\" Author: Heng Li .\" @@ -134,6 +135,9 @@ file may not be sorted. Similar to .B -R but the entire input will be read sequentially and regions not listed in FILE will be skipped. +.TP +.BI "-D " +Do not download the index file before opening it. Valid for remote files only. .PP .SH EXAMPLE (grep ^"#" in.gff; grep -v ^"#" in.gff | sort -k1,1 -k4,4n) | bgzip > sorted.gff.gz; diff --git a/tabix.c b/tabix.c index 1335cd9dd..07e3d3beb 100644 --- a/tabix.c +++ b/tabix.c @@ -1,7 +1,7 @@ /* tabix.c -- Generic indexer for TAB-delimited genome position files. Copyright (C) 2009-2011 Broad Institute. - Copyright (C) 2010-2012, 2014-2018 Genome Research Ltd. + Copyright (C) 2010-2012, 2014-2019 Genome Research Ltd. Author: Heng Li @@ -79,7 +79,16 @@ int file_type(const char *fname) else if (l>=4 && strcasecmp(fname+l-5, ".cram") == 0) return IS_CRAM; htsFile *fp = hts_open(fname,"r"); - enum htsExactFormat format = fp->format.format; + if (!fp) { + if (errno == ENOEXEC) { + // hts_open() uses this to report that it didn't understand the + // file format. + error("Couldn't understand format of \"%s\"\n", fname); + } else { + error("Couldn't open \"%s\" : %s\n", fname, strerror(errno)); + } + } + enum htsExactFormat format = hts_get_format(fp)->format; hts_close(fp); if ( format == bcf ) return IS_BCF; if ( format == bam ) return IS_BAM; @@ -103,6 +112,9 @@ static char **parse_regions(char *regions_fname, char **argv, int argc, int *nre regidx_t *idx = regidx_init(regions_fname, NULL, NULL, 0, NULL); if ( !idx ) error("Could not read %s\n", regions_fname); + regitr_t *itr = regitr_init(idx); + if ( !itr ) error("Could not initialize an iterator over %s\n", regions_fname); + (*nregs) += regidx_nregs(idx); regs = (char**) malloc(sizeof(char*)*(*nregs)); @@ -110,17 +122,16 @@ static char **parse_regions(char *regions_fname, char **argv, int argc, int *nre char **seqs = regidx_seq_names(idx, &nseq); for (iseq=0; iseqbeg+1, itr->end+1); regs[ireg++] = strdup(str.s); - itr.i++; } } regidx_destroy(idx); + regitr_destroy(itr); } free(str.s); @@ -139,7 +150,7 @@ static char **parse_regions(char *regions_fname, char **argv, int argc, int *nre for (iseq=0; iseqprint_header ) - bcf_hdr_write(out,hdr); + if ( bcf_hdr_write(out,hdr)!=0 ) error("Failed to write to %s\n", fname); if ( !args->header_only ) { bcf1_t *rec = bcf_init(); @@ -172,7 +183,7 @@ static int query_regions(args_t *args, char *fname, char **regs, int nregs) while ( bcf_itr_next(fp, itr, rec) >=0 ) { if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue; - bcf_write(out,hdr,rec); + if ( bcf_write(out,hdr,rec)!=0 ) error("Failed to write to %s\n", fname); } tbx_itr_destroy(itr); } @@ -182,9 +193,9 @@ static int query_regions(args_t *args, char *fname, char **regs, int nregs) bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } - else if ( format==vcf || format==sam || format==unknown_format ) + else if ( format==vcf || format==sam || format==bed || format==text_format || format==unknown_format ) { - tbx_t *tbx = tbx_index_load(fname); + tbx_t *tbx = tbx_index_load3(fname, NULL, download ? HTS_IDX_SAVE_REMOTE : 0); if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname); kstring_t str = {0,0,0}; if ( args->print_header ) @@ -206,7 +217,7 @@ static int query_regions(args_t *args, char *fname, char **regs, int nregs) if ( !itr ) continue; while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { - if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue; + if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end-1, NULL) ) continue; puts(str.s); } tbx_itr_destroy(itr); @@ -226,13 +237,13 @@ static int query_regions(args_t *args, char *fname, char **regs, int nregs) free(regs); return 0; } -static int query_chroms(char *fname) +static int query_chroms(char *fname, int download) { const char **seq; int i, nseq, ftype = file_type(fname); if ( ftype & IS_TXT || !ftype ) { - tbx_t *tbx = tbx_index_load(fname); + tbx_t *tbx = tbx_index_load3(fname, NULL, download ? HTS_IDX_SAVE_REMOTE : 0); if ( !tbx ) error("Could not load .tbi index of %s\n", fname); seq = tbx_seqnames(tbx, &nseq); for (i=0; i= 0) + while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:CR:T:D", loptions,NULL)) >= 0) { switch (c) { @@ -443,19 +456,24 @@ int main(int argc, char *argv[]) if ( *tmp ) error("Could not parse argument: -S %s\n", optarg); detect = 0; break; + case 'D': + download_index = 0; + break; case 1: printf( "tabix (htslib) %s\n" -"Copyright (C) 2018 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2019 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; - default: return usage(); + case 2: + return usage(stdout, EXIT_SUCCESS); + default: return usage(stderr, EXIT_FAILURE); } } - if ( optind==argc ) return usage(); + if ( optind==argc ) return usage(stderr, EXIT_FAILURE); if ( list_chroms ) - return query_chroms(argv[optind]); + return query_chroms(argv[optind], download_index); if ( argc > optind+1 || args.header_only || args.regions_fname || args.targets_fname ) { @@ -463,7 +481,7 @@ int main(int argc, char *argv[]) char **regs = NULL; if ( !args.header_only ) regs = parse_regions(args.regions_fname, argv+optind+1, argc-optind-1, &nregs); - return query_regions(&args, argv[optind], regs, nregs); + return query_regions(&args, argv[optind], regs, nregs, download_index); } char *fname = argv[optind]; @@ -517,6 +535,7 @@ int main(int argc, char *argv[]) } free(idx_fname); + int ret; if ( ftype==IS_CRAM ) { if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname); @@ -534,12 +553,29 @@ int main(int argc, char *argv[]) if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname); return 0; } - if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname); - return 0; + + switch (ret = tbx_index_build(fname, min_shift, &conf)) + { + case 0: + return 0; + case -2: + error("[tabix] the compression of '%s' is not BGZF\n", fname); + default: + error("tbx_index_build failed: %s\n", fname); + } } else // TBI index { - if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname); - return 0; + switch (ret = tbx_index_build(fname, min_shift, &conf)) + { + case 0: + return 0; + case -2: + error("[tabix] the compression of '%s' is not BGZF\n", fname); + default: + error("tbx_index_build failed: %s\n", fname); + } } + + return 0; } diff --git a/tbx.c b/tbx.c index 1d4b6b50b..31c5fbe17 100644 --- a/tbx.c +++ b/tbx.c @@ -1,6 +1,6 @@ /* tbx.c -- tabix API functions. - Copyright (C) 2009, 2010, 2012-2015, 2017 Genome Research Ltd. + Copyright (C) 2009, 2010, 2012-2015, 2017-2019 Genome Research Ltd. Copyright (C) 2010-2012 Broad Institute. Author: Heng Li @@ -23,6 +23,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -38,10 +39,19 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/khash.h" KHASH_DECLARE(s2i, kh_cstr_t, int64_t) +HTSLIB_EXPORT const tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 }; + +HTSLIB_EXPORT const tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 }; + +HTSLIB_EXPORT const tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 }; + +HTSLIB_EXPORT const tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 }; + +HTSLIB_EXPORT const tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 }; typedef struct { @@ -55,6 +65,7 @@ static inline int get_tid(tbx_t *tbx, const char *ss, int is_add) khint_t k; khash_t(s2i) *d; if (tbx->dict == 0) tbx->dict = kh_init(s2i); + if (!tbx->dict) return -1; // Out of memory d = (khash_t(s2i)*)tbx->dict; if (is_add) { int absent; @@ -92,7 +103,7 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) intv->ss = line + b; intv->se = line + i; } else if (id == conf->bc) { // here ->beg is 0-based. - intv->beg = intv->end = strtol(line + b, &s, 0); + intv->beg = intv->end = strtoll(line + b, &s, 0); if ( s==line+b ) return -1; // expected int if (!(conf->preset&TBX_UCSC)) --intv->beg; else ++intv->end; @@ -102,7 +113,7 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) if ((conf->preset&0xffff) == TBX_GENERIC) { if (id == conf->ec) { - intv->end = strtol(line + b, &s, 0); + intv->end = strtoll(line + b, &s, 0); if ( s==line+b ) return -1; // expected int } } else if ((conf->preset&0xffff) == TBX_SAM) { @@ -130,7 +141,7 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) s = strstr(line + b, ";END="); if (s) s += 5; } - if (s) intv->end = strtol(s, &s, 0); + if (s) intv->end = strtoll(s, &s, 0); line[i] = c; } } @@ -148,7 +159,8 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_ if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) { int c = *intv->se; *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c; - return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1; + if (intv->tid < 0) return -2; // get_tid out of memory + return (intv->beg >= 0 && intv->end >= 0)? 0 : -1; } else { char *type = NULL; switch (tbx->conf.preset&0xffff) @@ -164,20 +176,27 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_ } } -int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end) +/* + * Called by tabix iterator to read the next record. + * Returns >= 0 on success + * -1 on EOF + * <= -2 on error + */ +int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_pos_t *end) { tbx_t *tbx = (tbx_t *) tbxv; kstring_t *s = (kstring_t *) sv; int ret; if ((ret = bgzf_getline(fp, '\n', s)) >= 0) { tbx_intv_t intv; - get_intv(tbx, s, &intv, 0); + if (get_intv(tbx, s, &intv, 0) < 0) + return -2; *tid = intv.tid; *beg = intv.beg; *end = intv.end; } return ret; } -void tbx_set_meta(tbx_t *tbx) +static int tbx_set_meta(tbx_t *tbx) { int i, l = 0, l_nm; uint32_t x[7]; @@ -188,6 +207,7 @@ void tbx_set_meta(tbx_t *tbx) memcpy(x, &tbx->conf, 24); name = (char**)malloc(sizeof(char*) * kh_size(d)); + if (!name) return -1; for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) { if (!kh_exist(d, k)) continue; name[kh_val(d, k)] = (char*)kh_key(d, k); @@ -195,6 +215,7 @@ void tbx_set_meta(tbx_t *tbx) } l_nm = x[6] = l; meta = (uint8_t*)malloc(l_nm + 28); + if (!meta) { free(name); return -1; } if (ed_is_big()) for (i = 0; i < 7; ++i) x[i] = ed_swap_4(x[i]); @@ -206,6 +227,45 @@ void tbx_set_meta(tbx_t *tbx) } free(name); hts_idx_set_meta(tbx->idx, l, meta, 0); + return 0; +} + +// Minimal effort parser to extract reference length out of VCF header line +// This is used only used to adjust the number of levels if necessary, +// so not a major problem if it doesn't always work. +static void adjust_max_ref_len_vcf(const char *str, int64_t *max_ref_len) +{ + const char *ptr; + int64_t len; + if (strncmp(str, "##contig", 8) != 0) return; + ptr = strstr(str + 8, "length"); + if (!ptr) return; + for (ptr += 6; *ptr == ' ' || *ptr == '='; ptr++) {} + len = strtoll(ptr, NULL, 10); + if (*max_ref_len < len) *max_ref_len = len; +} + +// Same for sam files +static void adjust_max_ref_len_sam(const char *str, int64_t *max_ref_len) +{ + const char *ptr; + int64_t len; + if (strncmp(str, "@SQ", 3) != 0) return; + ptr = strstr(str + 3, "\tLN:"); + if (!ptr) return; + ptr += 4; + len = strtoll(ptr, NULL, 10); + if (*max_ref_len < len) *max_ref_len = len; +} + +// Adjusts number of levels if not big enough. This can happen for +// files with very large contigs. +static int adjust_n_lvls(int min_shift, int n_lvls, int64_t max_len) +{ + int64_t s = 1LL << (min_shift + n_lvls * 3); + max_len += 256; + for (; max_len > s; ++n_lvls, s <<= 3) {} + return n_lvls; } tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) @@ -216,37 +276,59 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) int64_t lineno = 0; uint64_t last_off = 0; tbx_intv_t intv; + int64_t max_ref_len = 0; str.s = 0; str.l = str.m = 0; tbx = (tbx_t*)calloc(1, sizeof(tbx_t)); + if (!tbx) return NULL; tbx->conf = *conf; if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI; else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI; while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { ++lineno; + if (str.s[0] == tbx->conf.meta_char && fmt == HTS_FMT_CSI) { + switch (tbx->conf.preset) { + case TBX_SAM: + adjust_max_ref_len_sam(str.s, &max_ref_len); break; + case TBX_VCF: + adjust_max_ref_len_vcf(str.s, &max_ref_len); break; + default: + break; + } + } if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) { last_off = bgzf_tell(fp); continue; } if (first == 0) { + if (fmt == HTS_FMT_CSI) + n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len); tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); + if (!tbx->idx) goto fail; first = 1; } - get_intv(tbx, &str, &intv, 1); - ret = hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, bgzf_tell(fp), 1); - if (ret < 0) - { - free(str.s); - tbx_destroy(tbx); - return NULL; + ret = get_intv(tbx, &str, &intv, 1); + if (ret < -1) goto fail; // Out of memory + if (ret < 0) continue; // Skip unparsable lines + if (hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, + bgzf_tell(fp), 1) < 0) { + goto fail; } } + if (ret < -1) goto fail; if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); // empty file + if (!tbx->idx) goto fail; if ( !tbx->dict ) tbx->dict = kh_init(s2i); - hts_idx_finish(tbx->idx, bgzf_tell(fp)); - tbx_set_meta(tbx); + if (!tbx->dict) goto fail; + if (hts_idx_finish(tbx->idx, bgzf_tell(fp)) != 0) goto fail; + if (tbx_set_meta(tbx) != 0) goto fail; free(str.s); return tbx; + + fail: + free(str.s); + tbx_destroy(tbx); + return NULL; } void tbx_destroy(tbx_t *tbx) @@ -270,7 +352,7 @@ int tbx_index_build3(const char *fn, const char *fnidx, int min_shift, int n_thr int ret; if ((fp = bgzf_open(fn, "r")) == 0) return -1; if ( n_threads ) bgzf_mt(fp, n_threads, 256); - if ( bgzf_compression(fp) != bgzf ) { bgzf_close(fp); return -1; } + if ( bgzf_compression(fp) != bgzf ) { bgzf_close(fp); return -2; } tbx = tbx_index(fp, min_shift, conf); bgzf_close(fp); if ( !tbx ) return -1; @@ -289,14 +371,14 @@ int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf) return tbx_index_build3(fn, NULL, min_shift, 0, conf); } -tbx_t *tbx_index_load2(const char *fn, const char *fnidx) +static tbx_t *index_load(const char *fn, const char *fnidx, int flags) { tbx_t *tbx; uint8_t *meta; char *nm, *p; uint32_t l_meta, l_nm; tbx = (tbx_t*)calloc(1, sizeof(tbx_t)); - tbx->idx = fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_TBI); + tbx->idx = hts_idx_load3(fn, fnidx, HTS_FMT_TBI, flags); if ( !tbx->idx ) { free(tbx); @@ -334,9 +416,19 @@ tbx_t *tbx_index_load2(const char *fn, const char *fnidx) return NULL; } +tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags) +{ + return index_load(fn, fnidx, flags); +} + +tbx_t *tbx_index_load2(const char *fn, const char *fnidx) +{ + return index_load(fn, fnidx, 1); +} + tbx_t *tbx_index_load(const char *fn) { - return tbx_index_load2(fn, NULL); + return index_load(fn, NULL, 1); } const char **tbx_seqnames(tbx_t *tbx, int *n) @@ -345,11 +437,15 @@ const char **tbx_seqnames(tbx_t *tbx, int *n) if (d == NULL) { *n = 0; - return NULL; + return calloc(1, sizeof(char *)); } int tid, m = kh_size(d); const char **names = (const char**) calloc(m,sizeof(const char*)); khint_t k; + if (!names) { + *n = 0; + return NULL; + } for (k=kh_begin(d); k # @@ -29,7 +29,7 @@ use Getopt::Long; my %opts; -GetOptions(\%opts, 'noqual', 'noaux', 'notemplate', 'unknownrg', 'nomd', 'template-1', 'noflag', 'Baux'); +GetOptions(\%opts, 'noqual', 'noaux', 'notemplate', 'unknownrg', 'nomd', 'partialmd=i', 'template-1', 'noflag', 'Baux'); my ($fn1, $fn2) = @ARGV; open(my $fd1, "<", $fn1) || die $!; @@ -82,6 +82,23 @@ $ln2 =~ s/\tNM:i:\d+//; } + # Validate MD and NM only if partialmd & 'file' set, otherwise + # discard it. Ie: + # + # 1: if file 1 has NM/MD keep in file 2, othewise discard from file2 + # 2: if file 2 has NM/MD keep in file 1, othewise discard from file1 + # 3: if file 1 and file 2 both have NM/MD keep, otherwise discard. + if (exists $opts{partialmd}) { + if ($opts{partialmd} & 2) { + $ln1 =~ s/\tNM:i:\d+// unless ($ln2 =~ /\tNM:i:\d+/); + $ln1 =~ s/\tMD:Z:[A-Z0-9^]*// unless ($ln2 =~ /\tMD:Z:[A-Z0-9^]*/); + } + if ($opts{partialmd} & 1) { + $ln2 =~ s/\tNM:i:\d+// unless ($ln1 =~ /\tNM:i:\d+/); + $ln2 =~ s/\tMD:Z:[A-Z0-9^]*// unless ($ln1 =~ /\tMD:Z:[A-Z0-9^]*/); + } + } + my @ln1 = split("\t", $ln1); my @ln2 = split("\t", $ln2); diff --git a/test/cross_validate.sh b/test/cross_validate.sh index 226f13f46..04704f6ef 100755 --- a/test/cross_validate.sh +++ b/test/cross_validate.sh @@ -1,5 +1,24 @@ #!/bin/sh +# Copyright (C) 2015, 2018 Genome Research Ltd. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. # # ----------------------------------------------------------------------------- # Author: James Bonfield. diff --git a/test/emptyfile b/test/emptyfile new file mode 100644 index 000000000..e69de29bb diff --git a/test/fieldarith.c b/test/fieldarith.c index 3627a893a..56968a0ab 100644 --- a/test/fieldarith.c +++ b/test/fieldarith.c @@ -1,6 +1,6 @@ /* test/fieldarith.c -- CIGAR field arithmetic test suite. - Copyright (C) 2013 Genome Research Ltd. + Copyright (C) 2013-2014 Genome Research Ltd. Author: John Marshall @@ -47,7 +47,7 @@ void check(const bam1_t *aln, const char *testname, const char *tag, int value) int main(int argc, char **argv) { - bam_hdr_t *header; + sam_hdr_t *header; bam1_t *aln = bam_init1(); int i; @@ -64,7 +64,7 @@ int main(int argc, char **argv) check(aln, "endpos", "XE", bam_endpos(aln)); } - bam_hdr_destroy(header); + sam_hdr_destroy(header); sam_close(in); } diff --git a/test/fuzz/hts_open_fuzzer.c b/test/fuzz/hts_open_fuzzer.c new file mode 100644 index 000000000..11a741aec --- /dev/null +++ b/test/fuzz/hts_open_fuzzer.c @@ -0,0 +1,151 @@ +/* test/fuzz/hts_open_fuzzer.c -- Fuzz driver for hts_open. + + Copyright (C) 2018 Google LLC. + Copyright (C) 2019 Genome Research Ltd. + + Author: Markus Kusano + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include +#include +#include +#include +#include "htslib/hfile.h" +#include "htslib/hts.h" +#include "htslib/sam.h" +#include "htslib/vcf.h" + +static void hts_close_or_abort(htsFile* file) { + if (hts_close(file) != 0) { + abort(); + } +} + +static void view_sam(htsFile *in) { + if (!in) { + return; + } + samFile *out = sam_open("/dev/null", "w"); + if (!out) { + abort(); + } + sam_hdr_t *hdr = sam_hdr_read(in); + if (hdr == NULL) { + hts_close_or_abort(out); + return; + } + + // This will force the header to be parsed. + (void) sam_hdr_count_lines(hdr, "SQ"); + + if (sam_hdr_write(out, hdr) != 0) { + sam_hdr_destroy(hdr); + hts_close_or_abort(out); + return; + } + bam1_t *b = bam_init1(); + if (b == NULL) { + sam_hdr_destroy(hdr); + hts_close_or_abort(out); + return; + } + while (sam_read1(in, hdr, b) >= 0) { + if (sam_write1(out, hdr, b) < 0) { + break; + } + } + bam_destroy1(b); + + sam_hdr_destroy(hdr); + hts_close_or_abort(out); +} + +static void view_vcf(htsFile *in) { + if (!in) { + return; + } + vcfFile *out = vcf_open("/dev/null", "w"); + if (!out) { + abort(); + } + bcf_hdr_t *hdr = bcf_hdr_read(in); + if (hdr == NULL) { + hts_close_or_abort(out); + return; + } + + if (bcf_hdr_write(out, hdr) != 0) { + bcf_hdr_destroy(hdr); + hts_close_or_abort(out); + } + bcf1_t *rec = bcf_init(); + if (rec == NULL) { + bcf_hdr_destroy(hdr); + hts_close_or_abort(out); + } + while (bcf_read(in, hdr, rec) >= 0) { + if (bcf_write(out, hdr, rec) < 0) { + break; + } + } + bcf_destroy(rec); + + bcf_hdr_destroy(hdr); + hts_close_or_abort(out); +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + hFILE *memfile; + uint8_t *copy = malloc(size); + if (copy == NULL) { + abort(); + } + memcpy(copy, data, size); + // hopen does not take ownership of `copy`, but hts_hopen does. + memfile = hopen("mem:", "rb:", copy, size); + if (memfile == NULL) { + free(copy); + return 0; + } + + htsFile *ht_file = hts_hopen(memfile, "data", "rb"); + if (ht_file == NULL) { + if (hclose(memfile) != 0) { + abort(); + } + return 0; + } + switch (ht_file->format.category) { + case sequence_data: + view_sam(ht_file); + break; + case variant_data: + view_vcf(ht_file); + break; + default: + break; + } + hts_close_or_abort(ht_file); + return 0; +} diff --git a/test/header_syms.pl b/test/header_syms.pl new file mode 100755 index 000000000..fc9cfa303 --- /dev/null +++ b/test/header_syms.pl @@ -0,0 +1,104 @@ +#!/usr/bin/env perl +# +# Copyright (C) 2019 Genome Research Ltd. +# +# Author: Rob Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Script to extract symbol names from HTSlib header files. Used to +# check the shared library for missing exports. + +# Instead of implementing a full C parser, this attempts to do the minimum +# amount it can get away with by scrubbing out most of the header text and +# then looking through the rest for function declarations. + +# Roughly equivalent Exuberant-ctags command is: +# ctags -f - -n -I HTS_RESULT_USED -I HTS_DEPRECATED+ -I HTS_FORMAT+ \ +# -I KS_ATTR_PRINTF+ -I knet_win32_destroy+ -I knet_win32_init+ +# Unfortunately this is not the default ctags on all platforms, hence this +# script. + +use strict; +use warnings; +use Getopt::Long; + +# Use this option to show the processed version of the header text +# instead of the function list. +my $show_processed = 0; + +GetOptions('show-processed' => \$show_processed); + +# List of functions to strip from the output +my %ignore = map { $_ => 1 } qw(knet_win32_init knet_win32_destroy); + +foreach my $file (@ARGV) { + extract_symbols($file, $show_processed, \%ignore); +} + +sub extract_symbols { + my ($file, $show_processed, $ignore) = @_; + + local $/ = undef; + + open(my $f, '<', $file) || die "Couldn't open $file : $!\n"; + my $text = <$f>; + close($f) || die "Error reading $file : $!\n"; + + # Get rid of comments + $text =~ s#/\*.*?\*/##sg; + $text =~ s#//.*$##mg; + + # Remove extern "C" brackets + $text =~ s/#ifdef\s+__cplusplus.*?#endif//sg; + + # Remove #if 0 sections + $text =~ s/^\s*#\s*if\s+0\s+.*?#\s*endif\s//msg; + + # Remove #defines + $text =~ s/\n\s*?#\s*?define\s+(?:[^\n]+\\\n)*[^\n]+//sg; + + # Remove content inside curly braces + $text =~ s/(\{(?:(?>[^{}]+)|(?1))*\})/{}/sg; + + # Get rid of typedefs + $text =~ s/typedef\s+[^;]+;//sg; + + # Get rid of some macros + $text =~ s/HTS_RESULT_USED//g; + $text =~ s/HTSLIB_EXPORT//g; + + $text =~ s/HTS_DEPRECATED\s*?\(\"[^"]+\"?\)//g; + $text =~ s/HTS_FORMAT\s*?\(.*?\)//g; + $text =~ s/KS_ATTR_PRINTF\s*?\(.*?\)//g; + + # Get rid of static inline functions + $text =~ s/static\s+inline\s+(?:\S+\s+)+?(\S+)\s*(\((?:(?>[^()]+)|(?-1))*\))\s*{}//g; + + if ($show_processed) { + print $text; + return; + } + + # Find functions and print them + while ($text =~ m/^\s+(?:\S+\s+)+?(?:\*+\s*)?(\S+)\s*(\((?:(?>[^()]+)|(?-1))*\))\s*;/msg) { + next if (exists($ignore->{$1})); + print "$1\n"; + } +} diff --git a/test/hfile.c b/test/hfile.c index 7e09ba03c..fccb44454 100644 --- a/test/hfile.c +++ b/test/hfile.c @@ -1,6 +1,6 @@ /* test/hfile.c -- Test cases for low-level input/output streams. - Copyright (C) 2013-2014, 2016 Genome Research Ltd. + Copyright (C) 2013-2014, 2016, 2018 Genome Research Ltd. Author: John Marshall @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hfile.h" #include "htslib/hts_defs.h" +#include "htslib/kstring.h" void HTS_NORETURN fail(const char *format, ...) { @@ -251,10 +252,10 @@ int main(void) if (strcmp(buffer, "hello, world!\x0A") != 0) fail("hread result"); if (hclose(fin) != 0) fail("hclose(\"data:...\")"); - fin = hopen("test/xx#blank.sam", "r"); - if (fin == NULL) fail("hopen(\"test/xx#blank.sam\") for reading"); - if (hread(fin, buffer, 100) != 0) fail("test/xx#blank.sam is non-empty"); - if (hclose(fin) != 0) fail("hclose(\"test/xx#blank.sam\") for reading"); + fin = hopen("test/emptyfile", "r"); + if (fin == NULL) fail("hopen(\"test/emptyfile\") for reading"); + if (hread(fin, buffer, 100) != 0) fail("test/emptyfile is non-empty"); + if (hclose(fin) != 0) fail("hclose(\"test/emptyfile\") for reading"); fin = hopen("data:,", "r"); if (fin == NULL) fail("hopen(\"data:\") for reading"); @@ -279,5 +280,30 @@ int main(void) fail("hread result for base64"); if (hclose(fin) != 0) fail("hclose(\"data:;base64,...\")"); + kstring_t kstr = { 0, 0, NULL }; + + if (strcmp(haddextension(&kstr, "foo/bar.bam", 0, ".bai"), + "foo/bar.bam.bai") != 0) fail("haddextension foo/bar.bam[.bai]"); + if (strcmp(haddextension(&kstr, "foo/bar.bam", 1, ".bai"), + "foo/bar.bai") != 0) fail("haddextension foo/bar[.bai]"); + if (strcmp(haddextension(&kstr, "foo.bar/baz", 1, ".bai"), + "foo.bar/baz.bai") != 0) fail("haddextension foo.bar/baz[.bai]"); + if (strcmp(haddextension(&kstr, "foo#bar.bam", 0, ".bai"), + "foo#bar.bam.bai") != 0) fail("haddextension foo#bar.bam[.bai]"); + if (strcmp(haddextension(&kstr, ".bam", 1, ".bai"), + ".bai") != 0) fail("haddextension [.bai]"); + if (strcmp(haddextension(&kstr, "foo", 1, ".csi"), + "foo.csi") != 0) fail("haddextension foo[.csi]"); + + if (strcmp(haddextension(&kstr, "http://host/bar.cram?a&b&c", 0, ".crai"), + "http://host/bar.cram.crai?a&b&c") != 0) + fail("haddextension http://host/bar.cram[.crai]?a&b&c"); + + if (strcmp(haddextension(&kstr, "http://host/bar.cram#frag", 1, ".crai"), + "http://host/bar.crai#frag") != 0) + fail("haddextension http://host/bar[.crai]#frag"); + + free(ks_release(&kstr)); + return EXIT_SUCCESS; } diff --git a/test/index.bam.bai b/test/index.bam.bai new file mode 100644 index 000000000..9d7f43d49 Binary files /dev/null and b/test/index.bam.bai differ diff --git a/test/index.bam.csi b/test/index.bam.csi new file mode 100644 index 000000000..a19a3164b Binary files /dev/null and b/test/index.bam.csi differ diff --git a/test/index.bcf.csi b/test/index.bcf.csi new file mode 100644 index 000000000..dbebfe13a Binary files /dev/null and b/test/index.bcf.csi differ diff --git a/test/index.cram.crai b/test/index.cram.crai new file mode 100644 index 000000000..acdfe6781 Binary files /dev/null and b/test/index.cram.crai differ diff --git a/test/index.sam b/test/index.sam new file mode 100644 index 000000000..1368988a9 --- /dev/null +++ b/test/index.sam @@ -0,0 +1,190 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:CHROMOSOME_I LN:1009800 M5:8ede36131e0dbf3417807e48f77f3ebd +@SQ SN:CHROMOSOME_II LN:5000 M5:8e7993f7a93158587ee897d7287948ec +@SQ SN:CHROMOSOME_III LN:5000 M5:3adcb065e1cf74fafdbba1e8c352b323 +@SQ SN:CHROMOSOME_IV LN:5000 M5:251af66a69ee589c9f3757340ec2de6f +@SQ SN:CHROMOSOME_V LN:5000 M5:cf200a65fb754836dcc56b24b3170ee8 +@SQ SN:CHROMOSOME_X LN:5000 M5:6f9368fd2192c89c613718399d2d31fc +@SQ SN:CHROMOSOME_MtDNA LN:5000 M5:cd05857ece6411f40257a565ccfe15bb +@PG ID:bowtie2 PN:bowtie2 VN:2.0.0-beta5 +SRR065390.17240207 16 CHROMOSOME_I 999901 42 100M * 0 0 ATGTTTACAGGACTTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAG CACAC?CBBAA@?@?BADDBBDBBAB>DDDBBDDABBBCCADDDDDCBCBCCCDBDDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.15493040 0 CHROMOSOME_I 999912 42 100M * 0 0 ACTTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDBCCBDBCCBDDA@>DC?5@?@@??:><<>8>39<37 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.6144221 0 CHROMOSOME_I 999914 42 100M * 0 0 TTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCDCCCCBDCDDBBDDBDBDD@BBB@DBABDB AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.8057275 0 CHROMOSOME_I 999916 42 100M * 0 0 CAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTT CCCCCCCBCCC@CCCCCCCCCCC>BBB>BB?4CCCCCC;>====ACCCA@CCCBBCCBC;>@==>BBBBA?<;@<@######################## AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24679913 16 CHROMOSOME_I 999917 42 100M * 0 0 AAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTT ==56>??>AB?>D>?A?DBDABBB=BDBDACDBBCCDBBBBDDCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.25513175 0 CHROMOSOME_I 999934 42 100M * 0 0 ATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCBC@CADCDDAABA=B?=A=B.>AA?AADA########################## AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17492782 0 CHROMOSOME_I 999935 42 100M * 0 0 TGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCDCDCCDCCBDCDDBDDBDD@BBBBBBACBBAB=AB>BBBAB>?BA@CAAA? AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17146364 16 CHROMOSOME_I 999942 42 100M * 0 0 CAAAAATTGTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAG #######@/A@@<:BBBBB>ABBDADC@=DDBDDDCDCCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:8T91 YT:Z:UU +SRR065390.14459471 16 CHROMOSOME_I 999944 42 100M * 0 0 AAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGC @@@@=B@CCCBAABACCC@DCCCCCDCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.6968616 16 CHROMOSOME_I 999947 42 100M * 0 0 ATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTA BDB>B@DDDD@DDDDBCACB@DCBCCACCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.9052825 16 CHROMOSOME_I 999952 42 100M * 0 0 GGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACC ?B;DABDABDDBDDADCCCD@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.22926164 0 CHROMOSOME_I 999967 42 100M * 0 0 TTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBCCBCCCCCCCCDCCDCDDDDCCDACDCADBDDBBCBCBCCABBA@BABABCBABC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.27108093 16 CHROMOSOME_I 999969 42 100M * 0 0 AAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACN ##########AAAAA388333-533')''+AA8AAAAAAAAAA8AAAAAA67788AAAA888887AAA5AAAAAAAAAAAA8AAAAAAAA+*++)))))! AS:i:-1 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:99C0 YT:Z:UU +SRR065390.19145675 0 CHROMOSOME_I 999970 42 100M * 0 0 AGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCBCCCCCCCCCCADCBDBBCBBBBBDCBABBBABAABB??DDAACCAACC>AC?C?= AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.22660118 16 CHROMOSOME_I 999972 42 100M * 0 0 CTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCC B9ABABDB>DBBBD8CBDCDBCDBCDBCBCCBCCCCCCCCCCCCCCC>CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1589310 0 CHROMOSOME_I 999973 42 100M * 0 0 NTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCG !++((22221AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA7A8AAAAAAAA8AAAAAAAAAAAAA7A7AA768655 AS:i:-1 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:0T99 YT:Z:UU +SRR065390.32984687 0 CHROMOSOME_I 999978 42 100M * 0 0 GTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCT CCCCCCCCCCCBCCCCCCCCCCCCCCCCC@CC@CCCBCCCCCCBDACDCC>@B@CDBADB@BCBD@B=BBB@BD>C@BBCBACAABAB;D9<4:<66 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.28347129 16 CHROMOSOME_I 999978 42 100M * 0 0 GTTATGTTTAGGCGTAGGCTTAGACATACGCTTAGGTTTCGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCT ##############################################@B?BB@A@ABBBDABD@DDBBB@@B;C@BACBC@CC@CCCCCBCCCCCCCCCCC AS:i:-10 XN:i:0 XM:i:5 XO:i:0 XG:i:0 NM:i:5 MD:Z:23G1T2G2G7A60 YT:Z:UU +SRR065390.17964692 16 CHROMOSOME_I 999984 42 100M * 0 0 TTTGGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAA #####@<@=<53.830;>.?A5@@?ABAAADBDBC<@CB@D@BCB@CBCDCDBBDC=C@C@CAAC@C@ACCCCCCCCCCCCCCCCCCCCC AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:3A96 YT:Z:UU +SRR065390.16701032 0 CHROMOSOME_I 999987 42 100M * 0 0 AGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCACCCCCCCCCCDCBCCCCCCDCCBAA@BBBBBC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24060716 16 CHROMOSOME_I 999989 42 100M * 0 0 GCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAA @8>68BD?B??B@DB>ABB?BA@A=ADBCC@?AA@CCBBCBCCDBCDCCBCBCCC@CCCCBCCCCCCCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24907628 16 CHROMOSOME_I 999989 42 100M * 0 0 GCGTAGGCTGAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAA ################################BDDBB?BB>?>BADABBBDBDBABDBDC;?>9=C?B>CC@CCCCDCCCCCCCCCCCCCCCCCCCCCCC AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:9T90 YT:Z:UU +SRR065390.21366278 16 CHROMOSOME_I 999991 42 100M * 0 0 GTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAAAG ######?9>A09=@?=>BBDBBBB8B>DBCDCCDCBCBCBDCCC@CCCCCCCBCCCCCCC@@CCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.27662957 0 CHROMOSOME_I 999995 42 100M * 0 0 GCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAAAGAGTA CCCCCCCCCCCCCCCCCCC@ACCCCCCCCCCCCCCADCCCBC?CDDDDAC=BA?@B@DBDB>?>>D?#################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.29477959 0 CHROMOSOME_I 999997 42 100M * 0 0 TTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAAAGAGTAGG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=AB?DAB@3=@8@=@?@ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.13030274 16 CHROMOSOME_I 1000208 42 100M * 0 0 TCAATTAAACTGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTC 955576>0@BBBBBBDBBD?DABDDDDCD@DCDDCCDCDDCACBACCCCCCBCCCCCCCCCCCCCCBCCCCCCCCCCBBCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.18054898 16 CHROMOSOME_I 1000209 42 100M * 0 0 CAATTAAACTGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCC CAC@CAA?BC?D??BCABB8=>@@?#### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.26866653 16 CHROMOSOME_I 1000217 42 100M * 0 0 CTGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACC ###########??????4D;AA?AAD?A>>?CABCBABBBBAA@AD>ADAAC@CCCCBCCBCCC?CCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCBCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.23714265 0 CHROMOSOME_I 1000218 0 78M2I20M * 0 0 TGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGAGATCGGAAGAGCGGTTCAGCAGGAA CCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCADDCCBBBBBDBBBB+=7=0?==>A#################### AS:i:-48 XN:i:0 XM:i:16 XO:i:1 XG:i:2 NM:i:18 MD:Z:75T0T1T0T0G0T2T0T0T3C0T0T0T0T0T1C0 YT:Z:UU +SRR065390.20744360 16 CHROMOSOME_I 1000218 42 100M * 0 0 TGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCG #####@ABBBBDBD@BA@DCDBABBBBBDA>@CBBDBBAD=BBDCBACBCCCCCCCBCBCCCCACCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCBCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.3611567 16 CHROMOSOME_I 1000225 42 100M * 0 0 CGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTT #####@<2@=BBBBAC=DBBB@BBACBBBB=C;BBCCBACC@CCACCCCBCCCCCCBCCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.10053218 16 CHROMOSOME_I 1000225 42 100M * 0 0 CGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTT @@=@6AA=AAC?CAC>BB>?A>>CBB@@CBAD>CC;>C@BC>A################################################# AS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 NM:i:4 MD:Z:66A7A14C2A7 YT:Z:UU +SRR065390.21951837 0 CHROMOSOME_I 1000229 42 100M * 0 0 AATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCBCDCACCCCCCBCCB>AACCC@1/?@?CCC@@BABCB=?@@+:A?B###### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.21381202 0 CHROMOSOME_I 1000232 40 100M * 0 0 TATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGGTTTTTTTAGTTTTTTCTTTTTTCCCAATTTTTTTGGATA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCDCDC?=8@';4@AA############################################# AS:i:-16 XN:i:0 XM:i:8 XO:i:0 XG:i:0 NM:i:8 MD:Z:60A5G8C6A2G7A0G1A3 YT:Z:UU +SRR065390.22184926 16 CHROMOSOME_I 1000235 42 100M * 0 0 TGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCT ??CAACCBAADD?DBB?@>BBB;BABBBBB@>CCCDBCDBACCCCAACACACCACCC@@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17603173 0 CHROMOSOME_I 1000236 42 100M * 0 0 GGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTG CCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCCDCCCCCCCCB>CAB@ACCC################################################ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17587471 16 CHROMOSOME_I 1000250 42 100M * 0 0 GAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTT 10?8;;?;AA??:AA@BBBBB?BDDDDDBCDA>@DDDCCCDACCCDDCCDCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.635026 0 CHROMOSOME_I 1000255 42 100M * 0 0 TTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGA CCCCCCCCCCCCCCBCCCCCCCBD@CCCCB0:>8:=BBBBC6:=7@>?B?B43/+2>@@/@########## AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.33333470 0 CHROMOSOME_I 1000257 42 100M * 0 0 TTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTTTGTTT CCCCBCCCCCCC?CCC?CCCCDBCADCCCCCA@@:;CCCC?7.)8;>???-3>>;A?3?6;/2;>?A:24775=4B<@@<4)+75:70(4@>::)9,B>BB?BBD:>BADDD=ABBBDDDBD@DBCCCDCCDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:0T99 YT:Z:UU +SRR065390.18670433 0 CHROMOSOME_I 1000260 40 100M * 0 0 TGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTTTGAATATCTGGGGATTTTTCGTTTTTTTTTTTTT CCCCCCCCCCCCC>CCCCBBC4A@ACCC8@;5/8;A?A/6,>==AAC6<@################################################## AS:i:-14 XN:i:0 XM:i:7 XO:i:0 XG:i:0 NM:i:7 MD:Z:65A0G11A4C7C1G0A5 YT:Z:UU +SRR065390.5800524 0 CHROMOSOME_I 1000261 42 100M * 0 0 GGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTTCCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTT CCCCCCCCCCCCCCCCCDDC*/,0/??/<<508BAA@@BCBCAC?BAADBCD@@@CBCCBA9CCCACCCCCCCCCDCCCCCC?CCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCBBBCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1793614 16 CHROMOSOME_I 1000274 42 100M * 0 0 ACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTT A:CAADB=DBDD@CBACC>@CACCCCCCCDCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCCBBBCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.20107270 0 CHROMOSOME_I 1000276 42 100M * 0 0 GTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAG CCCCCCCCCCCCCCCCCCCCCCC@@CCCCCCCCCCCCCCCCADDCCCCCDCC?ACACDCCCCC@CCCDCD@BCDCBB3>B@BCCC@@9=3BB?@B@>85; AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.8268806 16 CHROMOSOME_I 1000276 42 100M * 0 0 GTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAG ##########D?:BBA>;BBABBAABBBBBDDB>DDDDBDCDDCDCDDCCCDCCCDCCCCDCCCCCCCCCCCCCCCCC@BBCCCCCCCCBBBCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.19264263 16 CHROMOSOME_I 1000280 42 100M * 0 0 CACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCC ##BB?>CBABBB?:BBBBABABABB@DBCBBDAABDCCCCCCBCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCCBBBCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.18391831 0 CHROMOSOME_I 1000283 42 100M * 0 0 TTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCC CCCCCCCCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC?ACCCCCCCBCC@CC8BBCCCCCB@>A>CCCDDC@@@DBBBC?:CCDBAC;CDDDDCBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCC@BBCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24029537 16 CHROMOSOME_I 1000284 42 100M * 0 0 TTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCA DB>B8BB<9;?>ABDDAADB@DD@C@BBAABBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCCBBBCCCCCCCCCCCCCDCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.28630205 0 CHROMOSOME_I 1000286 42 100M * 0 0 TTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCACG CCCCCCCCCCCCC@BCCCCCCCCCCCCCCCCCDBCCCCCDDBBBCBCDCDB@=?BBBBDBBABBBBBB@@CBBDB>>>A>BCBCCB:;:>=<9:@A#### AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:98A1 YT:Z:UU +SRR065390.15799530 0 CHROMOSOME_I 1000295 42 100M * 0 0 TTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCAAGCCTAAGCCT CCCCCCCCCCCCCCCCCCCCCCAACCCCCCCCCACCBCBCCCC?B@CCCCB@93=@B5>BB>>3/77:7:B>CDBDDB@>;B>BBBBDACAAB@D@<9<9<7 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.22494349 0 CHROMOSOME_I 1000297 42 100M * 0 0 TGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCAAGCCTAAGCCTAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACDCCCCCD@@CCDCDCBBDCDDDBADDDDCD>B;@>DAABBB@>5A>BDBB?6??@D?9@####### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.12445253 0 CHROMOSOME_I 1000298 42 100M * 0 0 GTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCAAGCCTAAGCATAAC CCCCCCCCCCCCCCCCCCCCACCCCCCCCC@DCCCCCCCC?BACCBC@CBDCCACB?BBBCDC@@;4BCBABDC@B56?B@96=4A>BAB;;5;:@19A;@;;;6?BBBBB3BBB??@@@>@BBB;@AA@9@AA9BABBBAA@@AABAABAB@BB:;??>:?DBAB?BBDDBBABB;ACBDB?BBB@CCCBDD@CD@CCDBCDDDCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.27194079 0 CHROMOSOME_II 2920 42 100M * 0 0 CTAATTTTCAGAGAGACTGAAAGAGTTTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAA CCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCC=BBBCB?BBBA?BBBDB?>BB=CBCCAACAC;DAB=ACAC?##### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.21775125 16 CHROMOSOME_II 2934 42 100M * 0 0 AACTGAAAGAGTTTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACC #####ABA?=<<=5=@BBA?=@>:A:7.44?B?8B@@>BBB=@B?ADBBBCBBACBD9CBD?A9?=A?.AABADDABBB@BABDDBACBBCCDCBCCDCCCCDCCCCDCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.3790175 16 CHROMOSOME_II 2944 42 100M * 0 0 GTTTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACCGCGGTAAGTG 8BDD@:=7)/>B>ABBB?BB?>?DB@B:BBB?BBADDC@BDCDDCDBCDCCCBADCCCCCCCBCCCCCCCCCCCCCCCCCCCDCCCCDCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.4091455 0 CHROMOSOME_II 2946 42 100M * 0 0 TTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACCGCGGTAAGTGTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC?CCCCCCCCCCDCCCCBCCCDACBCDCACC@C@CA@CBAAD=BBAADD06@##### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.8676436 0 CHROMOSOME_II 2947 42 100M * 0 0 TAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCCCCGCGGTCCGTGTGC <:>:>/000/:<<:BAB?>8A?A;:A873;3?>?>A>>A8B############################################# AS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 NM:i:4 MD:Z:84A7A0A5T0 YT:Z:UU +SRR065390.28734084 0 CHROMOSOME_II 2948 42 100M * 0 0 AAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACCGCGGTAAGTGTGTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCBCBCDCBCCCBBDDDCADABADBBABB:BB=D?B<@B@>CA?CA>BACADAA########### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.10526869 0 CHROMOSOME_II 2956 40 100M * 0 0 TACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACGCCACCGCGGGGAGGGGGGTTGTTTTAT CCCCCCCCCCCCCCCCCCCCADCBBDDDDDDDBBB8BA@B>6<:>9=789=0>D>AA<@<8B>1>A9>;@5=@8C:48;*AAA=<>9>9>>:>>AB?D>BBDBCBDBBCCBABBB>@CDCCBCDCAACCCCCACCCCCCCCBCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.10879394 16 CHROMOSOME_V 938 42 100M * 0 0 TATGTTTTTCTTGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCA B;B:B>@B?>@>7BBDABADADBBCBDCCBACBCCBBB@CCCCCBCCACACCCCCC>CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1520161 0 CHROMOSOME_V 941 42 100M * 0 0 GTTTTTCTTGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATG CCCCCCCBBBCCCCCCCCCCCCCCCCCCCCACCCCCCDCC@CCCCCCCCCCCCCCCCCCACCCBCCAD=D@BC?C?C?C##################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17468019 16 CHROMOSOME_V 943 42 100M * 0 0 TTTTCTTGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAA >ABBBABBDDDB=DBCD?DDBDBDADDADDBDCCCCCCC=CCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.12403970 0 CHROMOSOME_V 949 42 100M * 0 0 TGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCC@CBBCACBC@?144:>><@@DAB?:=9@<>/>9?;=927= AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.16193993 16 CHROMOSOME_V 949 42 100M * 0 0 TGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTC ;;/67AAC@ADCCDBCDCCCCCCD@CCCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCDC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.30032741 16 CHROMOSOME_V 950 42 100M * 0 0 GAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCA AACBBAB?BB>BABBCDBBDABDBADDDDBDDBBADDDDBACCDCBDDDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.32455256 0 CHROMOSOME_V 956 42 100M * 0 0 GTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGGTATAATACAGCGACTCAATGAAAAAATCAAAAAAA CCCCACCCCCBB=?ABB?BBA?BAABBBBBB@BBABBBBBBBBBBBBBAA@BBBBBBB>B######################################## AS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:64T25C9 YT:Z:UU +SRR065390.15571530 16 CHROMOSOME_V 966 42 100M * 0 0 CTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTT B;:B;B?D?@?BBBB5-=<:@@AA@BBA>BBADBBDDDDCDCDCDBBDCCCDCCCCCCCCCDCCCCCDCCCCCCCCCCCACCBBBCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.9595122 0 CHROMOSOME_V 967 42 100M * 0 0 TGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCC?@:AAA>C@CBB@@>?B=A?BBBBBCB>@/@>=>=>BB# AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.3600239 16 CHROMOSOME_V 969 42 100M * 0 0 ATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCA DB>DBBBBA@AAB?DCA@CB@ABB@BB=AAAA>@==>>6/>:>5:688/85A?AAA>>657==BBB<;;;9>>8>>BBBB> AS:i:-6 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:28T25T45 YT:Z:UU +SRR065390.31266674 0 CHROMOSOME_V 971 42 100M * 0 0 AATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTTACTTTGCACG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCADDCCBC@CBC5<5<7?:83;+471/0<4=8;??BBD(.94;9?@?################ AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:89G10 YT:Z:UU +SRR065390.23187971 16 CHROMOSOME_V 972 42 100M * 0 0 ATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGC 647:0BBB?B==@?@@BDBDBBBDDDBDDBDBDDDCBCCCCBBCCCCCDCCBCCCCCCCCCCCBBBCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.28661392 16 CHROMOSOME_V 975 42 100M * 0 0 TGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTAT ACCACAA5BDABAA>BDBDBDCBCBA@DBDB>DBBBBBAABDBDBDDBCCCCDCCCCCDCCCCCCCCCCCCCCBBBCCCCCCCCCCDCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1859967 0 CHROMOSOME_V 979 42 100M * 0 0 AAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGCAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCA>=>C<@@;:@A@A=53@?AB::?@CCACC=B/<;53;7BB:>B=::=A@?@?ACCC>C@CCCCCB:/&-7735@B7B>B?;@@CC@35A@@CCBC@######################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.12435485 16 CHROMOSOME_V 981 42 100M * 0 0 ATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAA B<=BB>B@>>BBBD@>?DABBBBBDDDDDDDDCADCDCCDCCCDCDBCCCCCDCCCCCCCCCCCCCCBBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.7485987 0 CHROMOSOME_V 983 42 100M * 0 0 TATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGGTAAAA C@?C@CCCCCCCCCCCC@CCCCC@?C8CCC@BC?@CC############################################################### AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:94T5 YT:Z:UU +SRR065390.17264189 0 CHROMOSOME_V 983 42 100M * 0 0 TATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACCCTATGGTTAAAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=C=/////=?5=;:@8???AA############################ AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:87G12 YT:Z:UU +SRR065390.6356855 0 CHROMOSOME_V 986 42 100M * 0 0 AATTTAAAACTTAAACGAAGCTAAAATGTGGCTGGTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAAAA CCBCCBCCCCCCCCCC@CC?@CCCCC@CCC>A=@.88/45+()/.=>2==BBCB659?9?'))10;9??############################### AS:i:-7 XN:i:0 XM:i:3 XO:i:0 XG:i:0 NM:i:3 MD:Z:34T63T0G0 YT:Z:UU +SRR065390.20107175 0 CHROMOSOME_V 989 42 100M * 0 0 TTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAAT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC@C@?./..):3872A=@=A<=:;=B>B>>87777@>&@9A@@@8:@>88 AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:87T12 YT:Z:UU +SRR065390.6431660 16 CHROMOSOME_V 994 42 100M * 0 0 ACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATT AACBABABDC@@ADABBDDCDCDBCDDDCCDCDBCACCCBCCDCCCCCCCCBCC@@@CCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.4439503 16 CHROMOSOME_V 997 42 100M * 0 0 TAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTT ###########B>:AAAAA@C@=;937<ACCC8@@@AABCC>@+/662BBBC?B>BBB?BBBB#################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.11492188 0 CHROMOSOME_V 998 42 100M * 0 0 AAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTTG CCCCCCCCCCCCCCCCDCCCCC>A@AAAAAACA??B@@BBD>BACACC08;;AAACB==/*/1//:=@99BBABA@;<@;<:9>>B??>B??:?6B??B9 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.9605367 0 CHROMOSOME_V 999 42 100M * 0 0 AACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTTGG CCCCCCCCCCCCCCCDCCCCC@C<>>A9<4=9>=B###################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.29302896 0 CHROMOSOME_V 1000 42 100M * 0 0 ACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTTGGT CCCCCCCCCCCCCCDCCCCC6?:??AABCCCC8?C@BCCCC@@5;><9>>>B>>AB=<)6=4:):9>>@@################ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.13754 4 * 0 0 * * 0 0 TCGCTGCTGTGATGTTGCGTTTTTATCAGCACAAAGGCGGTCAGGCCGAGGCCTATTTTTTCCGGATCCAGCAGGGCGACTTTGCCGATAAGGATACCGT CCCCCCCCCCCCCCCCCCCCCCCCCBBCCCCCCC@CCCCCDCCCCCCCDCCCC@ACCCC@>>CCD?>>>@@@ YT:Z:UU +SRR065390.13765 4 * 0 0 * * 0 0 CGTGGTCGTGCCGGTTACAAGCCTGCCGTGAAAAGCCGTTTCAGTAAGTCAGCCAATAGCAAATTCTCCCATACTATCGCTTTTGCCTGATCCTGAACTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCDCCACCCCCCCC@CCCCCCCCCACCCBB@?CBDABDDADB<=ABBB@B@BB@ YT:Z:UU +SRR065390.13778 4 * 0 0 * * 0 0 TTTTATACCAACAAAAAACGGAAAGCAGATAACCCAGCAGCCCGAGTAACAGTATCCGGGCATCCAGGCCAAAAGCTAACAGAGCCGCGATAAAATCCCA CCCCCCCCCCCCCCBBBBBCCCCCCCBCCCCCCCCCBCCCCCCCCCDCCCCCACCCCCCCCCDCCCCDCBCA@AC>@=@CC?B>CBBCC>=?8A8=?>66 YT:Z:UU +SRR065390.13779 4 * 0 0 * * 0 0 ATAATGGACAACTTTAATGGCAATCACTAAATCAACTCCGGCACCATTAACCGGTGGGACGTTATGGTGCGTCACTATTGCATTGTCATTAGCGACATTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCC=CC?BCCBCCACBABCCDCDBDADBBCDCBD>DBBD==BAA:>5<> YT:Z:UU +SRR065390.13802 4 * 0 0 * * 0 0 AAGGCGTTTATTATATACACTCGCATGGCTTTTCTTCTGAAAATGTAGAATAATTGAGTAATTTTTAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG ??B:??????BBB>B99;;;>B>>>:BBBBB;;B=70///0-/01/BA>BABB>B>BBB@BB>>ABABA>BA>6BB88183,<8;<4>:@><>A>> YT:Z:UU +SRR065390.13808 4 * 0 0 * * 0 0 GTTTGCTGACTGGCCAGCCAGCTCAAGGCATCAAAAGCGTCTTTGAGAGGGAAAGGAATATCGATAACCCGAATGCCGGGTGGCGCCGGAATCTCTTGCG CCCCBCCCBCCC@CCC@CCCAA:A=BB?BBBB@>B>A#################### YT:Z:UU +SRR065390.13853 4 * 0 0 * * 0 0 CTGGTACGTCACCACACGCCGCGATGGCGTCATCCACCGACTTCACCCACGTTACGCGATCGTCCGTACCCGGGTGACCGTTGGGGATAATATTTTTGCG #################################################################################################### YT:Z:UU +SRR065390.13861 4 * 0 0 * * 0 0 TTCAGAAACTGGATGAACAGTGCGCAGCCATCTGCAAATATGAATTAGTTCAAGTCACTCAAAAGCTATTTATTTGAATGGAAGAAATTTTTGAACTATA CCCCCCCCCCCBCCB@@CCCBCCBCCCCCCCCCCCCC?CCCC@CCC@C@CCC@CACCBCC?BBBC@C7CBCBCB@@ABCCBBBC=BABCCBBBBAB@@CA YT:Z:UU +SRR065390.13907 4 * 0 0 * * 0 0 CATTACCATTCAGTTGTATTGTTTGCGCACCAGAAAAATGAGACTGCACAGAATAAATTATACTGACCAGAAATTGTAAAATTCGTATATTCTTATTCAT 8998;9:;9;>9:9>?BABBAAA2A@@@@@>:3'3A################################################################ YT:Z:UU +SRR065390.13946 4 * 0 0 * * 0 0 TTTCCTCGAGTTCTTGATGAAATGGTCCATTATTTGTCAACCATTTATTTTTCCATATTTTTTCCAGGTAAGGCATGAATTCTGCAAGTTCCGGCAAAGA CCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCBBBBCACCCCCCB@BCCCCBC?CCCCACCBCBCCCCC@BBDCDDBCA4@@A YT:Z:UU +SRR065390.13956 4 * 0 0 * * 0 0 CGGCGCAACAATACTCAGCAGTTAATTGCAAAGGTATCGCACACCATTAAAAGCATTAAGCCGGGAGTCGAATTTGGTGTTAGCCCGGCAGGCGTGTGGC CCCCCCCCCCCBCCCCCCCCCDCCCCCCCCCCCC>CCCCCCCCCCCDCCCCBDCCDDCBDCC@?@BA@B@B>BBABAABB6?BB>B@?B??2?=+>->60 YT:Z:UU +SRR065390.13964 4 * 0 0 * * 0 0 NTTGAGGTGCTCCAGTGGCTTCTGTTTCTATCAGCTGTCCCTCCTGTTCAGCTACTGACGGGGGGGTGCGCAACGGCAAAAGCACCGCCGGGCATCAGCG !))))++++*AAAAA8AAAA################################################################################ YT:Z:UU +SRR065390.13969 4 * 0 0 * * 0 0 CGGGCGATAGTCAAAAACTTATTTTCACAATTTTCGGCTAGGGAGTATATTTACAGTTAATTTGCGATGTGTTAGATCGGAAGAGCGGTTCAGCAGGAAT CCCCCCCCCCCCCCCCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCACCCCCCCCCCDCCCACCCCCDCCACBCCCCCCBCBBCDCBCC?BCBBCBCBC;A YT:Z:UU +SRR065390.13978 4 * 0 0 * * 0 0 AGACGGTAACTTTCAATTTGCACCCATGATTAAATTTTATGTTGATTAAAATAGAAGCAAAAATCATTACATTACACTACAAAATACGCCGAAATGTTAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBACCCDCBCCCABCADCCAABC? YT:Z:UU +SRR065390.13985 4 * 0 0 * * 0 0 TAACCAAAAACTGGATTATGCAAATAACTAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATATCGTATGCCGTCTTCTGCTTGAAAAAAAAA CCCCCCCCCCCCCC?CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCBCCDCDCCD@CBD5>@=:=><9A@3>=B?BB>CBACB?BBA YT:Z:UU +SRR065390.14000 4 * 0 0 * * 0 0 TAGGTGAGAAAAGCGTTATTGGTCCGGTATACCTGCGAAGCGACAAAGCAATAAGGCAACAATGGCAGGTAATGCTGCTCAAAAAAGCGTTTACTGATCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCBADCCCB?@B>B@BADAAABBD@C5;B9?:?;ACABAB YT:Z:UU +SRR065390.14032 4 * 0 0 * * 0 0 GAAGGTCCAAGTGCCTTGAAGATAGAAAATTATAGCATTTCTCTTTAATTTCAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATATCGCATGA CCCACCCCCC>AAAACBCCCCCCCBCC?CCCCCCCCCCCCCCCCCCCCCCCB@CCCCCCBAC@CDCA/@B<;8=?@B>BC>?>?BB=:A########### YT:Z:UU +SRR065390.14061 4 * 0 0 * * 0 0 TGAAGCCGACAATTTGAGGCCAAACATCTTACATTCGACAGTAAATATTTGGGGATTAAGACTTATGTTAGATCGGAAGAGCGGTTCAGCAGGAATGCCG CCCCCCCCCCCCCCCCCCCCCCCCCCC=CCCCCCCCCCADCBCCCC=CCCCCCCBCCC=CBCCCCCCCABCCCCCCBACBC@CCBB;@B;?A@A@=?99A YT:Z:UU +SRR065390.14072 4 * 0 0 * * 0 0 TGAGTGAGGCTCAGGATTTTGAGTGAGGCTCAGGATTATGAGTGAGGTTGAAGAATTTGAGAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGAG @B=@@BB@B@<@BB>BB>>@BB@==2;:;8BBBBBB@B@@@:@?1B@B@B@3@@@>3;@;@<@?>;@B@@##### YT:Z:UU +SRR065390.14100 4 * 0 0 * * 0 0 AAGCCTGAGGGATAATTTTCGTCAAATTAAGGCAATTGCCGAGTGTTTCATCCCTGGCAAGCAGAACGGCTTTTTCGTTATTTATATCGGGAGAATTTAT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCBCDCDCDCCCCDBBCDADBBBABDDBBBBBBBBDCBCD?BABB>B>AB>BCABAA>C YT:Z:UU +SRR065390.14105 4 * 0 0 * * 0 0 AAATTGTCCCCAAATAAAACAATTCCAGTGATCTTCCGATTCTAGGTGCCAAATAACCCAAATAGTCACTGCATTAGTTTTTATCTCACTTTTCTCCCCC #################################################################################################### YT:Z:UU +SRR065390.14107 4 * 0 0 * * 0 0 TGAAATTTCAAGAAAAATGTTAATTACCACCGTATTAAAAAAAAAAAACTTAAAATCAAAGATCGGAAAAGGGGTCAGGCAGGAATGCCAAAACCGACAC CCCCBCCCCCCCBCB>>>ACCCBCACCC?CCCCCCCCCCC?B########################################################## YT:Z:UU +SRR065390.14137 4 * 0 0 * * 0 0 CTGTGGCGTTTTTATCAAATTGGCAGAGCCACGTTCAGAGCTGAAAAAGCCACAGAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTTGGA BCCC@CCCCC?CCC>>CCBCCC?>C@CCCB=6?AA>=>3?>@?@86;86.@A@==378::68829>B9B############################### YT:Z:UU +SRR065390.14141 4 * 0 0 * * 0 0 GGTCACCAATCATAAGAGGAACAGCGACTGCACCTGCGTACATGACAAGGACGTGTTGCAGACCGAGTATGATCAGCTTTCCTGGTGATAGTATGCGCTC AAA@A?AA8:>A######################################################################################## YT:Z:UU +SRR065390.14162 4 * 0 0 * * 0 0 ATACTTCACCGGATGGTGGAATTAACGAAAACAACAACTGGTGTCACATCCCGCAGGCAAAAGAGGCAGCGGCTAACTAAGCGGCCTGCTGACTTTCTCG CCCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCCBBCCCCCCDCDCCCCCCCCCC?C?CCCCCACD@CAD@AB<>@CB;6B#################### YT:Z:UU +SRR065390.14168 4 * 0 0 * * 0 0 TCGAGGGTGAGGGCGTCTGCCAGATCGGAAGAGCGGGTCAGCAGGAATGCCGAGACCGATATCGGATGCCGTCTCCTGCTGGACAAAAAATGAGAATGGG AACC@0@>@6:<>??>?BBBBB?+B6BBB>B?B=:?BBB=BBBBB>B######################################## YT:Z:UU +SRR065390.14173 4 * 0 0 * * 0 0 AAGAAACTCAACAAACCGGACTTGCAGGTGAAACTGATTCCGATTACCTCACAAAACCGTATTCCACTGCTGCAAAACGGCACTTTCGATTTTGAATGTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=@CCCCCCCCAA?ABC@CCA=CCCABDCCAABDD?BB@BBA YT:Z:UU +SRR065390.14178 4 * 0 0 * * 0 0 GCGCTTTGTTTACCTGATACGGAATTTCGTGGACGATAATGGTTTCACGACCGGTTTTGGCGTCAACTTCCACTTCTGCGCGAGCGCGGATATACACCTT BCCCCCCCCC@@CCCCCCCCCCCCCCCCCCA=AA>AAA=ACCBCCCCC?CCCCAB@ACCC?A?<CB?=CAB9B@BA################### YT:Z:UU +SRR065390.14182 4 * 0 0 * * 0 0 ATTTACTCTAATGTTCTGAAAAATAATTTACTCTAATGTTCTGCCAAATAATTTACTCTAACGTTCTGCCAAATAATTTACTCTAATGTTCTGCCAAATA CCCCCCCCCBCCCCCCCBCC@CCB@@@BCCCCCCC@CCCCCCBBCCCCCCBCCCC@CCC?CCC>>CCBCCCCA@CCCC;CBCCBDCCB@CCBCAACB@BB;B?B0B=8??9>??BB>B?@?B>A>A########## YT:Z:UU +SRR065390.14197 4 * 0 0 * * 0 0 GTACCTCGCCGTTGTTCTCGACCTGTTCGCAAGAAAACCAGTGGGCTGGGCCATGTCGTTCTCGCCGGACAGCAGGCTCACCATGCAAGCGCTGGAAATG CCCCCCCCCCCCCDCBB=B@?BB@BBBBBB@@@B@B==BBB9B@@@@B@=BBB@BBB=@BBABBB@@@BB<@BA@BBB=B;B?BBACA YT:Z:UU +SRR065390.14284 4 * 0 0 * * 0 0 CGGTGCATGATGCGGATTCCAGGAATCAACGTACAGCGTCGGGCTAAACCAGAACCAGCCAATAATGCACAGACCGACGACCGGAATAATAACCCCCCAC BCACCCCCCCCBCCBCCCCCCCBCC@CCCCCCCAC@CBACCACCCCC@CBCCCCCCCA8CC?A@9@AB@9CACC8=81B@CC9CCCCCCC,<8??CBC@BB?@C@ACBCB################################################## YT:Z:UU +SRR065390.14312 4 * 0 0 * * 0 0 ACAGTAACATTCAACGTTAAATATGTTAATAAGACGTTGCATTATTGTCCTGAAGTTGAAGATAGCAGGTATGGCGGTTGGATAGCACGGCGTTGGTTTA CCCCCCCCCCCCCCCCCCCCBCCCCCCBCCCCCCCCCCCCCCCCCCCBCCCCCDCCCCCCCCCACCBC@?CACBC######################### YT:Z:UU +SRR065390.14331 4 * 0 0 * * 0 0 GAATAATGAAGATGATGCGACGCGTCTGGCGCGTTTGAACGAACGCTTTAAACGCGAAGGTAAACCGGAGTTGAAGAAGCTGGATGATCTACCTAAAGAT CCCCCCCCCCCCCCBCCBCCCCCCBCCCC@CCCB@CCCCACCDDCBCC?CAC@B@DABA?BAB@@@?C?C@BC?9A::>=@@C;?############### YT:Z:UU +SRR065390.14335 4 * 0 0 * * 0 0 TCCATTTGATGAACCTGAAGTTTAAGTATTGACTTGAGAGGAAAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCT CCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCC@BBA>CCCCCCCCBBCCBC=CBCCCB################### YT:Z:UU +SRR065390.14342 4 * 0 0 * * 0 0 AAGTTCATGAATTAAAGCCGACTCAAACACTCTGTTTAAAAACTGGATAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGAGATCGTAGGCCGTC 0000079;9;AAAA?;;;>9>3>9BB8BBBB@############################ YT:Z:UU +SRR065390.14359 4 * 0 0 * * 0 0 GCATCAGTACGATAAAACGCGTACCGAACTACTGAATGATGTCGCAGGGGCGCTGGCTCTTGATGACAAACTCGGACGTAGCACCAATCAACTTTCCGGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCBABACCBB@BBBBBB>BDBBDBBB>B@@@>>?BCBAC?CBC?> YT:Z:UU +SRR065390.14364 4 * 0 0 * * 0 0 GGTCGCCGATCCGATTTGCACTTTAACCACTTTCGGTAAAGAAACCGTTGTTAGTGAAAGCGAAAAACGCACAACGACCACTGATGACCCGCTACAGGTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC@@CCCCCCCCCBCCCCCCCCCACCDACCBCACA@CACCAA=B=CBC=ACAAAC=)>? YT:Z:UU +SRR065390.14392 4 * 0 0 * * 0 0 GTTATCCTTTTCCGTGATATGTGCGGTACTGCAGCGTATGCCGGCAAGGGTTGCAAACGGTGGTAGTGTGCAGGTTGACTGTTGGTCGGATTCCTCCACC CCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCDCCCCCCCCCBCCCA@CCB@AACB?CB?BA=A8@BACB?:===@C@-A6==?@@<@@AA##### YT:Z:UU +SRR065390.14393 4 * 0 0 * * 0 0 AGAAATTTACTGGCTCGCCGCAGCCAACTCCTCTTCTGACACCCCGGTAAAGCGCATGATGTCTGTAAGAGGGGCCCCGGATTCAAGCATTATTTTGGCT CCCCCCCCCCCCCCCBA9::<4A>AAAA:?A#################################### YT:Z:UU +SRR065390.14434 4 * 0 0 * * 0 0 GGTAGATTCCCATAAAAATCGCCAGCGGAATGGTGAACGCAACGGTATACGTTCCCCACGGGCTATGAGTCAGGGCTTTCACCACGATCATCGCCAGTAC DCACCCBCCCCCCCC>CBBCCCCCCCCCCCCCC?CCCCCCCCCCCACACCC@BCCCCBCD=ABB@BCBD?@@B6BC8B@B>BABCBB@AB=@2C###### YT:Z:UU diff --git a/test/index.sam.gz.bai b/test/index.sam.gz.bai new file mode 100644 index 000000000..0d41e3cad Binary files /dev/null and b/test/index.sam.gz.bai differ diff --git a/test/index.sam.gz.csi b/test/index.sam.gz.csi new file mode 100644 index 000000000..2992ed1b8 Binary files /dev/null and b/test/index.sam.gz.csi differ diff --git a/test/index.vcf b/test/index.vcf new file mode 100644 index 000000000..b32e2d293 --- /dev/null +++ b/test/index.vcf @@ -0,0 +1,728 @@ +##fileformat=VCFv4.2 +##FILTER= +##bcftoolsVersion=1.8-31-g9ba4024+htslib-1.8-32-g6e87a1e-dirty +##bcftoolsCommand=mpileup --fasta-ref /nfs/srpipe_references/references/Human/1000Genomes_hs37d5/all/fasta/hs37d5.fa test/index.bam +##reference=file:///nfs/srpipe_references/references/Human/1000Genomes_hs37d5/all/fasta/hs37d5.fa +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ERS220911 +1 9999919 . G <*> 0 . DP=1;I16=1,0,0,0,26,676,0,0,60,3600,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,26 +1 9999920 . T <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 9999921 . A <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 9999922 . A <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 9999923 . T <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 9999924 . C <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 9999925 . C <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 9999926 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999927 . A <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 9999928 . G <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 9999929 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 9999930 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999931 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999932 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999933 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 9999934 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999935 . A <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 9999936 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 9999937 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 9999938 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 9999939 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 9999940 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999941 . C <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +1 9999942 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 9999943 . A <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 9999944 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999945 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 9999946 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999947 . C <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 9999948 . A <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 9999949 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999950 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999951 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 9999952 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999953 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999954 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 9999955 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 9999956 . C <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +1 9999957 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999958 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999959 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999960 . T <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 9999961 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999962 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999963 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999964 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999965 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999966 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 9999967 . A <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +1 9999968 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999969 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999970 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999971 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999972 . T <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 9999973 . T <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 9999974 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 9999975 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 9999976 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999977 . G <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 9999978 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 9999979 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999980 . C <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 9999981 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 9999982 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 9999983 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999984 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999985 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999986 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999987 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 9999988 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 9999989 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999990 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999991 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999992 . C <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 9999993 . A <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 9999994 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 9999995 . G <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 9999996 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 9999997 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 9999998 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 9999999 . A <*> 0 . DP=1;I16=1,0,0,0,31,961,0,0,60,3600,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,31 +1 10000000 . A <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 10000001 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10000002 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10000003 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10000004 . C <*> 0 . DP=1;I16=1,0,0,0,29,841,0,0,60,3600,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000005 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10000006 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10000007 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10000008 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10000009 . T <*> 0 . DP=1;I16=1,0,0,0,43,1849,0,0,60,3600,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,43 +1 10000010 . C <*> 0 . DP=2;I16=1,1,0,0,59,2105,0,0,89,4441,0,0,8,64,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,59 +1 10000011 . T <*> 0 . DP=2;I16=1,1,0,0,76,2888,0,0,89,4441,0,0,8,50,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +1 10000012 . A <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,89,4441,0,0,8,40,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +1 10000013 . C <*> 0 . DP=2;I16=1,1,0,0,66,2250,0,0,89,4441,0,0,8,34,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,66 +1 10000014 . A <*> 0 . DP=2;I16=1,1,0,0,67,2285,0,0,89,4441,0,0,8,32,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +1 10000015 . A <*> 0 . DP=2;I16=1,1,0,0,69,2385,0,0,89,4441,0,0,8,34,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,65 +1 10000016 . T <*> 0 . DP=2;I16=1,1,0,0,75,2817,0,0,89,4441,0,0,8,40,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,68 +1 10000017 . A <*> 0 . DP=2;I16=1,1,0,0,67,2285,0,0,89,4441,0,0,8,50,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,58 +1 10000018 . A <*> 0 . DP=2;I16=1,1,0,0,64,2120,0,0,89,4441,0,0,8,64,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,55 +1 10000019 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000020 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000021 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000022 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000023 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000024 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000025 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000026 . T <*> 0 . DP=1;I16=0,1,0,0,29,841,0,0,29,841,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000027 . A <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000028 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000029 . T <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000030 . A <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000031 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000032 . C <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000033 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000034 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000035 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000036 . G <*> 0 . DP=1;I16=0,1,0,0,42,1764,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000037 . C <*> 0 . DP=1;I16=0,1,0,0,34,1156,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000038 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000039 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000040 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000041 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000042 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000043 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000044 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000045 . T <*> 0 . DP=1;I16=0,1,0,0,42,1764,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000046 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000047 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000048 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000049 . T <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000050 . G <*> 0 . DP=1;I16=0,1,0,0,31,961,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000051 . C <*> 0 . DP=1;I16=0,1,0,0,16,256,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,16 +1 10000052 . T <*> 0 . DP=1;I16=0,1,0,0,31,961,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000053 . T <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000054 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000055 . T <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000056 . A <*> 0 . DP=1;I16=0,1,0,0,22,484,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,22 +1 10000057 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000058 . T <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000059 . C <*> 0 . DP=1;I16=0,1,0,0,34,1156,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000060 . C <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000061 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000062 . A <*> 0 . DP=1;I16=0,1,0,0,34,1156,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000063 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000064 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000065 . T <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000066 . A <*> 0 . DP=1;I16=0,1,0,0,32,1024,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000067 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000068 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000069 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000070 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000071 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000072 . C <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000073 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000074 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000075 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000076 . C <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000077 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000078 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000079 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000080 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000081 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000082 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000083 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000084 . G <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000085 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000086 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000087 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000088 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000089 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000090 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000091 . C <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000092 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000093 . T <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000094 . C <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000095 . C <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000096 . A <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000097 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000098 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000099 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000100 . C <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000101 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000102 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000103 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000104 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000105 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000106 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000107 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000108 . C <*> 0 . DP=1;I16=0,1,0,0,32,1024,0,0,29,841,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10000109 . A <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,29,841,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +2 4999907 . C <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +2 4999908 . C <*> 0 . DP=1;I16=1,0,0,0,32,1024,0,0,60,3600,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,32 +2 4999909 . A <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +2 4999910 . G <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +2 4999911 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 4999912 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 4999913 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 4999914 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 4999915 . T <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +2 4999916 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 4999917 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 4999918 . A <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +2 4999919 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 4999920 . G <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +2 4999921 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 4999922 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999923 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 4999924 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 4999925 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 4999926 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 4999927 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 4999928 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 4999929 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 4999930 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999931 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999932 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999933 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999934 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999935 . T <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +2 4999936 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999937 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999938 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999939 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999940 . A <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 4999941 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999942 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 4999943 . T <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +2 4999944 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 4999945 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999946 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999947 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 4999948 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999949 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 4999950 . A <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 4999951 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 4999952 . A <*> 0 . DP=2;I16=1,1,0,0,64,2120,0,0,120,7200,0,0,25,625,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,64 +2 4999953 . G <*> 0 . DP=2;I16=1,1,0,0,60,1962,0,0,120,7200,0,0,26,626,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,60 +2 4999954 . T <*> 0 . DP=2;I16=1,1,0,0,67,2257,0,0,120,7200,0,0,27,629,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +2 4999955 . C <*> 0 . DP=2;I16=1,1,0,0,57,1805,0,0,120,7200,0,0,28,634,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,57 +2 4999956 . T <*> 0 . DP=2;I16=1,1,0,0,77,2969,0,0,120,7200,0,0,29,641,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +2 4999957 . C <*> 0 . DP=2;I16=1,1,0,0,60,2000,0,0,120,7200,0,0,30,650,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,60 +2 4999958 . A <*> 0 . DP=2;I16=1,1,0,0,71,2561,0,0,120,7200,0,0,31,661,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,71 +2 4999959 . C <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,32,674,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +2 4999960 . A <*> 0 . DP=2;I16=1,1,0,0,71,2561,0,0,120,7200,0,0,33,689,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,71 +2 4999961 . A <*> 0 . DP=2;I16=1,1,0,0,71,2521,0,0,120,7200,0,0,34,706,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,71 +2 4999962 . G <*> 0 . DP=2;I16=1,1,0,0,79,3121,0,0,120,7200,0,0,35,725,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,79 +2 4999963 . C <*> 0 . DP=2;I16=1,1,0,0,70,2452,0,0,120,7200,0,0,36,746,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,70 +2 4999964 . C <*> 0 . DP=2;I16=1,1,0,0,78,3042,0,0,120,7200,0,0,37,769,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,78 +2 4999965 . T <*> 0 . DP=2;I16=1,1,0,0,79,3121,0,0,120,7200,0,0,38,794,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,79 +2 4999966 . T <*> 0 . DP=2;I16=1,1,0,0,76,2888,0,0,120,7200,0,0,39,821,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,76 +2 4999967 . A <*> 0 . DP=2;I16=1,1,0,0,79,3125,0,0,120,7200,0,0,40,850,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,79 +2 4999968 . A <*> 0 . DP=2;I16=1,1,0,0,73,2669,0,0,120,7200,0,0,41,881,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,73 +2 4999969 . T <*> 0 . DP=2;I16=1,1,0,0,73,2669,0,0,120,7200,0,0,42,914,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,73 +2 4999970 . T <*> 0 . DP=2;I16=1,1,0,0,78,3042,0,0,120,7200,0,0,43,949,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,78 +2 4999971 . C <*> 0 . DP=2;I16=1,1,0,0,80,3208,0,0,120,7200,0,0,44,986,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,80 +2 4999972 . T <*> 0 . DP=2;I16=1,1,0,0,75,2817,0,0,120,7200,0,0,45,1025,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,75 +2 4999973 . T <*> 0 . DP=2;I16=1,1,0,0,78,3044,0,0,120,7200,0,0,46,1066,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,78 +2 4999974 . G <*> 0 . DP=2;I16=1,1,0,0,76,2888,0,0,120,7200,0,0,47,1109,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,76 +2 4999975 . C <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,48,1154,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +2 4999976 . A <*> 0 . DP=2;I16=1,1,0,0,72,2594,0,0,120,7200,0,0,49,1201,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,72 +2 4999977 . T <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +2 4999978 . T <*> 0 . DP=2;I16=1,1,0,0,80,3200,0,0,120,7200,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,80 +2 4999979 . C <*> 0 . DP=2;I16=1,1,0,0,78,3044,0,0,120,7200,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,78 +2 4999980 . T <*> 0 . DP=2;I16=1,1,0,0,81,3281,0,0,120,7200,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,81 +2 4999981 . G <*> 0 . DP=2;I16=1,1,0,0,75,2813,0,0,120,7200,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,75 +2 4999982 . T <*> 0 . DP=2;I16=1,1,0,0,72,2592,0,0,120,7200,0,0,49,1201,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,72 +2 4999983 . G <*> 0 . DP=2;I16=1,1,0,0,75,2813,0,0,120,7200,0,0,48,1154,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,75 +2 4999984 . C <*> 0 . DP=2;I16=1,1,0,0,76,2888,0,0,120,7200,0,0,47,1109,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,76 +2 4999985 . A <*> 0 . DP=2;I16=1,1,0,0,76,2890,0,0,120,7200,0,0,46,1066,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,76 +2 4999986 . C <*> 0 . DP=2;I16=1,1,0,0,76,2890,0,0,120,7200,0,0,45,1025,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,76 +2 4999987 . C <*> 0 . DP=2;I16=1,1,0,0,81,3281,0,0,120,7200,0,0,44,986,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,81 +2 4999988 . T <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,43,949,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +2 4999989 . G <*> 0 . DP=2;I16=1,1,0,0,78,3044,0,0,120,7200,0,0,42,914,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,78 +2 4999990 . C <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,41,881,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +2 4999991 . A <*> 0 . DP=2;I16=1,1,0,0,82,3362,0,0,120,7200,0,0,40,850,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,82 +2 4999992 . G <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,39,821,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +2 4999993 . G <*> 0 . DP=2;I16=1,1,0,0,77,2969,0,0,120,7200,0,0,38,794,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +2 4999994 . T <*> 0 . DP=2;I16=1,1,0,0,76,2890,0,0,120,7200,0,0,37,769,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,76 +2 4999995 . T <*> 0 . DP=2;I16=1,1,0,0,74,2738,0,0,120,7200,0,0,36,746,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,74 +2 4999996 . T <*> 0 . DP=2;I16=1,1,0,0,79,3121,0,0,120,7200,0,0,35,725,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,79 +2 4999997 . A <*> 0 . DP=2;I16=1,1,0,0,76,2888,0,0,120,7200,0,0,34,706,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,76 +2 4999998 . A <*> 0 . DP=2;I16=1,1,0,0,75,2813,0,0,120,7200,0,0,33,689,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,75 +2 4999999 . T C,<*> 0 . DP=2;I16=0,1,1,0,39,1521,38,1444,60,3600,60,3600,25,625,7,49;QS=0.506494,0.493506,0;SGB=-0.379885;RPB=1;MQB=1;MQSB=1;BQB=1;MQ0F=0 PL 32,0,33,35,36,68 +2 5000000 . A <*> 0 . DP=2;I16=1,1,0,0,76,2890,0,0,120,7200,0,0,31,661,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,76 +2 5000001 . T <*> 0 . DP=2;I16=1,1,0,0,76,2896,0,0,120,7200,0,0,30,650,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,76 +2 5000002 . T <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,29,641,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +2 5000003 . A <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,28,634,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +2 5000004 . C <*> 0 . DP=2;I16=1,1,0,0,79,3121,0,0,120,7200,0,0,27,629,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,79 +2 5000005 . A <*> 0 . DP=2;I16=1,1,0,0,71,2525,0,0,120,7200,0,0,26,626,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,71 +2 5000006 . T <*> 0 . DP=2;I16=1,1,0,0,70,2458,0,0,120,7200,0,0,25,625,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,70 +2 5000007 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000008 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 5000009 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000010 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000011 . G <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +2 5000012 . C <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000013 . C <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +2 5000014 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000015 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000016 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000017 . G <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +2 5000018 . C <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 5000019 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000020 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000021 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000022 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000023 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000024 . G <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +2 5000025 . C <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000026 . T <*> 0 . DP=2;I16=1,1,0,0,67,2357,0,0,120,7200,0,0,25,625,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +2 5000027 . T <*> 0 . DP=2;I16=1,1,0,0,72,2600,0,0,120,7200,0,0,25,577,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,72 +2 5000028 . G <*> 0 . DP=2;I16=1,1,0,0,72,2610,0,0,120,7200,0,0,25,533,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,72 +2 5000029 . T <*> 0 . DP=2;I16=1,1,0,0,75,2825,0,0,120,7200,0,0,25,493,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,75 +2 5000030 . G <*> 0 . DP=2;I16=1,1,0,0,72,2594,0,0,120,7200,0,0,25,457,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,72 +2 5000031 . C <*> 0 . DP=2;I16=1,1,0,0,75,2813,0,0,120,7200,0,0,25,425,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,75 +2 5000032 . T <*> 0 . DP=2;I16=1,1,0,0,71,2525,0,0,120,7200,0,0,25,397,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,71 +2 5000033 . C <*> 0 . DP=2;I16=1,1,0,0,74,2738,0,0,120,7200,0,0,25,373,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,74 +2 5000034 . T <*> 0 . DP=2;I16=1,1,0,0,75,2813,0,0,120,7200,0,0,25,353,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,75 +2 5000035 . C <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,25,337,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +2 5000036 . C <*> 0 . DP=2;I16=1,1,0,0,70,2452,0,0,120,7200,0,0,25,325,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,70 +2 5000037 . A <*> 0 . DP=2;I16=1,1,0,0,74,2740,0,0,120,7200,0,0,25,317,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,74 +2 5000038 . A <*> 0 . DP=2;I16=1,1,0,0,84,3530,0,0,120,7200,0,0,25,313,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,84 +2 5000039 . A <*> 0 . DP=2;I16=1,1,0,0,76,2888,0,0,120,7200,0,0,25,313,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,76 +2 5000040 . G <*> 0 . DP=2;I16=1,1,0,0,76,2888,0,0,120,7200,0,0,25,317,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,76 +2 5000041 . C <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,25,325,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +2 5000042 . A <*> 0 . DP=2;I16=1,1,0,0,73,2665,0,0,120,7200,0,0,25,337,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,73 +2 5000043 . G <*> 0 . DP=2;I16=1,1,0,0,75,2813,0,0,120,7200,0,0,25,353,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,75 +2 5000044 . C <*> 0 . DP=2;I16=1,1,0,0,77,2969,0,0,120,7200,0,0,25,373,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +2 5000045 . A <*> 0 . DP=2;I16=1,1,0,0,72,2594,0,0,120,7200,0,0,25,397,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,72 +2 5000046 . G <*> 0 . DP=2;I16=1,1,0,0,75,2813,0,0,120,7200,0,0,25,425,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,75 +2 5000047 . T <*> 0 . DP=2;I16=1,1,0,0,73,2665,0,0,120,7200,0,0,25,457,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,73 +2 5000048 . G <*> 0 . DP=2;I16=1,1,0,0,73,2665,0,0,120,7200,0,0,25,493,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,73 +2 5000049 . A <*> 0 . DP=2;I16=1,1,0,0,73,2689,0,0,120,7200,0,0,25,533,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,73 +2 5000050 . T <*> 0 . DP=2;I16=1,1,0,0,71,2545,0,0,120,7200,0,0,25,577,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,71 +2 5000051 . A <*> 0 . DP=2;I16=1,1,0,0,72,2610,0,0,120,7200,0,0,25,625,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,72 +2 5000052 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000053 . G <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +2 5000054 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 5000055 . T <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +2 5000056 . T <*> 0 . DP=1;I16=1,0,0,0,32,1024,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,32 +2 5000057 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000058 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000059 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000060 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000061 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000062 . G <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +2 5000063 . T <*> 0 . DP=1;I16=1,0,0,0,21,441,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,21 +2 5000064 . G <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +2 5000065 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000066 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000067 . C <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +2 5000068 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000069 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000070 . A <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +2 5000071 . C <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +2 5000072 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000073 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 5000074 . A <*> 0 . DP=1;I16=1,0,0,0,20,400,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,20 +2 5000075 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000076 . A <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +2 5000077 . T <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +2 5000078 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000079 . T <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +2 5000080 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 5000081 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000082 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000083 . C <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +2 5000084 . T <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000085 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000086 . G <*> 0 . DP=1;I16=1,0,0,0,32,1024,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,32 +2 5000087 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000088 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000089 . T <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000090 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000091 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 5000092 . T <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000093 . A <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 5000094 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000095 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 5000096 . T <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +2 5000097 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000098 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000099 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000100 . A <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 5000101 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000102 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000103 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000104 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000105 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 5000106 . C <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +2 5000107 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000108 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000109 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000110 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 5000111 . G <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +2 5000112 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000113 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +2 5000114 . T <*> 0 . DP=1;I16=1,0,0,0,26,676,0,0,60,3600,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,26 +2 5000115 . T <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000116 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000117 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +2 5000118 . A <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +2 5000119 . G <*> 0 . DP=1;I16=1,0,0,0,30,900,0,0,60,3600,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,30 +2 5000120 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000121 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +2 5000122 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000123 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +2 5000124 . A <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +2 5000125 . A <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +10 2999980 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,60,3600,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +10 2999981 . C <*> 0 . DP=2;I16=1,1,0,0,63,2045,0,0,120,7200,0,0,1,1,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,63 +10 2999982 . A <*> 0 . DP=2;I16=1,1,0,0,73,2677,0,0,120,7200,0,0,3,5,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,73 +10 2999983 . A <*> 0 . DP=2;I16=1,1,0,0,69,2393,0,0,120,7200,0,0,5,13,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,69 +10 2999984 . C <*> 0 . DP=2;I16=1,1,0,0,71,2525,0,0,120,7200,0,0,7,25,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,71 +10 2999985 . A <*> 0 . DP=2;I16=1,1,0,0,68,2314,0,0,120,7200,0,0,9,41,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,68 +10 2999986 . A <*> 0 . DP=2;I16=1,1,0,0,73,2669,0,0,120,7200,0,0,11,61,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,73 +10 2999987 . A <*> 0 . DP=2;I16=1,1,0,0,61,1945,0,0,120,7200,0,0,13,85,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,61 +10 2999988 . G <*> 0 . DP=3;I16=1,2,0,0,108,3896,0,0,180,10800,0,0,15,113,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,102 +10 2999989 . T <*> 0 . DP=3;I16=1,2,0,0,112,4190,0,0,180,10800,0,0,18,146,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,106 +10 2999990 . G <*> 0 . DP=3;I16=1,2,0,0,115,4409,0,0,180,10800,0,0,21,185,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,109 +10 2999991 . A <*> 0 . DP=3;I16=1,2,0,0,110,4082,0,0,180,10800,0,0,24,230,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,104 +10 2999992 . G <*> 0 . DP=3;I16=1,2,0,0,117,4569,0,0,180,10800,0,0,27,281,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,111 +10 2999993 . A <*> 0 . DP=3;I16=1,2,0,0,115,4411,0,0,180,10800,0,0,30,338,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,108 +10 2999994 . C <*> 0 . DP=3;I16=1,2,0,0,111,4145,0,0,180,10800,0,0,33,401,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,105 +10 2999995 . C <*> 0 . DP=3;I16=1,2,0,0,108,3944,0,0,180,10800,0,0,36,470,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,102 +10 2999996 . T <*> 0 . DP=3;I16=1,2,0,0,113,4257,0,0,180,10800,0,0,39,545,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,106 +10 2999997 . C <*> 0 . DP=3;I16=1,2,0,0,111,4109,0,0,180,10800,0,0,42,626,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,105 +10 2999998 . A <*> 0 . DP=3;I16=1,2,0,0,111,4121,0,0,180,10800,0,0,45,713,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,105 +10 2999999 . T <*> 0 . DP=3;I16=1,2,0,0,109,3961,0,0,180,10800,0,0,48,806,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,103 +10 3000000 . C <*> 0 . DP=3;I16=1,2,0,0,109,3979,0,0,180,10800,0,0,51,905,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,103 +10 3000001 . T <*> 0 . DP=3;I16=1,2,0,0,109,3969,0,0,180,10800,0,0,54,1010,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,103 +10 3000002 . C <*> 0 . DP=3;I16=1,2,0,0,109,3961,0,0,180,10800,0,0,57,1121,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,103 +10 3000003 . T <*> 0 . DP=3;I16=1,2,0,0,116,4494,0,0,180,10800,0,0,60,1238,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,110 +10 3000004 . A <*> 0 . DP=3;I16=1,2,0,0,118,4642,0,0,180,10800,0,0,63,1361,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,111 +10 3000005 . C <*> 0 . DP=3;I16=1,2,0,0,117,4569,0,0,180,10800,0,0,66,1490,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,110 +10 3000006 . A <*> 0 . DP=3;I16=1,2,0,0,112,4190,0,0,180,10800,0,0,68,1574,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,106 +10 3000007 . A <*> 0 . DP=3;I16=1,2,0,0,111,4113,0,0,180,10800,0,0,69,1611,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,105 +10 3000008 . A <*> 0 . DP=3;I16=1,2,0,0,115,4417,0,0,180,10800,0,0,70,1650,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,108 +10 3000009 . A <*> 0 . DP=3;I16=1,2,0,0,115,4411,0,0,180,10800,0,0,71,1691,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,109 +10 3000010 . A <*> 0 . DP=3;I16=1,2,0,0,117,4569,0,0,180,10800,0,0,72,1734,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,110 +10 3000011 . A <*> 0 . DP=3;I16=1,2,0,0,116,4494,0,0,180,10800,0,0,73,1779,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,109 +10 3000012 . T <*> 0 . DP=3;I16=1,2,0,0,112,4190,0,0,180,10800,0,0,74,1826,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,106 +10 3000013 . G <*> 0 . DP=3;I16=1,2,0,0,114,4334,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,108 +10 3000014 . A <*> 0 . DP=3;I16=1,2,0,0,117,4577,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,110 +10 3000015 . A <*> 0 . DP=3;I16=1,2,0,0,112,4182,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,105 +10 3000016 . A <*> 0 . DP=3;I16=1,2,0,0,116,4490,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,110 +10 3000017 . A <*> 0 . DP=3;I16=1,2,0,0,113,4259,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,106 +10 3000018 . G <*> 0 . DP=3;I16=1,2,0,0,110,4042,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,104 +10 3000019 . G <*> 0 . DP=3;I16=1,2,0,0,116,4488,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,109 +10 3000020 . T <*> 0 . DP=3;I16=1,2,0,0,115,4409,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,109 +10 3000021 . T <*> 0 . DP=3;I16=1,2,0,0,110,4046,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,104 +10 3000022 . A <*> 0 . DP=3;I16=1,2,0,0,105,3713,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,99 +10 3000023 . G <*> 0 . DP=3;I16=1,2,0,0,114,4334,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,107 +10 3000024 . C <*> 0 . DP=3;I16=1,2,0,0,113,4267,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,107 +10 3000025 . T <*> 0 . DP=3;I16=1,2,0,0,112,4182,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,106 +10 3000026 . T <*> 0 . DP=3;I16=1,2,0,0,119,4725,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,112 +10 3000027 . G <*> 0 . DP=3;I16=1,2,0,0,96,3464,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,93 +10 3000028 . G <*> 0 . DP=3;I16=1,2,0,0,113,4277,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,107 +10 3000029 . T <*> 0 . DP=4;I16=1,2,0,0,116,4490,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,110 +10 3000030 . G <*> 0 . DP=4;I16=1,3,0,0,152,5778,0,0,240,14400,0,0,76,1876,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,134 +10 3000031 . T <*> 0 . DP=4;I16=1,3,0,0,140,5100,0,0,240,14400,0,0,77,1879,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,122 +10 3000032 . A <*> 0 . DP=4;I16=1,3,0,0,137,4965,0,0,240,14400,0,0,78,1884,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,123 +10 3000033 . G <*> 0 . DP=4;I16=1,3,0,0,153,5853,0,0,240,14400,0,0,79,1891,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,135 +10 3000034 . T <*> 0 . DP=4;I16=1,3,0,0,140,4998,0,0,240,14400,0,0,80,1900,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,124 +10 3000035 . G <*> 0 . DP=4;I16=1,3,0,0,152,5794,0,0,240,14400,0,0,81,1911,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,133 +10 3000036 . G <*> 0 . DP=4;I16=1,3,0,0,150,5628,0,0,240,14400,0,0,82,1924,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,132 +10 3000037 . C <*> 0 . DP=4;I16=1,3,0,0,157,6165,0,0,240,14400,0,0,83,1939,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,138 +10 3000038 . A <*> 0 . DP=4;I16=1,2,0,0,110,4034,0,0,180,10800,0,0,75,1875,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,103 +10 3000039 . C <*> 0 . DP=4;I16=1,3,0,0,149,5553,0,0,240,14400,0,0,85,1975,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,131 +10 3000040 . A <*> 0 . DP=4;I16=1,3,0,0,137,4767,0,0,240,14400,0,0,86,1996,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,121 +10 3000041 . T <*> 0 . DP=4;I16=1,3,0,0,148,5514,0,0,240,14400,0,0,87,2019,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,130 +10 3000042 . A <*> 0 . DP=4;I16=1,3,0,0,135,4599,0,0,240,14400,0,0,88,2044,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,119 +10 3000043 . T <*> 0 . DP=4;I16=1,3,0,0,155,6011,0,0,240,14400,0,0,89,2071,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,136 +10 3000044 . C <*> 0 . DP=4;I16=1,3,0,0,150,5646,0,0,240,14400,0,0,90,2100,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,132 +10 3000045 . T <*> 0 . DP=4;I16=1,3,0,0,133,4559,0,0,240,14400,0,0,91,2131,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,118 +10 3000046 . G <*> 0 . DP=4;I16=1,3,0,0,146,5362,0,0,240,14400,0,0,92,2164,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,128 +10 3000047 . T <*> 0 . DP=4;I16=1,3,0,0,150,5630,0,0,240,14400,0,0,93,2199,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,131 +10 3000048 . G <*> 0 . DP=4;I16=1,3,0,0,155,6007,0,0,240,14400,0,0,94,2236,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,137 +10 3000049 . G <*> 0 . DP=4;I16=1,3,0,0,152,5814,0,0,240,14400,0,0,95,2275,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,135 +10 3000050 . T <*> 0 . DP=4;I16=1,3,0,0,147,5465,0,0,240,14400,0,0,96,2316,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,130 +10 3000051 . C <*> 0 . DP=4;I16=1,3,0,0,152,5782,0,0,240,14400,0,0,97,2359,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,133 +10 3000052 . C <*> 0 . DP=4;I16=1,3,0,0,153,5861,0,0,240,14400,0,0,98,2404,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,134 +10 3000053 . C <*> 0 . DP=4;I16=1,3,0,0,149,5595,0,0,240,14400,0,0,99,2451,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,131 +10 3000054 . A <*> 0 . DP=4;I16=1,3,0,0,144,5202,0,0,240,14400,0,0,100,2500,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,127 +10 3000055 . G <*> 0 . DP=4;I16=1,3,0,0,153,5859,0,0,240,14400,0,0,99,2451,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,135 +10 3000056 . C <*> 0 . DP=4;I16=1,3,0,0,157,6163,0,0,240,14400,0,0,97,2355,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,139 +10 3000057 . T <*> 0 . DP=5;I16=2,3,0,0,166,5624,0,0,300,18000,0,0,95,2263,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,145 +10 3000058 . A <*> 0 . DP=5;I16=2,3,0,0,187,7011,0,0,300,18000,0,0,94,2176,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,162 +10 3000059 . T <*> 0 . DP=5;I16=2,3,0,0,177,6285,0,0,300,18000,0,0,93,2095,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,154 +10 3000060 . T <*> 0 . DP=5;I16=2,3,0,0,182,6644,0,0,300,18000,0,0,92,2020,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,159 +10 3000061 . T <*> 0 . DP=5;I16=2,3,0,0,187,7011,0,0,300,18000,0,0,91,1951,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,162 +10 3000062 . G <*> 0 . DP=5;I16=2,3,0,0,188,7084,0,0,300,18000,0,0,90,1888,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,164 +10 3000063 . G <*> 0 . DP=5;I16=2,3,0,0,184,6802,0,0,300,18000,0,0,88,1782,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,160 +10 3000064 . G <*> 0 . DP=5;I16=2,3,0,0,187,6999,0,0,300,18000,0,0,86,1684,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,162 +10 3000065 . A <*> 0 . DP=5;I16=2,3,0,0,185,6865,0,0,300,18000,0,0,84,1594,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,161 +10 3000066 . G A,<*> 0 . DP=5;I16=1,1,1,2,71,2525,112,4186,120,7200,180,10800,39,821,43,691;QS=0.387978,0.612022,0;VDB=0.946712;SGB=-0.511536;RPB=1;MQB=1;MQSB=1;BQB=0.5;MQ0F=0 PL 91,0,56,97,65,152 +10 3000067 . G <*> 0 . DP=5;I16=2,3,0,0,190,7224,0,0,300,18000,0,0,80,1438,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,164 +10 3000068 . C <*> 0 . DP=5;I16=2,3,0,0,185,6877,0,0,300,18000,0,0,78,1372,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,161 +10 3000069 . T <*> 0 . DP=5;I16=2,3,0,0,183,6715,0,0,300,18000,0,0,76,1314,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,158 +10 3000070 . G <*> 0 . DP=5;I16=2,3,0,0,186,6922,0,0,300,18000,0,0,74,1264,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,161 +10 3000071 . A <*> 0 . DP=5;I16=2,3,0,0,183,6707,0,0,300,18000,0,0,72,1222,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,159 +10 3000072 . G <*> 0 . DP=5;I16=2,3,0,0,188,7080,0,0,300,18000,0,0,70,1188,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,164 +10 3000073 . A <*> 0 . DP=5;I16=2,3,0,0,179,6411,0,0,300,18000,0,0,68,1162,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,155 +10 3000074 . T <*> 0 . DP=5;I16=2,3,0,0,175,6187,0,0,300,18000,0,0,66,1144,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,153 +10 3000075 . A <*> 0 . DP=5;I16=2,3,0,0,172,5958,0,0,300,18000,0,0,64,1134,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,149 +10 3000076 . G <*> 0 . DP=5;I16=2,3,0,0,184,6776,0,0,300,18000,0,0,62,1132,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,159 +10 3000077 . G <*> 0 . DP=5;I16=2,3,0,0,182,6654,0,0,300,18000,0,0,60,1138,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,158 +10 3000078 . A <*> 0 . DP=5;I16=2,3,0,0,184,6784,0,0,300,18000,0,0,58,1152,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,159 +10 3000079 . G <*> 0 . DP=5;I16=2,3,0,0,181,6697,0,0,300,18000,0,0,56,1174,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,15,158 +10 3000080 . G <*> 0 . DP=4;I16=2,2,0,0,147,5435,0,0,240,14400,0,0,55,1203,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,12,135 +10 3000081 . A <*> 0 . DP=3;I16=1,2,0,0,114,4334,0,0,180,10800,0,0,55,1237,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,107 +10 3000082 . T <*> 0 . DP=3;I16=1,2,0,0,107,3841,0,0,180,10800,0,0,55,1275,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,101 +10 3000083 . C <*> 0 . DP=3;I16=1,2,0,0,111,4109,0,0,180,10800,0,0,54,1266,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,105 +10 3000084 . A <*> 0 . DP=3;I16=1,2,0,0,94,3054,0,0,180,10800,0,0,53,1259,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,88 +10 3000085 . C <*> 0 . DP=3;I16=1,2,0,0,112,4210,0,0,180,10800,0,0,52,1254,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,106 +10 3000086 . T <*> 0 . DP=3;I16=1,2,0,0,111,4145,0,0,180,10800,0,0,51,1251,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,105 +10 3000087 . T <*> 0 . DP=3;I16=1,2,0,0,112,4214,0,0,180,10800,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,106 +10 3000088 . G <*> 0 . DP=2;I16=1,1,0,0,70,2450,0,0,120,7200,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,70 +10 3000089 . A <*> 0 . DP=2;I16=1,1,0,0,75,2813,0,0,120,7200,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,75 +10 3000090 . G <*> 0 . DP=2;I16=1,1,0,0,69,2385,0,0,120,7200,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,69 +10 3000091 . C <*> 0 . DP=3;I16=1,2,0,0,109,4001,0,0,180,10800,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,103 +10 3000092 . C <*> 0 . DP=3;I16=1,2,0,0,110,4058,0,0,180,10800,0,0,51,1251,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,104 +10 3000093 . C <*> 0 . DP=3;I16=1,2,0,0,107,3817,0,0,180,10800,0,0,52,1254,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,101 +10 3000094 . A <*> 0 . DP=3;I16=1,2,0,0,92,3026,0,0,180,10800,0,0,53,1259,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,88 +10 3000095 . A <*> 0 . DP=3;I16=1,2,0,0,102,3518,0,0,180,10800,0,0,54,1266,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,96 +10 3000096 . G <*> 0 . DP=3;I16=1,2,0,0,114,4332,0,0,180,10800,0,0,55,1275,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,108 +10 3000097 . A <*> 0 . DP=3;I16=1,2,0,0,110,4038,0,0,180,10800,0,0,56,1286,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,104 +10 3000098 . G <*> 0 . DP=3;I16=1,2,0,0,111,4109,0,0,180,10800,0,0,57,1299,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,105 +10 3000099 . G <*> 0 . DP=3;I16=1,2,0,0,113,4261,0,0,180,10800,0,0,58,1314,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,106 +10 3000100 . T <*> 0 . DP=3;I16=1,2,0,0,112,4230,0,0,180,10800,0,0,59,1331,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,106 +10 3000101 . C <*> 0 . DP=3;I16=1,2,0,0,111,4109,0,0,180,10800,0,0,60,1350,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,104 +10 3000102 . A <*> 0 . DP=3;I16=1,2,0,0,116,4488,0,0,180,10800,0,0,61,1371,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,110 +10 3000103 . A <*> 0 . DP=3;I16=1,2,0,0,114,4332,0,0,180,10800,0,0,62,1394,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,108 +10 3000104 . G <*> 0 . DP=3;I16=1,2,0,0,117,4565,0,0,180,10800,0,0,62,1370,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,111 +10 3000105 . G <*> 0 . DP=3;I16=1,2,0,0,118,4650,0,0,180,10800,0,0,62,1350,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,111 +10 3000106 . C <*> 0 . DP=3;I16=1,2,0,0,114,4370,0,0,180,10800,0,0,62,1334,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,107 +10 3000107 . T <*> 0 . DP=3;I16=1,2,0,0,111,4109,0,0,180,10800,0,0,62,1322,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,105 +10 3000108 . G <*> 0 . DP=3;I16=1,2,0,0,119,4721,0,0,180,10800,0,0,62,1314,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,112 +10 3000109 . C <*> 0 . DP=3;I16=1,2,0,0,110,4054,0,0,180,10800,0,0,62,1310,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,104 +10 3000110 . A <*> 0 . DP=3;I16=1,2,0,0,107,3821,0,0,180,10800,0,0,62,1310,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,101 +10 3000111 . G <*> 0 . DP=3;I16=1,2,0,0,121,4893,0,0,180,10800,0,0,62,1314,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,114 +10 3000112 . T <*> 0 . DP=3;I16=1,2,0,0,110,4034,0,0,180,10800,0,0,62,1322,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,104 +10 3000113 . G <*> 0 . DP=3;I16=1,2,0,0,112,4186,0,0,180,10800,0,0,62,1334,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,105 +10 3000114 . A <*> 0 . DP=3;I16=1,2,0,0,115,4409,0,0,180,10800,0,0,62,1350,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,108 +10 3000115 . G <*> 0 . DP=3;I16=1,2,0,0,113,4261,0,0,180,10800,0,0,62,1370,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,107 +10 3000116 . C <*> 0 . DP=3;I16=1,2,0,0,115,4417,0,0,180,10800,0,0,62,1394,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,108 +10 3000117 . C <*> 0 . DP=3;I16=1,2,0,0,115,4409,0,0,180,10800,0,0,61,1371,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,109 +10 3000118 . A <*> 0 . DP=3;I16=1,2,0,0,118,4642,0,0,180,10800,0,0,60,1350,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,112 +10 3000119 . T <*> 0 . DP=3;I16=1,2,0,0,108,3890,0,0,180,10800,0,0,59,1331,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,102 +10 3000120 . G <*> 0 . DP=3;I16=1,2,0,0,113,4259,0,0,180,10800,0,0,58,1314,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,107 +10 3000121 . A <*> 0 . DP=3;I16=1,2,0,0,106,3750,0,0,180,10800,0,0,57,1299,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,100 +10 3000122 . T <*> 0 . DP=3;I16=1,2,0,0,111,4109,0,0,180,10800,0,0,56,1286,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,105 +10 3000123 . T <*> 0 . DP=3;I16=1,2,0,0,115,4411,0,0,180,10800,0,0,55,1275,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,108 +10 3000124 . G <*> 0 . DP=3;I16=1,2,0,0,108,3890,0,0,180,10800,0,0,54,1266,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,101 +10 3000125 . C <*> 0 . DP=3;I16=1,2,0,0,113,4275,0,0,180,10800,0,0,53,1259,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,106 +10 3000126 . A <*> 0 . DP=3;I16=1,2,0,0,109,3977,0,0,180,10800,0,0,52,1254,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,103 +10 3000127 . T <*> 0 . DP=3;I16=1,2,0,0,111,4121,0,0,180,10800,0,0,51,1251,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,105 +10 3000128 . C <*> 0 . DP=3;I16=1,2,0,0,101,3489,0,0,180,10800,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,9,96 +10 3000129 . C <*> 0 . DP=2;I16=1,1,0,0,78,3042,0,0,120,7200,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,78 +10 3000130 . C <*> 0 . DP=2;I16=1,1,0,0,76,2888,0,0,120,7200,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,76 +10 3000131 . T <*> 0 . DP=2;I16=1,1,0,0,76,2890,0,0,120,7200,0,0,50,1250,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,76 +10 3000132 . G <*> 0 . DP=2;I16=1,1,0,0,80,3202,0,0,120,7200,0,0,49,1201,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,80 +10 3000133 . C <*> 0 . DP=2;I16=1,1,0,0,74,2738,0,0,120,7200,0,0,48,1154,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,74 +10 3000134 . A <*> 0 . DP=2;I16=1,1,0,0,75,2817,0,0,120,7200,0,0,47,1109,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,75 +10 3000135 . C <*> 0 . DP=2;I16=1,1,0,0,74,2738,0,0,120,7200,0,0,46,1066,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,74 +10 3000136 . T <*> 0 . DP=2;I16=1,1,0,0,74,2756,0,0,120,7200,0,0,45,1025,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,74 +10 3000137 . C <*> 0 . DP=2;I16=1,1,0,0,75,2817,0,0,120,7200,0,0,44,986,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,75 +10 3000138 . C <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,43,949,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +10 3000139 . A <*> 0 . DP=2;I16=1,1,0,0,78,3044,0,0,120,7200,0,0,42,914,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,78 +10 3000140 . A <*> 0 . DP=2;I16=1,1,0,0,73,2677,0,0,120,7200,0,0,41,881,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,73 +10 3000141 . C <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,40,850,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +10 3000142 . C <*> 0 . DP=2;I16=1,1,0,0,81,3281,0,0,120,7200,0,0,39,821,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,81 +10 3000143 . T <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,38,794,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +10 3000144 . G <*> 0 . DP=2;I16=1,1,0,0,80,3208,0,0,120,7200,0,0,37,769,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,80 +10 3000145 . G <*> 0 . DP=2;I16=1,1,0,0,78,3044,0,0,120,7200,0,0,36,746,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,78 +10 3000146 . G <*> 0 . DP=2;I16=1,1,0,0,78,3044,0,0,120,7200,0,0,35,725,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,78 +10 3000147 . T <*> 0 . DP=2;I16=1,1,0,0,73,2677,0,0,120,7200,0,0,34,706,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,73 +10 3000148 . G <*> 0 . DP=2;I16=1,1,0,0,79,3121,0,0,120,7200,0,0,33,689,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,79 +10 3000149 . A <*> 0 . DP=2;I16=1,1,0,0,71,2545,0,0,120,7200,0,0,32,674,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,71 +10 3000150 . C <*> 0 . DP=2;I16=1,1,0,0,74,2746,0,0,120,7200,0,0,31,661,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,74 +10 3000151 . A <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,30,650,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +10 3000152 . G <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,120,7200,0,0,29,641,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,77 +10 3000153 . A <*> 0 . DP=2;I16=1,1,0,0,72,2594,0,0,120,7200,0,0,28,634,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,72 +10 3000154 . G <*> 0 . DP=2;I16=1,1,0,0,75,2817,0,0,120,7200,0,0,27,629,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,75 +10 3000155 . C <*> 0 . DP=2;I16=1,1,0,0,56,1730,0,0,120,7200,0,0,26,626,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,56 +10 3000156 . A <*> 0 . DP=2;I16=1,1,0,0,67,2285,0,0,120,7200,0,0,25,625,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +10 3000157 . A <*> 0 . DP=1;I16=0,1,0,0,31,961,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,31 +10 3000158 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +10 3000159 . A <*> 0 . DP=1;I16=0,1,0,0,29,841,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +10 3000160 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +10 3000161 . C <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +10 3000162 . C <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +10 3000163 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +10 3000164 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +10 3000165 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +10 3000166 . T <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,60,3600,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +10 3000167 . A <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,60,3600,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +10 3000168 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,60,3600,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +10 3000169 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,60,3600,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +10 3000170 . A <*> 0 . DP=1;I16=0,1,0,0,34,1156,0,0,60,3600,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +10 3000171 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,60,3600,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +10 3000172 . T <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,60,3600,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +10 3000173 . T <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,60,3600,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +10 3000174 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,60,3600,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +10 3000175 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,60,3600,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +10 3000176 . C <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,60,3600,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +10 3000177 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,60,3600,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +10 3000178 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,60,3600,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +10 3000179 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,60,3600,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +10 3000180 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,60,3600,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +10 3000181 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,60,3600,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +10 3000182 . C <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,60,3600,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +10 3000183 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,60,3600,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +10 3000184 . A <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,60,3600,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +10 3000185 . C <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,60,3600,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +10 3000186 . C <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,60,3600,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +10 3000187 . A <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,60,3600,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +10 3000188 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,60,3600,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +10 3000189 . A <*> 0 . DP=1;I16=0,1,0,0,29,841,0,0,60,3600,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +10 3000190 . A <*> 0 . DP=1;I16=0,1,0,0,26,676,0,0,60,3600,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,26 diff --git a/test/index.vcf.gz.csi b/test/index.vcf.gz.csi new file mode 100644 index 000000000..644832d83 Binary files /dev/null and b/test/index.vcf.gz.csi differ diff --git a/test/index.vcf.gz.tbi b/test/index.vcf.gz.tbi new file mode 100644 index 000000000..4d6e99781 Binary files /dev/null and b/test/index.vcf.gz.tbi differ diff --git a/test/longrefs/index.expected1.vcf b/test/longrefs/index.expected1.vcf new file mode 100644 index 000000000..e0e7f91ad --- /dev/null +++ b/test/longrefs/index.expected1.vcf @@ -0,0 +1,6 @@ +1 10010000100 . C <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000101 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000102 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000103 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000104 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000105 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,29 diff --git a/test/longrefs/index.expected2.vcf b/test/longrefs/index.expected2.vcf new file mode 100644 index 000000000..4898e2563 --- /dev/null +++ b/test/longrefs/index.expected2.vcf @@ -0,0 +1 @@ +1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45 diff --git a/test/longrefs/index.vcf b/test/longrefs/index.vcf new file mode 100644 index 000000000..54c8e03d3 --- /dev/null +++ b/test/longrefs/index.vcf @@ -0,0 +1,216 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file:10_gig_at_front.fa +##contig= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##ALT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ERS220911 +1 10009999919 . G <*> 0 . DP=1;I16=1,0,0,0,26,676,0,0,60,3600,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,26 +1 10009999920 . T <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 10009999921 . A <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 10009999922 . A <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 10009999923 . T <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 10009999924 . C <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 10009999925 . C <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999926 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999927 . A <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999928 . G <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 10009999929 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999930 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999931 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999932 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999933 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999934 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999935 . A <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999936 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999937 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999938 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999939 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999940 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999941 . C <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +1 10009999942 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999943 . A <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999944 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999945 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999946 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999947 . C <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 10009999948 . A <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 10009999949 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999950 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999951 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999952 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999953 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999954 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999955 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999956 . C <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +1 10009999957 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999958 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999959 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999960 . T <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 10009999961 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999962 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999963 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999964 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999965 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999966 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999967 . A <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +1 10009999968 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999969 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999970 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999971 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999972 . T <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999973 . T <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999974 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999975 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999976 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999977 . G <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999978 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999979 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999980 . C <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 10009999981 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999982 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999983 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999984 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999985 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999986 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999987 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999988 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999989 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999990 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999991 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999992 . C <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999993 . A <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999994 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999995 . G <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 10009999996 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999997 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999998 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999999 . A <*> 0 . DP=1;I16=1,0,0,0,31,961,0,0,60,3600,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,31 +1 10010000000 . A <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 10010000001 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10010000002 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10010000003 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10010000004 . C <*> 0 . DP=1;I16=1,0,0,0,29,841,0,0,60,3600,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000005 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10010000006 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10010000007 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10010000008 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10010000009 . T <*> 0 . DP=1;I16=1,0,0,0,43,1849,0,0,60,3600,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,43 +1 10010000010 . C <*> 0 . DP=2;I16=1,1,0,0,59,2105,0,0,89,4441,0,0,8,64,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,59 +1 10010000011 . T <*> 0 . DP=2;I16=1,1,0,0,76,2888,0,0,89,4441,0,0,8,50,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +1 10010000012 . A <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,89,4441,0,0,8,40,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +1 10010000013 . C <*> 0 . DP=2;I16=1,1,0,0,66,2250,0,0,89,4441,0,0,8,34,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,66 +1 10010000014 . A <*> 0 . DP=2;I16=1,1,0,0,67,2285,0,0,89,4441,0,0,8,32,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +1 10010000015 . A <*> 0 . DP=2;I16=1,1,0,0,69,2385,0,0,89,4441,0,0,8,34,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,65 +1 10010000016 . T <*> 0 . DP=2;I16=1,1,0,0,75,2817,0,0,89,4441,0,0,8,40,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,68 +1 10010000017 . A <*> 0 . DP=2;I16=1,1,0,0,67,2285,0,0,89,4441,0,0,8,50,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,58 +1 10010000018 . A <*> 0 . DP=2;I16=1,1,0,0,64,2120,0,0,89,4441,0,0,8,64,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,55 +1 10010000019 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000020 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000021 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000022 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000023 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000024 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000025 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000026 . T <*> 0 . DP=1;I16=0,1,0,0,29,841,0,0,29,841,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000027 . A <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000028 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000029 . T <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000030 . A <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000031 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000032 . C <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000033 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000034 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000035 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000036 . G <*> 0 . DP=1;I16=0,1,0,0,42,1764,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000037 . C <*> 0 . DP=1;I16=0,1,0,0,34,1156,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000038 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000039 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000040 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000041 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000042 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000043 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000044 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000045 . T <*> 0 . DP=1;I16=0,1,0,0,42,1764,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000046 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000047 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000048 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000049 . T <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000050 . G <*> 0 . DP=1;I16=0,1,0,0,31,961,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000051 . C <*> 0 . DP=1;I16=0,1,0,0,16,256,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,16 +1 10010000052 . T <*> 0 . DP=1;I16=0,1,0,0,31,961,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000053 . T <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000054 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000055 . T <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000056 . A <*> 0 . DP=1;I16=0,1,0,0,22,484,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,22 +1 10010000057 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000058 . T <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000059 . C <*> 0 . DP=1;I16=0,1,0,0,34,1156,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000060 . C <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000061 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000062 . A <*> 0 . DP=1;I16=0,1,0,0,34,1156,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000063 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000064 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000065 . T <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000066 . A <*> 0 . DP=1;I16=0,1,0,0,32,1024,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000067 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000068 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000069 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000070 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000071 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000072 . C <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000073 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000074 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000075 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000076 . C <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000077 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000078 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000079 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000080 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000081 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000082 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000083 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000084 . G <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000085 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000086 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000087 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000088 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000089 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000090 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000091 . C <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000092 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000093 . T <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000094 . C <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000095 . C <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000096 . A <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000097 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000098 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000099 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000100 . C <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000101 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000102 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000103 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000104 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000105 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000106 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000107 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000108 . C <*> 0 . DP=1;I16=0,1,0,0,32,1024,0,0,29,841,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000109 . A <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,29,841,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45 diff --git a/test/longrefs/longref.sam b/test/longrefs/longref.sam new file mode 100644 index 000000000..a2611f675 --- /dev/null +++ b/test/longrefs/longref.sam @@ -0,0 +1,96 @@ +@SQ SN:CHROMOSOME_I LN:10001009800 +SRR065390.14978392 16 CHROMOSOME_I 10000000002 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 +SRR065390.921023 16 CHROMOSOME_I 10000000003 12 100M * 0 0 CTAAGCCTAAATCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################################???88:;98768700000<>:BBA?BBAB?BBBBBBBB>B>BB::;?:00000 AS:i:-6 XS:i:-13 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10G0C10G77 NM:i:3 +SRR065390.1871511 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA 0:BB@>B<=B@???@=8@B>BB@CA@DACDCBBCCCA@CCCCACCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3743423 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##################?6@:7<=@3=@ABAAB>BDBBABADABDDDBDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.4251890 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########@BB=BCBBC?B>B;>B@@ADBBB@DBBBBDCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.5238868 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA @,=@@D8D;?BBB>;?BBB==BB@D;>D>BBB>BBDDBA@@BCCB@=BACBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.8289592 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A?@C9@@BC=AABDD@A@DC@CB=@BA?6@CCAAC@+CCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.14497557 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ######@:@@.>=><;;B>AB>>BB?B=>B=BD>BDADDD>CCDDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.15617929 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA D?;;D>?C>CBAAACD@BB?B>BBDB>@BBDDBDC@CBDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.16049575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #######################@??BB8BBB@@:AB@BDBCCDCBDCCCCACCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.17838261 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################@>=?B@DCBDB>@D>DBADCDDD>CC@DCCCCBCCACCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22711273 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################B<@=<:6/0307==72@@=?788==;AAA:@CCAACCC?CCAACCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22922978 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##########################B=B>A@BBBC??=@=A@AC<><<8>C6CCCCC8CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23087186 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ############@:73???@6;D?B>:>BBA?B<>B@B>@B>@>BCDCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23506653 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############A/=A5::87@:=>6@AA>@CDBA@ABCB?BC>CD>DDBDC@CCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23791575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCCCACCCCAACCCTTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############################B4;:=B@>A@BCB@@ABCCBB@BCC@CCDCCDCCDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-12 XS:i:-12 XN:i:0 XM:i:6 XO:i:0 XG:i:0 YT:Z:UU MD:Z:7T0A1G2T2G3A79 NM:i:6 +SRR065390.25911768 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############@8B@B?9=:A?=@DDB>;B6?DDBCABABB@DDCCBDBDCCDACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26055380 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################DAA><0=>=B;?BACDBDABCBBC@CACACACACCACCCCCCCCCCCCCCCCCCCCCCBCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26121674 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################?:AA::@DAAA>B??@A4@=BBBBDDBDBDCCBDDBCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.30352568 16 CHROMOSOME_I 10000000003 7 100M * 0 0 CTAGGGCTAACCCTCAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################################################A>>5A?CCC@CCCCCCCCCC?CC:C@A@==@A@A@ AS:i:-10 XS:i:-19 XN:i:0 XM:i:5 XO:i:0 XG:i:0 YT:Z:UU MD:Z:3A1C4G3A37G47 NM:i:5 +SRR065390.31341126 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################?AD?D@BCAABBBD@=DBCDBAACCDCAABCDCCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.33653624 16 CHROMOSOME_I 10000000003 17 100M * 0 0 CTAATCCTAGGCCTAAGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################??8?000-+0000,@ABBBB@B:B@B>BB????>>>@@?::?6?>>;>>@ACCCCBCCBACCCC AS:i:-6 XS:i:-19 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:4G4A9T80 NM:i:3 +SRR065390.28043279 16 CHROMOSOME_I 10000000004 0 9M1I90M * 0 0 TCTTCCGATCTCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #####A>=7A6DD=@AA?>AAABC@CAABDBCBBABDADBADCABBBDCDCDCACDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCC AS:i:-26 XS:i:-26 XN:i:0 XM:i:6 XO:i:1 XG:i:1 YT:Z:UU MD:Z:1A0A0G2T1A0G89 NM:i:7 +SRR065390.29270443 16 CHROMOSOME_I 10000000006 1 100M * 0 0 AGCCTAAGCCGAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ###################################@:88@@>B>C>CCCCA@CCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10T2G86 NM:i:2 +SRR065390.1364843 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ####################@=A=8@:>@;@@=>>B8?C6CCCCCCCCCCACCCCBBCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.10190875 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##################@@@@@@;>BBB?>A6BAB?BB=BAB@?:A.<===@7:4::>8D@BABBACCCCAB@CCCDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.13556211 0 CHROMOSOME_I 10000000011 0 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGATTGGAAGAGCGGCTAAGCAGGAACGCCGGGCTCGATCTCAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCBCDCCB>BBBBB########################################### AS:i:-50 XS:i:-50 XN:i:0 XM:i:25 XO:i:0 XG:i:0 YT:Z:UU MD:Z:57C0C1A0A0G0C0C0T0A0A1C6C0T0A1G1C0T0A0A1C2A0A0G0C2A3 NM:i:25 +SRR065390.20524775 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ############################?9<8B=?@C8A<@?@C8CBDCCC=CCCCC??@CCDCCCCCCCCCCCCCCCCCCCCDCCCCCCCDCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.20580336 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ############################?:>@?@=>@=0<:CB>@B=DCADB@CCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22573273 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##################################A9;?@CBBDBA>BB;ABDB>AAA;=>=0943@########### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.20870114 0 CHROMOSOME_I 10000000012 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCABCCCC=BBBCA@B>B?D;B=>9?############################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3863623 16 CHROMOSOME_I 10000000012 1 100M * 0 0 CGCCTACGCCTACGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##############################?@BB>B@BCABBB?DC@DADC@DCDCACDCBCCCCCCCCCCC@CCCCCCCCCCCCCCC1CCCCCCCCCCC AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0A5A5A87 NM:i:3 +SRR065390.1659845 0 CHROMOSOME_I 10000000013 0 100M * 0 0 GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAACCTAAGCCTAAGCCCAACCCTAAGACCGAGACCGAGACC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCAB@CCC######################################### AS:i:-22 XS:i:-22 XN:i:0 XM:i:11 XO:i:0 XG:i:0 YT:Z:UU MD:Z:60G14T2G6C1T0A2C1T0A2C1T0 NM:i:11 +SRR065390.1567418 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CACAGCCTACGTCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #########################################?:8A@<@>>BBB8>BBB@BBBB>@:??::87688:?:::?@<@@97866@?>@@;;>:< AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T0A6A1C88 NM:i:4 +SRR065390.4996386 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CCAAGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###################################@@@@A=BB@C>>DCCACCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-22 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T5T92 NM:i:2 +SRR065390.14822977 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CGAAGCCAGAGCCTAGGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################B:B?:==2>6@B@@C>?>A@CB5@??@28C@CCCBC@CC?CC?A@CC:CBCCCCCCCCCCCCCC AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T5T0A6A84 NM:i:4 +SRR065390.15148736 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CTGAGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########################CCBC<=C;9??<;==C@BCCCCC=CCCCACACACCBBCCCCCCCCCCCCCCCCCBCCCCCCCCCCCBCA?CCC AS:i:-4 XS:i:-21 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2A4T92 NM:i:2 +SRR065390.18089757 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CTGAGCCTGAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################A212.0:?.>8?BB?B<@@C?CCBCB;DCCCACDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2A5A91 NM:i:2 +SRR065390.25601994 16 CHROMOSOME_I 10000000015 17 100M * 0 0 ATAAGCCTAATCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #####################???DD?BD?BDBB>ACBDBDDBDDDBDBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-21 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0C9G89 NM:i:2 +SRR065390.29400981 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CGAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A<:?C>>BCABABC?AD>BDADDDBDBBDBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-18 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T2G95 NM:i:2 +SRR065390.29022479 0 CHROMOSOME_I 10000000167 0 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTAAGCCTAATCCTATGCATAAACCTAAACAGAATCAAAAGAAAAATCCAATCT CCCCCCCCCACCCCCBCCCC?CCCCCCCD;?D?D################################ AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:94C0T0A3 NM:i:3 +SRR065390.23298396 16 CHROMOSOME_I 10000000167 1 100M * 0 0 AAGCCTCGGCCTACGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC #####################A@><>B==BC@CCBB?BA'@>>;>>DADDDBDBADB?B6@7=;;7DBD?B<8=AA:4-9<@@1:@A################################ AS:i:-2 XS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 YT:Z:UU MD:Z:98C1 NM:i:1 +SRR065390.23263331 0 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCBCCCDCCDCDDDBBDA=B@BB@B>B>AB?@?BB>;;ACC>CAA@;9<5@############## AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.1428659 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC #######?DB@;>BBB::>:D=>D?BDDBBBBCCAC@DCCBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.9270489 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##########?4=>@BAA>BB>AA@====3BBBBB;B?@C==CCC?@CCC?CCC?ACCCCCBCCCBCCCCBCCCCCCCCCCCCCC=BCCCCCACCCDCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.9538669 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##########@=?6??@B;BA@@@?.@?@@;D>A;DB@DBBBD>@DDDBADCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.15525407 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ####################@37:0BC@@C@ACCAB?@CCACCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.18387934 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##########################@@A@4BDDBB@ACABB@8BCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.27778447 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ###############@@B=;>89<>/8?<8@>=ABDCCDCC@CCACB@@C@9ACCCC;CCCC@CCAAB@@CCCCCBCCCCCCCBCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.4767844 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCCAAGCCTAAGCCTAACCCCA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCCCCCDCC=CCBA=BCCACCBCC<@@@A@>A?D<5/772AA####################### AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:80T14G2T1 NM:i:3 +SRR065390.6036148 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCCAAGCCTCAGACCA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCBCCC=C########################################## AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:86T6A2C1T1 NM:i:4 +SRR065390.7523697 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTATACCTATGCATA 8773399<;8BBB>BAA<A################################### AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:84C3A4A5A0 NM:i:4 +SRR065390.21777229 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCCAAGCCCAAGACCAAGCCAAGACCCC CCCCCCCCCCCCCCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCCCBDABAA@48@############################################# AS:i:-18 XS:i:-18 XN:i:0 XM:i:9 XO:i:0 XG:i:0 YT:Z:UU MD:Z:74T5T3C1T5T1A0G2T0A0 NM:i:9 +SRR065390.22082412 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTTGGCCGCAGCCTCAGCCTGAACAGA CCCCACACCCCCCCC??:??@CCCC@9A>9?AA@AC>@CA@B-73>8=53@=:=A?><=>49778?################ AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:88A5A5 NM:i:2 +SRR065390.32243033 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTACGACTA CCCCCCCCCCCACC@CCACCCCCCCCCCCCCCCCC@CADCCBBD@BB>=?A@9C@?C>A88?>8A?:@CCCCCCCCC:?>;:CCC?BCCCCACCCCCCCCCC AS:i:-39 XS:i:-39 XN:i:0 XM:i:18 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0C0T0A0A2C0T2G1C0T0A0A0G0C1T0A1G1C64C10 NM:i:18 +SRR065390.28296401 16 CHROMOSOME_I 10000000171 1 100M * 0 0 CTAAGCCTAAGCCTAAGGCTAAGCCTAAACCCACGCCTAGGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##################################################BAADDDBBDDCCDCCCCCACDCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-12 XS:i:-12 XN:i:0 XM:i:6 XO:i:0 XG:i:0 YT:Z:UU MD:Z:17C10G2T1A5A3T56 NM:i:6 +SRR065390.1242089 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC A=@@?=?=8A3BB>>B@B>BAB@B@B77//8<;>5:@@@B6ABA@BA<@BB5):5;*83736?;;;@@=;6B>??##################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3872193 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCBCCCC@DCACD=ABCB@BCDDA@BA=BBB@C??@;:0A>?>B>?)?#################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.14566073 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCBCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCCCCAB=?CCCA6?AACABCCAC=1B@A@;B<@A@@;>?@>8BB?B#################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.18391952 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCDCCCAADCCB?CBABD=A>?BB5:??:B;>?@AA?>3?;@(8>=>>/(5500;+@@6 AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.18719419 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCAC@@C@@B@DBBDBB################################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23668023 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCC@@ABDB@@BBB>DBABB@D@BDBAABAB>B>AA@??9:8>>A:255@###### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23826980 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCDCDBCDDBDDDABBBBDDBBBBBBB>D?#################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.28024258 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCBACDDBC>DDBDB>BBBBB;?@BBB3@???=0<=>@@:@################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.30039772 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCDACDBBDDDDDBBBDBBD>BBAADAABAAC??B??######################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.5345749 16 CHROMOSOME_I 10000000173 1 100M * 0 0 GACCCAGACCCCGCGCCTAAGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##########################################@BA=>AAA@;AAAA@AA9AAAA@BAA@:=@@@4A=?A@AAAAA:B@@BBBBB@>>>>> AS:i:-18 XS:i:-18 XN:i:0 XM:i:9 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0A1G2T0A1G2T0A0A9T76 NM:i:9 +SRR065390.16932911 16 CHROMOSOME_I 10000000173 1 100M * 0 0 AACCCTAAACCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ###############B?BAA;;9>0A1BAAA@=CA*@CCCCACCCC@@?CAAB>AC=C?CCCCBCCBBCBCCCABCCBCA@CCCCCCBCCCCC?BCCCCC AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2G5G5G85 NM:i:3 +SRR065390.17106354 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCACC?CCADCCAC@BB@CBB@C?@A@@A>=B?BAABBABB6A>BBB:BBA=?DD??;D/<71; AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22716808 0 CHROMOSOME_I 10000000174 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCGAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCABBBBB?################################### AS:i:-2 XS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 YT:Z:UU MD:Z:94T5 NM:i:1 +SRR065390.12986460 0 CHROMOSOME_I 10000000176 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCATAATCGTAAGACTAAGAGCAAGCCTCAGCATA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCA?CCA############################### AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:92T2G4 NM:i:2 +SRR065390.14729559 16 CHROMOSOME_I 10000000176 1 100M * 0 0 CCTACGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTA #########################?(4<=B@;BBBBCB?>BCCA?DCCACCCCCC@C;BBB??B<;9=C@BCAACBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2T0A0A6G88 NM:i:4 +SRR065390.26023345 0 CHROMOSOME_I 10000000177 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTCAGCCGAA CCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCCCCCCCCCA?CDADABDBDDBDDBAB>>BBBB@;>@BBB?A>CBBB<>>B@@4@?>>?0ABD@@###### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.6149508 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCDDCCBD=CCDB@@DABAB=ABB??>>@BB=BCBAB>>D;A?><>AA>?A==+@A AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.6618950 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCDCCCCCDCCBCAACBBCBB@DADABBDAB?CBB@B;?BB=B>>>?:? AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.7246333 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCACCCDCCCCCCCCCCCDCCBCD@CBBDCADADADBDABBDBDABDBCBBA>BAB>>AC9A################## AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.8266146 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTCAGCCGAGGCCTACGC CDCCCCCCCCCCCCCCCCCBCCCCCCDCCCCCCACDCCCCCDACBDCABCB@A=ABBB@BBD@DB?B################################# AS:i:-10 XS:i:-10 XN:i:0 XM:i:5 XO:i:0 XG:i:0 YT:Z:UU MD:Z:80G3A4T1A5A2 NM:i:5 +SRR065390.8986893 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCC@CCCCCCCCCCA@CCCCD=CCCDAABBDB>BDDBDB;BB@@B=@BDB:.A>>BB:@################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 diff --git a/test/longrefs/longref_itr.expected.sam b/test/longrefs/longref_itr.expected.sam new file mode 100644 index 000000000..6aca06706 --- /dev/null +++ b/test/longrefs/longref_itr.expected.sam @@ -0,0 +1,26 @@ +@SQ SN:CHROMOSOME_I LN:10001009800 +SRR065390.14978392 16 CHROMOSOME_I 10000000002 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 +SRR065390.921023 16 CHROMOSOME_I 10000000003 12 100M * 0 0 CTAAGCCTAAATCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################################???88:;98768700000<>:BBA?BBAB?BBBBBBBB>B>BB::;?:00000 AS:i:-6 XS:i:-13 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10G0C10G77 NM:i:3 +SRR065390.1871511 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA 0:BB@>B<=B@???@=8@B>BB@CA@DACDCBBCCCA@CCCCACCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3743423 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##################?6@:7<=@3=@ABAAB>BDBBABADABDDDBDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.4251890 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########@BB=BCBBC?B>B;>B@@ADBBB@DBBBBDCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.5238868 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA @,=@@D8D;?BBB>;?BBB==BB@D;>D>BBB>BBDDBA@@BCCB@=BACBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.8289592 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A?@C9@@BC=AABDD@A@DC@CB=@BA?6@CCAAC@+CCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.14497557 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ######@:@@.>=><;;B>AB>>BB?B=>B=BD>BDADDD>CCDDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.15617929 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA D?;;D>?C>CBAAACD@BB?B>BBDB>@BBDDBDC@CBDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.16049575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #######################@??BB8BBB@@:AB@BDBCCDCBDCCCCACCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.17838261 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################@>=?B@DCBDB>@D>DBADCDDD>CC@DCCCCBCCACCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22711273 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################B<@=<:6/0307==72@@=?788==;AAA:@CCAACCC?CCAACCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22922978 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##########################B=B>A@BBBC??=@=A@AC<><<8>C6CCCCC8CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23087186 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ############@:73???@6;D?B>:>BBA?B<>B@B>@B>@>BCDCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23506653 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############A/=A5::87@:=>6@AA>@CDBA@ABCB?BC>CD>DDBDC@CCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23791575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCCCACCCCAACCCTTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############################B4;:=B@>A@BCB@@ABCCBB@BCC@CCDCCDCCDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-12 XS:i:-12 XN:i:0 XM:i:6 XO:i:0 XG:i:0 YT:Z:UU MD:Z:7T0A1G2T2G3A79 NM:i:6 +SRR065390.25911768 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############@8B@B?9=:A?=@DDB>;B6?DDBCABABB@DDCCBDBDCCDACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26055380 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################DAA><0=>=B;?BACDBDABCBBC@CACACACACCACCCCCCCCCCCCCCCCCCCCCCBCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26121674 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################?:AA::@DAAA>B??@A4@=BBBBDDBDBDCCBDDBCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.30352568 16 CHROMOSOME_I 10000000003 7 100M * 0 0 CTAGGGCTAACCCTCAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################################################A>>5A?CCC@CCCCCCCCCC?CC:C@A@==@A@A@ AS:i:-10 XS:i:-19 XN:i:0 XM:i:5 XO:i:0 XG:i:0 YT:Z:UU MD:Z:3A1C4G3A37G47 NM:i:5 +SRR065390.31341126 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################?AD?D@BCAABBBD@=DBCDBAACCDCAABCDCCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.33653624 16 CHROMOSOME_I 10000000003 17 100M * 0 0 CTAATCCTAGGCCTAAGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################??8?000-+0000,@ABBBB@B:B@B>BB????>>>@@?::?6?>>;>>@ACCCCBCCBACCCC AS:i:-6 XS:i:-19 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:4G4A9T80 NM:i:3 diff --git a/test/longrefs/longref_multi.expected.sam b/test/longrefs/longref_multi.expected.sam new file mode 100644 index 000000000..997ead54c --- /dev/null +++ b/test/longrefs/longref_multi.expected.sam @@ -0,0 +1,46 @@ +@SQ SN:CHROMOSOME_I LN:10001009800 +SRR065390.14978392 16 CHROMOSOME_I 10000000002 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 +SRR065390.921023 16 CHROMOSOME_I 10000000003 12 100M * 0 0 CTAAGCCTAAATCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################################???88:;98768700000<>:BBA?BBAB?BBBBBBBB>B>BB::;?:00000 AS:i:-6 XS:i:-13 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10G0C10G77 NM:i:3 +SRR065390.1871511 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA 0:BB@>B<=B@???@=8@B>BB@CA@DACDCBBCCCA@CCCCACCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3743423 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##################?6@:7<=@3=@ABAAB>BDBBABADABDDDBDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.4251890 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########@BB=BCBBC?B>B;>B@@ADBBB@DBBBBDCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.5238868 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA @,=@@D8D;?BBB>;?BBB==BB@D;>D>BBB>BBDDBA@@BCCB@=BACBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.8289592 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A?@C9@@BC=AABDD@A@DC@CB=@BA?6@CCAAC@+CCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.14497557 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ######@:@@.>=><;;B>AB>>BB?B=>B=BD>BDADDD>CCDDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.15617929 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA D?;;D>?C>CBAAACD@BB?B>BBDB>@BBDDBDC@CBDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.16049575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #######################@??BB8BBB@@:AB@BDBCCDCBDCCCCACCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.17838261 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################@>=?B@DCBDB>@D>DBADCDDD>CC@DCCCCBCCACCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22711273 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################B<@=<:6/0307==72@@=?788==;AAA:@CCAACCC?CCAACCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22922978 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##########################B=B>A@BBBC??=@=A@AC<><<8>C6CCCCC8CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23087186 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ############@:73???@6;D?B>:>BBA?B<>B@B>@B>@>BCDCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23506653 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############A/=A5::87@:=>6@AA>@CDBA@ABCB?BC>CD>DDBDC@CCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23791575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCCCACCCCAACCCTTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############################B4;:=B@>A@BCB@@ABCCBB@BCC@CCDCCDCCDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-12 XS:i:-12 XN:i:0 XM:i:6 XO:i:0 XG:i:0 YT:Z:UU MD:Z:7T0A1G2T2G3A79 NM:i:6 +SRR065390.25911768 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############@8B@B?9=:A?=@DDB>;B6?DDBCABABB@DDCCBDBDCCDACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26055380 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################DAA><0=>=B;?BACDBDABCBBC@CACACACACCACCCCCCCCCCCCCCCCCCCCCCBCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26121674 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################?:AA::@DAAA>B??@A4@=BBBBDDBDBDCCBDDBCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.30352568 16 CHROMOSOME_I 10000000003 7 100M * 0 0 CTAGGGCTAACCCTCAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################################################A>>5A?CCC@CCCCCCCCCC?CC:C@A@==@A@A@ AS:i:-10 XS:i:-19 XN:i:0 XM:i:5 XO:i:0 XG:i:0 YT:Z:UU MD:Z:3A1C4G3A37G47 NM:i:5 +SRR065390.31341126 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################?AD?D@BCAABBBD@=DBCDBAACCDCAABCDCCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.33653624 16 CHROMOSOME_I 10000000003 17 100M * 0 0 CTAATCCTAGGCCTAAGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################??8?000-+0000,@ABBBB@B:B@B>BB????>>>@@?::?6?>>;>>@ACCCCBCCBACCCC AS:i:-6 XS:i:-19 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:4G4A9T80 NM:i:3 +SRR065390.28043279 16 CHROMOSOME_I 10000000004 0 9M1I90M * 0 0 TCTTCCGATCTCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #####A>=7A6DD=@AA?>AAABC@CAABDBCBBABDADBADCABBBDCDCDCACDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCC AS:i:-26 XS:i:-26 XN:i:0 XM:i:6 XO:i:1 XG:i:1 YT:Z:UU MD:Z:1A0A0G2T1A0G89 NM:i:7 +SRR065390.29270443 16 CHROMOSOME_I 10000000006 1 100M * 0 0 AGCCTAAGCCGAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ###################################@:88@@>B>C>CCCCA@CCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10T2G86 NM:i:2 +SRR065390.1364843 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ####################@=A=8@:>@;@@=>>B8?C6CCCCCCCCCCACCCCBBCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.10190875 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##################@@@@@@;>BBB?>A6BAB?BB=BAB@?:A.<===@7:4::>8D@BABBACCCCAB@CCCDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.13556211 0 CHROMOSOME_I 10000000011 0 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGATTGGAAGAGCGGCTAAGCAGGAACGCCGGGCTCGATCTCAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCBCDCCB>BBBBB########################################### AS:i:-50 XS:i:-50 XN:i:0 XM:i:25 XO:i:0 XG:i:0 YT:Z:UU MD:Z:57C0C1A0A0G0C0C0T0A0A1C6C0T0A1G1C0T0A0A1C2A0A0G0C2A3 NM:i:25 +SRR065390.20524775 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ############################?9<8B=?@C8A<@?@C8CBDCCC=CCCCC??@CCDCCCCCCCCCCCCCCCCCCCCDCCCCCCCDCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.20580336 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ############################?:>@?@=>@=0<:CB>@B=DCADB@CCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22573273 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##################################A9;?@CBBDBA>BB;ABDB>AAA;=>=0943@########### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.20870114 0 CHROMOSOME_I 10000000012 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCABCCCC=BBBCA@B>B?D;B=>9?############################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3863623 16 CHROMOSOME_I 10000000012 1 100M * 0 0 CGCCTACGCCTACGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##############################?@BB>B@BCABBB?DC@DADC@DCDCACDCBCCCCCCCCCCC@CCCCCCCCCCCCCCC1CCCCCCCCCCC AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0A5A5A87 NM:i:3 +SRR065390.1659845 0 CHROMOSOME_I 10000000013 0 100M * 0 0 GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAACCTAAGCCTAAGCCCAACCCTAAGACCGAGACCGAGACC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCAB@CCC######################################### AS:i:-22 XS:i:-22 XN:i:0 XM:i:11 XO:i:0 XG:i:0 YT:Z:UU MD:Z:60G14T2G6C1T0A2C1T0A2C1T0 NM:i:11 +SRR065390.1567418 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CACAGCCTACGTCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #########################################?:8A@<@>>BBB8>BBB@BBBB>@:??::87688:?:::?@<@@97866@?>@@;;>:< AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T0A6A1C88 NM:i:4 +SRR065390.4996386 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CCAAGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###################################@@@@A=BB@C>>DCCACCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-22 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T5T92 NM:i:2 +SRR065390.14822977 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CGAAGCCAGAGCCTAGGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################B:B?:==2>6@B@@C>?>A@CB5@??@28C@CCCBC@CC?CC?A@CC:CBCCCCCCCCCCCCCC AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T5T0A6A84 NM:i:4 +SRR065390.15148736 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CTGAGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########################CCBC<=C;9??<;==C@BCCCCC=CCCCACACACCBBCCCCCCCCCCCCCCCCCBCCCCCCCCCCCBCA?CCC AS:i:-4 XS:i:-21 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2A4T92 NM:i:2 +SRR065390.18089757 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CTGAGCCTGAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################A212.0:?.>8?BB?B<@@C?CCBCB;DCCCACDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2A5A91 NM:i:2 +SRR065390.25601994 16 CHROMOSOME_I 10000000015 17 100M * 0 0 ATAAGCCTAATCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #####################???DD?BD?BDBB>ACBDBDDBDDDBDBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-21 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0C9G89 NM:i:2 +SRR065390.29400981 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CGAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A<:?C>>BCABABC?AD>BDADDDBDBBDBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-18 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T2G95 NM:i:2 diff --git a/test/maintainer/check_copyright.pl b/test/maintainer/check_copyright.pl new file mode 100755 index 000000000..161a7214c --- /dev/null +++ b/test/maintainer/check_copyright.pl @@ -0,0 +1,95 @@ +#!/usr/bin/perl +# check_copyright.pl : Basic source file checks for copyright boilerplate +# +# Author : Rob Davies +# +# Copyright (C) 2018 Genome Research Ltd. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +use strict; +use warnings; +use File::Find; +use Getopt::Long; + +my $verbose = 0; +GetOptions('v' => \$verbose); + +my ($root) = @ARGV; +if (!$root) { + die "Usage: $0 [-v] \n"; +} +my $errors = 0; +find({ wanted => \&check, no_chdir=>1}, $root); +exit($errors ? 1 : 0); + +sub check { + # Only check C, perl and shell files + return unless (/(?:\.[ch]|\.pl|\.sh)$/); + + # Exclusions: + my %exclude = map { ("$root/$_", 1) } ( +'config.h', # Auto-generated +'version.h', # Auto-generated +'cram/rANS_byte.h', # "Public domain" +'os/lzma_stub.h', # "Public domain" +'os/rand.c'); # drand48 license + return if exists($exclude{$_}); + + my $remove_left = /\.[ch]$/ ? qr/\s*\*?\s*/ : qr/\s*#\s*/; + + return unless (-f $_); # Only check plain files + my $in; + if (!open($in, '<', $_)) { + print STDERR "Couldn't open $_ : $!\n"; + $errors++; + return; + } + my $count = 0; + my $copyright_found = 0; + my $license_found = ""; + my $line; + while ($count < 100 && ($line = <$in>)) { + $count++; + $line =~ s/^$remove_left//; + $line =~ s/\s+/ /g; + if ($line =~ /^Copyright\s+\([cC]\)\s+(?:19|20)\d\d[-, ]/) { + $copyright_found = 1; + } elsif ($line =~ /^Redistribution and use in source and binary forms/) { + $license_found = "BSD"; + } elsif ($line =~ /^Permission is hereby granted, free of charge/) { + $license_found = "MIT"; + } + last if ($copyright_found && $license_found); + } + if (!close($in)) { + print STDERR "Error on closing $_ : $!\n"; + $errors++; + return; + } + my $failed = (!$copyright_found || !$license_found); + if ($verbose || $failed) { + printf("$_ : %s%s\n", + $license_found ? $license_found : "no_license", + $copyright_found ? "" : " no_copyright_line"); + } + if ($failed) { + $errors++; + } +} diff --git a/test/maintainer/check_spaces.pl b/test/maintainer/check_spaces.pl new file mode 100755 index 000000000..0daf24406 --- /dev/null +++ b/test/maintainer/check_spaces.pl @@ -0,0 +1,92 @@ +#!/usr/bin/perl +# check_spaces.pl : Check source files for tabs and trailing spaces +# +# Author : Rob Davies +# +# Copyright (C) 2018 Genome Research Ltd. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +use strict; +use warnings; +use File::Find; +use Getopt::Long; + +my $verbose = 0; +GetOptions('v' => \$verbose); + +my ($root) = @ARGV; +if (!$root) { + die "Usage: $0 [-v] \n"; +} +my $errors = 0; +find({ wanted => \&check, no_chdir=>1}, $root); +exit($errors ? 1 : 0); + +sub check { + # Only check C, perl and shell files + return unless (/(?:\.[ch]|\.pl|\.sh)$/); + + my %allow_tabs = map { ("$root/$_", 1) } ( +'kfunc.c', +'knetfile.c', +'kstring.c', +'md5.c', +'htslib/khash.h', +'htslib/kseq.h', +'htslib/ksort.h', +'htslib/kstring.h', +'htslib/knetfile.h', +'htslib/klist.h', +'htslib/kbitset.h', +'os/rand.c', +); + + my $check_tabs = !exists($allow_tabs{$_}); + + my $in; + if (!open($in, '<', $_)) { + print STDERR "Couldn't open $_ : $!\n"; + $errors++; + return; + } + my $tab = 0; + my $trailing = 0; + while (my $line = <$in>) { + chomp($line); + if ($check_tabs && $line =~ /\t/) { $tab = 1; } + if ($line =~ /\s$/) { $trailing = 1; } + } + if (!close($in)) { + print STDERR "Error on closing $_ : $!\n"; + $errors++; + return; + } + my $failed = ($tab || $trailing); + if ($verbose || $failed) { + my $msg = ($failed ? join(" ", + $tab ? ("includes_tabs") : (), + $trailing ? "trailing_spaces" : ()) + : "ok"); + print "$_ : $msg\n"; + } + if ($failed) { + $errors++; + } +} diff --git a/test/mpileup/c1#pad1.out b/test/mpileup/c1#pad1.out new file mode 100644 index 000000000..cbac51ad8 --- /dev/null +++ b/test/mpileup/c1#pad1.out @@ -0,0 +1,10 @@ +c1 1 9 ^!A^!A^!A^!A^!A^!A^!A^!A^!A +c1 2 9 AAAAAAAAA-3() +c1 3 9 CCCCCCCC* +c1 4 9 CCCCCCCC-1()* +c1 5 9 GGGG+6(GTTAAC)G+6(*TTAA*)G+6(GTT***)G+6(***AAC)*+6(**TA**)-1()*+6(GTTAAC)-3() +c1 6 9 CCCCCCC** +c1 7 9 GGGGGGGG* +c1 8 9 GGGGGGGG* +c1 9 9 TTTTTTTTT +c1 10 9 T$T$T$T$T$T$T$T$T$ diff --git a/test/mpileup/c1#pad1.sam b/test/mpileup/c1#pad1.sam new file mode 100644 index 000000000..93d88d1a4 --- /dev/null +++ b/test/mpileup/c1#pad1.sam @@ -0,0 +1,47 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:c1 LN:10 +@CO +@CO Copyright (c) 2014,2018 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining +@CO a copy of this software and associated documentation files (the +@CO "Software"), to deal in the Software without restriction, including +@CO without limitation the rights to use, copy, modify, merge, publish, +@CO distribute, sublicense, and/or sell copies of the Software, and to +@CO permit persons to whom the Software is furnished to do so, subject +@CO to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included +@CO in all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +@CO OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +@CO MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +@CO IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +@CO CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +@CO TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +@CO SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +@CO +@CO Ref AACCG******CGGTT +@CO 12345 67890 Depadded base numbering +@CO 1 +@CO +@CO s0a AACCG CGGTT 10M +@CO s0b AACCG CGGTT 10M +@CO s0c AACCG CGGTT 10M +@CO s1 AACCGGTTAACCGGTT 5M 6I 5M +@CO s2 AACCG*TTAA*CGGTT 5M 1P 4I 1P 5M +@CO s3 AACCGGTT***CGGTT 5M 3I 3P 5M +@CO s4 AACCG***AACCGGTT 5M 3P 3I 5M +@CO s5 AACC***TA***GGTT 4M 1D 2P 2I 2P 1D 4M +@CO s6 AA***GTTAAC***TT 2M 3D 6I 3D 2M +@CO +s0a 0 c1 1 0 10M * 0 0 AACCGCGGTT * +s0b 0 c1 1 0 10M * 0 0 AACCGCGGTT * +s0c 0 c1 1 0 10M * 0 0 AACCGCGGTT * +s1 0 c1 1 0 5M6I5M * 0 0 AACCGGTTAACCGGTT * +s2 0 c1 1 0 5M1P4I1P5M * 0 0 AACCGTTAACGGTT * +s3 0 c1 1 0 5M3I3P5M * 0 0 AACCGGTTCGGTT * +s4 0 c1 1 0 5M3P3I5M * 0 0 AACCGAACCGGTT * +s5 0 c1 1 0 4M1D2P2I2P1D4M * 0 0 AACCTAGGTT * +s6 0 c1 1 0 2M3D6I3D2M * 0 0 AAGTTAACTT * diff --git a/test/mpileup/c1#pad2.out b/test/mpileup/c1#pad2.out new file mode 100644 index 000000000..9cab78a87 --- /dev/null +++ b/test/mpileup/c1#pad2.out @@ -0,0 +1,10 @@ +c1 1 12 ^!A^!A^!A^!A^!A^!A^!A^!A^!A^!A^!A^!* +c1 2 12 AAAAAAAAAA-3()A* +c1 3 12 CCCCCCCCC*C* +c1 4 12 CCCCCCCCC-1()*C-2()* +c1 5 13 GGGGG+6(GTTAAC)G+6(*TTAA*)G+6(GTT***)G+6(***AAC)*+6(**TA**)-1()*+6(GTTAAC)-3()**+6(**TA**)-5()^!G+6(**TA**)$ +c1 6 12 CCCCCCCC**** +c1 7 12 GGGGGGGGG*G* +c1 8 12 GGGGGGGGG*G* +c1 9 12 TTTTTTTTTTT* +c1 10 12 T$T$T$T$T$T$T$T$T$T$T$*$ diff --git a/test/mpileup/c1#pad2.sam b/test/mpileup/c1#pad2.sam new file mode 100644 index 000000000..bbbdd1111 --- /dev/null +++ b/test/mpileup/c1#pad2.sam @@ -0,0 +1,55 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:c1 LN:10 +@CO +@CO Copyright (c) 2014,2018 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining +@CO a copy of this software and associated documentation files (the +@CO "Software"), to deal in the Software without restriction, including +@CO without limitation the rights to use, copy, modify, merge, publish, +@CO distribute, sublicense, and/or sell copies of the Software, and to +@CO permit persons to whom the Software is furnished to do so, subject +@CO to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included +@CO in all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +@CO OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +@CO MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +@CO IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +@CO CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +@CO TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +@CO SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +@CO +@CO Ref AACCG******CGGTT +@CO 12345 67890 Depadded base numbering +@CO 1 +@CO +@CO s0a AACCG CGGTT 10M +@CO s0b AACCG CGGTT 10M +@CO s0c AACCG CGGTT 10M +@CO s0d AACCG CGGTT 10M +@CO s1 AACCGGTTAACCGGTT 5M 6I 5M +@CO s2 AACCG*TTAA*CGGTT 5M 1P 4I 1P 5M +@CO s3 AACCGGTT***CGGTT 5M 3I 3P 5M +@CO s4 AACCG***AACCGGTT 5M 3P 3I 5M +@CO s5 AACC***TA***GGTT 4M 1D 2P 2I 2P 1D 4M +@CO s6 AA***GTTAAC***TT 2M 3D 6I 3D 2M +@CO s7 AACC* *GGTT 4M 2D 4M +@CO s8 *******TA******* 5D 2P 2I 2P 5D +@CO s9 G**TA** 1M 2P 2I 2P +@CO +s0a 0 c1 1 0 10M * 0 0 AACCGCGGTT * +s0b 0 c1 1 0 10M * 0 0 AACCGCGGTT * +s0c 0 c1 1 0 10M * 0 0 AACCGCGGTT * +s0d 0 c1 1 0 10M * 0 0 AACCGCGGTT * +s1 0 c1 1 0 5M6I5M * 0 0 AACCGGTTAACCGGTT * +s2 0 c1 1 0 5M1P4I1P5M * 0 0 AACCGTTAACGGTT * +s3 0 c1 1 0 5M3I3P5M * 0 0 AACCGGTTCGGTT * +s4 0 c1 1 0 5M3P3I5M * 0 0 AACCGAACCGGTT * +s5 0 c1 1 0 4M1D2P2I2P1D4M * 0 0 AACCTAGGTT * +s6 0 c1 1 0 2M3D6I3D2M * 0 0 AAGTTAACTT * +s7 0 c1 1 0 4M2D4M * 0 0 AACCGGTT * +s8 0 c1 1 0 5D2P2I2P5D * 0 0 TA * +s9 0 c1 5 0 1M2P2I2P * 0 0 GTA * diff --git a/test/mpileup/c1#pad3.out b/test/mpileup/c1#pad3.out new file mode 100644 index 000000000..d56eae596 --- /dev/null +++ b/test/mpileup/c1#pad3.out @@ -0,0 +1,5 @@ +c1 6 11 ^!C^!C^!C^!C^!C^!C^!C^!C^!*^!*^!* +c1 7 11 GGGGGGGGG*G +c1 8 11 GGGGGGGGG*G +c1 9 11 TTTTTTTTTTT +c1 10 11 T$T$T$T$T$T$T$T$T$T$T$ diff --git a/test/mpileup/c1#pad3.sam b/test/mpileup/c1#pad3.sam new file mode 100644 index 000000000..c5c043836 --- /dev/null +++ b/test/mpileup/c1#pad3.sam @@ -0,0 +1,53 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:c1 LN:16 +@RG ID:p.sam SM:unknown LB:p.sam +@CO +@CO Copyright (c) 2014,2018 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining +@CO a copy of this software and associated documentation files (the +@CO "Software"), to deal in the Software without restriction, including +@CO without limitation the rights to use, copy, modify, merge, publish, +@CO distribute, sublicense, and/or sell copies of the Software, and to +@CO permit persons to whom the Software is furnished to do so, subject +@CO to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included +@CO in all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +@CO OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +@CO MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +@CO IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +@CO CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +@CO TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +@CO SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +@CO +@CO Ref ***********CGGTT +@CO 12345 Depadded base numbering +@CO +@CO s0a AACCG******CGGTT 5I 6P 5M +@CO s0b AACCG******CGGTT 5I 6P 5M +@CO s0c AACCG******CGGTT 5I 6P 5M +@CO s0d AACCG******CGGTT 5I 6P 5M +@CO s1 AACCGGTTAACCGGTT 11I 5M +@CO s2 AACCG*TTAA*CGGTT 5I 1P 4I 1P 5M +@CO s3 AACCGGTT***CGGTT 8I 3P 5M +@CO s4 AACCG***AACCGGTT 5I 3P 3I 5M +@CO s5 AACC***TA***GGTT 4I 3P 2I 2P 1D 4M +@CO s6 AA***GTTAAC***TT 2I 3P 6I 3D 2M +@CO s7 AACC********GGTT 4I 7P 1D 4M +@CO s8 *******TA** 7P 2I 2P +@CO +s0a 0 c1 6 0 5I6P5M * 0 0 AACCGCGGTT * RG:Z:p.sam +s0b 0 c1 6 0 5I6P5M * 0 0 AACCGCGGTT * RG:Z:p.sam +s0c 0 c1 6 0 5I6P5M * 0 0 AACCGCGGTT * RG:Z:p.sam +s0d 0 c1 6 0 5I6P5M * 0 0 AACCGCGGTT * RG:Z:p.sam +s1 0 c1 6 0 11I5M * 0 0 AACCGGTTAACCGGTT * RG:Z:p.sam +s2 0 c1 6 0 5I1P4I1P5M * 0 0 AACCGTTAACGGTT * RG:Z:p.sam +s3 0 c1 6 0 8I3P5M * 0 0 AACCGGTTCGGTT * RG:Z:p.sam +s4 0 c1 6 0 5I3P3I5M * 0 0 AACCGAACCGGTT * RG:Z:p.sam +s5 0 c1 6 0 4I3P2I2P1D4M * 0 0 AACCTAGGTT * RG:Z:p.sam +s6 0 c1 6 0 2I3P6I3D2M * 0 0 AAGTTAACTT * RG:Z:p.sam +s7 0 c1 6 0 4I7P1D4M * 0 0 AACCGGTT * RG:Z:p.sam +s8 0 c1 6 0 7P2I2P * 0 0 TA !! RG:Z:p.sam diff --git a/test/mpileup/mp_D.out b/test/mpileup/mp_D.out new file mode 100644 index 000000000..656cedb5d --- /dev/null +++ b/test/mpileup/mp_D.out @@ -0,0 +1,11 @@ +z 2 3 ^!A^!A^!* +z 3 3 GG* +z 4 3 CCC +z 5 3 TT-3()T +z 6 3 T*T +z 7 3 A*A +z 8 3 G*G +z 9 3 CCC +z 10 3 AAA-2() +z 11 3 GG* +z 12 3 G$G$*$ diff --git a/test/mpileup/mp_D.sam b/test/mpileup/mp_D.sam new file mode 100644 index 000000000..73b95993e --- /dev/null +++ b/test/mpileup/mp_D.sam @@ -0,0 +1,36 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:z LN:13 +@CO +@CO Copyright (c) 2018 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining +@CO a copy of this software and associated documentation files (the +@CO "Software"), to deal in the Software without restriction, including +@CO without limitation the rights to use, copy, modify, merge, publish, +@CO distribute, sublicense, and/or sell copies of the Software, and to +@CO permit persons to whom the Software is furnished to do so, subject +@CO to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included +@CO in all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +@CO OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +@CO MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +@CO IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +@CO CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +@CO TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +@CO SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +@CO +@CO 1 +@CO 1234567890123 Depadded base numbering +@CO ref TAGCTTAGCAGGT +@CO +@CO s1 AGCTTAGCAGG 11M +@CO s2 AGCT***CAGG 4M 3D 4M +@CO s3 **CTTAGCA** 2D 7M 2D +@CO qual 01234567890 +@CO +s1 0 z 2 0 11M * 0 0 AGCTTAGCAGG 01234567890 +s2 0 z 2 0 4M3D4M * 0 0 AGCTCAGG 01237890 +s3 0 z 2 0 2D7M2D * 0 0 CTTAGCA 2345678 diff --git a/test/mpileup/mp_DI.out b/test/mpileup/mp_DI.out new file mode 100644 index 000000000..8a2ff2e2a --- /dev/null +++ b/test/mpileup/mp_DI.out @@ -0,0 +1,12 @@ +z 2 5 ^!A^!A^!A^!*^!* +z 3 5 GGG*+2(AA)*+2(*A) +z 4 5 CCCCC +z 5 5 TTTTT +z 6 5 TTTTT +z 7 5 AAAAA +z 8 5 G-2()G-2()G-2()GG +z 9 5 ***CC +z 10 5 *+2(TT)*+2(TT)$*+2(*T)$AA +z 11 3 GGG +z 12 3 GG$G$ +z 13 1 C$ diff --git a/test/mpileup/mp_DI.sam b/test/mpileup/mp_DI.sam new file mode 100644 index 000000000..dffd95c39 --- /dev/null +++ b/test/mpileup/mp_DI.sam @@ -0,0 +1,40 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:z LN:13 +@CO +@CO Copyright (c) 2018 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining +@CO a copy of this software and associated documentation files (the +@CO "Software"), to deal in the Software without restriction, including +@CO without limitation the rights to use, copy, modify, merge, publish, +@CO distribute, sublicense, and/or sell copies of the Software, and to +@CO permit persons to whom the Software is furnished to do so, subject +@CO to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included +@CO in all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +@CO OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +@CO MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +@CO IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +@CO CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +@CO TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +@CO SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +@CO +@CO 1 +@CO 123 4567890 123 Depadded base numbering +@CO ref TAG CTTAGCA GGT +@CO +@CO s1 AG CTTAG**TTGGC 7M 2D 2I 3M +@CO s2 AG CTTAG**TT 7M 2D 2I +@CO s4 AG CTTAG***T 7M 2D 1P 1I +@CO s3 **AACTTAGCA GG 2D 2I 9M +@CO s5 ***ACTTAGCA GG 2D 1P 1I 9M +@CO 01AB2345678AB901 +@CO +s1 0 z 2 0 7M2D2I3M * 0 0 AGCTTAGTTGGC 0123456AB901 +s2 0 z 2 0 7M2D2I * 0 0 AGCTTAGTT 0123456AB +s4 0 z 2 0 7M2D1P1I * 0 0 AGCTTAGT 0123456B +s3 0 z 2 0 2D2I9M * 0 0 AACTTAGCAGG AB234567890 +s5 0 z 2 0 2D1P1I9M * 0 0 ACTTAGCAGG B234567890 diff --git a/test/mpileup/mp_I.out b/test/mpileup/mp_I.out new file mode 100644 index 000000000..c437a49d9 --- /dev/null +++ b/test/mpileup/mp_I.out @@ -0,0 +1,11 @@ +z 2 3 ^!A^!A^!A +z 3 3 GGG +z 4 3 CCC +z 5 3 TT+3(CCC)T +z 6 3 TTT +z 7 3 AAA +z 8 3 GGG +z 9 3 CCC +z 10 3 AAA +z 11 3 GGG +z 12 3 G$G$G+2(=A)$ diff --git a/test/mpileup/mp_I.sam b/test/mpileup/mp_I.sam new file mode 100644 index 000000000..9f48e21d4 --- /dev/null +++ b/test/mpileup/mp_I.sam @@ -0,0 +1,41 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:z LN:13 +@CO +@CO Copyright (c) 2018 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining +@CO a copy of this software and associated documentation files (the +@CO "Software"), to deal in the Software without restriction, including +@CO without limitation the rights to use, copy, modify, merge, publish, +@CO distribute, sublicense, and/or sell copies of the Software, and to +@CO permit persons to whom the Software is furnished to do so, subject +@CO to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included +@CO in all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +@CO OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +@CO MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +@CO IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +@CO CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +@CO TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +@CO SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +@CO +@CO Various I combinations +@CO Starting with I isn't handled due to the pileup ordering. +@CO The record only appears in the pileup list when the first base +@CO is placed against the reference, which is too late. +@CO +@CO 1 +@CO 1 2345 6789012 3 Depadded base numbering +@CO ref T AGCT TAGCAGG T +@CO +@CO s1 AGCT TAGCAGG 11M +@CO s2 AGCTCCCTAGCAGG 4M 3I 7M +@CO s3 A=AGCT TAGCAGG=A 2I 11M 2I +@CO AB0123ABC4567890CD +@CO +s1 0 z 2 0 11M * 0 0 AGCTTAGCAGG 01234567890 +s2 0 z 2 0 4M3I7M * 0 0 AGCTCCCTAGCAGG 0123ABC4567890 +s2 0 z 2 0 2I11M2I * 0 0 A=AGCTTAGCAGG=A AB01234567890CD diff --git a/test/mpileup/mp_ID.out b/test/mpileup/mp_ID.out new file mode 100644 index 000000000..4f88f51e0 --- /dev/null +++ b/test/mpileup/mp_ID.out @@ -0,0 +1,12 @@ +z 2 3 ^!A^!A^!A +z 3 3 GGG +z 4 5 CCC^!*^!* +z 5 5 TTT** +z 6 5 TTTTT +z 7 5 AAAAA +z 8 5 G+2(TT)-2()G+2(TT)-2()G+2(T*)-2()GG +z 9 5 ***CC +z 10 5 **$*$AA +z 11 3 GGG +z 12 3 GG$G$ +z 13 1 C$ diff --git a/test/mpileup/mp_ID.sam b/test/mpileup/mp_ID.sam new file mode 100644 index 000000000..1b2197676 --- /dev/null +++ b/test/mpileup/mp_ID.sam @@ -0,0 +1,45 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:z LN:13 +@CO +@CO Copyright (c) 2018 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining +@CO a copy of this software and associated documentation files (the +@CO "Software"), to deal in the Software without restriction, including +@CO without limitation the rights to use, copy, modify, merge, publish, +@CO distribute, sublicense, and/or sell copies of the Software, and to +@CO permit persons to whom the Software is furnished to do so, subject +@CO to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included +@CO in all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +@CO OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +@CO MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +@CO IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +@CO CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +@CO TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +@CO SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +@CO +@CO Various I combinations +@CO Starting with I isn't handled due to the pileup ordering. +@CO The record only appears in the pileup list when the first base +@CO is placed against the reference, which is too late. +@CO +@CO 1 +@CO 123 45678 90123 Depadded base numbering +@CO ref TAG CTTAG CAGGT +@CO +@CO s1 AG CTTAGTT**GGC 7M 2I 2D 3M +@CO s2 AG CTTAGTT** 7M 2I 2D +@CO s3 AG CTTAGT*** 7M 1I 1P 2D +@CO s4 AA**TAG CAGG 2I 2D 7M +@CO s5 *A**TAG CAGG 1P 1I 2D 7M +@CO qual 01AB23456CD78901 +@CO +s1 0 z 2 0 7M2I2D3M * 0 0 AGCTTAGTTGGC 0123456CD901 +s2 0 z 2 0 7M2I2D * 0 0 AGCTTAGTT 0123456CD +s3 0 z 2 0 7M1I1P2D * 0 0 AGCTTAGT 0123456D +s4 0 z 4 0 2I2D7M * 0 0 AATAGCAGG AB4567890 +s5 0 z 4 0 1P1I2D7M * 0 0 ATAGCAGG B4567890 diff --git a/test/mpileup/mp_N.out b/test/mpileup/mp_N.out new file mode 100644 index 000000000..695e4634c --- /dev/null +++ b/test/mpileup/mp_N.out @@ -0,0 +1,11 @@ +z 2 3 ^!A^!A^!> +z 3 3 GG> +z 4 3 CCC +z 5 3 TTT +z 6 3 T>T +z 7 3 A>A +z 8 3 G>G +z 9 3 CCC +z 10 3 AAA +z 11 3 GG> +z 12 3 G$G$>$ diff --git a/test/mpileup/mp_N.sam b/test/mpileup/mp_N.sam new file mode 100644 index 000000000..cc9ab7784 --- /dev/null +++ b/test/mpileup/mp_N.sam @@ -0,0 +1,40 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:z LN:13 +@CO +@CO Copyright (c) 2018 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining +@CO a copy of this software and associated documentation files (the +@CO "Software"), to deal in the Software without restriction, including +@CO without limitation the rights to use, copy, modify, merge, publish, +@CO distribute, sublicense, and/or sell copies of the Software, and to +@CO permit persons to whom the Software is furnished to do so, subject +@CO to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included +@CO in all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +@CO OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +@CO MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +@CO IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +@CO CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +@CO TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +@CO SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +@CO +@CO Various I combinations +@CO Starting with I isn't handled due to the pileup ordering. +@CO The record only appears in the pileup list when the first base +@CO is placed against the reference, which is too late. +@CO +@CO 1 +@CO 1234567890123 Depadded base numbering +@CO ref TAGCTTAGCAGGT +@CO s1 AGCTTAGCAGG 11M +@CO s2 AGCT>>>CAGG 4M 3N 4M +@CO s3 >>CTTAGCA>> 2N 7M 2N +@CO qual 01234567890 +@CO +s1 0 z 2 0 11M * 0 0 AGCTTAGCAGG 01234567890 +s2 0 z 2 0 4M3N4M * 0 0 AGCTCAGG 01237890 +s3 0 z 2 0 2N7M2N * 0 0 CTTAGCA 2345678 diff --git a/test/mpileup/mp_N2.out b/test/mpileup/mp_N2.out new file mode 100644 index 000000000..baf168364 --- /dev/null +++ b/test/mpileup/mp_N2.out @@ -0,0 +1,13 @@ +z 1 6 ^!T^!T^!T^!T^!T^!T +z 2 6 AAAAAA +z 3 6 GGGGGG +z 4 6 C+2(AA)-5()C+2(A*)-5()C+2(*A)-5()C+2(AA)C+2(A*)C+2(*A) +z 5 6 ***>>> +z 6 6 ***>>> +z 7 6 ***>>> +z 8 6 ***>>> +z 9 6 *+2(TT)*+2(*T)*+2(T*)>+2(TT)>+2(*T)>+2(T*) +z 10 6 AAAAAA +z 11 6 GGGGGG +z 12 6 GGGGGG +z 13 6 T$T$T$T$T$T$ diff --git a/test/mpileup/mp_N2.sam b/test/mpileup/mp_N2.sam new file mode 100644 index 000000000..292cfcb58 --- /dev/null +++ b/test/mpileup/mp_N2.sam @@ -0,0 +1,46 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:z LN:13 +@CO +@CO Copyright (c) 2018 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining +@CO a copy of this software and associated documentation files (the +@CO "Software"), to deal in the Software without restriction, including +@CO without limitation the rights to use, copy, modify, merge, publish, +@CO distribute, sublicense, and/or sell copies of the Software, and to +@CO permit persons to whom the Software is furnished to do so, subject +@CO to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included +@CO in all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +@CO OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +@CO MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +@CO IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +@CO CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +@CO TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +@CO SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +@CO +@CO Various I combinations +@CO Starting with I isn't handled due to the pileup ordering. +@CO The record only appears in the pileup list when the first base +@CO is placed against the reference, which is too late. +@CO +@CO 1 +@CO 1234 56789 0123 Depadded base numbering +@CO ref TAGC TTAGC AGGT +@CO sD1 TAGCAA*****TTAGGT 4M 2I 5D 2I 4M +@CO sD2 TAGCA*******TAGGT 4M 1I 1P 5D 1P 1I 4M +@CO sD3 TAGC*A*****T*AGGT 4M 1P 1II 5D 1I 1P 4M +@CO sN1 TAGCAA>>>>>TTAGGT 4M 2I 5N 2I 4M +@CO sN2 TAGCA*>>>>>*TAGGT 4M 1I 1P 5N 1P 1I 4M +@CO sN3 TAGC*A>>>>>T*AGGT 4M 1P 1I 5N 1I 1P 4M +@CO qual ABCDEF GHIJKL +@CO +sD1 0 z 1 0 4M2I5D2I4M * 0 0 TAGCAATTAGGT ABCDEFGHIJKL +sD2 0 z 1 0 4M1I1P5D1P1I4M * 0 0 TAGCATAGGT ABCDEHIJKL +sD3 0 z 1 0 4M1P1II5D1I1P4M * 0 0 TAGCATAGGT ABCDFGIJKL +sN1 0 z 1 0 4M2I5N2I4M * 0 0 TAGCAATTAGGT ABCDEFGHIJKL +sN2 0 z 1 0 4M1I1P5N1P1I4M * 0 0 TAGCATAGGT ABCDEHIJKL +sN3 0 z 1 0 4M1P1I5N1I1P4M * 0 0 TAGCATAGGT ABCDFGIJKL diff --git a/test/mpileup/mp_P.out b/test/mpileup/mp_P.out new file mode 100644 index 000000000..471b7d7fc --- /dev/null +++ b/test/mpileup/mp_P.out @@ -0,0 +1,10 @@ +z 2 5 ^!A^!A^!A^!A^!A +z 3 5 GGGGG +z 4 5 CCCCC +z 5 5 TTTTT +z 6 5 TT+4(GGCC)T+4(GG**)T+4(*GC*)T+4(**CC) +z 7 5 AAAAA +z 8 5 GGGGG +z 9 5 CCCCC +z 10 5 AAAAA +z 11 5 G$G$G$G$G$ diff --git a/test/mpileup/mp_P.sam b/test/mpileup/mp_P.sam new file mode 100644 index 000000000..9a8c9d6cb --- /dev/null +++ b/test/mpileup/mp_P.sam @@ -0,0 +1,41 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:z LN:13 +@CO +@CO Copyright (c) 2018 Genome Research Ltd. +@CO +@CO Permission is hereby granted, free of charge, to any person obtaining +@CO a copy of this software and associated documentation files (the +@CO "Software"), to deal in the Software without restriction, including +@CO without limitation the rights to use, copy, modify, merge, publish, +@CO distribute, sublicense, and/or sell copies of the Software, and to +@CO permit persons to whom the Software is furnished to do so, subject +@CO to the following conditions: +@CO +@CO The above copyright notice and this permission notice shall be included +@CO in all copies or substantial portions of the Software. +@CO +@CO THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +@CO OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +@CO MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +@CO IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +@CO CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +@CO TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +@CO SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +@CO +@CO Various I combinations with (P)ads +@CO +@CO 1 +@CO 12345 67890 Depadded base numbering +@CO ref AGCTT AGCAG +@CO +@CO s1 AGCTT AGCAG 10M +@CO s2 AGCTTGGCCAGCAG 5M 4I 5M +@CO s3 AGCTTGG**AGCAG 5M 2I 2P 5M +@CO s4 AGCTT*GC*AGCAG 5M 1P 2I 1P 5M +@CO s5 AGCTT**CCAGCAG 5M 2P 2I 5M +@CO qual 01234ABCD56789 +s1 0 z 2 0 10M * 0 0 AGCTTAGCAG 0123456789 +s2 0 z 2 0 5M4I5M * 0 0 AGCTTGGCCAGCAG 01234ABCD56789 +s3 0 z 2 0 5M2I2P5M * 0 0 AGCTTGGAGCAG 01234AB56789 +s4 0 z 2 0 5M1P2I1P5M * 0 0 AGCTTGCAGCAG 01234BC56789 +s5 0 z 2 0 5M2P2I5M * 0 0 AGCTTCCAGCAG 01234CD56789 diff --git a/test/mpileup/mpileup.tst b/test/mpileup/mpileup.tst new file mode 100644 index 000000000..4ffbd3481 --- /dev/null +++ b/test/mpileup/mpileup.tst @@ -0,0 +1,73 @@ +# Copyright (C) 2017-2018 Genome Research Ltd. +# +# Author: Robert Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# First field: +# INIT = initialisation, not counted in testing +# P = expected to pass (zero return; expected output matches, if present) +# N = expected to return non-zero +# F = expected to fail +# +# Second field (P/N/F only): +# Filename of expected output. If '.', output is not checked +# +# Rest: +# Command to execute. $pileup is replaced with the path to the pileup test +# program + +# Deletions +P mp_D.out $pileup mp_D.sam +P mp_D.out $pileup -m mp_D.sam + +# Deletions followed by insertions +P mp_DI.out $pileup mp_DI.sam +P mp_DI.out $pileup -m mp_DI.sam + +# NB: pileup currently cannot return leading insertions. +# Test output reflects this. +# Insertions +P mp_I.out $pileup mp_I.sam +P mp_I.out $pileup -m mp_I.sam +P mp_P.out $pileup mp_P.sam +P mp_P.out $pileup -m mp_P.sam + +# Insertions followed by deletions +P mp_ID.out $pileup mp_ID.sam +P mp_ID.out $pileup -m mp_ID.sam + +# Ref skips +P mp_N.out $pileup mp_N.sam +P mp_N.out $pileup -m mp_N.sam + +# Ref skips and deletions +P mp_N2.out $pileup mp_N2.sam +P mp_N2.out $pileup -m mp_N2.sam + +# Various combinations of insertions, deletions and pads +P c1#pad1.out $pileup c1#pad1.sam +P c1#pad1.out $pileup -m c1#pad1.sam +P c1#pad2.out $pileup c1#pad2.sam +P c1#pad2.out $pileup -m c1#pad2.sam +P c1#pad3.out $pileup c1#pad3.sam +P c1#pad3.out $pileup -m c1#pad3.sam + +# Issue #852. Problem caused by alignments with entirely S/I ops in CIGAR. +P small.out $pileup -m small.bam diff --git a/test/mpileup/small.bam b/test/mpileup/small.bam new file mode 100644 index 000000000..94e479740 Binary files /dev/null and b/test/mpileup/small.bam differ diff --git a/test/mpileup/small.out b/test/mpileup/small.out new file mode 100644 index 000000000..b5c161024 --- /dev/null +++ b/test/mpileup/small.out @@ -0,0 +1,322 @@ +2 1 1 ^]T +2 2 1 G +2 3 1 G +2 4 1 A +2 5 1 G +2 6 1 A +2 7 1 G +2 8 1 C +2 9 1 A +2 10 1 C +2 11 1 A +2 12 1 T +2 13 1 A +2 14 1 A +2 15 1 C +2 16 1 T +2 17 1 T +2 18 1 G +2 19 1 G +2 20 1 G +2 21 1 T +2 22 1 G +2 23 1 A +2 24 1 G +2 25 1 A +2 26 1 T +2 27 1 G +2 28 1 A +2 29 1 T +2 30 1 G +2 31 2 A^]A +2 32 2 AA +2 33 2 AA +2 34 2 TT +2 35 2 GG +2 36 2 AA +2 37 2 GG +2 38 2 CC +2 39 2 AA +2 40 2 CC +2 41 2 TT +2 42 2 GG +2 43 2 GG +2 44 2 CC +2 45 2 TT +2 46 2 TT +2 47 2 TT +2 48 2 GG +2 49 2 GG +2 50 2 AA +2 51 2 GG +2 52 2 TT +2 53 2 CC +2 54 2 AA +2 55 2 CC +2 56 2 AA +2 57 2 CC +2 58 2 AA +2 59 2 GG +2 60 2 AA +2 61 2 CC +2 62 2 CC +2 63 2 AA +2 64 3 GG^]g +2 65 3 GGg +2 66 3 GGg +2 67 4 TTt^]t +2 68 4 CCcc +2 69 4 CCcc +2 70 4 AAaa +2 71 4 GGgg +2 72 4 GGgg +2 73 4 CCcc +2 74 4 GGgg +2 75 4 C$Ccc +2 76 3 Ccc +2 77 3 Ttt +2 78 3 Agg +2 79 3 Ttt +2 80 3 Aat +2 81 3 Ccc +2 82 3 Ccc +2 83 3 Aaa +2 84 3 Ttt +2 85 3 Aaa +2 86 3 Aaa +2 87 3 Ccc +2 88 3 Acc +2 89 3 Ccc +2 90 3 Ttt +2 91 3 Ccc +2 92 3 Tgt +2 93 3 Aaa +2 94 3 Ggg +2 95 3 Ttt +2 96 3 Ggg +2 97 3 Ggg +2 98 3 Ttt +2 99 3 Ggg +2 100 3 Ttt +2 101 3 Ggg +2 102 3 Ggg +2 103 3 Ccc +2 104 3 Ggg +2 105 3 G$gg +2 106 2 aa +2 107 2 aa +2 108 2 cc +2 109 2 cc +2 110 2 tt +2 111 2 cc +2 112 2 tt +2 113 2 cc +2 114 2 aa +2 115 2 gg +2 116 2 aa +2 117 2 cc +2 118 2 cc +2 119 2 tt +2 120 2 cc +2 121 2 cc +2 122 2 cc +2 123 2 aa +2 124 2 gg +2 125 2 cc +2 126 2 cc +2 127 2 aa +2 128 2 gg +2 129 2 aa +2 130 2 aa +2 131 2 aa +2 132 2 gg +2 133 2 gg +2 134 2 ag +2 135 2 aa +2 136 2 tt +2 137 2 cc +2 138 2 t$t$ +2 495 1 ^Ft +2 496 1 t +2 497 1 t +2 498 1 g +2 499 1 g +2 500 1 c +2 501 1 a +2 502 1 a +2 503 1 t +2 504 1 t +2 505 1 t +2 506 1 a +2 507 1 c +2 508 1 a +2 509 1 c +2 510 1 t +2 511 1 g +2 512 1 t +2 513 1 g +2 514 1 t +2 515 1 t +2 516 1 a +2 517 1 t +2 518 1 a +2 519 1 g +2 520 1 c +2 521 1 a +2 522 1 a +2 523 1 t +2 524 1 a +2 525 1 t +2 526 1 a +2 527 1 g +2 528 1 t +2 529 1 g +2 530 1 a +2 531 1 a +2 532 1 a +2 533 1 a +2 534 1 g +2 535 1 g +2 536 1 g +2 537 1 t +2 538 1 g +2 539 1 a +2 540 1 t +2 541 1 c +2 542 1 a +2 543 1 t +2 544 1 t +2 545 1 a +2 546 1 c +2 547 1 c +2 548 1 t +2 549 1 c +2 550 1 a +2 551 1 a +2 552 1 g +2 553 1 a +2 554 1 c +2 555 1 t +2 556 1 g +2 557 1 t +2 558 1 t +2 559 1 c +2 560 1 a +2 561 1 c +2 562 1 a +2 563 1 a +2 564 1 a +2 565 1 c +2 566 1 a +2 567 1 c +2 568 1 a +2 569 1 t$ +2 648 1 ^gA +2 649 1 C +2 650 1 G +2 651 1 C +2 652 1 A +2 653 1 C +2 654 1 C +2 655 1 C +2 656 1 T +2 657 1 C +2 658 1 T +2 659 1 A +2 660 1 T +2 661 1 C +2 662 1 C +2 663 1 C +2 664 1 C +2 665 1 A +2 666 1 C +2 667 1 A +2 668 1 T +2 669 1 A +2 670 1 A +2 671 1 A +2 672 1 T +2 673 1 C +2 674 1 T +2 675 1 A +2 676 1 T +2 677 1 A +2 678 1 C +2 679 1 A +2 680 1 A +2 681 1 C +2 682 2 A^>a +2 683 2 Cc +2 684 2 Tt +2 685 2 Cc +2 686 2 Aa +2 687 2 Cc +2 688 2 Cc +2 689 2 Cc +2 690 2 Tt +2 691 2 Cc +2 692 2 Tt +2 693 2 Aa +2 694 2 Cc +2 695 2 Aa +2 696 2 Cc +2 697 2 Cc +2 698 2 Cc +2 699 2 Aa +2 700 2 Cc +2 701 2 Aa +2 702 2 Tt +2 703 2 Aa +2 704 2 Cc +2 705 2 Aa +2 706 2 Tt +2 707 2 Cc +2 708 2 Tt +2 709 2 Aa +2 710 2 Tt +2 711 2 Aa +2 712 2 Cc +2 713 2 Aa +2 714 2 Aa +2 715 2 Cc +2 716 2 Aa +2 717 2 Cc +2 718 2 Gg +2 719 2 C$c +2 720 1 a +2 721 1 c +2 722 1 c +2 723 1 c +2 724 1 t +2 725 1 c +2 726 1 t +2 727 1 a +2 728 1 c +2 729 1 c +2 730 1 c +2 731 1 c +2 732 1 a +2 733 1 c +2 734 1 a +2 735 1 t +2 736 1 a +2 737 1 c +2 738 1 g +2 739 1 t +2 740 1 c +2 741 1 t +2 742 1 a +2 743 1 c +2 744 1 a +2 745 1 c +2 746 1 a +2 747 1 a +2 748 1 c +2 749 1 a +2 750 1 t +2 751 1 g +2 752 1 c +2 753 1 a +2 754 1 c +2 755 1 g +2 756 1 c$ diff --git a/test/mpileup/test-pileup.sh b/test/mpileup/test-pileup.sh new file mode 100755 index 000000000..8a83cca6d --- /dev/null +++ b/test/mpileup/test-pileup.sh @@ -0,0 +1,34 @@ +#!/bin/sh +# +# Copyright (C) 2017-2018 Genome Research Ltd. +# +# Author: Robert Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Load in the test driver +. ../simple_test_driver.sh + +echo "Testing (m)pileup..." + +pileup="../pileup" + +test_driver $@ + +exit $? diff --git a/test/no_hdr_sq_1.bam b/test/no_hdr_sq_1.bam new file mode 100644 index 000000000..93d57a2d5 Binary files /dev/null and b/test/no_hdr_sq_1.bam differ diff --git a/test/no_hdr_sq_1.bam.csi b/test/no_hdr_sq_1.bam.csi new file mode 100644 index 000000000..e37fc635c Binary files /dev/null and b/test/no_hdr_sq_1.bam.csi differ diff --git a/test/no_hdr_sq_1.expected.sam b/test/no_hdr_sq_1.expected.sam new file mode 100644 index 000000000..ae05bb1ad --- /dev/null +++ b/test/no_hdr_sq_1.expected.sam @@ -0,0 +1,16 @@ +@CO SN:CHROMOSOME_I LN:1009800 +@CO SN:CHROMOSOME_II LN:5000 +@CO SN:CHROMOSOME_III LN:5000 +@CO SN:CHROMOSOME_IV LN:5000 +@CO SN:CHROMOSOME_V LN:5000 +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +I 16 CHROMOSOME_I 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU +II.14978392 16 CHROMOSOME_I 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU +III 16 CHROMOSOME_I 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU +IV 16 CHROMOSOME_I 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU +V 16 CHROMOSOME_I 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU +VI 2048 CHROMOSOME_I 2 1 27M100000D73M * 0 0 ACTAAGCCTAAGCCTAAGCCTAAGCCAATTATCGATTTCTGAAAAAATTATCGAATTTTCTAGAAATTTTGCAAATTTTTTCATAAAATTATCGATTTTA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC diff --git a/test/pileup.c b/test/pileup.c new file mode 100644 index 000000000..7362f218c --- /dev/null +++ b/test/pileup.c @@ -0,0 +1,255 @@ +/* test/pileup.c -- simple pileup tester + + Copyright (C) 2014,2018-2019 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +/* + The output from this program isn't quite the same as that from + `samtools mpileup`. It doesn't print the reference base column, + it puts brackets around insertion sequences to make them easier to spot + and it writes empty brackets after a reported deletion. + + The output from `samtools mpileup` can be converted to the same format like + this: + +samtools mpileup -B -Q 0 in.bam | perl -lane \ + 'pop(@F); + splice(@F, 2, 1); + $F[3] =~ s/\+(\d+)([ACGTN]+)/sprintf("+%d(%s)%s",$1,substr($2,0,$1),substr($2,$1))/ieg; + $F[3] =~ s/\-(\d+)([ACGTN]+)/sprintf("-%d()%s",$1,substr($2,$1))/ieg; + print join("\t", @F);' + + */ + +#include + +#include +#include +#include +#include +#include +#include "htslib/sam.h" +#include "htslib/kstring.h" + +#define MIN(a,b) ((a)<(b)?(a):(b)) + +typedef struct ptest_t { + const char *fname; + samFile *fp; + sam_hdr_t *fp_hdr; +} ptest_t; + +static int readaln(void *data, bam1_t *b) { + ptest_t *g = (ptest_t*)data; + int ret; + + while (1) { + ret = sam_read1(g->fp, g->fp_hdr, b); + if (ret < 0) break; + if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; + break; + } + + return ret; +} + +static int print_pileup_seq(const bam_pileup1_t *p, int n) { + kstring_t ks = { 0, 0, NULL }; + int i; + + for (i = 0; i < n; i++, p++) { + uint8_t *seq = bam_get_seq(p->b); + int del_len, is_rev = bam_is_rev(p->b); + + if (p->is_head) + putchar('^'), putchar('!'+MIN(p->b->core.qual,93)); + + if (p->is_del) + putchar(p->is_refskip ? (is_rev ? '<' : '>') : '*'); + else { + unsigned char c = seq_nt16_str[bam_seqi(seq, p->qpos)]; + putchar(is_rev ? tolower(c) : toupper(c)); + } + + del_len = -p->indel; + if (p->indel > 0) { + int j, len = bam_plp_insertion(p, &ks, &del_len); + if (len < 0) { + perror("bam_plp_insertion"); + goto fail; + } + printf("%+d(", len); + for (j = 0; j < len; j++) + putchar(is_rev ? + tolower((uint8_t) ks.s[j]) : + toupper((uint8_t) ks.s[j])); + putchar(')'); + } + if (del_len > 0) { + printf("-%d()", del_len); + } + if (p->is_tail) + putchar('$'); + } + free(ks.s); + return 0; + + fail: + free(ks.s); + return -1; +} + +static int test_pileup(ptest_t *input) { + bam_plp_t plp = NULL; + const bam_pileup1_t *p; + int tid, pos, n = 0; + + plp = bam_plp_init(readaln, input); + if (!plp) { + perror("bam_plp_init"); + goto fail; + } + while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) { + if (tid < 0) break; + if (tid >= input->fp_hdr->n_targets) { + fprintf(stderr, + "bam_plp_auto returned tid %d >= header n_targets %d\n", + tid, input->fp_hdr->n_targets); + goto fail; + } + + printf("%s\t%d\t%d\t", input->fp_hdr->target_name[tid], pos+1, n); + + if (print_pileup_seq(p, n) < 0) + goto fail; + + putchar('\n'); + } + if (n < 0) { + fprintf(stderr, "bam_plp_auto failed for \"%s\"\n", input->fname); + goto fail; + } + + bam_plp_destroy(plp); + return 0; + + fail: + bam_plp_destroy(plp); + return -1; +} + +static int test_mpileup(ptest_t *input) { + bam_mplp_t iter = NULL; + const bam_pileup1_t *pileups[1] = { NULL }; + int n_plp[1] = { 0 }; + int tid, pos, n = 0; + + iter = bam_mplp_init(1, readaln, (void **) &input); + if (!iter) { + perror("bam_plp_init"); + goto fail; + } + if (bam_mplp_init_overlaps(iter) < 0) { + perror("bam_mplp_init_overlaps"); + goto fail; + } + + while ((n = bam_mplp_auto(iter, &tid, &pos, n_plp, pileups)) > 0) { + if (tid < 0) break; + if (tid >= input->fp_hdr->n_targets) { + fprintf(stderr, + "bam_mplp_auto returned tid %d >= header n_targets %d\n", + tid, input->fp_hdr->n_targets); + goto fail; + } + + printf("%s\t%d\t%d\t", input->fp_hdr->target_name[tid], pos+1, n_plp[0]); + + if (print_pileup_seq(pileups[0], n_plp[0]) < 0) + goto fail; + + putchar('\n'); + } + if (n < 0) { + fprintf(stderr, "bam_plp_auto failed for \"%s\"\n", input->fname); + goto fail; + } + + bam_mplp_destroy(iter); + return 0; + + fail: + bam_mplp_destroy(iter); + return -1; +} + +int main(int argc, char **argv) { + ptest_t g = { NULL, NULL, NULL }; + int use_mpileup = 0, opt; + + while ((opt = getopt(argc, argv, "m")) != -1) { + switch (opt) { + case 'm': + use_mpileup = 1; + break; + default: + fprintf(stderr, "Usage: %s [-m] \n", argv[0]); + return EXIT_FAILURE; + } + } + + if (optind >= argc) { + fprintf(stderr, "Usage: %s [-m] \n", argv[0]); + return EXIT_FAILURE; + } + + g.fname = argv[optind]; + g.fp = sam_open(g.fname, "r"); + if (!g.fp) { + fprintf(stderr, "Couldn't open \"%s\" : %s", g.fname, strerror(errno)); + goto fail; + } + g.fp_hdr = sam_hdr_read(g.fp); + if (!g.fp_hdr) { + fprintf(stderr, "Couldn't read header from \"%s\" : %s", + g.fname, strerror(errno)); + goto fail; + } + + if (use_mpileup) { + if (test_mpileup(&g) < 0) + goto fail; + } else { + if (test_pileup(&g) < 0) + goto fail; + } + + sam_hdr_destroy(g.fp_hdr); + sam_close(g.fp); + + return EXIT_SUCCESS; + + fail: + if (g.fp_hdr) sam_hdr_destroy(g.fp_hdr); + if (g.fp) sam_close(g.fp); + return EXIT_FAILURE; +} diff --git a/test/sam.c b/test/sam.c index dc0681d0f..b8936ee0c 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1,6 +1,6 @@ /* test/sam.c -- SAM/BAM/CRAM API test cases. - Copyright (C) 2014-2017 Genome Research Ltd. + Copyright (C) 2014-2019 Genome Research Ltd. Author: John Marshall @@ -31,6 +31,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include +#include // Suppress message for faidx_fetch_nseq(), which we're intentionally testing #include "htslib/hts_defs.h" @@ -39,7 +41,18 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/sam.h" #include "htslib/faidx.h" -#include "htslib/kstring.h" +#include "htslib/khash.h" +#include "htslib/hts_log.h" + +KHASH_SET_INIT_STR(keep) +typedef khash_t(keep) *keephash_t; + +#ifndef HTS_VERSION +#error HTS_VERSION not defined +#endif +#if HTS_VERSION < 100900 +#error HTS_VERSION comparison incorrect +#endif int status; @@ -61,9 +74,9 @@ uint8_t *check_bam_aux_get(const bam1_t *aln, const char *tag, char type) uint8_t *p = bam_aux_get(aln, tag); if (p) { if (*p == type) return p; - else fail("%s field of type '%c', expected '%c'\n", tag, *p, type); + else fail("%s field of type '%c', expected '%c'", tag, *p, type); } - else fail("can't find %s field\n", tag); + else fail("can't find %s field", tag); return NULL; } @@ -75,18 +88,18 @@ static void check_int_B_array(bam1_t *aln, char *tag, uint32_t i; if (bam_auxB_len(p) != nvals) - fail("Wrong length reported for %s field, got %u, expected %u\n", + fail("Wrong length reported for %s field, got %u, expected %u", tag, bam_auxB_len(p), nvals); for (i = 0; i < nvals; i++) { if (bam_auxB2i(p, i) != vals[i]) { fail("Wrong value from bam_auxB2i for %s field index %u, " - "got %"PRId64" expected %"PRId64"\n", + "got %"PRId64" expected %"PRId64, tag, i, bam_auxB2i(p, i), vals[i]); } if (bam_auxB2f(p, i) != (double) vals[i]) { fail("Wrong value from bam_auxB2f for %s field index %u, " - "got %f expected %f\n", + "got %f expected %f", tag, i, bam_auxB2f(p, i), (double) vals[i]); } } @@ -214,15 +227,20 @@ static int test_update_array(bam1_t *aln, const char target_id[2], return 0; } +// This function uses bam_hdr_t etc as a check ensuring the legacy typedef +// and functions continue to compile successfully. static int aux_fields1(void) { static const char sam[] = "data:," "@SQ\tSN:one\tLN:1000\n" "@SQ\tSN:two\tLN:500\n" -"r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:d:2.46801\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\n"; +"r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:d:2.46801\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\n" +"r2\t0x8D\t*\t0\t0\t*\t*\t0\t0\tATGC\tqqqq\n" +; - // Canonical form of the alignment record above, as output by sam_format1() + // Canonical form of the alignment records above, as output by sam_format1() static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" NEW_HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:f:9.8765\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\tN0:i:-1234\tN1:i:1234\tN2:i:-2\tN3:i:3\tF1:f:4.5678\tN4:B:S,65535,32768,1,0\tN5:i:4242"; + static const char r2[] = "r2\t141\t*\t0\t0\t*\t*\t0\t0\tATGC\tqqqq"; samFile *in = sam_open(sam, "r"); bam_hdr_t *header = sam_hdr_read(in); @@ -298,13 +316,13 @@ static int aux_fields1(void) nvals = NELE(bfvals); if ((p = check_bam_aux_get(aln, "Bf", 'B')) != NULL) { if (bam_auxB_len(p) != nvals) - fail("Wrong length reported for Bf field, got %d, expected %zd\n", + fail("Wrong length reported for Bf field, got %d, expected %zd", bam_auxB_len(p), nvals); for (i = 0; i < nvals; i++) { if (bam_auxB2f(p, i) != bfvals[i]) { fail("Wrong value from bam_auxB2f for Bf field index %zd, " - "got %f expected %f\n", + "got %f expected %f", i, bam_auxB2f(p, i), bfvals[i]); } } @@ -435,24 +453,84 @@ static int aux_fields1(void) if (strcmp(ks.s, r1) != 0) fail("record formatted incorrectly: \"%s\"", ks.s); - - free(ks.s); } else fail("can't read record"); + if (sam_read1(in, header, aln) >= 0) { + if (sam_format1(header, aln, &ks) < 0) + fail("can't format record r2"); + + if (aln->core.flag != 0x8D) + fail("r2 flag value is 0x%X, expected 0x8D", aln->core.flag); + + if (strcmp(ks.s, r2) != 0) + fail("record r2 formatted incorrectly: \"%s\"", ks.s); + } + else fail("can't read record r2"); + bam_destroy1(aln); bam_hdr_destroy(header); sam_close(in); + free(ks.s); return 1; } +static void set_qname(void) +{ + static const char sam[] = "data:," +"@SQ\tSN:one\tLN:1000\n" +"@SQ\tSN:two\tLN:500\n" +"r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:d:2.46801\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\n" +"r22\t0x8D\t*\t0\t0\t*\t*\t0\t0\tATGC\tqqqq\n" +"r12345678\t0x8D\t*\t0\t0\t*\t*\t0\t0\tATGC\tqqqq\n" +; + + // Canonical form of the alignment records above, as output by sam_format1() + static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:d:2.46801\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295"; + static const char r2[] = "r234\t141\t*\t0\t0\t*\t*\t0\t0\tATGC\tqqqq"; + static const char r3[] = "xyz\t141\t*\t0\t0\t*\t*\t0\t0\tATGC\tqqqq"; + + samFile *in = sam_open(sam, "r"); + bam_hdr_t *header = sam_hdr_read(in); + bam1_t *aln = bam_init1(); + kstring_t ks = { 0, 0, NULL }; + + if (sam_read1(in, header, aln) >= 0) { + bam_set_qname(aln, "r1"); + if (sam_format1(header, aln, &ks) < 0) fail("can't format record"); + if (strcmp(ks.s, r1) != 0) fail("record formatted incorrectly:\nGot: \"%s\"\nExp: \"%s\"\n", ks.s, r1); + } + else fail("can't read record"); + + if (sam_read1(in, header, aln) >= 0) { + bam_set_qname(aln, "r234"); + if (sam_format1(header, aln, &ks) < 0) fail("can't format record"); + if (strcmp(ks.s, r2) != 0) fail("record formatted incorrectly:\nGot: \"%s\"\nExp: \"%s\"\n", ks.s, r2); + } + else fail("can't read record"); + + if (sam_read1(in, header, aln) >= 0) { + bam_set_qname(aln, "xyz"); + if (sam_format1(header, aln, &ks) < 0) fail("can't format record"); + if (strcmp(ks.s, r3) != 0) fail("record formatted incorrectly:\nGot: \"%s\"\nExp: \"%s\"\n", ks.s, r3); + } + else fail("can't read record"); + + bam_destroy1(aln); + bam_hdr_destroy(header); + sam_close(in); + free(ks.s); +} + static void iterators1(void) { hts_itr_destroy(sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0)); hts_itr_destroy(sam_itr_queryi(NULL, HTS_IDX_NONE, 0, 0)); } +// This function uses bam_hdr_t etc as a check ensuring the legacy typedef +// and functions continue to compile successfully. static void copy_check_alignment(const char *infname, const char *informat, const char *outfname, const char *outmode, const char *outref) { @@ -503,11 +581,611 @@ static void copy_check_alignment(const char *infname, const char *informat, err: bam_destroy1(aln); + aln = NULL; bam_hdr_destroy(header); + header = NULL; + if (in) sam_close(in); + if (out) sam_close(out); +} + +static int check_target_names(sam_hdr_t *header, int expected_n_targets, + const char **expected_targets, + const int *expected_lengths) { + int i; + + // Check consistency of target_names array + if (!header->target_name) { + fail("target_name is NULL"); + return -1; + } + if (!header->target_len) { + fail("target_len is NULL"); + return -1; + } + if (header->n_targets != expected_n_targets) { + fail("header->n_targets (%d) != expected_n_targets (%d)", + header->n_targets, expected_n_targets); + return -1; + } + for (i = 0; i < expected_n_targets; i++) { + if (!header->target_name[i] + || strcmp(header->target_name[i], expected_targets[i]) != 0) { + fail("header->target_name[%d] (%s) != \"%s\"", + i, header->target_name[i] ? header->target_name[i] : "NULL", + expected_targets[i]); + return -1; + } + if (header->target_len[i] != expected_lengths[i]) { + fail("header->target_len[%d] (%d) != %d", + i, header->target_len[i], expected_lengths[i]); + return -1; + } + } + return 0; +} + +static void use_header_api(void) { + static const char header_text[] = "data:," + "@HD\tVN:1.4\tGO:group\tSS:coordinate:queryname\n" + "@SQ\tSN:ref0\tLN:100\n" + "@CO\tThis line below will be updated\n" + "@SQ\tSN:ref1\tLN:5001\tM5:983dalu9ue2\n" + "@SQ\tSN:ref1.5\tLN:5001\n" + "@CO\tThis line is good\n" + "@SQ\tSN:ref2\tLN:5002\n"; + + static const char rg_line[] = + { '@', 'R', 'G', '\t', 'I', 'D', ':', 'r', 'u', 'n', '1' }; + + static const char expected[] = + "@HD\tVN:1.5\tSO:coordinate\n" + "@CO\tThis line below will be updated\n" + "@SQ\tSN:ref1\tLN:5001\tM5:kja8u34a2q3\n" + "@CO\tThis line is good\n" + "@SQ\tSN:ref2\tLN:5002\n" + "@SQ\tSN:ref3\tLN:5003\n" + "@PG\tID:samtools\tPN:samtools\tVN:1.9\n" + "@RG\tID:run1\n" + "@RG\tID:run4\n"; + + static const char *expected_targets[] = { "ref1", "ref2", "ref3" }; + static const int expected_lengths[] = { 5001, 5002, 5003 }; + const int expected_n_targets = sizeof(expected_targets) / sizeof(char *); + + const char outfname[] = "test/sam_header.tmp.sam_"; + const char outmode[] = "w"; + FILE *inf = NULL; + char buffer[sizeof(expected) + 1024]; + + samFile *in = sam_open(header_text, "r"); + samFile *out = sam_open(outfname, outmode); + sam_hdr_t *header = NULL; + kstring_t ks = { 0, 0, NULL }; + size_t bytes; + int r; + const char *name; + + if (!in) { + fail("couldn't open file"); + goto err; + } + if (!out) { + fail("couldn't open %s with mode %s", outfname, outmode); + goto err; + } + + header = sam_hdr_read(in); + if (!header) { + fail("reading header from file"); + goto err; + } + r = sam_hdr_remove_tag_id(header, "HD", NULL, NULL, "GO"); + if (r != 1) { fail("sam_hdr_remove_tag_id"); goto err; } + + r = sam_hdr_update_hd(header, "VN", "1.5"); + if (r != 0) { fail("sam_hdr_update_hd"); goto err; } + + r = sam_hdr_add_line(header, "SQ", "SN", "ref3", "LN", "5003", NULL); + if (r < 0) { fail("sam_hdr_add_line"); goto err; } + + r = sam_hdr_update_line(header, "SQ", "SN", "ref1", + "M5", "kja8u34a2q3", NULL); + if (r != 0) { fail("sam_hdr_update_line SQ"); goto err; } + + r = sam_hdr_add_pg(header, "samtools", "VN", "1.9", NULL); + if (r != 0) { fail("sam_hdr_add_pg"); goto err; } + + // Test addition with no newline or trailing NUL + r = sam_hdr_add_lines(header, rg_line, sizeof(rg_line)); + if (r != 0) { fail("sam_hdr_add_lines rg_line"); goto err; } + + // Test header line removal + r = sam_hdr_add_line(header, "RG", "ID", "run2", NULL); + if (r < 0) { fail("sam_hdr_add_line"); goto err; } + + r = sam_hdr_add_line(header, "RG", "ID", "run3", NULL); + if (r < 0) { fail("sam_hdr_add_line"); goto err; } + + r = sam_hdr_add_line(header, "RG", "ID", "run4", NULL); + if (r < 0) { fail("sam_hdr_add_line"); goto err; } + + r = sam_hdr_line_index(header, "RG", "run4"); + if (r != 3) { fail("sam_hdr_line_index - run4~3"); goto err; } + + r = sam_hdr_line_index(header, "RG", "run5"); + if (r != -1) { fail("sam_hdr_line_index - run5~-1"); goto err; } + + name = sam_hdr_line_name(header, "RG", 2); + if (!name || strcmp(name, "run3")) { fail("sam_hdr_line_name - 2~run3"); goto err; } + + name = sam_hdr_line_name(header, "RG", 10); + if (name) { fail("sam_hdr_line_name - 10~NULL"); goto err; } + + r = sam_hdr_remove_line_id(header, "RG", "ID", "run2"); + if (r < 0) { fail("sam_hdr_remove_line_id"); goto err; } + + r = sam_hdr_find_tag_id(header, "RG", "ID", "run3", "ID", &ks); + if (r < 0 || !ks.s || strcmp(ks.s, "run3") != 0) { + fail("sam_hdr_find_tag_id() expected \"run3\" got \"%s\"", + r == 0 && ks.s ? ks.s : "NULL"); + goto err; + } + + r = sam_hdr_remove_line_pos(header, "RG", 1); // Removes run3 + if (r < 0) { fail("sam_hdr_remove_line_pos"); goto err; } + + r = sam_hdr_remove_line_id(header, "SQ", "SN", "ref0"); + if (r < 0) { fail("sam_hdr_remove_line_id"); goto err; } + + r = sam_hdr_remove_line_pos(header, "SQ", 1); // Removes ref1.5 + if (r < 0) { fail("sam_hdr_remove_line_pos"); goto err; } + + r = sam_hdr_find_tag_id(header, "SQ", "SN", "ref1", "M5", &ks); + if (r < 0 || !ks.s || strcmp(ks.s, "kja8u34a2q3") != 0) { + fail("sam_hdr_find_tag_id() expected \"kja8u34a2q3\" got \"%s\"", + r == 0 && ks.s ? ks.s : "NULL"); + goto err; + } + + r = sam_hdr_line_index(header, "RG", "run4"); + if (r != 1) { fail("sam_hdr_line_index - run4~1"); goto err; } + + name = sam_hdr_line_name(header, "RG", 2); + if (name) { fail("sam_hdr_line_name - 2~NULL"); goto err; } + + r = sam_hdr_remove_tag_hd(header, "SS"); + if (r < 0) { + fail("sam_hdr_remove_tag_hd"); + } + + r = sam_hdr_find_hd(header, &ks); + if (r < 0 || !ks.s || strcmp(ks.s, "@HD\tVN:1.5") != 0) { + fail("sam_hdr_find_hd() expected \"@HD\tVN:1.5\" got \"%s\"", + r == 0 && ks.s ? ks.s : "NULL"); + } + + r = sam_hdr_find_tag_hd(header, "VN", &ks); + if (r < 0 || !ks.s || strcmp(ks.s, "1.5") != 0) { + fail("sam_hdr_find_tag_hd() expected \"1.5\" got \"%s\"", + r == 0 && ks.s ? ks.s : "NULL"); + } + + r = sam_hdr_update_hd(header, "SO", "coordinate"); + if (r < 0) { + fail("sam_hdr_update_hd"); + } + + if (check_target_names(header, expected_n_targets, expected_targets, + expected_lengths) < 0) { + goto err; + } + + if ((r = sam_hdr_count_lines(header, "HD")) != 1) { + fail("incorrect HD line count - expected 1, got %d", r); + goto err; + } + if ((r = sam_hdr_count_lines(header, "SQ")) != 3) { + fail("incorrect SQ line count - expected 3, got %d", r); + goto err; + } + if ((r = sam_hdr_count_lines(header, "PG")) != 1) { + fail("incorrect PG line count - expected 1, got %d", r); + goto err; + } + if ((r = sam_hdr_count_lines(header, "RG")) != 2) { + fail("incorrect RG line count - expected 2, got %d", r); + goto err; + } + if ((r = sam_hdr_count_lines(header, "CO")) != 2) { + fail("incorrect CO line count - expected 2, got %d", r); + goto err; + } + + if (sam_hdr_write(out, header) < 0) { + fail("writing headers to \"%s\"", outfname); + goto err; + } + r = sam_close(out); + out = NULL; + if (r < 0) { + fail("close \"%s\"", outfname); + goto err; + } + + inf = fopen(outfname, "r"); + if (!inf) { + fail("Opening written header \"%s\"", outfname); + goto err; + } + bytes = fread(buffer, 1, sizeof(buffer), inf); + if (bytes != sizeof(expected) - 1 || memcmp(buffer, expected, bytes) != 0) { + fail("edited header does not match expected version"); + fprintf(stderr, + "---------- Expected:\n%.*s\n" + "++++++++++ Got:\n%.*s\n" + "====================\n", + (int) sizeof(expected), expected, + (int) bytes, buffer); + goto err; + } + + free(ks_release(&ks)); + + err: + sam_hdr_destroy(header); + header = NULL; if (in) sam_close(in); if (out) sam_close(out); + if (inf) fclose(inf); + free(ks_release(&ks)); } +static void test_header_pg_lines(void) { + static const char header_text[] = "data:," + "@HD\tVN:1.5\n" + "@PG\tID:prog1\tPN:prog1\n" + "@PG\tID:prog2\tPN:prog2\tPP:prog1\n"; + + static const char expected[] = + "@HD\tVN:1.5\n" + "@PG\tID:prog1\tPN:prog1\n" + "@PG\tID:prog2\tPN:prog2\tPP:prog1\n" + "@PG\tID:prog3\tPN:prog3\tPP:prog2\n" + "@PG\tID:prog4\tPN:prog4\tPP:prog1\n" + "@PG\tID:prog5\tPN:prog5\tPP:prog2\n" + "@PG\tID:prog6\tPN:prog6\tPP:prog3\n" + "@PG\tID:prog6.1\tPN:prog6\tPP:prog4\n" + "@PG\tID:prog6.2\tPN:prog6\tPP:prog5\n" + "@PG\tPN:prog7\tID:my_id\tPP:prog6\n"; + + samFile *in = sam_open(header_text, "r"); + sam_hdr_t *header = NULL; + const char *text = NULL; + enum htsLogLevel old_log_level; + int r; + + if (!in) { + fail("couldn't open file"); + goto err; + } + + header = sam_hdr_read(in); + if (!header) { + fail("reading header from file"); + goto err; + } + + r = sam_hdr_add_pg(header, "prog3", NULL); + if (r != 0) { fail("sam_hdr_add_pg prog3"); goto err; } + + + r = sam_hdr_add_pg(header, "prog4", "PP", "prog1", NULL); + if (r != 0) { fail("sam_hdr_add_pg prog4"); goto err; } + + r = sam_hdr_add_line(header, "PG", "ID", + "prog5", "PN", "prog5", "PP", "prog2", NULL); + if (r != 0) { fail("sam_hdr_add_line @PG ID:prog5"); goto err; } + + r = sam_hdr_add_pg(header, "prog6", NULL); + if (r != 0) { fail("sam_hdr_add_pg prog6"); goto err; } + + r = sam_hdr_add_pg(header, "prog7", "ID", "my_id", "PP", "prog6", NULL); + if (r != 0) { fail("sam_hdr_add_pg prog7"); goto err; } + + text = sam_hdr_str(header); + if (!text) { fail("sam_hdr_str"); goto err; } + + // These should fail + old_log_level = hts_get_log_level(); + hts_set_log_level(HTS_LOG_OFF); + + r = sam_hdr_add_pg(header, "prog8", "ID", "my_id", NULL); + if (r == 0) { fail("sam_hdr_add_pg prog8 (unexpected success)"); goto err; } + + r = sam_hdr_add_pg(header, "prog9", "PP", "non-existent", NULL); + if (r == 0) { fail("sam_hdr_add_pg prog9 (unexpected success)"); goto err; } + + hts_set_log_level(old_log_level); + // End failing tests + + if (strcmp(text, expected) != 0) { + fail("edited header does not match expected version"); + fprintf(stderr, + "---------- Expected:\n%s\n" + "++++++++++ Got:\n%s\n" + "====================\n", + expected, text); + goto err; + } + + err: + sam_hdr_destroy(header); + header = NULL; + if (in) sam_close(in); + return; +} + +static void test_header_updates(void) { + static const char header_text[] = + "@HD\tVN:1.4\n" + "@SQ\tSN:chr1\tLN:100\n" + "@SQ\tSN:chr2\tLN:200\n" + "@SQ\tSN:chr3\tLN:300\n" + "@RG\tID:run1\n" + "@RG\tID:run2\n" + "@RG\tID:run3\n" + "@PG\tID:prog1\tPN:prog1\n"; + + static const char expected[] = + "@HD\tVN:1.4\n" + "@SQ\tSN:1\tLN:100\n" + "@SQ\tSN:chr2\tLN:2000\n" + "@SQ\tSN:chr3\tLN:300\n" + "@RG\tID:run1\tDS:hello\n" + "@RG\tID:aliquot2\n" + "@RG\tID:run3\n" + "@PG\tID:prog1\tPN:prog1\n"; + + static const char *expected_targets[] = { "1", "chr2", "chr3" }; + static const int expected_lengths[] = { 100, 2000, 300 }; + const int expected_n_targets = sizeof(expected_targets) / sizeof(char *); + + sam_hdr_t *header = sam_hdr_parse(sizeof(header_text) - 1, header_text); + const char *hdr_str; + int r, i, old_log_level; + + if (!header) { + fail("creating sam header"); + goto err; + } + + if (sam_hdr_name2tid(header, "chr1") != 0) { // Should now be unknown + fail("sam_hdr_name2tid(\"chr1\") != 0"); + goto err; + } + + r = sam_hdr_update_line(header, "SQ", "SN", "chr2", "LN", "2000", NULL); + if (r != 0) { fail("sam_hdr_update_line SQ SN chr2 LN 2000"); goto err; } + r = sam_hdr_update_line(header, "SQ", "SN", "chr1", "SN", "1", NULL); + if (r != 0) { fail("sam_hdr_update_line SQ SN chr1 SN 1"); goto err; } + r = sam_hdr_update_line(header, "RG", "ID", "run1", "DS", "hello", NULL); + if (r != 0) { fail("sam_hdr_update_line RG ID run1 DS hello"); goto err; } + r = sam_hdr_update_line(header, "RG", "ID", "run2", "ID", "aliquot2", NULL); + if (r != 0) { fail("sam_hdr_update_line RG ID run2 ID aliquot2"); goto err; } + + // These should fail + old_log_level = hts_get_log_level(); + hts_set_log_level(HTS_LOG_OFF); + + r = sam_hdr_update_line(header, "PG", "ID", "prog1", "ID", "prog2", NULL); + if (r == 0) { fail("sam_hdr_update_line PG ID prog1 ID prog2"); goto err; } + + r = sam_hdr_update_line(header, "SQ", "SN", "chr3", "SN", "chr2", NULL); + if (r == 0) { fail("sam_hdr_update_line SQ SN chr3 SN chr2"); goto err; } + + r = sam_hdr_update_line(header, "RG", "ID", "run3", "ID", "run1", NULL); + if (r == 0) { fail("sam_hdr_update_line RG ID run3 ID run1"); goto err; } + + hts_set_log_level(old_log_level); + // End failing tests + + if (check_target_names(header, expected_n_targets, expected_targets, + expected_lengths) < 0) { + goto err; + } + + for (i = 0; i < expected_n_targets; i++) { + if (sam_hdr_name2tid(header, expected_targets[i]) != i) { + fail("sam_hdr_name2tid unexpected result"); + goto err; + } + } + if (sam_hdr_name2tid(header, "chr1") != -1) { // Should now be unknown + fail("sam_hdr_name2tid(\"chr1\") != -1"); + goto err; + } + + hdr_str = sam_hdr_str(header); + if (!hdr_str || strcmp(hdr_str, expected) != 0) { + fail("edited header does not match expected version"); + fprintf(stderr, + "---------- Expected:\n%s\n" + "++++++++++ Got:\n%s\n" + "====================\n", + expected, hdr_str ? hdr_str : ""); + goto err; + } + + err: + sam_hdr_destroy(header); +} + +static void test_header_remove_lines(void) { + static const char header_text[] = + "@HD\tVN:1.4\n" + "@SQ\tSN:chr1\tLN:100\n" + "@SQ\tSN:chr2\tLN:200\n" + "@SQ\tSN:chr3\tLN:300\n" + "@RG\tID:run1\n" + "@RG\tID:run2\n" + "@RG\tID:run3\n" + "@PG\tID:prog1\tPN:prog1\n"; + + static const char expected[] = + "@HD\tVN:1.4\n" + "@SQ\tSN:chr1\tLN:100\n" + "@SQ\tSN:chr3\tLN:300\n" + "@PG\tID:prog1\tPN:prog1\n"; + + sam_hdr_t *header = sam_hdr_parse(sizeof(header_text) - 1, header_text); + keephash_t rh = kh_init(keep); + khint_t k; + const char *hdr_str; + int r = 0; + + if (!header) { + fail("creating sam header"); + goto err; + } + if (!rh) { + fail("creating keep hash table"); + goto err; + } + + kh_put(keep, rh, strdup("chr3"), &r); + if (r < 0) { fail("adding chr3 to hash table"); goto err; } + kh_put(keep, rh, strdup("chr1"), &r); + if (r < 0) { fail("adding chr1 to hash table"); goto err; } + + r = sam_hdr_remove_lines(header, "SQ", "SN", rh); + if (r != 0) { fail("sam_hdr_remove_lines SQ SN rh"); goto err; } + + r = sam_hdr_remove_lines(header, "RG", "ID", NULL); + if (r != 0) { fail("sam_hdr_remove_lines RG ID NULL"); goto err; } + + hdr_str = sam_hdr_str(header); + if (!hdr_str || strcmp(hdr_str, expected) != 0) { + fail("edited header does not match expected version"); + fprintf(stderr, + "---------- Expected:\n%s\n" + "++++++++++ Got:\n%s\n" + "====================\n", + expected, hdr_str ? hdr_str : ""); + goto err; + } + + err: + if (rh) { + for (k = 0; k < kh_end(rh); ++k) + if (kh_exist(rh, k)) free((char*)kh_key(rh, k)); + kh_destroy(keep, rh); + } + if (header) sam_hdr_destroy(header); +} + +static void check_ref_lookup(sam_hdr_t *header, const char *msg, ...) { + const char *name; + va_list args; + va_start(args, msg); + while ((name = va_arg(args, const char *)) != NULL) { + int exp = va_arg(args, int); + int tid = sam_hdr_name2tid(header, name); + if (tid != exp) + fail("%s: altname \"%s\" => %d (expected %d)", msg, name, tid, exp); + } + va_end(args); +} + +static void test_header_ref_altnames(void) { + static const char initial_header[] = + "@SQ\tSN:1\tLN:100\tAN:chr1\n" + "@SQ\tSN:chr2\tAN:2\tLN:200\n" + "@SQ\tSN:3\tLN:300\n" + "@SQ\tSN:chrMT\tLN:16569\tAN:MT,chrM,M\n"; + + sam_hdr_t *header = sam_hdr_init(); + if (header == NULL) { fail("sam_hdr_init"); return; } + + if (sam_hdr_add_lines(header, initial_header, 0) < 0) + fail("sam_hdr_add_lines() for altnames"); + + check_ref_lookup(header, "initial", + "1", 0, "chr1", 0, "2", 1, "chr2", 1, "3", 2, + "chrMT", 3, "chrM", 3, "M", 3, "fred", -1, "barney", -1, + NULL); + + if (sam_hdr_add_line(header, "SQ", "AN", "fred", "LN", "500", "SN", "barney", NULL) < 0) + fail("sam_hdr_add_line() for altnames"); + + check_ref_lookup(header, "barney added", + "1", 0, "chr1", 0, "2", 1, "chr2", 1, "3", 2, + "chrMT", 3, "chrM", 3, "M", 3, "fred", 4, "barney", 4, + NULL); + + if (sam_hdr_remove_line_id(header, "SQ", "SN", "chr2") < 0) + fail("sam_hdr_remove_line_id() for altnames"); + + check_ref_lookup(header, "chr2 removed", + "1", 0, "chr1", 0, "2", -1, "chr2", -1, "3", 1, + "chrMT", 2, "chrM", 2, "M", 2, "fred", 3, "barney", 3, + NULL); + + if (sam_hdr_remove_tag_id(header, "SQ", "SN", "1", "AN") < 0) + fail("sam_hdr_remove_tag_id() for altnames"); + + check_ref_lookup(header, "1's AN removed", + "1", 0, "chr1", -1, "CM000663", -1, "2", -1, "chr2", -1, "3", 1, + "chrMT", 2, "chrM", 2, "M", 2, "fred", 3, "barney", 3, + NULL); + + sam_hdr_destroy(header); + + static const char initial_header_duplicates[] = + "@SQ\tSN:1\tLN:100\tAN:foo,2\n" + "@SQ\tSN:2\tLN:200\tAN:bar\n" + "@SQ\tSN:3\tLN:300\tAN:baz,3\n"; + + header = sam_hdr_init(); + if (header == NULL) { fail("sam_hdr_init"); return; } + + int old_log_level = hts_get_log_level(); + hts_set_log_level(HTS_LOG_ERROR); // Silence "Duplicate entry AN:2" warning + + if (sam_hdr_add_lines(header, initial_header_duplicates, 0) < 0) + fail("sam_hdr_add_lines() for altnames with duplicates"); + + hts_set_log_level(old_log_level); + + // Check "2" is SN:2 and not AN:2 + check_ref_lookup(header, "initial_header_duplicates", + "1", 0, "foo", 0, + "2", 1, "bar", 1, + "3", 2, "baz", 2, NULL); + + if (sam_hdr_remove_tag_id(header, "SQ", "SN", "1", "AN") < 0) + fail("sam_hdr_remove_tag_id() for duplicate altnames SN:1"); + + // Check "2" still works and "foo" does not + check_ref_lookup(header, "initial_header_duplicates", + "1", 0, "foo", -1, + "2", 1, "bar", 1, + "3", 2, "baz", 2, NULL); + + if (sam_hdr_remove_tag_id(header, "SQ", "SN", "3", "AN") < 0) + fail("sam_hdr_remove_tag_id() for duplicate altnames SN:3"); + + // Check "3" still works and "baz" does not + check_ref_lookup(header, "initial_header_duplicates", + "1", 0, "foo", -1, + "2", 1, "bar", 1, + "3", 2, "baz", -1, NULL); + + sam_hdr_destroy(header); +} + +#define ABC50 "abcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxy" +#define ABC250 ABC50 ABC50 ABC50 ABC50 ABC50 + static void samrecord_layout(void) { static const char qnames[] = "data:," @@ -516,18 +1194,25 @@ static void samrecord_layout(void) "bc\t0\tCHROMOSOME_II\t200\t10\t4M\t*\t0\t0\tATGC\tqqqq\n" "def\t0\tCHROMOSOME_II\t300\t10\t4M\t*\t0\t0\tATGC\tqqqq\n" "ghij\t0\tCHROMOSOME_II\t400\t10\t4M\t*\t0\t0\tATGC\tqqqq\n" -"klmno\t0\tCHROMOSOME_II\t500\t10\t4M\t*\t0\t0\tATGC\tqqqq\n"; +"klmno\t0\tCHROMOSOME_II\t500\t10\t4M\t*\t0\t0\tATGC\tqqqq\n" + ABC250 "\t0\tCHROMOSOME_II\t600\t10\t4M\t*\t0\t0\tATGC\tqqqq\n" + ABC250 "1\t0\tCHROMOSOME_II\t650\t10\t4M\t*\t0\t0\tATGC\tqqqq\n" + ABC250 "12\t0\tCHROMOSOME_II\t700\t10\t4M\t*\t0\t0\tATGC\tqqqq\n" + ABC250 "123\t0\tCHROMOSOME_II\t750\t10\t4M\t*\t0\t0\tATGC\tqqqq\n" + ABC250 "1234\t0\tCHROMOSOME_II\t800\t10\t4M\t*\t0\t0\tATGC\tqqqq\n" +; size_t bam1_t_size, bam1_t_size2; - bam1_t_size = 36 + sizeof (int) + 4 + sizeof (char *); -#ifndef BAM_NO_ID - bam1_t_size += 8; -#endif + assert(sizeof(hts_pos_t) == 8 || sizeof(hts_pos_t) == 4); + int core_size = sizeof(hts_pos_t) == 8 ? 48 : 36; + bam1_t_size = (core_size + sizeof(int) + sizeof(char *) + sizeof(uint64_t) + + 2 * sizeof(uint32_t)); bam1_t_size2 = bam1_t_size + 4; // Account for padding on some platforms - if (sizeof (bam1_core_t) != 36) - fail("sizeof bam1_core_t is %zu, expected 36", sizeof (bam1_core_t)); + if (sizeof (bam1_core_t) != core_size) + fail("sizeof bam1_core_t is %zu, expected %d", + sizeof (bam1_core_t), core_size); if (sizeof (bam1_t) != bam1_t_size && sizeof (bam1_t) != bam1_t_size2) fail("sizeof bam1_t is %zu, expected either %zu or %zu", @@ -541,6 +1226,196 @@ static void samrecord_layout(void) "test/sam_alignment.tmp.sam_", "w", NULL); } +static int check_ref_lengths(const sam_hdr_t *header, + const hts_pos_t *expected_lengths, + int num_refs, const char *hdr_name) +{ + int i; + for (i = 0; i < num_refs; i++) { + hts_pos_t ln = sam_hdr_tid2len(header, i); + if (ln != expected_lengths[i]) { + fail("Wrong length for %s ref %d : " + "expected %"PRIhts_pos" got %"PRIhts_pos"\n", + hdr_name, i, expected_lengths[i], ln); + return -1; + } + } + return 0; +} + +static void check_big_ref(int parse_header) +{ + static const char sam_text[] = "data:," + "@HD\tVN:1.4\n" + "@SQ\tSN:large#1\tLN:5000000000\n" + "@SQ\tSN:small#1\tLN:100\n" + "@SQ\tSN:large#2\tLN:9223372034707292158\n" + "@SQ\tSN:small#2\tLN:1\n" + "r1\t0\tlarge#1\t4999999000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n" + "r2\t0\tsmall#1\t1\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n" + "r3\t0\tlarge#2\t9223372034707292000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n" + "p1\t99\tlarge#2\t1\t50\t8M\t=\t9223372034707292150\t9223372034707292158\tACGTACGT\tabcdefgh\n" + "p1\t147\tlarge#2\t9223372034707292150\t50\t8M\t=\t1\t-9223372034707292158\tACGTACGT\tabcdefgh\n" + "r4\t0\tsmall#2\t2\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n"; + const hts_pos_t expected_lengths[] = { + 5000000000LL, 100LL, 9223372034707292158LL, 1LL + }; + const int expected_tids[] = { + 0, 1, 2, 2, 2, 3 + }; + const int expected_mtid[] = { + -1, -1, -1, 2, 2, -1 + }; + const hts_pos_t expected_positions[] = { + 4999999000LL - 1, 1LL - 1, 9223372034707292000LL - 1, 1LL - 1, + 9223372034707292150LL - 1, 2LL - 1 + }; + const hts_pos_t expected_mpos[] = { + -1, -1, -1, 9223372034707292150LL - 1, 1LL - 1, -1 + }; + samFile *in = NULL, *out = NULL; + sam_hdr_t *header = NULL, *dup_header = NULL; + bam1_t *aln = bam_init1(); + const int num_refs = sizeof(expected_lengths) / sizeof(expected_lengths[0]); + const int num_align = sizeof(expected_tids) / sizeof(expected_tids[0]); + const char *outfname = "test/sam_big_ref.tmp.sam_"; + int i, r; + char buffer[sizeof(sam_text) + 1024]; + FILE *inf = NULL; + size_t bytes; + + if (!aln) { + fail("Out of memory"); + goto cleanup; + } + + in = sam_open(sam_text, "r"); + if (!in) { + fail("Opening SAM file"); + goto cleanup; + } + out = sam_open(outfname, "w"); + if (!out) { + fail("Opening output SAM file \"%s\"", outfname); + goto cleanup; + } + header = sam_hdr_read(in); + if (!header) { + fail("Reading SAM header"); + goto cleanup; + } + if (parse_header) { + // This will force the header to be parsed + if (sam_hdr_count_lines(header, "SQ") != num_refs) { + fail("Wrong number of SQ lines in header"); + goto cleanup; + } + } + if (check_ref_lengths(header, expected_lengths, num_refs, "header") < 0) + goto cleanup; + + dup_header = sam_hdr_dup(header); + if (!dup_header) { + fail("Failed to duplicate header"); + } + + if (check_ref_lengths(dup_header, expected_lengths, + num_refs, "duplicate header") < 0) + goto cleanup; + + if (sam_hdr_count_lines(dup_header, "SQ") != num_refs) { + fail("Wrong number of SQ lines in duplicate header"); + goto cleanup; + } + + if (check_ref_lengths(dup_header, expected_lengths, + num_refs, "parsed duplicate header") < 0) + goto cleanup; + + if (sam_hdr_write(out, header) < 0) { + fail("Failed to write SAM header"); + goto cleanup; + } + i = 0; + while ((r = sam_read1(in, header, aln)) >= 0) { + if (i >= num_align) { + fail("Too many alignment records.\n"); + goto cleanup; + } + if (aln->core.tid != expected_tids[i]) { + fail("Wrong tid for record %d : expected %d got %d\n", + i, expected_tids[i], aln->core.tid); + goto cleanup; + } + if (aln->core.mtid != expected_mtid[i]) { + fail("Wrong mate tid for record %d : expected %d got %d\n", + i, expected_mtid[i], aln->core.mtid); + goto cleanup; + } + if (aln->core.pos != expected_positions[i]) { + fail("Wrong position for record %d : " + "expected %"PRIhts_pos" got %"PRIhts_pos"\n", + i, expected_positions[i], aln->core.pos); + } + if (aln->core.mpos != expected_mpos[i]) { + fail("Wrong mate position for record %d : " + "expected %"PRIhts_pos" got %"PRIhts_pos"\n", + i, expected_mpos[i], aln->core.mpos); + } + if (sam_write1(out, header, aln) < 0) { + fail("Failed to write alignment record %d\n", i); + goto cleanup; + } + i++; + } + if (r < -1) { + fail("Error reading SAM alignment\n"); + goto cleanup; + } + if (i < num_align) { + fail("Not enough alignment records\n"); + goto cleanup; + } + r = sam_close(in); in = NULL; + if (r < 0) { + fail("sam_close(in)"); + goto cleanup; + } + r = sam_close(out); out = NULL; + if (r < 0) { + fail("sam_close(out)"); + goto cleanup; + } + + inf = fopen(outfname, "r"); + if (!inf) { + fail("Opening \"%s\"", outfname); + goto cleanup; + } + bytes = fread(buffer, 1, sizeof(buffer), inf); + if (bytes != sizeof(sam_text) - 7 + || memcmp(buffer, sam_text + 6, bytes - 7) != 0) { + fail("Output file does not match original version"); + fprintf(stderr, + "---------- Expected:\n%.*s\n" + "++++++++++ Got:\n%.*s\n" + "====================\n", + (int) sizeof(sam_text) - 7, sam_text + 6, + (int) bytes, buffer); + goto cleanup; + } + + cleanup: + bam_destroy1(aln); + sam_hdr_destroy(header); + sam_hdr_destroy(dup_header); + if (in) sam_close(in); + if (out) sam_close(out); + if (inf) fclose(inf); + unlink(outfname); + return; +} + static void faidx1(const char *filename) { int n, n_exp = 0, n_fq_exp = 0; @@ -549,10 +1424,10 @@ static void faidx1(const char *filename) faidx_t *fai; fin = fopen(filename, "rb"); - if (fin == NULL) fail("can't open %s\n", filename); + if (fin == NULL) fail("can't open %s", filename); sprintf(tmpfilename, "%s.tmp", filename); fout = fopen(tmpfilename, "wb"); - if (fout == NULL) fail("can't create temporary %s\n", tmpfilename); + if (fout == NULL) fail("can't create temporary %s", tmpfilename); while (fgets(line, sizeof line, fin)) { if (line[0] == '>') n_exp++; if (line[0] == '+' && line[1] == '\n') n_fq_exp++; @@ -581,6 +1456,45 @@ static void faidx1(const char *filename) fai_destroy(fai); } +static void test_empty_sam_file(const char *filename) +{ + samFile *in = sam_open(filename, "r"); + if (in) { + enum htsExactFormat format = hts_get_format(in)->format; + bam1_t *aln = bam_init1(); + sam_hdr_t *header = sam_hdr_read(in); + int ret = sam_read1(in, header, aln); + + if (format != empty_format) + fail("detected %s as %d (expected empty_format)", filename, format); + if (header) + fail("sam_hdr_read() from %s should fail", filename); + if (ret >= -1) + fail("sam_read1() from %s returned %d but should fail", filename, ret); + + bam_destroy1(aln); + sam_hdr_destroy(header); + sam_close(in); + } + else fail("can't open %s to read as SAM", filename); +} + +static void test_text_file(const char *filename, int nexp) +{ + htsFile *in = hts_open(filename, "r"); + if (in) { + kstring_t str = KS_INITIALIZE; + int ret, n = 0; + while ((ret = hts_getline(in, '\n', &str)) >= 0) n++; + if (ret != -1) fail("hts_getline got an error from %s", filename); + if (n != nexp) fail("hts_getline read %d lines from %s (expected %d)", n, filename, nexp); + + hts_close(in); + free(str.s); + } + else fail("can't open %s to read as text", filename); +} + static void check_enum1(void) { // bgzf_compression() returns int, but enjoys this correspondence @@ -589,6 +1503,306 @@ static void check_enum1(void) if (bgzf != 2) fail("bgzf is %d", bgzf); } +static void check_cigar_tab(void) +{ + int i, n_neg = 0; + + for (i = 0; i < 256; ++i) + if (bam_cigar_table[i] < 0) n_neg++; + + if (n_neg + strlen(BAM_CIGAR_STR) != 256) + fail("bam_cigar_table has %d unset entries", n_neg); + + for (i = 0; BAM_CIGAR_STR[i]; ++i) + if (bam_cigar_table[(unsigned char) BAM_CIGAR_STR[i]] != i) + fail("bam_cigar_table['%c'] is not %d", BAM_CIGAR_STR[i], i); +} + +#define MAX_RECS 1000 +#define SEQ_LEN 100 +#define REC_LENGTH 150 // Undersized so some won't fit. + +static int generator(const char *name) +{ + FILE *f = fopen(name, "w"); + char *ref = NULL; + char qual[101]; + size_t i; + uint32_t lfsr = 0xbadcafe; + int res = -1; + + if (!f) { + fail("Couldn't open \"%s\"", name); + return -1; + } + + ref = malloc(MAX_RECS + SEQ_LEN + 1); + if (!ref) goto cleanup; + for (i = 0; i < MAX_RECS + SEQ_LEN; i++) { + // Linear-feedback shift register to make random reference + lfsr ^= lfsr << 13; + lfsr ^= lfsr >> 17; + lfsr ^= lfsr << 5; + ref[i] = "ACGT"[lfsr & 3]; + } + ref[MAX_RECS + SEQ_LEN] = '\0'; + for (i = 0; i < SEQ_LEN; i++) { + qual[i] = 'A' + (i & 0xf); + } + + if (fputs("@HD\tVN:1.4\n", f) < 0) goto cleanup; + if (fprintf(f, "@SQ\tSN:ref1\tLN:%u\n", MAX_RECS + SEQ_LEN) < 0) + goto cleanup; + for (i = 0; i < MAX_RECS; i++) { + if (fprintf(f, "read%zu\t0\tref1\t%zu\t64\t100M\t*\t0\t0\t%.*s\t%.*s\n", + i + 1, i + 1, SEQ_LEN, ref + i, SEQ_LEN, qual) < 0) + goto cleanup; + } + + if (fclose(f) == 0) + res = 0; + f = NULL; + + cleanup: + if (f) fclose(f); + free(ref); + return res; +} + +static int read_data_block(const char *in_name, samFile *fp_in, + const char *out_name, samFile *fp_out, + sam_hdr_t *header, bam1_t *recs, size_t max_recs, + uint8_t *buffer, size_t bufsz, size_t *nrecs_out) { + size_t buff_used = 0, nrecs; + uint32_t new_m_data; + int ret = -1, res = -1; + + for (nrecs = 0; nrecs < max_recs; nrecs++) { + bam_set_mempolicy(&recs[nrecs], + BAM_USER_OWNS_STRUCT|BAM_USER_OWNS_DATA); + + recs[nrecs].data = &buffer[buff_used]; + recs[nrecs].m_data = bufsz - buff_used; + + res = sam_read1(fp_in, header, &recs[nrecs]); + if (res < 0) break; // EOF or error + + if (fp_out) { + if (sam_write1(fp_out, header, &recs[nrecs]) < 0) { + nrecs++; // To return correct count + fail("sam_write1() to \"%s\"", out_name); + goto out; + } + } + + if ((bam_get_mempolicy(&recs[nrecs]) & BAM_USER_OWNS_DATA) == 0) { + continue; // Data not put in buffer + } + + new_m_data = ((uint32_t) recs[nrecs].l_data + 7) & (~7U); + if (new_m_data < recs[nrecs].m_data) recs[nrecs].m_data = new_m_data; + + buff_used += recs[nrecs].m_data; + } + if (res < -1) { + fail("sam_read1() from \"%s\" failed", in_name); + } else { + ret = 0; + } + + out: + *nrecs_out = nrecs; + return ret; +} + +static void test_mempolicy(void) +{ + size_t bufsz = MAX_RECS * REC_LENGTH, nrecs = 0, i; + bam1_t *recs = calloc(MAX_RECS, sizeof(bam1_t)); + uint8_t *buffer = malloc(bufsz); + const char *fname = "test/sam_alignment.tmp.sam"; + const char *bam_name = "test/sam_alignment.tmp.bam"; + const char *cram_name = "test/sam_alignment.tmp.cram"; + const char *tag_text = + "lengthy text ... lengthy text ... lengthy text ... lengthy text ... " + "lengthy text ... lengthy text ... lengthy text ... lengthy text ... " + "lengthy text ... lengthy text ... lengthy text ... lengthy text ... " + "lengthy text ... lengthy text ... lengthy text ... lengthy text ... " + "lengthy text ... lengthy text ... lengthy text ... lengthy text ... "; + int res = 0; + samFile *fp = NULL, *bam_fp = NULL, *cram_fp = NULL; + htsFormat cram_fmt; + sam_hdr_t *header = NULL; + + if (!recs || !buffer) { + fail("Allocating buffer"); + goto cleanup; + } + + memset(&cram_fmt, 0, sizeof(cram_fmt)); + + // Make test file + if (generator(fname) < 0) + goto cleanup; + + // Open and read header + fp = sam_open(fname, "r"); + if (!fp) { + fail("sam_open(\"%s\")", fname); + goto cleanup; + } + + bam_fp = sam_open(bam_name, "wb"); + if (!fp) { + fail("sam_open(\"%s\")", bam_name); + goto cleanup; + } + + header = sam_hdr_read(fp); + if (!header) { + fail("read header from \"%s\"", fname); + goto cleanup; + } + + if (sam_hdr_write(bam_fp, header) < 0) { + fail("sam_hdr_write() to \"%s\"", bam_name); + goto cleanup; + } + + if (read_data_block(fname, fp, bam_name, bam_fp, header, recs, + MAX_RECS, buffer, bufsz, &nrecs) < 0) + goto cleanup; + + res = sam_close(bam_fp); + bam_fp = NULL; + if (res < 0) { + fail("sam_close(\"%s\")", bam_name); + goto cleanup; + } + + // Add a big tag to some records so they no longer fit in the allocated + // buffer space. + for (i = 0; i < MAX_RECS; i += 11) { + if (bam_aux_update_str(&recs[i], "ZZ", + sizeof(tag_text) - 1, tag_text) < 0) { + fail("bam_aux_update_str()"); + goto cleanup; + } + } + + // Delete all the records. bam_destroy1() should free the data + // for the ones that were expanded. + for (i = 0; i < nrecs; i++) { + bam_destroy1(&recs[i]); + } + + res = sam_close(fp); + fp = NULL; + if (res < 0) { + fail("sam_close(\"%s\")", fname); + goto cleanup; + } + + // Same test but reading BAM, writing CRAM + nrecs = 0; + sam_hdr_destroy(header); + header = NULL; + + bam_fp = sam_open(bam_name, "r"); + if (!bam_fp) { + fail("sam_open(\"%s\", \"r\")", bam_name); + goto cleanup; + } + + if (hts_parse_format(&cram_fmt, "cram,no_ref") < 0) { + fail("hts_parse_format"); + goto cleanup; + } + cram_fp = hts_open_format(cram_name, "wc", &cram_fmt); + if (!cram_fp) { + fail("hts_open_format(\"%s\", \"wc\")", cram_name); + goto cleanup; + } + + header = sam_hdr_read(bam_fp); + if (!header) { + fail("read header from \"%s\"", bam_name); + goto cleanup; + } + + if (sam_hdr_write(cram_fp, header) < 0) { + fail("sam_hdr_write() to \"%s\"", cram_name); + goto cleanup; + } + + if (read_data_block(bam_name, bam_fp, cram_name, cram_fp, header, recs, + MAX_RECS, buffer, bufsz, &nrecs) < 0) + goto cleanup; + + res = sam_close(cram_fp); + cram_fp = NULL; + if (res < 0) { + fail("sam_close(\"%s\")", cram_name); + goto cleanup; + } + + for (i = 0; i < MAX_RECS; i += 11) { + if (bam_aux_update_str(&recs[i], "ZZ", + sizeof(tag_text) - 1, tag_text) < 0) { + fail("bam_aux_update_str()"); + goto cleanup; + } + } + + for (i = 0; i < nrecs; i++) { + bam_destroy1(&recs[i]); + } + + // Now try reading the cram file + nrecs = 0; + sam_hdr_destroy(header); + header = NULL; + + cram_fp = sam_open(cram_name, "r"); + if (!cram_fp) { + fail("sam_open(\"%s\", \"r\")", cram_name); + goto cleanup; + } + + header = sam_hdr_read(cram_fp); + if (!header) { + fail("read header from \"%s\"", cram_name); + goto cleanup; + } + + if (read_data_block(cram_name, cram_fp, NULL, NULL, header, recs, + MAX_RECS, buffer, bufsz, &nrecs) < 0) + goto cleanup; + + for (i = 0; i < MAX_RECS; i += 11) { + if (bam_aux_update_str(&recs[i], "ZZ", + sizeof(tag_text) - 1, tag_text) < 0) { + fail("bam_aux_update_str()"); + goto cleanup; + } + } + + cleanup: + sam_hdr_destroy(header); + if (fp) sam_close(fp); + if (bam_fp) sam_close(bam_fp); + if (cram_fp) sam_close(cram_fp); + + for (i = 0; i < nrecs; i++) { + bam_destroy1(&recs[i]); + } + free(buffer); + free(recs); + if (cram_fmt.specific) { + hts_opt_free(cram_fmt.specific); + } +} + int main(int argc, char **argv) { int i; @@ -598,7 +1812,22 @@ int main(int argc, char **argv) aux_fields1(); iterators1(); samrecord_layout(); + use_header_api(); + test_header_pg_lines(); + test_header_updates(); + test_header_remove_lines(); + test_header_ref_altnames(); + test_empty_sam_file("test/emptyfile"); + test_text_file("test/emptyfile", 0); + test_text_file("test/xx#pair.sam", 7); + test_text_file("test/xx.fa", 7); + test_text_file("test/fastqs.fq", 500); check_enum1(); + check_cigar_tab(); + check_big_ref(0); + check_big_ref(1); + test_mempolicy(); + set_qname(); for (i = 1; i < argc; i++) faidx1(argv[i]); return status; diff --git a/test/simple_test_driver.sh b/test/simple_test_driver.sh new file mode 100644 index 000000000..7e4a3bdc2 --- /dev/null +++ b/test/simple_test_driver.sh @@ -0,0 +1,190 @@ +#!/bin/sh +# simple_test_driver.sh -- shell functions for test scripts +# +# Copyright (C) 2017-2018 Genome Research Ltd. +# +# Author: Robert Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Executes a single test and compares against the expected output +run_test() { + # Expected result: pass (P) / fail (F) / nonzero exit (N) + p="$1"; shift + # File with expected output (empty or '.' if none) + e="$1"; shift + # Test result + r="P" + # Why the test failed + y="" + if [ "x$test_iter" = "x" ] + then + test_iter=1 + else + test_iter=`expr $test_iter + 1` + fi + result=`eval ${@+"$@"} 2>_err.tmp > _out.tmp` + if [ $? != 0 ] + then + if [ "$p" != "N" ] + then + # Expected zero exit code, got non-zero + r="F" + y="exit_code" + else + # Expected non-zero exit code and got it + r="P" + fi + elif [ "$p" = "N" ] + then + # Expected non-zero exit code, but got zero + r="F" + y="exit_code" + elif [ "x$e" != "x" -a "$e" != "." ] + then + sed -n 's/.*/&/p' _out.tmp > _out.tmp2 + if cmp -s _out.tmp2 "$e" + then + # Output was as expected + r="P" + rm -f _out.tmp _out.tmp2 _err.tmp + else + # Output differed + r="F" + y="output" + fi + else + # Expected zero exit code and got it. + r="P" + rm -f _out.tmp _out.tmp2 _err.tmp + fi + + if [ "$r" = "F" ] + then + # Test failed + case "$p" in + [PN]) + echo "FAIL : $@" + if [ "x$e" != "x" -a "$e" != "." ] + then + keep_output="FAIL-$e.${test_iter}" + else + keep_output="FAIL.${test_iter}" + fi + mv _out.tmp "${keep_output}.out" + mv _err.tmp "${keep_output}.err" + nufail=`expr $nufail + 1` + if [ "$y" = "exit_code" ] + then + if [ "$p" != "N" ] + then + echo "Got non-zero exit code" + else + echo "Got unexpected zero exit code" + fi + echo "See ${keep_output}.{out,err} for output" + else + echo "Output differed from expected result" + echo "Compare $e ${keep_output}.out" + fi + ;; + *) + echo "XFAIL: $@" + nefail=`expr $nefail + 1` + ;; + esac + else + # Test passed + case "$p" in + "P") + echo "PASS : $@" + nepass=`expr $nepass + 1` + ;; + "N") + echo "PASS : $@ (must exit non-zero)" + nepass=`expr $nepass + 1` + ;; + *) + echo "XPASS: $@" + nupass=`expr $nupass + 1` + ;; + esac + fi +} + +# Reads in a file containing a list of tests and executes them. +# The format for the file is: +# First field: +# INIT = initialisation, not counted in testing +# P = expected to pass (zero return; expected output matches, if present) +# N = expected to return non-zero +# F = expected to fail +# +# Second field (P/N/F only): +# Filename of expected output. If '.', output is not checked +# +# Rest: +# Shell command to execute. The command is executed using `eval` so +# all normal shell substitutions will be done first. + +test_driver() { + nupass=0; nepass=0 + nufail=0; nefail=0 + + exec 9<"$1" + while read -r line <&9 + do + set -- $line + case $1 in + "#"*) # skip comments + ;; + "") # skip blank lines too + ;; + + "INIT") + shift + eval ${@+"$@"} > /dev/null + if [ $? != 0 ] + then + echo "INIT FAIL: $@" + return 1 + fi + ;; + + *) + p=$1;shift + o=$1;shift + run_test "$p" "$o" ${@+"$@"} + ;; + esac + done + exec 9<&- + + echo "" + echo "Expected passes: $nepass" + echo "Unexpected passes: $nupass" + echo "Expected failures: $nefail" + echo "Unexpected failures: $nufail" + if [ "$nupass" -gt 0 -o "$nufail" -gt 0 ] + then + return 1 + else + return 0 + fi +} diff --git a/test/tabix.out b/test/tabix.out new file mode 100644 index 000000000..0e61ac759 --- /dev/null +++ b/test/tabix.out @@ -0,0 +1 @@ +1 10000060 . C <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 diff --git a/test/tabix/test-tabix.sh b/test/tabix/test-tabix.sh index 4a4b0456a..e9a5a9c5a 100755 --- a/test/tabix/test-tabix.sh +++ b/test/tabix/test-tabix.sh @@ -1,6 +1,6 @@ #!/bin/sh # -# Copyright (C) 2017 Genome Research Ltd. +# Copyright (C) 2017-2018 Genome Research Ltd. # # Author: Robert Davies # @@ -22,162 +22,14 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -# Executes a single test and compares against the expected output -run_test() { - # Expected result: pass (P) / fail (F) / nonzero exit (N) - p="$1"; shift - # File with expected output (empty or '.' if none) - e="$1"; shift - # Test result - r="P" - # Why the test failed - y="" - if [ "x$test_iter" = "x" ] - then - test_iter=1 - else - test_iter=`expr $test_iter + 1` - fi - result=`eval ${@+"$@"} 2>_err.tmp > _out.tmp` - if [ $? != 0 ] - then - if [ "$p" != "N" ] - then - # Expected zero exit code, got non-zero - r="F" - y="exit_code" - else - # Expected non-zero exit code and got it - r="P" - fi - elif [ "$p" = "N" ] - then - # Expected non-zero exit code, but got zero - r="F" - y="exit_code" - elif [ "x$e" != "x" -a "$e" != "." ] - then - sed -n 's/.*/&/p' _out.tmp > _out.tmp2 - if cmp -s _out.tmp2 "$e" - then - # Output was as expected - r="P" - rm -f _out.tmp _out.tmp2 _err.tmp - else - # Output differed - r="F" - y="output" - fi - else - # Expected zero exit code and got it. - r="P" - rm -f _out.tmp _out.tmp2 _err.tmp - fi - - if [ "$r" = "F" ] - then - # Test failed - case "$p" in - [PN]) - echo "FAIL : $@" - if [ "x$e" != "x" -a "$e" != "." ] - then - keep_output="FAIL-$e.${test_iter}" - else - keep_output="FAIL.${test_iter}" - fi - mv _out.tmp "${keep_output}.out" - mv _err.tmp "${keep_output}.err" - nufail=`expr $nufail + 1` - if [ "$y" = "exit_code" ] - then - if [ "$p" != "N" ] - then - echo "Got non-zero exit code" - else - echo "Got unexpected zero exit code" - fi - echo "See ${keep_output}.{out,err} for output" - else - echo "Output differed from expected result" - echo "Compare $e ${keep_output}.out" - fi - ;; - *) - echo "XFAIL: $@" - nefail=`expr $nefail + 1` - ;; - esac - else - # Test passed - case "$p" in - "P") - echo "PASS : $@" - nepass=`expr $nepass + 1` - ;; - "N") - echo "PASS : $@ (must exit non-zero)" - nepass=`expr $nepass + 1` - ;; - *) - echo "XPASS: $@" - nupass=`expr $nupass + 1` - ;; - esac - fi -} - -tabix_test() { - nupass=0; nepass=0 - nufail=0; nefail=0 - - exec 9<"$1" - while read -r line <&9 - do - set -- $line - case $1 in - "#"*) # skip comments - ;; - "") # skip blank lines too - ;; - - "INIT") - shift - eval ${@+"$@"} > /dev/null - if [ $? != 0 ] - then - echo "INIT FAIL: $@" - return 1 - fi - ;; - - *) - p=$1;shift - o=$1;shift - run_test "$p" "$o" ${@+"$@"} - ;; - esac - done - exec 9<&- - - echo "" - echo "Expected passes: $nepass" - echo "Unexpected passes: $nupass" - echo "Expected failures: $nefail" - echo "Unexpected failures: $nufail" - if [ "$nupass" -gt 0 -o "$nufail" -gt 0 ] - then - return 1 - else - return 0 - fi -} +# Load in the test driver +. ../simple_test_driver.sh echo "Testing tabix..." bgzip="../../bgzip" tabix="../../tabix" -tabix_test $@ +test_driver $@ exit $? diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c index ebe93904a..ee0aadedc 100644 --- a/test/test-bcf-sr.c +++ b/test/test-bcf-sr.c @@ -31,6 +31,7 @@ #include #include #include +#include #include void error(const char *format, ...) @@ -103,7 +104,7 @@ int main(int argc, char *argv[]) { if ( !bcf_sr_has_line(sr,i) ) continue; bcf1_t *rec = bcf_sr_get_line(sr, i); - printf("%s:%d", bcf_seqname(bcf_sr_get_header(sr,i),rec),rec->pos+1); + printf("%s:%"PRIhts_pos, bcf_seqname(bcf_sr_get_header(sr,i),rec),rec->pos+1); break; } diff --git a/test/test-bcf-sr.pl b/test/test-bcf-sr.pl index 7a5887c2f..51e4fa164 100755 --- a/test/test-bcf-sr.pl +++ b/test/test-bcf-sr.pl @@ -1,9 +1,27 @@ #!/usr/bin/env perl +# test-bcf-sr.pl -- Test bcf synced reader's allele pairing # -# Author: petr.danecek@sanger +# Copyright (C) 2017-2018 Genome Research Ltd. # -# Test bcf synced reader's allele pairing +# Author: petr.danecek@sanger # +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. use strict; use warnings; diff --git a/test/test-bcf-translate.c b/test/test-bcf-translate.c index f799c340e..b3ddac59b 100644 --- a/test/test-bcf-translate.c +++ b/test/test-bcf-translate.c @@ -1,6 +1,6 @@ /* test/test-bcf-translate.c - Copyright (C) 2017 Genome Research Ltd. + Copyright (C) 2017-2018 Genome Research Ltd. Author: Petr Danecek @@ -27,6 +27,15 @@ #include #include +void error(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + exit(-1); +} + int main(int argc, char **argv) { char *fname = argc>1 ? argv[1] : "/dev/null"; @@ -63,12 +72,12 @@ int main(int argc, char **argv) bcf_hdr_add_sample(hdr1,"SMPL2"); bcf_hdr_add_sample(hdr2,"SMPL1"); bcf_hdr_add_sample(hdr2,"SMPL2"); - bcf_hdr_sync(hdr1); - bcf_hdr_sync(hdr2); + if (bcf_hdr_sync(hdr1) < 0) error("bcf_hdr_sync(hdr1)"); + if (bcf_hdr_sync(hdr2) < 0) error("bcf_hdr_sync(hdr2)"); hdr2 = bcf_hdr_merge(hdr2,hdr1); - bcf_hdr_sync(hdr2); - bcf_hdr_write(fp, hdr2); + if (bcf_hdr_sync(hdr2) < 0) error("bcf_hdr_sync(hdr2) after merge"); + if ( bcf_hdr_write(fp, hdr2)!=0 ) error("Failed to write to %s\n", fname); bcf1_t *rec = bcf_init1(); rec->rid = bcf_hdr_name2id(hdr1, "1"); @@ -91,7 +100,7 @@ int main(int argc, char **argv) bcf_update_format_int32(hdr1, rec, "FMT2", NULL, 0); bcf_translate(hdr2, hdr1, rec); - bcf_write(fp, hdr2, rec); + if ( bcf_write(fp, hdr2, rec)!=0 ) error("Faild to write to %s\n", fname); // Clean bcf_destroy1(rec); diff --git a/test/test-parse-reg.c b/test/test-parse-reg.c new file mode 100644 index 000000000..c4d490f3d --- /dev/null +++ b/test/test-parse-reg.c @@ -0,0 +1,204 @@ +/* + Copyright (C) 2018-2019 Genome Research Ltd. + + Author: James Bonfield + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* + Test region description parser. + Usage: test-parse-reg [-c] file.bam region + test-parse-reg [-c] -m file.bam region,region... + test-parse-reg -t + + -c is chr:pos is a single base coordinate, ie chr:pos-pos, + otherwise it is chr:pos- + -m is multi-region list. + -t runs built-in tests + + ./test/test-parse-reg -c -m test/colons.bam "{chr1:100-200},{chr1}:100-200,{chr1:100-200}:100,{chr1,chr3},chr1:" +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +void reg_expected(sam_hdr_t *hdr, const char *reg, int flags, + char *reg_exp, int tid_exp, hts_pos_t beg_exp, hts_pos_t end_exp) { + const char *reg_out; + int tid_out = -1; + hts_pos_t beg_out = -1, end_out = -1; + + reg_out = sam_parse_region(hdr, reg, &tid_out, &beg_out, &end_out, flags); + + if ((reg_out != NULL) != (reg_exp != NULL) || + (reg_out && reg_exp && strcmp(reg_out, reg_exp) != 0) || + (reg_exp && tid_out != tid_exp) || + (reg_exp && beg_out != beg_exp) || + (reg_exp && end_out != end_exp)) { + fprintf(stderr, "Parsing \"%s\" expected return \"%s\", %d:%"PRIhts_pos"-%"PRIhts_pos", " + "but got \"%s\", %d:%"PRIhts_pos"-%"PRIhts_pos"\n", + reg, + reg_exp?reg_exp:"(null)", tid_exp, beg_exp, end_exp, + reg_out?reg_out:"(null)", tid_out, beg_out, end_out); + exit(1); + } +} + +int reg_test(char *fn) { + samFile *fp; + sam_hdr_t *hdr; + + if (!(fp = sam_open(fn, "r"))) + return 1; + + if (!(hdr = sam_hdr_read(fp))) + return 1; + + // 0 chr1 + // 1 chr1:100 + // 2 chr1:100-200 + // 3 chr2:100-200 + // 4 chr3 + // 5 chr1,chr3 + + // Check range extensions. + reg_expected(hdr, "chr1", 0, "", 0, 0, HTS_POS_MAX); + reg_expected(hdr, "chr1:50", 0, "", 0, 49, HTS_POS_MAX); + reg_expected(hdr, "chr1:50", HTS_PARSE_ONE_COORD, "", 0, 49, 50); + reg_expected(hdr, "chr1:50-100", 0, "", 0, 49, 100); + reg_expected(hdr, "chr1:50-", 0, "", 0, 49, HTS_POS_MAX); + reg_expected(hdr, "chr1:-50", 0, "", 0, 0, 50); + + // Check quoting + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:100-200", 0, NULL, 0, 0, 0); // ambiguous + reg_expected(hdr, "{chr1}:100-200", 0, "", 0, 99, 200); + reg_expected(hdr, "{chr1:100-200}", 0, "", 2, 0, HTS_POS_MAX); + reg_expected(hdr, "{chr1:100-200}:100-200", 0, "", 2, 99, 200); + reg_expected(hdr, "{chr2:100-200}:100-200", 0, "", 3, 99, 200); + reg_expected(hdr, "chr2:100-200:100-200", 0, "", 3, 99, 200); + reg_expected(hdr, "chr2:100-200", 0, "", 3, 0, HTS_POS_MAX); + + // Check numerics + reg_expected(hdr, "chr3", 0, "", 4, 0, HTS_POS_MAX); + reg_expected(hdr, "chr3:", 0, "", 4, 0, HTS_POS_MAX); + reg_expected(hdr, "chr3:1000-1500", 0, "", 4, 999, 1500); + reg_expected(hdr, "chr3:1,000-1,500", 0, "", 4, 999, 1500); + reg_expected(hdr, "chr3:1k-1.5K", 0, "", 4, 999, 1500); + reg_expected(hdr, "chr3:1e3-1.5e3", 0, "", 4, 999, 1500); + reg_expected(hdr, "chr3:1e3-15e2", 0, "", 4, 999, 1500); + + // Check list mode + reg_expected(hdr, "chr1,chr3", HTS_PARSE_LIST, "chr3", 0, 0, HTS_POS_MAX); + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:100-200,chr3", HTS_PARSE_LIST, NULL, 0, 0, 0); // ambiguous + reg_expected(hdr, "{chr1,chr3}", HTS_PARSE_LIST, "", 5, 0, HTS_POS_MAX); + reg_expected(hdr, "{chr1,chr3},chr1", HTS_PARSE_LIST, "chr1", 5, 0, HTS_POS_MAX); + // incorrect usage; first reg is valid (but not what user expects). + reg_expected(hdr, "chr3:1,000-1,500", HTS_PARSE_LIST | HTS_PARSE_ONE_COORD, "000-1,500", 4, 0, 1); + + // More expected failures + reg_expected(hdr, "chr2", 0, NULL, 0, 0, 0); + reg_expected(hdr, "chr1,", 0, NULL, 0, 0, 0); + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "{chr1", 0, NULL, 0, 0, 0); + reg_expected(hdr, "chr1:10-10", 0, "", 0, 9, 10); // OK + reg_expected(hdr, "chr1:10-9", 0, NULL, 0, 0, 0); // Issue#353 + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:x", 0, NULL, 0, 0, 0); + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:1-y", 0, NULL, 0, 0, 0); + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:1,chr3", 0, NULL, 0, 0, 0); + + sam_hdr_destroy(hdr); + sam_close(fp); + + exit(0); +} + +int main(int argc, char **argv) { + sam_hdr_t *hdr; + samFile *fp; + int flags = 0; + + while (argc > 1) { + if (strcmp(argv[1], "-m") == 0) { + flags |= HTS_PARSE_LIST; + argc--; argv++; + continue; + } + + if (strcmp(argv[1], "-c") == 0) { + flags |= HTS_PARSE_ONE_COORD; + argc--; argv++; + continue; + } + + // Automatic mode for test harness + if (strcmp(argv[1], "-t") == 0) + reg_test(argv[2]); + + break; + } + + // Interactive mode for debugging + if (argc != 3) { + fprintf(stderr, "Usage: test-parse-reg [-m] [-c] region[,region]...\n"); + exit(1); + } + + if (!(fp = sam_open(argv[1], "r"))) { + perror(argv[1]); + exit(1); + } + + if (!(hdr = sam_hdr_read(fp))) { + fprintf(stderr, "Couldn't read header\n"); + exit(1); + } + + const char *reg = argv[2]; + while (*reg) { + int tid; + hts_pos_t beg, end; + reg = sam_parse_region(hdr, reg, &tid, &beg, &end, flags); + if (!reg) { + fprintf(stderr, "Failed to parse region\n"); + exit(1); + } + printf("%-20s %12"PRIhts_pos" %12"PRIhts_pos"\n", + tid == -1 ? "*" : hdr->target_name[tid], + beg, end); + } + + sam_hdr_destroy(hdr); + sam_close(fp); + + return 0; +} diff --git a/test/test-regidx.c b/test/test-regidx.c index 30844a672..dcb065f7d 100644 --- a/test/test-regidx.c +++ b/test/test-regidx.c @@ -1,6 +1,8 @@ /* test/test-regidx.c -- Regions index test harness. - Copyright (C) 2014 Genome Research Ltd. + gcc -g -Wall -O0 -I. -I../htslib/ -L../htslib regidx.c -o test-regidx test-regidx.c -lhts + + Copyright (C) 2014,2016,2018 Genome Research Ltd. Author: Petr Danecek @@ -24,15 +26,42 @@ */ #include - #include #include #include +#include #include -#include -#include "hts_internal.h" +#include +#include +#include "htslib/kstring.h" +#include "htslib/regidx.h" +#include "htslib/hts_defs.h" +#include "textutils_internal.h" + +static int verbose = 0; + +HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) +static void debug(const char *format, ...) +{ + if ( verbose<2 ) return; + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); +} + +HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) +static void info(const char *format, ...) +{ + if ( verbose<1 ) return; + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); +} -void error(const char *format, ...) +HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) +static void error(const char *format, ...) { va_list ap; va_start(ap, format); @@ -41,10 +70,10 @@ void error(const char *format, ...) exit(-1); } -int custom_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr) +int custom_parse(const char *line, char **chr_beg, char **chr_end, hts_pos_t *beg, hts_pos_t *end, void *payload, void *usr) { // Use the standard parser for CHROM,FROM,TO - int i, ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL); + int i, ret = regidx_parse_tab(line,chr_beg,chr_end,beg,end,NULL,NULL); if ( ret!=0 ) return ret; // Skip the fields that were parsed above @@ -73,7 +102,44 @@ void custom_free(void *payload) free(*dat); } -int main(int argc, char **argv) +void test_sequential_access(void) +{ + // Init index with no file name, we will insert the regions manually + regidx_t *idx = regidx_init(NULL,custom_parse,custom_free,sizeof(char*),NULL); + if ( !idx ) error("init failed\n"); + + // Insert regions + kstring_t str = {0,0,0}; + int i, n = 10; + for (i=0; ibeg!=itr->end || itr->beg+1!=10*(i+1) ) error("listing failed, expected %d, found %"PRIhts_pos"\n",10*(i+1),itr->beg+1); + str.l = 0; + ksprintf(&str,"%"PRIhts_pos, itr->beg+1); + if ( strcmp(regitr_payload(itr,char*),str.s) ) error("listing failed, expected payload \"%s\", found \"%s\"\n",str.s,regitr_payload(itr,char*)); + i++; + } + if ( i!=n ) error("Expected %d regions, listed %d\n", n,i); + debug("ok: listed %d regions\n", n); + + // Clean up + regitr_destroy(itr); + regidx_destroy(idx); + free(str.s); +} + +void test_custom_payload(void) { // Init index with no file name, we will insert the regions manually regidx_t *idx = regidx_init(NULL,custom_parse,custom_free,sizeof(char*),NULL); @@ -85,32 +151,320 @@ int main(int argc, char **argv) line = "1 20000000 20000001 1:20000000-20000001"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); line = "1 20000002 20000002 1:20000002-20000002"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); line = "1 30000000 30000000 1:30000000-30000000"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); - - // Finish initialization - regidx_insert(idx,NULL); + line = "1 8000000000 8000000000 1:8000000000-8000000000"; if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); // Test - regitr_t itr; - int from, to; + regitr_t *itr = regitr_init(idx); + hts_pos_t from, to; from = to = 10000000; - if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to); - if ( strcmp("1:10000000-10000000",REGITR_PAYLOAD(itr,char*)) ) error("query failed: 1:%d-%d vs %s\n", from,to,REGITR_PAYLOAD(itr,char*)); - if ( !regidx_overlap(idx,"1",from-2,to-1,&itr) ) error("query failed: 1:%d-%d\n",from-1,to); - if ( !regidx_overlap(idx,"1",from-2,to+3,&itr) ) error("query failed: 1:%d-%d\n",from-1,to+2); - if ( regidx_overlap(idx,"1",from-2,to-2,&itr) ) error("query failed: 1:%d-%d\n",from-1,to-1); + if ( !regidx_overlap(idx,"1",from-1,to-1,itr) ) error("query failed: 1:%"PRIhts_pos"-%"PRIhts_pos"\n",from,to); + if ( strcmp("1:10000000-10000000",regitr_payload(itr,char*)) ) error("query failed: 1:%"PRIhts_pos"-%"PRIhts_pos" vs %s\n", from,to,regitr_payload(itr,char*)); + if ( !regidx_overlap(idx,"1",from-2,to-1,itr) ) error("query failed: 1:%"PRIhts_pos"-%"PRIhts_pos"\n",from-1,to); + if ( !regidx_overlap(idx,"1",from-2,to+3,itr) ) error("query failed: 1:%"PRIhts_pos"-%"PRIhts_pos"\n",from-1,to+2); + if ( regidx_overlap(idx,"1",from-2,to-2,itr) ) error("query failed: 1:%"PRIhts_pos"-%"PRIhts_pos"\n",from-1,to-1); from = to = 20000000; - if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to); + if ( !regidx_overlap(idx,"1",from-1,to-1,itr) ) error("query failed: 1:%"PRIhts_pos"-%"PRIhts_pos"\n",from,to); from = to = 20000002; - if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to); + if ( !regidx_overlap(idx,"1",from-1,to-1,itr) ) error("query failed: 1:%"PRIhts_pos"-%"PRIhts_pos"\n",from,to); from = to = 30000000; - if ( !regidx_overlap(idx,"1",from-1,to-1,&itr) ) error("query failed: 1:%d-%d\n",from,to); + if ( !regidx_overlap(idx,"1",from-1,to-1,itr) ) error("query failed: 1:%"PRIhts_pos"-%"PRIhts_pos"\n",from,to); + + from = to = 8000000000; + if ( !regidx_overlap(idx,"1",from-1,to-1,itr) ) error("query failed: 1:%"PRIhts_pos"-%"PRIhts_pos"\n",from,to); + + // This shouldn't bring anything back + from &= 0xffffffffU; + to &= 0xffffffffU; + if ( regidx_overlap(idx,"1",from-1,to-1,itr) ) error("query should not succeed: 1:%"PRIhts_pos"-%"PRIhts_pos"\n",from,to); // Clean up + regitr_destroy(itr); regidx_destroy(idx); +} + +void get_random_region(uint32_t min, uint32_t max, uint32_t *beg, uint32_t *end) +{ + long int b = rand(), e = rand(); + *beg = min + (float)b * (max-min) / RAND_MAX; + *end = *beg + (float)e * (max-*beg) / RAND_MAX; +} + +void test_random(int nregs, uint32_t min, uint32_t max) +{ + min--; + max--; + + // Init index with no file name, we will insert the regions manually + regidx_t *idx = regidx_init(NULL,custom_parse,custom_free,sizeof(char*),NULL); + if ( !idx ) error("init failed\n"); + + // Test region + uint32_t beg,end; + get_random_region(min,max,&beg,&end); + + // Insert regions + int i, nexp = 0; + kstring_t str = {0,0,0}; + for (i=0; i=beg && b<=end ) nexp++; + } + + // Test + regitr_t *itr = regitr_init(idx); + int nhit = 0, ret = regidx_overlap(idx,"1",beg,end,itr); + if ( nexp && !ret ) error("query failed, expected %d overlap(s), found none: %d-%d\n", nexp,beg+1,end+1); + if ( !nexp && ret ) error("query failed, expected no overlaps, found some: %d-%d\n", beg+1,end+1); + while ( ret && regitr_overlap(itr) ) + { + str.l = 0; + ksprintf(&str,"1:%"PRIhts_pos"-%"PRIhts_pos"",itr->beg+1,itr->end+1); + if ( strcmp(str.s,regitr_payload(itr,char*)) ) + error("query failed, incorrect payload: %s vs %s (%d-%d)\n",str.s,regitr_payload(itr,char*),beg+1,end+1); + if ( itr->beg > end || itr->end < beg ) + error("query failed, incorrect hit: %d-%d vs %"PRIhts_pos"-%"PRIhts_pos", payload %s\n", beg+1,end+1,itr->beg+1,itr->end+1,regitr_payload(itr,char*)); + nhit++; + } + if ( nexp!=nhit ) error("query failed, expected %d overlap(s), found %d: %d-%d\n",nexp,nhit,beg+1,end+1); + debug("ok: found %d overlaps\n", nexp); + + // Clean up + regitr_destroy(itr); + regidx_destroy(idx); + free(str.s); +} +void test_explicit(char *tgt, char *qry, char *exp) +{ + regidx_t *idx = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL); + + char *beg = tgt, *end, *exp_ori = exp; + kstring_t str = {0,0,0}; + while ( *beg ) + { + end = tgt; + while ( *end && *end!=';' ) end++; + str.l = 0; + kputsn(beg, end-beg, &str); + debug("insert: %s\n", str.s); + if ( regidx_insert(idx,str.s)!=0 ) error("insert failed: %s\n", str.s); + beg = *end ? end + 1 : end; + } + + beg = qry; + while ( *beg ) + { + end = qry; + while ( *end && *end!=';' ) end++; + str.l = 0; + kputsn(beg, end-beg, &str); + beg = *end ? end + 1 : end; + + char *chr_beg, *chr_end; + hts_pos_t reg_beg, reg_end; + if ( regidx_parse_reg(str.s, &chr_beg, &chr_end, ®_beg, ®_end, NULL, NULL)!=0 ) error("could not parse: %s in %s\n", str.s, qry); + chr_end[1] = 0; + int hit = regidx_overlap(idx,chr_beg,reg_beg,reg_end,NULL); + if ( *exp=='1' ) + { + if ( !hit ) + { + error("query failed, there should be a hit .. %s:%"PRIhts_pos"-%"PRIhts_pos"\n",chr_beg, reg_beg+1, reg_end+1); + } + else + { + debug("ok: overlap found for %s:%"PRIhts_pos"-%"PRIhts_pos"\n",chr_beg,reg_beg+1,reg_end+1); + } + } + else if ( *exp=='0' ) + { + if ( hit ) + { + error("query failed, there should be no hit .. %s:%"PRIhts_pos"-%"PRIhts_pos"\n",chr_beg,reg_beg+1,reg_end+1); + } + else + { + debug("ok: no overlap found for %s:%"PRIhts_pos"-%"PRIhts_pos"\n",chr_beg,reg_beg+1,reg_end+1); + } + } + else error("could not parse: %s\n", exp_ori); + exp++; + } + + free(str.s); + regidx_destroy(idx); +} + +void create_line_bed(char *line, char *chr, int start, int end) +{ + sprintf(line,"%s\t%d\t%d\n",chr,start-1,end); +} +void create_line_tab(char *line, char *chr, int start, int end) +{ + sprintf(line,"%s\t%d\t%d\n",chr,start,end); +} +void create_line_reg(char *line, char *chr, int start, int end) +{ + sprintf(line,"%s:%d-%d\n",chr,start,end); +} + +typedef void (*set_line_f)(char *line, char *chr, int start, int end); + +void test(set_line_f set_line, regidx_parse_f parse) +{ + regidx_t *idx = regidx_init(NULL,parse,NULL,0,NULL); + if ( !idx ) error("init failed\n"); + + char line[250], *chr = "1"; + int i, n = 10, start, end, nhit; + for (i=1; ibeg > end-1 || itr->end < start-1 ) error("query failed, incorrect region: %"PRIhts_pos"-%"PRIhts_pos" for %d-%d\n",itr->beg+1,itr->end+1,start,end); + debug("\t %"PRIhts_pos"-%"PRIhts_pos"\n",itr->beg+1,itr->end+1); + nhit++; + } + if ( nhit!=1 ) error("query failed, expected one hit, found %d: %s:%d-%d\n",nhit,chr,start,end); + + + // one hit + start = end = 10*i+1; + if ( !regidx_overlap(idx,chr,start-1,end-1,itr) ) error("query failed, there should be a hit: %s:%d-%d\n",chr,start,end); + debug("ok: overlap(s) found for %s:%d-%d\n",chr,start,end); + nhit = 0; + while ( regitr_overlap(itr) ) + { + if ( itr->beg > end-1 || itr->end < start-1 ) error("query failed, incorrect region: %"PRIhts_pos"-%"PRIhts_pos" for %d-%d\n",itr->beg+1,itr->end+1,start,end); + debug("\t %"PRIhts_pos"-%"PRIhts_pos"\n",itr->beg+1,itr->end+1); + nhit++; + } + if ( nhit!=1 ) error("query failed, expected one hit, found %d: %s:%d-%d\n",nhit,chr,start,end); + + + // two hits + start = 10*i; end = start+1; + if ( !regidx_overlap(idx,chr,start-1,end-1,itr) ) error("query failed, there should be a hit: %s:%d-%d\n",chr,start,end); + debug("ok: overlap(s) found for %s:%d-%d\n",chr,start,end); + nhit = 0; + while ( regitr_overlap(itr) ) + { + if ( itr->beg > end-1 || itr->end < start-1 ) error("query failed, incorrect region: %"PRIhts_pos"-%"PRIhts_pos" for %d-%d\n",itr->beg+1,itr->end+1,start,end); + debug("\t %"PRIhts_pos"-%"PRIhts_pos"\n",itr->beg+1,itr->end+1); + nhit++; + } + if ( nhit!=2 ) error("query failed, expected two hits, found %d: %s:%d-%d\n",nhit,chr,start,end); + + // fully contained interval, one hit + start = 20000*i - 5000; end = 20000*i + 3000; + set_line(line,chr,start,end); + if ( !regidx_overlap(idx,chr,start-1,end-1,itr) ) error("query failed, there should be a hit: %s:%d-%d\n",chr,start,end); + debug("ok: overlap(s) found for %s:%d-%d\n",chr,start,end); + nhit = 0; + while ( regitr_overlap(itr) ) + { + if ( itr->beg > end-1 || itr->end < start-1 ) error("query failed, incorrect region: %"PRIhts_pos"-%"PRIhts_pos" for %d-%d\n",itr->beg+1,itr->end+1,start,end); + debug("\t %"PRIhts_pos"-%"PRIhts_pos"\n",itr->beg+1,itr->end+1); + nhit++; + } + if ( nhit!=1 ) error("query failed, expected one hit, found %d: %s:%d-%d\n",nhit,chr,start,end); + } + regitr_destroy(itr); + regidx_destroy(idx); +} + +static void usage(void) +{ + fprintf(stderr, "Usage: test-regidx [OPTIONS]\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -h, --help this help message\n"); + fprintf(stderr, " -s, --seed random seed\n"); + fprintf(stderr, " -v, --verbose increase verbosity by giving multiple times\n"); + + exit(1); +} + +int main(int argc, char **argv) +{ + static struct option loptions[] = + { + {"help",0,0,'h'}, + {"verbose",0,0,'v'}, + {"seed",1,0,'s'}, + {0,0,0,0} + }; + int c; + int seed = (int)time(NULL); + while ((c = getopt_long(argc, argv, "hvs:",loptions,NULL)) >= 0) + { + switch (c) + { + case 's': seed = atoi(optarg); break; + case 'v': verbose++; break; + default: usage(); break; + } + } + + info("Testing sequential access\n"); + test_sequential_access(); + + info("Testing TAB\n"); + test(create_line_tab,regidx_parse_tab); + + info("Testing REG\n"); + test(create_line_reg,regidx_parse_reg); + + info("Testing BED\n"); + test(create_line_bed,regidx_parse_bed); + + info("Testing custom payload\n"); + test_custom_payload(); + + info("Testing cases encountered in past\n"); + test_explicit("12:2064519-2064763","12:2064488-2067434","1"); + + int i, ntest = 1000, nreg = 50; + srand(seed); + info("%d randomized tests, %d regions per test. Random seed is %d\n", ntest,nreg,seed); + for (i=0; i @@ -30,72 +30,105 @@ DEALINGS IN THE SOFTWARE. */ #include #include +void error(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + if (strrchr(format, '\n') == NULL) fputc('\n', stderr); + exit(-1); +} + +#define STRINGIFY(x) #x +#define check0(x) ((x) == 0 ? (void) 0 : error("Failed: %s", STRINGIFY(x))) + void write_bcf(char *fname) { // Init htsFile *fp = hts_open(fname,"wb"); + if (!fp) error("Failed to open \"%s\" : %s", fname, strerror(errno)); bcf_hdr_t *hdr = bcf_hdr_init("w"); + if (!hdr) error("bcf_hdr_init : %s", strerror(errno)); bcf1_t *rec = bcf_init1(); + if (!rec) error("bcf_init1 : %s", strerror(errno)); // Create VCF header kstring_t str = {0,0,0}; - bcf_hdr_append(hdr, "##fileDate=20090805"); - bcf_hdr_append(hdr, "##FORMAT="); - bcf_hdr_append(hdr, "##INFO="); - bcf_hdr_append(hdr, "##FILTER="); - bcf_hdr_append(hdr, "##unused="); - bcf_hdr_append(hdr, "##unused=unformatted text 1"); - bcf_hdr_append(hdr, "##unused=unformatted text 2"); - bcf_hdr_append(hdr, "##contig="); - bcf_hdr_append(hdr, "##source=myImputationProgramV3.1"); - bcf_hdr_append(hdr, "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta"); - bcf_hdr_append(hdr, "##contig="); - bcf_hdr_append(hdr, "##phasing=partial"); - bcf_hdr_append(hdr, "##INFO="); - bcf_hdr_append(hdr, "##INFO="); - bcf_hdr_append(hdr, "##INFO="); - bcf_hdr_append(hdr, "##INFO="); - bcf_hdr_append(hdr, "##INFO="); - bcf_hdr_append(hdr, "##INFO="); - bcf_hdr_append(hdr, "##FILTER="); - bcf_hdr_append(hdr, "##FILTER="); - bcf_hdr_append(hdr, "##FORMAT="); - bcf_hdr_append(hdr, "##FORMAT="); - bcf_hdr_append(hdr, "##FORMAT="); - bcf_hdr_append(hdr, "##FORMAT="); - bcf_hdr_append(hdr, "##FORMAT="); - - bcf_hdr_add_sample(hdr, "NA00001"); - bcf_hdr_add_sample(hdr, "NA00002"); - bcf_hdr_add_sample(hdr, "NA00003"); - bcf_hdr_add_sample(hdr, NULL); // to update internal structures - bcf_hdr_write(fp, hdr); + check0(bcf_hdr_append(hdr, "##fileDate=20090805")); + check0(bcf_hdr_append(hdr, "##FORMAT=")); + check0(bcf_hdr_append(hdr, "##INFO=")); + check0(bcf_hdr_append(hdr, "##FILTER=")); + check0(bcf_hdr_append(hdr, "##unused=")); + check0(bcf_hdr_append(hdr, "##unused=unformatted text 1")); + check0(bcf_hdr_append(hdr, "##unused=unformatted text 2")); + check0(bcf_hdr_append(hdr, "##contig=")); + check0(bcf_hdr_append(hdr, "##source=myImputationProgramV3.1")); + check0(bcf_hdr_append(hdr, "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta")); + check0(bcf_hdr_append(hdr, "##contig=")); + check0(bcf_hdr_append(hdr, "##phasing=partial")); + check0(bcf_hdr_append(hdr, "##INFO=")); + check0(bcf_hdr_append(hdr, "##INFO=")); + check0(bcf_hdr_append(hdr, "##INFO=")); + check0(bcf_hdr_append(hdr, "##INFO=")); + check0(bcf_hdr_append(hdr, "##INFO=")); + check0(bcf_hdr_append(hdr, "##INFO=")); + check0(bcf_hdr_append(hdr, "##INFO=")); + check0(bcf_hdr_append(hdr, "##FILTER=")); + check0(bcf_hdr_append(hdr, "##FILTER=")); + check0(bcf_hdr_append(hdr, "##FORMAT=")); + check0(bcf_hdr_append(hdr, "##FORMAT=")); + check0(bcf_hdr_append(hdr, "##FORMAT=")); + check0(bcf_hdr_append(hdr, "##FORMAT=")); + check0(bcf_hdr_append(hdr, "##FORMAT=")); + + // Try a few header modifications + bcf_hdr_remove(hdr, BCF_HL_CTG, "Unused"); + check0(bcf_hdr_append(hdr, "##contig=")); + bcf_hdr_remove(hdr, BCF_HL_FMT, "TS"); + check0(bcf_hdr_append(hdr, "##FORMAT=")); + bcf_hdr_remove(hdr, BCF_HL_INFO, "NEG"); + check0(bcf_hdr_append(hdr, "##INFO=")); + bcf_hdr_remove(hdr, BCF_HL_FLT, "s50"); + check0(bcf_hdr_append(hdr, "##FILTER=")); + + check0(bcf_hdr_add_sample(hdr, "NA00001")); + check0(bcf_hdr_add_sample(hdr, "NA00002")); + check0(bcf_hdr_add_sample(hdr, "NA00003")); + check0(bcf_hdr_add_sample(hdr, NULL)); // to update internal structures + if ( bcf_hdr_write(fp, hdr)!=0 ) error("Failed to write to %s\n", fname); // Add a record - // 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. + // 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;NEG=-127;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. // .. CHROM rec->rid = bcf_hdr_name2id(hdr, "20"); // .. POS rec->pos = 14369; // .. ID - bcf_update_id(hdr, rec, "rs6054257"); + check0(bcf_update_id(hdr, rec, "rs6054257")); // .. REF and ALT - bcf_update_alleles_str(hdr, rec, "G,A"); + check0(bcf_update_alleles_str(hdr, rec, "G,A")); // .. QUAL rec->qual = 29; // .. FILTER int32_t tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS"); - bcf_update_filter(hdr, rec, &tmpi, 1); + check0(bcf_update_filter(hdr, rec, &tmpi, 1)); // .. INFO tmpi = 3; - bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1); + check0(bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1)); + tmpi = 500; + check0(bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1)); + tmpi = 100000; + check0(bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1)); tmpi = 14; - bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1); + check0(bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1)); + tmpi = -127; + check0(bcf_update_info_int32(hdr, rec, "NEG", &tmpi, 1)); float tmpf = 0.5; - bcf_update_info_float(hdr, rec, "AF", &tmpf, 1); - bcf_update_info_flag(hdr, rec, "DB", NULL, 1); - bcf_update_info_flag(hdr, rec, "H2", NULL, 1); + check0(bcf_update_info_float(hdr, rec, "AF", &tmpf, 1)); + check0(bcf_update_info_flag(hdr, rec, "DB", NULL, 1)); + check0(bcf_update_info_flag(hdr, rec, "H2", NULL, 1)); // .. FORMAT int32_t *tmpia = (int*)malloc(bcf_hdr_nsamples(hdr)*2*sizeof(int)); tmpia[0] = bcf_gt_phased(0); @@ -104,50 +137,66 @@ void write_bcf(char *fname) tmpia[3] = bcf_gt_phased(0); tmpia[4] = bcf_gt_unphased(1); tmpia[5] = bcf_gt_unphased(1); - bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2); + check0(bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2)); tmpia[0] = 48; tmpia[1] = 48; tmpia[2] = 43; - bcf_update_format_int32(hdr, rec, "GQ", tmpia, bcf_hdr_nsamples(hdr)); + check0(bcf_update_format_int32(hdr, rec, "GQ", tmpia, bcf_hdr_nsamples(hdr))); + tmpia[0] = 0; + tmpia[1] = 0; + tmpia[2] = 1; + check0(bcf_update_format_int32(hdr, rec, "DP", tmpia, bcf_hdr_nsamples(hdr))); + tmpia[0] = 1; + tmpia[1] = 100000; + tmpia[2] = 1; + check0(bcf_update_format_int32(hdr, rec, "DP", tmpia, bcf_hdr_nsamples(hdr))); tmpia[0] = 1; tmpia[1] = 8; tmpia[2] = 5; - bcf_update_format_int32(hdr, rec, "DP", tmpia, bcf_hdr_nsamples(hdr)); + check0(bcf_update_format_int32(hdr, rec, "DP", tmpia, bcf_hdr_nsamples(hdr))); tmpia[0] = 51; tmpia[1] = 51; tmpia[2] = 51; tmpia[3] = 51; tmpia[4] = bcf_int32_missing; tmpia[5] = bcf_int32_missing; - bcf_update_format_int32(hdr, rec, "HQ", tmpia, bcf_hdr_nsamples(hdr)*2); + check0(bcf_update_format_int32(hdr, rec, "HQ", tmpia, bcf_hdr_nsamples(hdr)*2)); char *tmp_str[] = {"String1","SomeOtherString2","YetAnotherString3"}; - bcf_update_format_string(hdr, rec, "TS", (const char**)tmp_str, 3); - bcf_write1(fp, hdr, rec); - - // 20 1110696 . A G,T 67 . NS=2;DP=10;AF=0.333,.;AA=T;DB GT 2 1 ./. + check0(bcf_update_format_string(hdr, rec, "TS", (const char**)tmp_str, 3)); + tmp_str[0] = "LongerStringRequiringBufferReallocation"; + check0(bcf_update_format_string(hdr, rec, "TS", (const char**)tmp_str, 3)); + tmp_str[0] = "String1"; + check0(bcf_update_format_string(hdr, rec, "TS", (const char**)tmp_str, 3)); + if ( bcf_write1(fp, hdr, rec)!=0 ) error("Failed to write to %s\n", fname); + + // 20 1110696 . A G,T 67 . NS=2;DP=10;NEG=-128;AF=0.333,.;AA=T;DB GT 2 1 ./. bcf_clear1(rec); rec->rid = bcf_hdr_name2id(hdr, "20"); rec->pos = 1110695; - bcf_update_alleles_str(hdr, rec, "A,G,T"); + check0(bcf_update_alleles_str(hdr, rec, "A,G,T")); rec->qual = 67; tmpi = 2; - bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1); + check0(bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1)); tmpi = 10; - bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1); + check0(bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1)); + tmpi = -128; + check0(bcf_update_info_int32(hdr, rec, "NEG", &tmpi, 1)); float *tmpfa = (float*)malloc(2*sizeof(float)); tmpfa[0] = 0.333; bcf_float_set_missing(tmpfa[1]); - bcf_update_info_float(hdr, rec, "AF", tmpfa, 2); - bcf_update_info_string(hdr, rec, "AA", "T"); - bcf_update_info_flag(hdr, rec, "DB", NULL, 1); + check0(bcf_update_info_float(hdr, rec, "AF", tmpfa, 2)); + check0(bcf_update_info_string(hdr, rec, "AA", "SHORT")); + check0(bcf_update_info_string(hdr, rec, "AA", "LONGSTRING")); + check0(bcf_update_info_string(hdr, rec, "AA", "T")); + check0(bcf_update_info_flag(hdr, rec, "DB", NULL, 1)); tmpia[0] = bcf_gt_phased(2); tmpia[1] = bcf_int32_vector_end; tmpia[2] = bcf_gt_phased(1); tmpia[3] = bcf_int32_vector_end; tmpia[4] = bcf_gt_missing; tmpia[5] = bcf_gt_missing; - bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2); - bcf_write1(fp, hdr, rec); + check0(bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2)); + if ( bcf_write1(fp, hdr, rec)!=0 ) error("Failed to write to %s\n", fname); free(tmpia); free(tmpfa); @@ -167,12 +216,17 @@ void write_bcf(char *fname) void bcf_to_vcf(char *fname) { htsFile *fp = hts_open(fname,"rb"); + if (!fp) error("Failed to open \"%s\" : %s", fname, strerror(errno)); bcf_hdr_t *hdr = bcf_hdr_read(fp); + if (!hdr) error("bcf_hdr_read : %s", strerror(errno)); bcf1_t *rec = bcf_init1(); + if (!rec) error("bcf_init1 : %s", strerror(errno)); char *gz_fname = (char*) malloc(strlen(fname)+4); + if (!gz_fname) error("malloc : %s", strerror(errno)); snprintf(gz_fname,strlen(fname)+4,"%s.gz",fname); htsFile *out = hts_open(gz_fname,"wg"); + if (!out) error("Couldn't open \"%s\" : %s\n", gz_fname, strerror(errno)); bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr); bcf_hdr_remove(hdr_out,BCF_HL_STR,"unused"); @@ -181,31 +235,32 @@ void bcf_to_vcf(char *fname) bcf_hdr_remove(hdr_out,BCF_HL_INFO,"UI"); bcf_hdr_remove(hdr_out,BCF_HL_FMT,"UF"); bcf_hdr_remove(hdr_out,BCF_HL_CTG,"Unused"); - bcf_hdr_write(out, hdr_out); - - while ( bcf_read1(fp, hdr, rec)>=0 ) + if ( bcf_hdr_write(out, hdr_out)!=0 ) error("Failed to write to %s\n", fname); + int r; + while ((r = bcf_read1(fp, hdr, rec)) >= 0) { - bcf_write1(out, hdr_out, rec); + if ( bcf_write1(out, hdr_out, rec)!=0 ) error("Failed to write to %s\n", fname); // Test problems caused by bcf1_sync: the data block // may be realloced, also the unpacked structures must // get updated. - bcf_unpack(rec, BCF_UN_STR); - bcf_update_id(hdr, rec, 0); - bcf_update_format_int32(hdr, rec, "GQ", NULL, 0); + check0(bcf_unpack(rec, BCF_UN_STR)); + check0(bcf_update_id(hdr, rec, 0)); + check0(bcf_update_format_int32(hdr, rec, "GQ", NULL, 0)); bcf1_t *dup = bcf_dup(rec); // force bcf1_sync call - bcf_write1(out, hdr_out, dup); + if ( bcf_write1(out, hdr_out, dup)!=0 ) error("Failed to write to %s\n", fname); bcf_destroy1(dup); - bcf_update_alleles_str(hdr_out, rec, "G,A"); + check0(bcf_update_alleles_str(hdr_out, rec, "G,A")); int32_t tmpi = 99; - bcf_update_info_int32(hdr_out, rec, "DP", &tmpi, 1); + check0(bcf_update_info_int32(hdr_out, rec, "DP", &tmpi, 1)); int32_t tmpia[] = {9,9,9}; - bcf_update_format_int32(hdr_out, rec, "DP", tmpia, 3); + check0(bcf_update_format_int32(hdr_out, rec, "DP", tmpia, 3)); - bcf_write1(out, hdr_out, rec); + if ( bcf_write1(out, hdr_out, rec)!=0 ) error("Failed to write to %s\n", fname); } + if (r < -1) error("bcf_read1"); bcf_destroy1(rec); bcf_hdr_destroy(hdr); @@ -250,7 +305,9 @@ void bcf_to_vcf(char *fname) void iterator(const char *fname) { htsFile *fp = hts_open(fname, "r"); + if (!fp) error("Failed to open \"%s\" : %s", fname, strerror(errno)); bcf_hdr_t *hdr = bcf_hdr_read(fp); + if (!hdr) error("bcf_hdr_read : %s", strerror(errno)); hts_idx_t *idx; hts_itr_t *iter; @@ -276,12 +333,16 @@ void iterator(const char *fname) void test_get_info_values(const char *fname) { htsFile *fp = hts_open(fname, "r"); + if (!fp) error("Failed to open \"%s\" : %s", fname, strerror(errno)); bcf_hdr_t *hdr = bcf_hdr_read(fp); + if (!hdr) error("bcf_hdr_read : %s", strerror(errno)); bcf1_t *line = bcf_init(); - - while (bcf_read(fp, hdr, line) == 0) + if (!line) error("bcf_init : %s", strerror(errno)); + int r; + while ((r = bcf_read(fp, hdr, line)) == 0) { float *afs = 0; + int32_t *negs = NULL; int count = 0; int ret = bcf_get_info_float(hdr, line, "AF", &afs, &count); @@ -303,7 +364,23 @@ void test_get_info_values(const char *fname) } free(afs); + + int32_t expected = (line->pos == 14369)? -127 : -128; + count = 0; + ret = bcf_get_info_int32(hdr, line, "NEG", &negs, &count); + if (ret != 1 || negs[0] != expected) + { + if (ret < 0) + fprintf(stderr, "NEG should be %d, got error ret=%d\n", expected, ret); + else if (ret == 0) + fprintf(stderr, "NEG should be %d, got no entries\n", expected); + else + fprintf(stderr, "NEG should be %d, got %d entries (first is %d)\n", expected, ret, negs[0]); + exit(1); + } + free(negs); } + if (r < -1) error("bcf_read"); bcf_destroy(line); bcf_hdr_destroy(hdr); @@ -314,15 +391,18 @@ void write_format_values(const char *fname) { // Init htsFile *fp = hts_open(fname, "wb"); + if (!fp) error("Failed to open \"%s\" : %s", fname, strerror(errno)); bcf_hdr_t *hdr = bcf_hdr_init("w"); + if (!hdr) error("bcf_hdr_init : %s", strerror(errno)); bcf1_t *rec = bcf_init1(); + if (!rec) error("bcf_init1 : %s", strerror(errno)); // Create VCF header - bcf_hdr_append(hdr, "##contig="); - bcf_hdr_append(hdr, "##FORMAT="); - bcf_hdr_add_sample(hdr, "S"); - bcf_hdr_add_sample(hdr, NULL); // to update internal structures - bcf_hdr_write(fp, hdr); + check0(bcf_hdr_append(hdr, "##contig=")); + check0(bcf_hdr_append(hdr, "##FORMAT=")); + check0(bcf_hdr_add_sample(hdr, "S")); + check0(bcf_hdr_add_sample(hdr, NULL)); // to update internal structures + if ( bcf_hdr_write(fp, hdr)!=0 ) error("Failed to write to %s\n", fname); // Add a record // .. FORMAT @@ -331,8 +411,8 @@ void write_format_values(const char *fname) test[1] = 47.11f; bcf_float_set_vector_end(test[2]); test[3] = -1.2e-13; - bcf_update_format_float(hdr, rec, "TF", test, 4); - bcf_write1(fp, hdr, rec); + check0(bcf_update_format_float(hdr, rec, "TF", test, 4)); + if ( bcf_write1(fp, hdr, rec)!=0 ) error("Failed to write to %s\n", fname); bcf_destroy1(rec); bcf_hdr_destroy(hdr); diff --git a/test/test-vcf-api.out b/test/test-vcf-api.out index d3bb73e64..dd2f4f984 100644 --- a/test/test-vcf-api.out +++ b/test/test-vcf-api.out @@ -13,16 +13,17 @@ ##INFO= ##INFO= ##FILTER= -##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= +##INFO= +##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 -20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ:TS 0|0:48:1:51,51:String1 1|0:48:8:51,51:SomeOtherString2 1/1:43:5:.,.:YetAnotherString3 -20 14370 . G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:DP:HQ:TS 0|0:1:51,51:String1 1|0:8:51,51:SomeOtherString2 1/1:5:.,.:YetAnotherString3 -20 14370 . G A 29 PASS NS=3;DP=99;AF=0.5;DB;H2 GT:DP:HQ:TS 0|0:9:51,51:String1 1|0:9:51,51:SomeOtherString2 1/1:9:.,.:YetAnotherString3 -20 1110696 . A G,T 67 . NS=2;DP=10;AF=0.333,.;AA=T;DB GT 2 1 ./. -20 1110696 . A G,T 67 . NS=2;DP=10;AF=0.333,.;AA=T;DB GT 2 1 ./. -20 1110696 . G A 67 . NS=2;DP=99;AF=0.333,.;AA=T;DB GT:DP 2:9 1:9 ./.:9 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;NEG=-127;AF=0.5;DB;H2 GT:GQ:DP:HQ:TS 0|0:48:1:51,51:String1 1|0:48:8:51,51:SomeOtherString2 1/1:43:5:.,.:YetAnotherString3 +20 14370 . G A 29 PASS NS=3;DP=14;NEG=-127;AF=0.5;DB;H2 GT:DP:HQ:TS 0|0:1:51,51:String1 1|0:8:51,51:SomeOtherString2 1/1:5:.,.:YetAnotherString3 +20 14370 . G A 29 PASS NS=3;DP=99;NEG=-127;AF=0.5;DB;H2 GT:DP:HQ:TS 0|0:9:51,51:String1 1|0:9:51,51:SomeOtherString2 1/1:9:.,.:YetAnotherString3 +20 1110696 . A G,T 67 . NS=2;DP=10;NEG=-128;AF=0.333,.;AA=T;DB GT 2 1 ./. +20 1110696 . A G,T 67 . NS=2;DP=10;NEG=-128;AF=0.333,.;AA=T;DB GT 2 1 ./. +20 1110696 . G A 67 . NS=2;DP=99;NEG=-128;AF=0.333,.;AA=T;DB GT:DP 2:9 1:9 ./.:9 diff --git a/test/test-vcf-sweep.c b/test/test-vcf-sweep.c index 44c2fb140..159636611 100644 --- a/test/test-vcf-sweep.c +++ b/test/test-vcf-sweep.c @@ -1,6 +1,6 @@ /* test/test-vcf-sweep.c -- VCF test harness. - Copyright (C) 2013 Genome Research Ltd. + Copyright (C) 2013-2014 Genome Research Ltd. Author: Petr Danecek diff --git a/test/test.pl b/test/test.pl index 6e3830046..3e3081c5b 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1,6 +1,6 @@ #!/usr/bin/env perl # -# Copyright (C) 2012-2013 Genome Research Ltd. +# Copyright (C) 2012-2019 Genome Research Ltd. # # Author: Petr Danecek # @@ -33,9 +33,21 @@ my $opts = parse_params(); +test_bgzip($opts, 0); +test_bgzip($opts, 4); + +ce_fa_to_md5_cache($opts); +test_index($opts, 0); +test_index($opts, 4); + +test_multi_ref($opts,0); +test_multi_ref($opts,4); + test_view($opts,0); test_view($opts,4); +test_MD($opts); + test_vcf_api($opts,out=>'test-vcf-api.out'); test_vcf_sweep($opts,out=>'test-vcf-sweep.out'); test_vcf_various($opts); @@ -106,8 +118,8 @@ sub parse_params $$opts{bin} = $FindBin::RealBin; $$opts{bin} =~ s{/test/?$}{}; if ($^O =~ /^msys/) { - $$opts{path} = cygpath($$opts{path}); - $$opts{bin} = cygpath($$opts{bin}); + $$opts{path} = cygpath($$opts{path}); + $$opts{bin} = cygpath($$opts{bin}); } return $opts; @@ -127,9 +139,9 @@ sub _cmd } else { - # Example of how to embed Valgrind into the testing framework. - # TEST_PRECMD="valgrind --leak-check=full --suppressions=$ENV{HOME}/valgrind.supp" make check - $cmd = "$ENV{TEST_PRECMD} $cmd" if exists $ENV{TEST_PRECMD}; + # Example of how to embed Valgrind into the testing framework. + # TEST_PRECMD="valgrind --leak-check=full --suppressions=$ENV{HOME}/valgrind.supp" make check + $cmd = "$ENV{TEST_PRECMD} $cmd" if exists $ENV{TEST_PRECMD}; # child exec('bash', '-o','pipefail','-c', $cmd) or error("Cannot execute the command [/bin/sh -o pipefail -c $cmd]: $!"); @@ -204,6 +216,65 @@ sub test_cmd } passed($opts,$test); } + +# Run cmd, producing file out, and compare contents against exp +sub test_compare +{ + my ($opts,$cmd,$exp_fn,$out_fn, %args) = @_; + my ($package, $filename, $line, $test)=caller(1); + $test =~ s/^.+:://; + + print "$test:\n\t$cmd\n"; + + my ($ret,$stdout) = _cmd($cmd); + if ( $ret ) { failed($opts,$test); return; } + + local $/; + my ($exp,$out) = ("",""); + if ( exists($args{"gz"}) ) { + if ( open(my $fh,'-|',"$$opts{bin}/bgzip -d < $exp_fn") ) { + $exp = <$fh>; + close($fh); + } else { + failed($opts,$test,"bgzip -d < $exp_fn $!"); return; + } + } else { + if ( open(my $fh,'<',$exp_fn) ) { + $exp = <$fh>; + close($fh); + } else { + failed($opts,$test,"$exp_fn $!"); return; + } + } + + if ( exists($args{"gz"}) ) { + if ( open(my $fh,'-|',"$$opts{bin}/bgzip -d < $out_fn") ) { + $out = <$fh>; + close($fh); + } else { + failed($opts,$test,"bgzip -d < $out_fn $!"); return; + } + } else { + if ( open(my $fh,'<',$out_fn) ) { + $out = <$fh>; + close($fh); + } else { + failed($opts,$test,"$out_fn $!"); return; + } + } + + if (exists($args{fix_newlines})) { + $exp =~ s/\015\012/\n/g; + $out =~ s/\015\012/\n/g; + } + + if ( $exp ne $out ) + { + failed($opts,$test,"The outputs differ:\n\t\t$exp_fn\n\t\t$out_fn"); + return; + } + passed($opts,$test); +} sub failed { my ($opts,$test,$reason) = @_; @@ -232,9 +303,146 @@ sub is_file_newer return 0; } +sub ce_fa_to_md5_cache { + my ($opts) = @_; + + # These should really be worked out from the file contents, but + # pre-calculating them avoids a dependency on Digest::MD5 + my %csums = (CHROMOSOME_I => '8ede36131e0dbf3417807e48f77f3ebd', + CHROMOSOME_II => '8e7993f7a93158587ee897d7287948ec', + CHROMOSOME_III => '3adcb065e1cf74fafdbba1e8c352b323', + CHROMOSOME_IV => '251af66a69ee589c9f3757340ec2de6f', + CHROMOSOME_V => 'cf200a65fb754836dcc56b24b3170ee8', + CHROMOSOME_X => '6f9368fd2192c89c613718399d2d31fc', + CHROMOSOME_MtDNA => 'cd05857ece6411f40257a565ccfe15bb'); + + my $m5_dir = "$$opts{tmp}/md5"; + if (!-d $m5_dir) { + mkdir($m5_dir) || die "Couldn't make directory $m5_dir\n"; + } + my $out; + open(my $fa, '<', "$$opts{path}/ce.fa") + || die "Couldn't open $$opts{path}/ce.fa : $!\n"; + my $name = ''; + while (<$fa>) { + chomp; + if (/^>(\S+)/) { + if ($out) { + close($out) || die "Error closing $m5_dir/$csums{$name} : $!\n"; + } + $name = $1; + if (!exists($csums{$name})) { + die "Unexpected fasta entry : $name\n"; + } + open($out, '>', "$m5_dir/$csums{$name}") + } else { + if (!$out) { + die "$$opts{path}/ce.fa : Got data before fasta header\n"; + } + $_ = uc($_); + s/\s+//g; + print $out $_; + } + } + if ($out) { + close($out) || die "Error closing $m5_dir/$csums{$name} : $!\n"; + } + close($fa) || die "Error reading $$opts{path}/ce.fa : $!\n"; + $$opts{m5_dir} = $m5_dir; +} + # The tests -------------------------- +sub test_bgzip { + my ($opts, $threads) = @_; + + my $at = $threads ? "-@ $threads" : ''; + my $data = "$$opts{path}/ce.fa"; + my $compressed = "$$opts{tmp}/ce.fa.$threads.gz"; + my $compressed_copy = "$$opts{tmp}/ce.fa.$threads.copy.gz"; + my $uncompressed = "$$opts{tmp}/ce.fa.$threads.uncomp"; + my $offset = 1055584; # Start of MT in ce.fa + my $uncompressed_part = "$$opts{tmp}/ce.fa.$threads.part"; + my $uncompressed_part2 = "$$opts{tmp}/ce.fa.$threads.part2"; + my $expected_part = "$$opts{tmp}/ce.fa.$threads.tail"; + my $index = "${compressed}.gzi"; + my $test = sprintf('%s %2s threads', 'bgzip round-trip', + $threads ? $threads : 'no'); + + # Round-trip test + print "$test: "; + my $c = "$$opts{bin}/bgzip $at -i -I '$index' < '$data' > '$compressed'"; + my ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "$$opts{bin}/bgzip $at -d < '$compressed' > '$uncompressed'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "cmp '$data' '$uncompressed'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, $out ? $out : "'$data' '$uncompressed' differ"); + return; + } + passed($opts,$test); + + # Extract from an offset + $test = sprintf('%s %2s threads', 'bgzip -b', + $threads ? $threads : 'no'); + print "$test: "; + $c = sprintf("tail -c +%d '%s' > '%s'", $offset + 1, $data, $expected_part); + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "$$opts{bin}/bgzip $at -b $offset -d '$compressed' > $uncompressed_part"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "cmp '$expected_part' '$uncompressed_part'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, + $out ? $out : "'$expected_part' '$uncompressed_part' differ"); + return; + } + passed($opts,$test); + + # Extract from an offset with named index + $test = sprintf('%s %2s threads', 'bgzip -b -I', + $threads ? $threads : 'no'); + print "$test: "; + $c = "cp '$compressed' '$compressed_copy'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "$$opts{bin}/bgzip $at -b $offset -d -I '$index' '$compressed_copy' > $uncompressed_part2"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "cmp '$expected_part' '$uncompressed_part2'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, + $out ? $out : "'$expected_part' '$uncompressed_part2' differ"); + return; + } + passed($opts,$test); +} + my $test_view_failures; sub testv { my ($opts, $cmd) = @_; @@ -251,6 +459,60 @@ sub testv { } } +sub fake_multi_ref_data +{ + open(SAM, ">multi_ref.tmp.sam") || die; + for (my $r=0;$r<1000;$r++) { + print SAM "\@SQ\tSN:c$r\tLN:10000\n"; + } + + # Single ref + my $rnum=0; + for (my $p=1;$p<1000;$p++) { + print SAM "X\t0\tc$rnum\t$p\t40\t10M\t*\t0\t0\tCCTAGCCCTA\tB?8B?BACCD\n"; + } + + # Multi ref; 1 seq per ref + for (my $r=1;$r<300;$r++) { + print SAM "X\t0\tc$rnum\t1\t40\t10M\t*\t0\t0\tCCTAGCCCTA\tB?8B?BACCD\n"; + $rnum++; + } + + # Single ref again + for (my $p=1;$p<1000;$p++) { + print SAM "X\t0\tc$rnum\t$p\t40\t10M\t*\t0\t0\tCCTAGCCCTA\tB?8B?BACCD\n"; + } + + # Multi ref; 1 seq per ref + for (my $r=1;$r<300;$r++) { + print SAM "X\t0\tc$rnum\t1\t40\t10M\t*\t0\t0\tCCTAGCCCTA\tB?8B?BACCD\n"; + $rnum++; + } + close(SAM); +} + +sub test_multi_ref +{ + my ($opts, $nthreads) = @_; + my $tv_args = $nthreads ? "-\@$nthreads" : ""; + + fake_multi_ref_data; + print "test_view testing multi-ref CRAM modes:\n"; + $test_view_failures = 0; + + for (my $mf = -1; $mf <= 1; $mf++) { + testv $opts, "./test_view $tv_args -o seqs_per_slice=100 -o no_ref=1 -o multi_seq_per_slice=$mf -S -C multi_ref.tmp.sam > multi_ref.tmp.cram"; + testv $opts, "./test_view $tv_args multi_ref.tmp.cram > multi_ref.tmp.sam_"; + testv $opts, "./compare_sam.pl multi_ref.tmp.sam multi_ref.tmp.sam_"; + } + + if ($test_view_failures == 0) { + passed($opts, "multi-ref conversions"); + } else { + failed($opts, "multi-ref conversions", "$test_view_failures subtests failed"); + } +} + sub test_view { my ($opts, $nthreads) = @_; @@ -313,8 +575,8 @@ sub test_view # CRAM2 -> CRAM3 testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 $cram.cram > $cram"; - # CRAM3 -> CRAM3 + multi-slice - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 -o seqs_per_slice=7 -o slices_per_container=5 $cram.cram > $cram"; + # CRAM3 -> CRAM3 + multi-slice + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 -o seqs_per_slice=7 -o slices_per_container=5 $cram.cram > $cram"; testv $opts, "./test_view $tv_args $cram > $cram.sam_"; testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; @@ -342,12 +604,188 @@ sub test_view # command line and nowhere else. REF_PATH should also point to nowhere # (currently done by the Makefile). This is to test the refseq reference # counting and reload (Issue #654). + print "test_view testing region queries:\n"; + $test_view_failures = 0; + my $regions = "CHROMOSOME_II:2980-2980 CHROMOSOME_IV:1500-1500 CHROMOSOME_II:2980-2980 CHROMOSOME_I:1000-1100"; testv $opts, "./test_view $tv_args -i reference=ce.fa range.cram $regions > range.tmp"; testv $opts, "./compare_sam.pl range.tmp range.out"; testv $opts, "./test_view $tv_args range.bam $regions > range.tmp"; testv $opts, "./compare_sam.pl range.tmp range.out"; + + if ($test_view_failures == 0) { + passed($opts, "range.cram tests"); + } else { + failed($opts, "range.cram tests", "$test_view_failures subtests failed"); + } + + # Test BAM files with references in targets list but no corresponding @SQ + # lines in the text header. + print "test_view testing BAM files with absent \@SQ lines:\n"; + $test_view_failures = 0; + testv $opts, "./test_view $tv_args -p no_hdr_sq_1.tmp.sam no_hdr_sq_1.bam"; + testv $opts, "./compare_sam.pl no_hdr_sq_1.tmp.sam no_hdr_sq_1.expected.sam"; + + # Try a range query to ensure id <-> name mapping works + # Input only has reads from CHROMOSOME_I, so same "expected" file is used + testv $opts, "./test_view $tv_args -p no_hdr_sq_1.chr1.tmp.sam no_hdr_sq_1.bam CHROMOSOME_I"; + testv $opts, "./compare_sam.pl no_hdr_sq_1.chr1.tmp.sam no_hdr_sq_1.expected.sam"; + if ($test_view_failures == 0) { + passed($opts, "no_hdr_sq tests"); + } else { + failed($opts, "no_hdr_sq tests", "$test_view_failures subtests failed"); + } + + # File with large (> 2Gbases) positions + # Only works for SAM at the moment, but we can still round-trip it. + print "test_view testing large (> 2Gbases) positions:\n"; + $test_view_failures = 0; + testv $opts, "./test_view $tv_args -z -p longrefs/longref.tmp.sam.gz -x longrefs/longref.tmp.sam.gz.csi.otf -m 14 longrefs/longref.sam"; + testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.sam.gz"; + testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_"; + + # CRAM disabled for now as the positions cannot be 32-bit. (These tests are useful for + # checking SQ headers only.) + # testv $opts, "./test_view $tv_args -C -o no_ref -p longrefs/longref.tmp.cram longrefs/longref.sam"; + # testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.cram"; + # testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_"; + + # Build index and compare with on-the-fly one made earlier. + test_compare $opts, "$$opts{path}/test_index -c longrefs/longref.tmp.sam.gz", "longrefs/longref.tmp.sam.gz.csi.otf", "longrefs/longref.tmp.sam.gz.csi", gz=>1; + + # Large position iterator tests + testv $opts, "./test_view $tv_args -p longrefs/longref_itr.tmp.sam longrefs/longref.tmp.sam.gz CHROMOSOME_I:10000000000-10000000003"; + testv $opts, "./compare_sam.pl longrefs/longref_itr.expected.sam longrefs/longref_itr.tmp.sam"; + testv $opts, "./test_view $tv_args -M -p longrefs/longref_multi.tmp.sam longrefs/longref.tmp.sam.gz CHROMOSOME_I:10000000000-10000000003 CHROMOSOME_I:10000000100-10000000110"; + testv $opts, "./compare_sam.pl longrefs/longref_multi.expected.sam longrefs/longref_multi.tmp.sam"; + + # VCF round trip + unlink("longrefs/index.tmp.vcf.gz.csi"); # To stop vcf_hdr_read from reading a stale index + testv $opts, "./test_view $tv_args -z -p longrefs/index.tmp.vcf.gz -x longrefs/index.tmp.vcf.gz.csi.otf -m 14 longrefs/index.vcf"; + testv $opts, "./test_view $tv_args -p longrefs/index.tmp.vcf_ longrefs/index.tmp.vcf.gz"; + testv $opts, "cmp longrefs/index.vcf longrefs/index.tmp.vcf_"; + + # Build index and compare with on-the-fly one made earlier. + test_compare $opts, "$$opts{path}/test_index -c longrefs/index.tmp.vcf.gz", "longrefs/index.tmp.vcf.gz.csi.otf", "longrefs/index.tmp.vcf.gz.csi", gz=>1; + + # test_view can't do indexed look-ups on vcf, but we can use tabix + test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000100-10010000105 > longrefs/index.tmp.tabix1.vcf", "longrefs/index.expected1.vcf", "longrefs/index.tmp.tabix1.vcf", fix_newlines => 1; + test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000120-10010000130 > longrefs/index.tmp.tabix2.vcf", "longrefs/index.expected2.vcf", "longrefs/index.tmp.tabix2.vcf", fix_newlines => 1; + + if ($test_view_failures == 0) { + passed($opts, "large position tests"); + } else { + failed($opts, "large position tests", "$test_view_failures subtests failed"); + } +} + +# Tests CRAM's ability to correctly preserve MD and NM, irrespective of whether +# they are correct. +sub test_MD +{ + my ($opts) = @_; + + foreach my $sam (glob("*#MD*.sam")) { + my ($base, $ref) = ($sam =~ /((.*)#.*)\.sam/); + $ref .= ".fa"; + + my $bam = "$base.tmp.bam"; + my $cram = "$base.tmp.cram"; + + print "\ntest_MD testing $sam, ref $ref:\n"; + $test_view_failures = 0; + $cram = "$base.tmp.cram"; + + # Forcibly store MD and NM and don't auto-generate. + # ALL NM/MD should match and be present only when originally present + testv $opts, "./test_view -o store_nm=1 -o store_md=1 -t $ref -C $sam > $cram"; + testv $opts, "./test_view -i decode_md=0 -D $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $sam $cram.sam_"; + + # Skip auto-MD generation; check MD iff in output file. + # (NB this does not check that all erroneous values are stored.) + testv $opts, "./test_view -t $ref -C $sam > $cram"; + testv $opts, "./test_view -i decode_md=0 -D $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl -partialmd=2 $sam $cram.sam_"; + + # Also check we haven't added NM or MD needlessly for xx#MD.sam. + # This file has no errors so without auto-generation there must be + # no NM or MD records. + if ($sam eq "xx#MD.sam") { + print " Checking for MD/NM in $sam\n"; + open(my $fh, "<$cram.sam_") || die; + while (<$fh>) { + if (/(MD|NM):/) { + print STDERR "Failed\nLine contains MD/NM:\n$_"; + $test_view_failures++; + last; + } + } + close($fh); + } + + # Force auto-MD generation; check MD iff in input file. + # This will ensure any erroneous values have been round-tripped. + testv $opts, "./test_view -t $ref -C $sam > $cram"; + testv $opts, "./test_view -i decode_md=1 -D $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl -partialmd=1 $sam $cram.sam_"; + + if ($test_view_failures == 0) { + passed($opts, "$sam MD tests"); + } else { + failed($opts, "$sam MD tests", "$test_view_failures subtests failed"); + } + } +} + +sub test_index +{ + my ($opts, $nthreads) = @_; + $nthreads = $nthreads ? "-\@$nthreads" : ""; + + # BAM + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -b -m 14 -x $$opts{tmp}/index.bam.csi $$opts{path}/index.sam > $$opts{tmp}/index.bam", "$$opts{tmp}/index.bam.csi", "$$opts{path}/index.bam.csi", gz=>1); + unlink("$$opts{tmp}/index.bam.csi"); + test_compare($opts,"$$opts{path}/test_index -c $$opts{tmp}/index.bam", "$$opts{tmp}/index.bam.csi", "$$opts{path}/index.bam.csi", gz=>1); + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -b -m 0 -x $$opts{tmp}/index.bam.bai $$opts{path}/index.sam > $$opts{tmp}/index.bam", "$$opts{tmp}/index.bam.bai", "$$opts{path}/index.bam.bai"); + unlink("$$opts{tmp}/index.bam.bai"); + test_compare($opts,"$$opts{path}/test_index -b $$opts{tmp}/index.bam", "$$opts{tmp}/index.bam.bai", "$$opts{path}/index.bam.bai"); + + # SAM + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -z -m 14 -x $$opts{tmp}/index.sam.gz.csi $$opts{path}/index.sam > $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.csi", "$$opts{path}/index.sam.gz.csi", gz=>1); + unlink("$$opts{tmp}/index.bam.bai"); + test_compare($opts,"$$opts{path}/test_index -c $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.csi", "$$opts{path}/index.sam.gz.csi", gz=>1); + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -z -m 0 -x $$opts{tmp}/index.sam.gz.bai $$opts{path}/index.sam > $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.bai", "$$opts{path}/index.sam.gz.bai"); + unlink("$$opts{tmp}/index.sam.gz.bai"); + test_compare($opts,"$$opts{path}/test_index -b $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.bai", "$$opts{path}/index.sam.gz.bai"); + + # CRAM + local $ENV{REF_PATH} = $$opts{m5_dir}; + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -C -x $$opts{tmp}/index.cram.crai $$opts{path}/index.sam > $$opts{tmp}/index.cram", "$$opts{tmp}/index.cram.crai", "$$opts{path}/index.cram.crai", gz=>1); + unlink("$$opts{tmp}/index.cram.crai"); + test_compare($opts,"$$opts{path}/test_index $$opts{tmp}/index.cram", "$$opts{tmp}/index.cram.crai", "$$opts{path}/index.cram.crai", gz=>1); + + # BCF + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -b -m 14 -x $$opts{tmp}/index.bcf.csi $$opts{path}/index.vcf > $$opts{tmp}/index.bcf", "$$opts{tmp}/index.bcf.csi", "$$opts{path}/index.bcf.csi", gz=>1); + unlink("$$opts{tmp}/index.bcf.csi"); + test_compare($opts,"$$opts{path}/test_index -c $$opts{tmp}/index.bcf", "$$opts{tmp}/index.bcf.csi", "$$opts{path}/index.bcf.csi", gz=>1); + + # VCF + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -z -m 14 -x $$opts{tmp}/index.vcf.gz.csi $$opts{path}/index.vcf > $$opts{tmp}/index.vcf.gz", "$$opts{tmp}/index.vcf.gz.csi", "$$opts{path}/index.vcf.gz.csi", gz=>1); + unlink("$$opts{tmp}/index.vcf.gz.csi"); + test_compare($opts,"$$opts{path}/test_index -c $$opts{tmp}/index.vcf.gz", "$$opts{tmp}/index.vcf.gz.csi", "$$opts{path}/index.vcf.gz.csi", gz=>1); + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -z -m 0 -x $$opts{tmp}/index.vcf.gz.tbi $$opts{path}/index.vcf > $$opts{tmp}/index.vcf.gz", "$$opts{tmp}/index.vcf.gz.tbi", "$$opts{path}/index.vcf.gz.tbi", gz=>1); + unlink("$$opts{tmp}/index.vcf.gz.tbi"); + test_compare($opts,"$$opts{path}/test_index -t $$opts{tmp}/index.vcf.gz", "$$opts{tmp}/index.vcf.gz.tbi", "$$opts{path}/index.vcf.gz.tbi", gz=>1); + + # Tabix and custom index names + _cmd("$$opts{bin}/tabix -fp vcf $$opts{tmp}/index.vcf.gz"); + my $wtmp = $$opts{tmp}; + if ($^O =~ /^msys/) { + $wtmp =~ s/\//\\\\/g; + } + test_cmd($opts,out=>'tabix.out',cmd=>"$$opts{bin}/tabix $wtmp/index.vcf.gz##idx##$wtmp/index.vcf.gz.tbi 1:10000060-10000060"); } sub test_vcf_api @@ -385,19 +823,19 @@ sub write_multiblock_bgzf { my $tmp = "$name.tmp"; open(my $out, '>', $name) || die "Couldn't open $name $!\n"; for (my $i = 0; $i < @$frags; $i++) { - local $/; - open(my $f, '>', $tmp) || die "Couldn't open $tmp : $!\n"; - print $f $frags->[$i]; - close($f) || die "Error writing to $tmp: $!\n"; - open(my $bgz, '-|', "$$opts{bin}/bgzip -c $tmp") - || die "Couldn't open pipe to bgzip: $!\n"; - my $compressed = <$bgz>; - close($bgz) || die "Error running bgzip\n"; - if ($i < $#$frags) { - # Strip EOF block - $compressed =~ s/\x1f\x8b\x08\x04\x00{5}\xff\x06\x00\x42\x43\x02\x00\x1b\x00\x03\x00{9}$//; - } - print $out $compressed; + local $/; + open(my $f, '>', $tmp) || die "Couldn't open $tmp : $!\n"; + print $f $frags->[$i]; + close($f) || die "Error writing to $tmp: $!\n"; + open(my $bgz, '-|', "$$opts{bin}/bgzip -c $tmp") + || die "Couldn't open pipe to bgzip: $!\n"; + my $compressed = <$bgz>; + close($bgz) || die "Error running bgzip\n"; + if ($i < $#$frags) { + # Strip EOF block + $compressed =~ s/\x1f\x8b\x08\x04\x00{5}\xff\x06\x00\x42\x43\x02\x00\x1b\x00\x03\x00{9}$//; + } + print $out $compressed; } close($out) || die "Error writing to $name: $!\n"; unlink($tmp); @@ -416,14 +854,14 @@ sub test_rebgzip my ($ret, $out) = _cmd("cmp $mb $$opts{path}/bgziptest.txt.gz"); if (!$ret && $out eq '') { # If it does, use the original - test_cmd($opts, %args, out => "bgziptest.txt.gz", - cmd => "$$opts{bin}/bgzip -I $$opts{path}/bgziptest.txt.gz.gzi -c -g $$opts{path}/bgziptest.txt"); + test_cmd($opts, %args, out => "bgziptest.txt.gz", + cmd => "$$opts{bin}/bgzip -I $$opts{path}/bgziptest.txt.gz.gzi -c -g $$opts{path}/bgziptest.txt"); } else { - # Otherwise index the one we just made and test that - print "test_rebgzip: Alternate zlib/deflate library detected\n"; - cmd("$$opts{bin}/bgzip -I $mb.gzi -r $mb"); - test_cmd($opts, %args, out => "bgziptest.txt.tmp.gz", - cmd => "$$opts{bin}/bgzip -I $mb.gzi -c -g $$opts{path}/bgziptest.txt"); + # Otherwise index the one we just made and test that + print "test_rebgzip: Alternate zlib/deflate library detected\n"; + cmd("$$opts{bin}/bgzip -I $mb.gzi -r $mb"); + test_cmd($opts, %args, out => "bgziptest.txt.tmp.gz", + cmd => "$$opts{bin}/bgzip -I $mb.gzi -c -g $$opts{path}/bgziptest.txt"); } } @@ -473,7 +911,10 @@ sub test_logging print "$test:\n"; print "\t$cmd\n"; my ($ret,$out) = _cmd($cmd); - if ( $ret ) { failed($opts,$test); } + if ( $ret ) { + print $out; + failed($opts,$test); + } else { passed($opts,$test); } } diff --git a/test/test_bgzf.c b/test/test_bgzf.c index 93194ac80..c06b0c87d 100644 --- a/test/test_bgzf.c +++ b/test/test_bgzf.c @@ -1,6 +1,6 @@ /* test/test_bgzf.c -- bgzf unit tests - Copyright (C) 2017 Genome Research Ltd + Copyright (C) 2017, 2019 Genome Research Ltd Author: Robert Davies @@ -31,7 +31,9 @@ DEALINGS IN THE SOFTWARE. #include #include #include +#include #include +#include #include "htslib/bgzf.h" #include "htslib/hfile.h" #include "hfile_internal.h" @@ -242,6 +244,39 @@ static int try_bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix, return 0; } +static int64_t try_bgzf_tell(BGZF *fp, const char *name, const char *func) { + int64_t told = bgzf_tell(fp); + if (told < 0) { + fprintf(stderr, "%s : %s %s : %s\n", + func, "Error telling in", + name, strerror(errno)); + return -1; + } + + return told; +} + +static int64_t try_bgzf_tell_expect(BGZF *fp, int64_t expected, const char *name, const char *func) { + int64_t told = try_bgzf_tell(fp, name, func); + if (told != expected) { + fprintf(stderr, "%s : Unexpected value (%" PRId64 ") from bgzf_tell on %s; " + "expected %" PRId64 "\n", + func, told, name, expected); + return -1; + } + return told; +} + +static int try_bgzf_seek(BGZF *fp, int64_t pos, int whence, + const char *name, const char *func) { + if (bgzf_seek(fp, pos, whence) < 0) { + fprintf(stderr, "%s : Error from bgzf_seek(%s, %" PRId64 ", %d) : %s\n", + func, name, pos, whence, strerror(errno)); + return -1; + } + return 0; +} + static int try_bgzf_useek(BGZF *fp, long uoffset, int where, const char *name, const char *func) { if (bgzf_useek(fp, uoffset, where) < 0) { @@ -265,6 +300,22 @@ static int try_bgzf_getc(BGZF *fp, size_t pos, int expected, return c; } +static int try_skip(BGZF *fp, size_t count, + const char *name, const char *func) { + size_t i; + int c; + for (i = 0; i < count; i++) { + c = bgzf_getc(fp); + if (c < 0) { + fprintf(stderr, + "%s : Error from bgzf_getc on %s\n", + func, name); + return -1; + } + } + return 0; +} + static int compare_buffers(const unsigned char *b1, const unsigned char *b2, size_t l1, size_t l2, const char *name1, const char *name2, @@ -575,12 +626,13 @@ static int test_check_EOF(char *name, int expected) { return try_bgzf_close(&bgz, name, __func__); } -static int test_index_seek_getc(Files *f, const char *mode, - int cache_size, int nthreads) { +static int test_index_useek_getc(Files *f, const char *mode, + int cache_size, int nthreads) { BGZF* bgz = NULL; ssize_t bg_put; - size_t i, j, iskip = f->ltext / 10; + size_t i, j, k, iskip = f->ltext / 10; int is_uncompressed = strchr(mode, 'u') != NULL; + size_t offsets[3] = { 0, 100, 50 }; bgz = try_bgzf_open(f->tmp_bgzf, mode, __func__); if (!bgz) goto fail; @@ -612,13 +664,16 @@ static int test_index_seek_getc(Files *f, const char *mode, } for (i = 0; i < f->ltext; i += iskip) { - if (try_bgzf_useek(bgz, i, SEEK_SET, f->tmp_bgzf, __func__) != 0) { - goto fail; - } + for (k = 0; k < sizeof(offsets) / sizeof(offsets[0]); k++) { + size_t o = offsets[k]; + if (try_bgzf_useek(bgz, i + o, SEEK_SET, f->tmp_bgzf, __func__) != 0) { + goto fail; + } - for (j = 0; j < 16 && i + j < f->ltext; j++) { - if (try_bgzf_getc(bgz, i + j, f->text[i + j], - f->tmp_bgzf, __func__) < 0) goto fail; + for (j = 0; j < 16 && i + o + j < f->ltext; j++) { + if (try_bgzf_getc(bgz, i + o + j, f->text[i + o + j], + f->tmp_bgzf, __func__) < 0) goto fail; + } } } @@ -663,6 +718,157 @@ static int test_index_seek_getc(Files *f, const char *mode, return -1; } +static int test_tell_seek_getc(Files *f, const char *mode, + int cache_size, int nthreads) { + + BGZF* bgz = NULL; + ssize_t bg_put; + size_t num_points = 10; + size_t i, j, k, iskip = f->ltext / num_points; + size_t offsets[3] = { 0, 100, 50 }; + size_t points[num_points]; + int64_t point_vos[num_points]; + + bgz = try_bgzf_open(f->tmp_bgzf, mode, __func__); + if (!bgz) goto fail; + + for (i = 0; i < num_points; i++) { + point_vos[i] = try_bgzf_tell(bgz, f->tmp_bgzf, __func__); + if (point_vos[i] < 0) goto fail; + points[i] = i * iskip; + bg_put = try_bgzf_write(bgz, f->text + i * iskip, iskip, f->tmp_bgzf, __func__); + if (bg_put < 0) goto fail; + } + + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + + bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); + if (!bgz) goto fail; + + if (nthreads > 0 && try_bgzf_mt(bgz, nthreads, __func__) != 0) goto fail; + + for (i = 0; i < f->ltext; i += iskip) { + for (k = 0; k < sizeof(offsets) / sizeof(offsets[0]); k++) { + size_t o = offsets[k]; + + if (try_bgzf_seek(bgz, point_vos[i/iskip], SEEK_SET, f->tmp_bgzf, __func__) != 0) { + goto fail; + } + if (try_bgzf_tell_expect(bgz, point_vos[i/iskip], f->tmp_bgzf, __func__) < 0) { + goto fail; + } + + if (try_skip(bgz, o, f->tmp_bgzf, __func__) != 0) { + goto fail; + } + for (j = 0; j < 16 && i + o + j < f->ltext; j++) { + if (try_bgzf_getc(bgz, i + o + j, f->text[i + o + j], + f->tmp_bgzf, __func__) < 0) goto fail; + } + } + } + + if (try_bgzf_seek(bgz, 0, SEEK_SET, f->tmp_bgzf, __func__) != 0) { + goto fail; + } + if (try_bgzf_tell_expect(bgz, 0, f->tmp_bgzf, __func__) < 0) { + goto fail; + } + for (j = 0; j < 70000 && j < f->ltext; j++) { // Should force a block load + if (try_bgzf_getc(bgz, j, f->text[j], + f->tmp_bgzf, __func__) < 0) goto fail; + } + + if (cache_size > 0) { + size_t mid = points[num_points / 2]; + int64_t mid_vo = point_vos[num_points / 2]; + bgzf_set_cache_size(bgz, cache_size); + + for (i = 0; i < 10; i++) { + if (try_bgzf_seek(bgz, 0, SEEK_SET, f->tmp_bgzf, __func__) != 0) { + goto fail; + } + if (try_bgzf_tell_expect(bgz, 0, f->tmp_bgzf, __func__) < 0) { + goto fail; + } + for (j = 0; j < 64 && j < f->ltext; j++) { + if (try_bgzf_getc(bgz, j, f->text[j], + f->tmp_bgzf, __func__) < 0) goto fail; + } + + if (try_bgzf_seek(bgz, mid_vo, SEEK_SET, + f->tmp_bgzf, __func__) != 0) { + goto fail; + } + if (try_bgzf_tell_expect(bgz, mid_vo, f->tmp_bgzf, __func__) < 0) { + goto fail; + } + for (j = 0; j < 64 && j + mid < f->ltext; j++) { + if (try_bgzf_getc(bgz, j + mid, f->text[j + mid], + f->tmp_bgzf, __func__) < 0) goto fail; + } + } + } + + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + + return 0; + + fail: + if (bgz) bgzf_close(bgz); + return -1; +} + +static int test_tell_read(Files *f, const char *mode) { + + BGZF* bgz = NULL; + ssize_t bg_put; + size_t num_points = 10; + size_t i, iskip = f->ltext / num_points; + int64_t point_vos[num_points]; + + unsigned char *bg_buf = calloc(iskip+1,1); + if (!bg_buf) return -1; + + bgz = try_bgzf_open(f->tmp_bgzf, mode, __func__); + if (!bgz) goto fail; + + for (i = 0; i < num_points; i++) { + point_vos[i] = try_bgzf_tell(bgz, f->tmp_bgzf, __func__); + if (point_vos[i] < 0) goto fail; + bg_put = try_bgzf_write(bgz, f->text + i * iskip, iskip, f->tmp_bgzf, __func__); + if (bg_put < 0) goto fail; + } + + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + + bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); + if (!bgz) goto fail; + + for (i = 0; i < f->ltext; i += iskip) { + if (try_bgzf_tell_expect(bgz, point_vos[i/iskip], f->tmp_bgzf, __func__) < 0) { + goto fail; + } + if (try_bgzf_read(bgz, bg_buf, iskip, f->tmp_bgzf, __func__) < 0) { + goto fail; + } + if (compare_buffers(f->text+i, bg_buf, iskip, iskip, + f->tmp_bgzf, f->tmp_bgzf, __func__) != 0) { + goto fail; + } + } + + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + free(bg_buf); + return 0; + + fail: + fprintf(stderr, "%s: failed\n", __func__); + if (bgz) bgzf_close(bgz); + free(bg_buf); + return -1; +} + static int test_bgzf_getline(Files *f, const char *mode, int nthreads) { BGZF* bgz = NULL; ssize_t bg_put; @@ -761,15 +967,32 @@ int main(int argc, char **argv) { if (test_index_load_dump(&f) != 0) goto out; // Index building on the fly and bgzf_useek - if (test_index_seek_getc(&f, "w", 1000000, 0) != 0) goto out; + if (test_index_useek_getc(&f, "w", 1000000, 0) != 0) goto out; // Index building on the fly and bgzf_useek, with threads - // ** Not implemented yet ** - // if (test_index_seek_getc(&f, "w", 1000000, 1) != 0) goto out; - // if (test_index_seek_getc(&f, "w", 1000000, 2) != 0) goto out; + if (test_index_useek_getc(&f, "w", 1000000, 1) != 0) goto out; + if (test_index_useek_getc(&f, "w", 1000000, 2) != 0) goto out; // bgzf_useek on an uncompressed file - if (test_index_seek_getc(&f, "wu", 0, 0) != 0) goto out; + if (test_index_useek_getc(&f, "wu", 0, 0) != 0) goto out; + + // bgzf_tell and bgzf_seek + if (test_tell_seek_getc(&f, "w", 0, 0) != 0) goto out; + if (test_tell_seek_getc(&f, "wu", 0, 0) != 0) goto out; + if (test_tell_seek_getc(&f, "w", 1000000, 0) != 0) goto out; + if (test_tell_seek_getc(&f, "wu", 1000000, 0) != 0) goto out; + if (test_tell_seek_getc(&f, "w", 0, 1) != 0) goto out; + if (test_tell_seek_getc(&f, "w", 0, 2) != 0) goto out; + if (test_tell_seek_getc(&f, "wu", 0, 1) != 0) goto out; + if (test_tell_seek_getc(&f, "wu", 0, 2) != 0) goto out; + if (test_tell_seek_getc(&f, "w", 1000000, 1) != 0) goto out; + if (test_tell_seek_getc(&f, "w", 1000000, 2) != 0) goto out; + if (test_tell_seek_getc(&f, "wu", 1000000, 1) != 0) goto out; + if (test_tell_seek_getc(&f, "wu", 1000000, 2) != 0) goto out; + + // bgzf_tell and bgzf_read + if (test_tell_read(&f, "w") != 0) goto out; + if (test_tell_read(&f, "wu") != 0) goto out; // getline if (test_bgzf_getline(&f, "w", 0) != 0) goto out; diff --git a/test/test_index.c b/test/test_index.c new file mode 100644 index 000000000..1eae9090f --- /dev/null +++ b/test/test_index.c @@ -0,0 +1,83 @@ +/* test/test_index.c -- simple tool to build an index, for the test harness. + + Copyright (C) 2018 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include + +#include "htslib/sam.h" +#include "htslib/vcf.h" + +void usage(FILE *fp) { + fprintf(fp, "Usage: test_index [opts] in.{sam.gz,bam,cram}|in.{vcf.gz,bcf}\n\n"); + fprintf(fp, " -b Use BAI index (BAM, SAM)\n"); + fprintf(fp, " -c Use CSI index (BAM, SAM, VCF, BCF)\n"); + fprintf(fp, " -t Use TBI index (VCF) \n"); + fprintf(fp, " -m bits Adjust min_shift; implies CSI\n"); + fprintf(fp, "\nThe default index format is CSI for sam/bam/vcf/bcf and CRAI for crams\n"); + exit(fp == stderr ? 1 : 0); +} + +int main(int argc, char **argv) { + int c, min_shift = 14; + + while ((c = getopt(argc, argv, "bctm:")) >= 0) { + switch (c) { + case 't': case 'b': min_shift = 0; break; + case 'c': min_shift = 14; break; + case 'm': min_shift = atoi(optarg); break; + case 'h': usage(stdout); + default: usage(stderr); + } + } + + if (optind >= argc) usage(stderr); + + htsFile *in = hts_open(argv[optind], "r"); + if (!in) { + fprintf(stderr, "Error opening \"%s\"\n", argv[optind]); + exit(1); + } + + int ret; + if (in->format.format == sam || + in->format.format == bam || + in->format.format == cram) { + ret = sam_index_build(argv[optind], min_shift); + } else { + ret = bcf_index_build(argv[optind], min_shift); + } + + if (ret < 0) { + fprintf(stderr, "Failed to build index for \"%s\"\n", argv[optind]); + exit(1); + } + + if (hts_close(in) < 0) { + fprintf(stderr, "Error closing \"%s\"\n", argv[optind]); + exit(1); + } + + return 0; +} diff --git a/test/test_kstring.c b/test/test_kstring.c new file mode 100644 index 000000000..5389a8ef1 --- /dev/null +++ b/test/test_kstring.c @@ -0,0 +1,213 @@ +/* test_kstring.c -- kstring unit tests + + Copyright (C) 2018 Genome Research Ltd. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include +#include +#include +#include + +#include + +static inline void clamp(int64_t *val, int64_t min, int64_t max) { + if (*val < min) *val = min; + if (*val > max) *val = max; +} + +static int test_kputuw_from_to(kstring_t *str, unsigned int s, unsigned int e) { + unsigned int i = s; + + for (;;) { + str->l = 0; + memset(str->s, 0xff, str->m); + if (kputuw(i, str) < 0 || !str->s) { + perror("kputuw"); + return -1; + } + if (str->l >= str->m || str->s[str->l] != '\0') { + fprintf(stderr, "No NUL termination on string from kputuw\n"); + return -1; + } + if (i != strtoul(str->s, NULL, 10)) { + fprintf(stderr, + "kputuw wrote the wrong value, expected %u, got %s\n", + i, str->s); + return -1; + } + if (i >= e) break; + i++; + } + return 0; +} + +static int test_kputuw(int64_t start, int64_t end) { + kstring_t str = { 0, 0, NULL }; + int64_t val; + + str.s = malloc(2); + if (!str.s) { + perror("malloc"); + return -1; + } + str.m = 2; + + for (val = 0; val < UINT_MAX; val = val == 0 ? 1 : val * 10) { + unsigned int s = val == 0 ? 0 : val - 5; + unsigned int e = val + 5; + + if (test_kputuw_from_to(&str, s, e) < 0) { + free(ks_release(&str)); + return -1; + } + } + + if (test_kputuw_from_to(&str, UINT_MAX - 5, UINT_MAX) < 0) { + free(ks_release(&str)); + return -1; + } + + str.m = 1; // Force a resize + clamp(&start, 0, UINT_MAX); + clamp(&end, 0, UINT_MAX); + + if (test_kputuw_from_to(&str, start, end) < 0) { + free(ks_release(&str)); + return -1; + } + + free(ks_release(&str)); + + return 0; +} + +static int test_kputw_from_to(kstring_t *str, int s, int e) { + int i = s; + + for (;;) { + str->l = 0; + memset(str->s, 0xff, str->m); + if (kputw(i, str) < 0 || !str->s) { + perror("kputw"); + return -1; + } + if (str->l >= str->m || str->s[str->l] != '\0') { + fprintf(stderr, "No NUL termination on string from kputw\n"); + return -1; + } + if (i != strtol(str->s, NULL, 10)) { + fprintf(stderr, + "kputw wrote the wrong value, expected %u, got %s\n", + i, str->s); + return -1; + } + if (i >= e) break; + i++; + } + return 0; +} + +static int test_kputw(int64_t start, int64_t end) { + kstring_t str = { 0, 0, NULL }; + int64_t val; + + str.s = malloc(2); + if (!str.s) { + perror("malloc"); + return -1; + } + str.m = 2; + + for (val = 1; val < INT_MAX; val *= 10) { + if (test_kputw_from_to(&str, val > 5 ? val - 5 : 0, val + 5) < 0) { + free(ks_release(&str)); + return -1; + } + } + + for (val = -1; val > INT_MIN; val *= 10) { + if (test_kputw_from_to(&str, val - 5, val < -5 ? val + 5 : 0) < 0) { + free(ks_release(&str)); + return -1; + } + } + + if (test_kputw_from_to(&str, INT_MAX - 5, INT_MAX) < 0) { + free(ks_release(&str)); + return -1; + } + + if (test_kputw_from_to(&str, INT_MIN, INT_MIN + 5) < 0) { + free(ks_release(&str)); + return -1; + } + + str.m = 1; // Force a resize + clamp(&start, INT_MIN, INT_MAX); + clamp(&end, INT_MIN, INT_MAX); + + if (test_kputw_from_to(&str, start, end) < 0) { + free(ks_release(&str)); + return -1; + } + + free(ks_release(&str)); + + return 0; +} + +int main(int argc, char **argv) { + int opt, res = EXIT_SUCCESS; + int64_t start = 0; + int64_t end = 0; + char *test = NULL; + + while ((opt = getopt(argc, argv, "e:s:t:")) != -1) { + switch (opt) { + case 's': + start = strtoll(optarg, NULL, 0); + break; + case 'e': + end = strtoll(optarg, NULL, 0); + break; + case 't': + test = optarg; + break; + default: + fprintf(stderr, "Usage : %s [-s ] [-e ] [-t ]\n", + argv[0]); + return EXIT_FAILURE; + } + } + + if (!test || strcmp(test, "kputuw") == 0) + if (test_kputuw(start, end) != 0) res = EXIT_FAILURE; + + if (!test || strcmp(test, "kputw") == 0) + if (test_kputw(start, end) != 0) res = EXIT_FAILURE; + + return res; +} diff --git a/test/test_realn.c b/test/test_realn.c index 0364d7659..9cdc2e9e7 100644 --- a/test/test_realn.c +++ b/test/test_realn.c @@ -47,7 +47,7 @@ int main(int argc, char **argv) { char *ref_seq = NULL; char modew[8] = "w"; faidx_t *fai = NULL; - bam_hdr_t *hdr = NULL; + sam_hdr_t *hdr = NULL; bam1_t *rec = NULL; int c, res, last_ref = -1, ref_len = 0; int adjust = 0, extended = 0, recalc = 0, flags = 0; @@ -151,7 +151,7 @@ int main(int argc, char **argv) { goto fail; } - bam_hdr_destroy(hdr); + sam_hdr_destroy(hdr); bam_destroy1(rec); free(ref_seq); fai_destroy(fai); @@ -159,7 +159,7 @@ int main(int argc, char **argv) { return EXIT_SUCCESS; fail: - if (hdr) bam_hdr_destroy(hdr); + if (hdr) sam_hdr_destroy(hdr); if (rec) bam_destroy1(rec); if (in) hts_close(in); if (out) hts_close(out); diff --git a/test/test_str2int.c b/test/test_str2int.c new file mode 100644 index 000000000..c513a62d3 --- /dev/null +++ b/test/test_str2int.c @@ -0,0 +1,155 @@ +/* test_parse_int.c -- Test integer string conversion + + Copyright (C) 2019 Genome Research Ltd. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + + +#include +#include +#include +#include +#include +#include +#include "textutils_internal.h" + +// Test hts_str2int() and hts_str2uint() on various values around the +// maximum (or minimum for negative numbers) allowed for the given +// number of bits. Ensures that the failed flag is set when the output +// isn't going to fit, that the correct value is returned and that +// 'end' points to the character following the number. +static int check_str2int(int verbose) { + char buffer[64], *end; + int64_t val; + uint64_t num, uval; + int failed = 0, efail, i, offset; + const char sentinal = '#'; + + // Positive value (unsigned) + for (i = 1; i < 64; i++) { + num = (1ULL << i) - 1; + for (offset = i < 5 ? -(1LL << (i - 1)) : -16; offset <= 30; offset++) { + efail = (offset > 0); + snprintf(buffer, sizeof(buffer), "%" PRIu64 "%c", + num + offset, sentinal); + + uval = hts_str2uint(buffer, &end, i, &failed); + if (failed != efail || uval != (!efail ? num + offset : num) + || *end != sentinal) { + fprintf(stderr, "hts_str2uint failed: %d bit " + "%s %"PRIu64" '%c' %d (%d)\n", + i, buffer, uval, *end, failed, efail); + return -1; + } else if (verbose) { + fprintf(stderr, "hts_str2uint OK: %d bit " + "%s %"PRIu64" '%c' %d (%d)\n", + i, buffer, uval, *end, failed, efail); + } + failed = 0; + } + + // Positive value (signed) + for (offset = i < 5 ? -(1LL << (i - 1)) : -16; offset <= 30; offset++) { + efail = (offset > 0); + snprintf(buffer, sizeof(buffer), "%" PRIu64 "%c", + num + offset, sentinal); + + val = hts_str2int(buffer, &end, i + 1, &failed); + if (failed != efail || val != (!efail ? num + offset : num) + || *end != sentinal) { + fprintf(stderr, + "hts_str2int failed: %d bit " + "%s %"PRId64" '%c' %d (%d)\n", + i + 1, buffer, val, *end, failed, efail); + return -1; + } else if (verbose) { + fprintf(stderr, "hts_str2int OK: %d bit " + "%s %"PRId64" '%c' %d (%d)\n", + i + 1, buffer, val, *end, failed, efail); + } + failed = 0; + } + + // Negative value (signed) + for (offset = i < 5 ? -(1LL << (i - 1)) : -16; offset <= 30; offset++) { + efail = (offset > 0); + snprintf(buffer, sizeof(buffer), "-%" PRIu64 "%c", + num + offset + 1, sentinal); + + val = hts_str2int(buffer, &end, i + 1, &failed); + // Cast of val to unsigned in this comparison avoids undefined + // behaviour when checking INT64_MIN. + if (failed != efail + || -((uint64_t) val) != (!efail ? num + offset + 1 : num + 1) + || *end != sentinal) { + fprintf(stderr, + "hts_str2int failed: %d bit " + "%s %"PRId64" '%c' %d (%d)\n", + i + 1, buffer, val, *end, failed, efail); + return -1; + } else if (verbose) { + fprintf(stderr, "hts_str2int OK: %d bit " + "%s %"PRId64" '%c' %d (%d)\n", + i + 1, buffer, val, *end, failed, efail); + } + failed = 0; + } + } + + // Special case for UINT64_MAX + for (offset = 0; offset <= 999; offset++) { + efail = offset > 615; + snprintf(buffer, sizeof(buffer), "18446744073709551%03d%c", + offset, sentinal); + uval = hts_str2uint(buffer, &end, 64, &failed); + if (failed != efail + || uval != (efail ? UINT64_MAX : 18446744073709551000ULL + offset) + || *end != sentinal) { + fprintf(stderr, "hts_str2uint failed: 64 bit %s " + "%"PRIu64" '%c' %d (%d)\n", + buffer, uval, *end, failed, efail); + return -1; + } else if (verbose) { + fprintf(stderr, "hts_str2uint OK: 64 bit " + "%s %"PRIu64" '%c' %d (%d)\n", + buffer, uval, *end, failed, efail); + } + } + return 0; +} + +int main(int argc, char **argv) { + int verbose = 0, opt, res; + + while ((opt = getopt(argc, argv, "v")) != -1) { + switch (opt) { + case 'v': + verbose = 1; + break; + default: + fprintf(stderr, "Usage: %s [-v]\n", argv[0]); + return EXIT_FAILURE; + } + } + + res = check_str2int(verbose); + return res ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/test/test_view.c b/test/test_view.c index 693b91817..aadf0c267 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -1,7 +1,7 @@ /* test/test_view.c -- simple view tool, purely for use in a test harness. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2019 Genome Research Ltd. Author: Heng Li @@ -33,56 +33,294 @@ DEALINGS IN THE SOFTWARE. */ #include #include "cram/cram.h" - #include "htslib/sam.h" +#include "htslib/vcf.h" +#include "htslib/hts_log.h" + +struct opts { + char *fn_ref; + int flag; + int clevel; + int ignore_sam_err; + int nreads; + int extra_hdr_nuls; + int benchmark; + int nthreads; + int multi_reg; + char *index; + int min_shift; +}; enum test_op { READ_COMPRESSED = 1, - WRITE_COMPRESSED = 2, + WRITE_BINARY_COMP = 2, // eg bam, bcf READ_CRAM = 4, WRITE_CRAM = 8, WRITE_UNCOMPRESSED = 16, + WRITE_COMPRESSED = 32, // eg vcf.gz, sam.gz }; +int sam_loop(int argc, char **argv, int optind, struct opts *opts, htsFile *in, htsFile *out) { + int r = 0; + sam_hdr_t *h = NULL; + hts_idx_t *idx = NULL; + bam1_t *b = NULL; + + h = sam_hdr_read(in); + if (h == NULL) { + fprintf(stderr, "Couldn't read header for \"%s\"\n", argv[optind]); + return EXIT_FAILURE; + } + h->ignore_sam_err = opts->ignore_sam_err; + if (opts->extra_hdr_nuls > 0) { + char *new_text = realloc(h->text, h->l_text + opts->extra_hdr_nuls); + if (new_text == NULL) { + fprintf(stderr, "Error reallocing header text\n"); + goto fail; + } + h->text = new_text; + memset(&h->text[h->l_text], 0, opts->extra_hdr_nuls); + h->l_text += opts->extra_hdr_nuls; + } + + b = bam_init1(); + if (b == NULL) { + fprintf(stderr, "Out of memory allocating BAM struct\n"); + goto fail; + } + + /* CRAM output */ + if ((opts->flag & WRITE_CRAM) && opts->fn_ref) { + // Create CRAM references arrays + int ret = hts_set_fai_filename(out, opts->fn_ref); + + if (ret != 0) + goto fail; + } + + if (!opts->benchmark && sam_hdr_write(out, h) < 0) { + fprintf(stderr, "Error writing output header.\n"); + goto fail; + } + + if (opts->index) { + if (sam_idx_init(out, h, opts->min_shift, opts->index) < 0) { + fprintf(stderr, "Failed to initialise index\n"); + goto fail; + } + } + + if (optind + 1 < argc && !(opts->flag & READ_COMPRESSED)) { // BAM input and has a region + int i; + if ((idx = sam_index_load(in, argv[optind])) == 0) { + fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); + goto fail; + } + if (opts->multi_reg) { + hts_itr_t *iter = sam_itr_regarray(idx, h, &argv[optind + 1], argc - optind-1); + if (!iter) + goto fail; + while ((r = sam_itr_next(in, iter, b)) >= 0) { + if (!opts->benchmark && sam_write1(out, h, b) < 0) { + fprintf(stderr, "Error writing output.\n"); + hts_itr_destroy(iter); + goto fail; + } + if (opts->nreads && --opts->nreads == 0) + break; + } + hts_itr_destroy(iter); + if (r < -1) { + fprintf(stderr, "Error reading input.\n"); + goto fail; + } + } else { + for (i = optind + 1; i < argc; ++i) { + hts_itr_t *iter; + if ((iter = sam_itr_querys(idx, h, argv[i])) == 0) { + fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); + goto fail; + } + while ((r = sam_itr_next(in, iter, b)) >= 0) { + if (!opts->benchmark && sam_write1(out, h, b) < 0) { + fprintf(stderr, "Error writing output.\n"); + hts_itr_destroy(iter); + goto fail; + } + if (opts->nreads && --opts->nreads == 0) + break; + } + hts_itr_destroy(iter); + if (r < -1) { + fprintf(stderr, "Error reading input.\n"); + goto fail; + } + } + } + hts_idx_destroy(idx); idx = NULL; + } else while ((r = sam_read1(in, h, b)) >= 0) { + if (!opts->benchmark && sam_write1(out, h, b) < 0) { + fprintf(stderr, "Error writing output.\n"); + goto fail; + } + if (opts->nreads && --opts->nreads == 0) + break; + } + + if (r < -1) { + fprintf(stderr, "Error parsing input.\n"); + goto fail; + } + + if (opts->index) { + if (sam_idx_save(out) < 0) { + fprintf(stderr, "Error saving index\n"); + goto fail; + } + } + + bam_destroy1(b); + sam_hdr_destroy(h); + + return 0; + fail: + if (b) bam_destroy1(b); + if (h) sam_hdr_destroy(h); + if (idx) hts_idx_destroy(idx); + + return 1; +} + +int vcf_loop(int argc, char **argv, int optind, struct opts *opts, htsFile *in, htsFile *out) { + bcf_hdr_t *h = bcf_hdr_read(in); + bcf1_t *b = bcf_init1(); + hts_idx_t *idx; + int i, exit_code = 0, r = 0; + + if (!h) + return 1; + if (!b) + return 1; + + if (!opts->benchmark && bcf_hdr_write(out, h) < 0) + return 1; + + if (opts->index) { + if (bcf_idx_init(out, h, opts->min_shift, opts->index) < 0) { + fprintf(stderr, "Failed to initialise index\n"); + return 1; + } + } + + if (optind + 1 < argc) { + // A series of regions. + if ((idx = bcf_index_load(argv[optind])) == 0) { + fprintf(stderr, "[E::%s] fail to load the BVCF index\n", __func__); + return 1; + } + + for (i = optind + 1; i < argc; i++) { + hts_itr_t *iter; + if ((iter = bcf_itr_querys(idx, h, argv[i])) == 0) { + fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); + continue; + } + while ((r = bcf_itr_next(in, iter, b)) >= 0) { + if (!opts->benchmark && bcf_write1(out, h, b) < 0) { + fprintf(stderr, "Error writing output.\n"); + exit_code = 1; + break; + } + if (opts->nreads && --opts->nreads == 0) + break; + } + if (r < -1) { + fprintf(stderr, "Error reading input.\n"); + exit_code = 1; + } + hts_itr_destroy(iter); + if (exit_code != 0) break; + } + + hts_idx_destroy(idx); + + } else { + // Whole file + while ((r = bcf_read1(in, h, b)) >= 0) { + if (!opts->benchmark && bcf_write1(out, h, b) < 0) { + fprintf(stderr, "Error writing output.\n"); + exit_code = 1; + break; + } + if (opts->nreads && --opts->nreads == 0) + break; + } + if (r < -1) { + fprintf(stderr, "Error reading input.\n"); + exit_code = 1; + } + } + + if (exit_code == 0 && opts->index) { + if (bcf_idx_save(out) < 0) { + fprintf(stderr, "Error saving index\n"); + exit_code = 1; + } + } + + bcf_destroy1(b); + bcf_hdr_destroy(h); + return exit_code; +} + int main(int argc, char *argv[]) { - samFile *in; - char *fn_ref = 0; - int flag = 0, c, clevel = -1, ignore_sam_err = 0; + htsFile *in, *out; char moder[8]; - bam_hdr_t *h; - bam1_t *b; - htsFile *out; char modew[800]; - int r = 0, exit_code = 0; + int c, exit_code = EXIT_SUCCESS; hts_opt *in_opts = NULL, *out_opts = NULL; - int nreads = 0; - int extra_hdr_nuls = 0; - int benchmark = 0; - int nthreads = 0; // shared pool - int multi_reg = 0; - - while ((c = getopt(argc, argv, "DSIt:i:bCul:o:N:BZ:@:M")) >= 0) { + char *out_fn = "-"; + + struct opts opts; + opts.fn_ref = NULL; + opts.flag = 0; + opts.clevel = -1; + opts.ignore_sam_err = 0; + opts.nreads = 0; + opts.extra_hdr_nuls = 0; + opts.benchmark = 0; + opts.nthreads = 0; // shared pool + opts.multi_reg = 0; + opts.index = NULL; + opts.min_shift = 0; + + while ((c = getopt(argc, argv, "DSIt:i:bzCul:o:N:BZ:@:Mx:m:p:v")) >= 0) { switch (c) { - case 'D': flag |= READ_CRAM; break; - case 'S': flag |= READ_COMPRESSED; break; - case 'I': ignore_sam_err = 1; break; - case 't': fn_ref = optarg; break; + case 'D': opts.flag |= READ_CRAM; break; + case 'S': opts.flag |= READ_COMPRESSED; break; + case 'I': opts.ignore_sam_err = 1; break; + case 't': opts.fn_ref = optarg; break; case 'i': if (hts_opt_add(&in_opts, optarg)) return 1; break; - case 'b': flag |= WRITE_COMPRESSED; break; - case 'C': flag |= WRITE_CRAM; break; - case 'u': flag |= WRITE_UNCOMPRESSED; break; // eg u-BAM not SAM - case 'l': clevel = atoi(optarg); flag |= WRITE_COMPRESSED; break; + case 'b': opts.flag |= WRITE_BINARY_COMP; break; + case 'z': opts.flag |= WRITE_COMPRESSED; break; + case 'C': opts.flag |= WRITE_CRAM; break; + case 'u': opts.flag |= WRITE_UNCOMPRESSED; break; // eg u-BAM not SAM + case 'l': opts.clevel = atoi(optarg); break; case 'o': if (hts_opt_add(&out_opts, optarg)) return 1; break; - case 'N': nreads = atoi(optarg); break; - case 'B': benchmark = 1; break; - case 'Z': extra_hdr_nuls = atoi(optarg); break; - case 'M': multi_reg = 1; break; - case '@': nthreads = atoi(optarg); break; + case 'N': opts.nreads = atoi(optarg); break; + case 'B': opts.benchmark = 1; break; + case 'Z': opts.extra_hdr_nuls = atoi(optarg); break; + case 'M': opts.multi_reg = 1; break; + case '@': opts.nthreads = atoi(optarg); break; + case 'x': opts.index = optarg; break; + case 'm': opts.min_shift = atoi(optarg); break; + case 'p': out_fn = optarg; break; + case 'v': hts_verbose++; break; } } if (argc == optind) { - fprintf(stderr, "Usage: test_view [-DSI] [-t fn_ref] [-i option=value] [-bC] [-l level] [-o option=value] [-N num_reads] [-B] [-Z hdr_nuls] [-@ num_threads] || [region]\n"); + fprintf(stderr, "Usage: test_view [-DSI] [-t fn_ref] [-i option=value] [-bC] [-l level] [-o option=value] [-N num_reads] [-B] [-Z hdr_nuls] [-@ num_threads] [-x index_fn] [-m min_shift] [-p out] [-v] || [region]\n"); fprintf(stderr, "\n"); fprintf(stderr, "-D: read CRAM format (mode 'c')\n"); fprintf(stderr, "-S: read compressed BCF, BAM, FAI (mode 'b')\n"); @@ -90,7 +328,8 @@ int main(int argc, char *argv[]) fprintf(stderr, "-t: fn_ref: load CRAM references from the specificed fasta file instead of @SQ headers when writing a CRAM file\n"); fprintf(stderr, "-i: option=value: set an option for CRAM input\n"); fprintf(stderr, "\n"); - fprintf(stderr, "-b: write compressed BCF, BAM, FAI (mode 'b')\n"); + fprintf(stderr, "-b: write binary compressed BCF, BAM, FAI (mode 'b')\n"); + fprintf(stderr, "-z: write text compressed VCF.gz, SAM.gz (mode 'z')\n"); fprintf(stderr, "-C: write CRAM format (mode 'c')\n"); fprintf(stderr, "-l 0-9: set zlib compression level\n"); fprintf(stderr, "-o option=value: set an option for CRAM output\n"); @@ -100,66 +339,35 @@ int main(int argc, char *argv[]) fprintf(stderr, "-M: use hts_itr_multi iterator\n"); fprintf(stderr, "-Z hdr_nuls: append specified number of null bytes to the SAM header\n"); fprintf(stderr, "-@ num_threads: use thread pool with specified number of threads\n\n"); + fprintf(stderr, "-x fn: write index to fn\n"); + fprintf(stderr, "-m min_shift: specifies BAI/CSI bin size; 0 is BAI(BAM) or TBI(VCF), 14 is CSI default\n"); + fprintf(stderr, "-p out_fn: output to out_fn instead of stdout\n"); + fprintf(stderr, "-v: increase verbosity\n"); fprintf(stderr, "The region list entries should be specified as 'reg:beg-end', with intervals of a region being disjunct and sorted by the starting coordinate.\n"); return 1; } strcpy(moder, "r"); - if (flag & READ_CRAM) strcat(moder, "c"); - else if ((flag & READ_COMPRESSED) == 0) strcat(moder, "b"); + if (opts.flag & READ_CRAM) strcat(moder, "c"); + else if ((opts.flag & READ_COMPRESSED) == 0) strcat(moder, "b"); - in = sam_open(argv[optind], moder); + in = hts_open(argv[optind], moder); if (in == NULL) { fprintf(stderr, "Error opening \"%s\"\n", argv[optind]); return EXIT_FAILURE; } - h = sam_hdr_read(in); - if (h == NULL) { - fprintf(stderr, "Couldn't read header for \"%s\"\n", argv[optind]); - return EXIT_FAILURE; - } - h->ignore_sam_err = ignore_sam_err; - if (extra_hdr_nuls) { - char *new_text = realloc(h->text, h->l_text + extra_hdr_nuls); - if (new_text == NULL) { - fprintf(stderr, "Error reallocing header text\n"); - return EXIT_FAILURE; - } - h->text = new_text; - memset(&h->text[h->l_text], 0, extra_hdr_nuls); - h->l_text += extra_hdr_nuls; - } - - b = bam_init1(); strcpy(modew, "w"); - if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); - if (flag & WRITE_CRAM) strcat(modew, "c"); - else if (flag & WRITE_COMPRESSED) strcat(modew, "b"); - else if (flag & WRITE_UNCOMPRESSED) strcat(modew, "bu"); - out = hts_open("-", modew); + if (opts.clevel >= 0 && opts.clevel <= 9) sprintf(modew + 1, "%d", opts.clevel); + if (opts.flag & WRITE_CRAM) strcat(modew, "c"); + else if (opts.flag & WRITE_BINARY_COMP) strcat(modew, "b"); + else if (opts.flag & WRITE_COMPRESSED) strcat(modew, "z"); + else if (opts.flag & WRITE_UNCOMPRESSED) strcat(modew, "bu"); + out = hts_open(out_fn, modew); if (out == NULL) { fprintf(stderr, "Error opening standard output\n"); return EXIT_FAILURE; } - /* CRAM output */ - if (flag & WRITE_CRAM) { - int ret; - - // Parse input header and use for CRAM output - out->fp.cram->header = sam_hdr_parse_(h->text, h->l_text); - - // Create CRAM references arrays - if (fn_ref) - ret = cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, fn_ref); - else - // Attempt to fill out a cram->refs[] array from @SQ headers - ret = cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, NULL); - - if (ret != 0) - return EXIT_FAILURE; - } - // Process any options; currently cram only. if (hts_opt_apply(in, in_opts)) return EXIT_FAILURE; @@ -171,8 +379,8 @@ int main(int argc, char *argv[]) // Create and share the thread pool htsThreadPool p = {NULL, 0}; - if (nthreads > 0) { - p.pool = hts_tpool_init(nthreads); + if (opts.nthreads > 0) { + p.pool = hts_tpool_init(opts.nthreads); if (!p.pool) { fprintf(stderr, "Error creating thread pool\n"); exit_code = 1; @@ -182,137 +390,33 @@ int main(int argc, char *argv[]) } } - if (!benchmark && sam_hdr_write(out, h) < 0) { - fprintf(stderr, "Error writing output header.\n"); - exit_code = 1; - } - if (optind + 1 < argc && !(flag & READ_COMPRESSED)) { // BAM input and has a region - int i; - hts_idx_t *idx; - if ((idx = sam_index_load(in, argv[optind])) == 0) { - fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); - return 1; - } - if (multi_reg) { - int reg_count = 0; - hts_reglist_t *reg_list = calloc(argc-(optind+1), sizeof(*reg_list)); - if (!reg_list) - return 1; - - // We need a public function somewhere to turn an array of region strings - // into a region list, but for testing this will suffice for now. - // Consider moving a derivation of this into htslib proper sometime. - for (i = optind + 1; i < argc; ++i) { - int j; - uint32_t beg, end; - char *cp = strrchr(argv[i], ':'); - if (cp) *cp = 0; - - for (j = 0; j < reg_count; j++) - if (strcmp(reg_list[j].reg, argv[i]) == 0) - break; - if (j == reg_count) { - reg_list[reg_count++].reg = argv[i]; - if (strcmp(".", argv[i]) == 0) { - reg_list[j].tid = HTS_IDX_START; - - } else if (strcmp("*", argv[i]) == 0) { - reg_list[j].tid = HTS_IDX_NOCOOR; - - } else { - int k; // need the header API here! - for (k = 0; k < h->n_targets; k++) - if (strcmp(h->target_name[k], argv[i]) == 0) - break; - if (k == h->n_targets) - return 1; - reg_list[j].tid = k; - reg_list[j].min_beg = h->target_len[k]; - reg_list[j].max_end = 0; - } - } + int ret; + switch (hts_get_format(in)->category) { + case sequence_data: + ret = sam_loop(argc, argv, optind, &opts, in, out); + break; - hts_reglist_t *r = ®_list[j]; - r->intervals = realloc(r->intervals, ++r->count * sizeof(*r->intervals)); - if (!r->intervals) - return 1; - beg = 1; - end = r->tid >= 0 ? h->target_len[r->tid] : 0; - if (cp) { - *cp = 0; - // hts_parse_reg() is better, but awkward here - sscanf(cp+1, "%d-%d", &beg, &end); - } - r->intervals[r->count-1].beg = beg-1; // BED syntax - r->intervals[r->count-1].end = end; - - if (r->min_beg > beg) - r->min_beg = beg; - if (r->max_end < end) - r->max_end = end; - } + case variant_data: + ret = vcf_loop(argc, argv, optind, &opts, in, out); + break; - hts_itr_multi_t *iter = sam_itr_regions(idx, h, reg_list, reg_count); - if (!iter) - return 1; - while ((r = sam_itr_multi_next(in, iter, b)) >= 0) { - if (!benchmark && sam_write1(out, h, b) < 0) { - fprintf(stderr, "Error writing output.\n"); - exit_code = 1; - break; - } - if (nreads && --nreads == 0) - break; - } - hts_itr_multi_destroy(iter); - } else { - for (i = optind + 1; i < argc; ++i) { - hts_itr_t *iter; - if ((iter = sam_itr_querys(idx, h, argv[i])) == 0) { - fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); - continue; - } - while ((r = sam_itr_next(in, iter, b)) >= 0) { - if (!benchmark && sam_write1(out, h, b) < 0) { - fprintf(stderr, "Error writing output.\n"); - exit_code = 1; - break; - } - if (nreads && --nreads == 0) - break; - } - hts_itr_destroy(iter); - } - } - hts_idx_destroy(idx); - } else while ((r = sam_read1(in, h, b)) >= 0) { - if (!benchmark && sam_write1(out, h, b) < 0) { - fprintf(stderr, "Error writing output.\n"); - exit_code = 1; - break; - } - if (nreads && --nreads == 0) - break; + default: + fprintf(stderr, "Unsupported or unknown category of data in input file\n"); + return EXIT_FAILURE; } - if (r < -1) { - fprintf(stderr, "Error parsing input.\n"); - exit_code = 1; - } + if (ret != 0) + exit_code = EXIT_FAILURE; - r = sam_close(out); - if (r < 0) { + ret = hts_close(out); + if (ret < 0) { fprintf(stderr, "Error closing output.\n"); - exit_code = 1; + exit_code = EXIT_FAILURE; } - - bam_destroy1(b); - bam_hdr_destroy(h); - - r = sam_close(in); - if (r < 0) { + ret = hts_close(in); + if (ret < 0) { fprintf(stderr, "Error closing input.\n"); - exit_code = 1; + exit_code = EXIT_FAILURE; } if (p.pool) diff --git a/test/thrash_threads7.c b/test/thrash_threads7.c new file mode 100644 index 000000000..99f71e31c --- /dev/null +++ b/test/thrash_threads7.c @@ -0,0 +1,119 @@ +/* The MIT/Expat License + +Copyright (C) 2017-2018 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + */ +/* + * Test for thread lock-ups caused by a race condition on the queue list + * where the process tpool_worker is working on could get detached just + * after it finished running a job. This would result on the pointer + * to the next process to be searched for work being set to NULL, which + * stopped all the workers from finding anything to do. + */ + + +#include + +#include +#include +#include +#include +#include +#include "htslib/thread_pool.h" + + +void *job(void *v) { + unsigned int *usecs = (unsigned int *) v; + usleep(*usecs); + return NULL; +} + +int main(int argc, char *argv[]) { + int run_for_secs = 120; + int num_threads = 8; + int num_jobs = 8, count = 0, n_proc = 8, i; + struct timeval end, now; + hts_tpool *p = NULL; + hts_tpool_process *q[n_proc]; + + p = hts_tpool_init(num_threads); + if (!p) { + perror("hts_tpool_init"); + exit(EXIT_FAILURE); + } + + for (i = 0; i < n_proc; i++) { + q[i] = hts_tpool_process_init(p, 10, 1); + if (!q[i]) { + perror("hts_tpool_process_init"); + exit(EXIT_FAILURE); + } + } + + if (gettimeofday(&end, NULL) != 0) { + perror("gettimeofday"); + exit(EXIT_FAILURE); + } + + end.tv_sec += run_for_secs; + + do { + unsigned int *t; + int qnum = rand() % n_proc; + t = malloc(num_jobs * sizeof(*t)); + if (!t) { + perror("malloc"); + exit(EXIT_FAILURE); + } + if ((count++ & 15) == 0) { + fprintf(stderr, "\r%d ", count); + alarm(10); + } + for (i = 0; i < num_jobs; i++) { + t[i] = 1000; + if (hts_tpool_dispatch(p, q[qnum], job, &t[i]) < 0) { + perror("hts_tpool_dispatch"); + exit(EXIT_FAILURE); + } + } + hts_tpool_process_flush(q[qnum]); + hts_tpool_process_destroy(q[qnum]); + free(t); + q[qnum] = hts_tpool_process_init(p, 10, 1); + if (!q[qnum]) { + perror("hts_tpool_process_init"); + exit(EXIT_FAILURE); + } + + if (gettimeofday(&now, NULL) != 0) { + perror("gettimeofday"); + exit(EXIT_FAILURE); + } + } while (now.tv_sec < end.tv_sec + || (now.tv_sec == end.tv_sec && now.tv_usec < end.tv_usec)); + for (i = 0; i < n_proc; i++) { + hts_tpool_process_flush(q[i]); + hts_tpool_process_destroy(q[i]); + } + hts_tpool_destroy(p); + fprintf(stderr, "\n"); + + return EXIT_SUCCESS; +} diff --git a/test/xx#MD.sam b/test/xx#MD.sam new file mode 100644 index 000000000..7f70f370f --- /dev/null +++ b/test/xx#MD.sam @@ -0,0 +1,22 @@ +@SQ SN:zz LN:30 +@CO All MD and NM should match the stored values +a1 0 zz 6 1 10M * 0 0 AAAAATTTTT * co:Z:no fields +a2 0 zz 6 1 10M * 0 0 AAAAGGTTTT * +a3 0 zz 6 1 10M * 0 0 GAAAATTTTG * +i1 0 zz 6 1 5M1I5M * 0 0 AAAAAGTTTTT * +i2 0 zz 6 1 5M3I5M * 0 0 AAAAAGGGTTTTT * +i3 0 zz 6 1 10M2I * 0 0 AAAAATTTTTCC * +i4 0 zz 6 1 10M2P2I * 0 0 AAAAATTTTTCC * +d1 0 zz 6 1 5M10D5M * 0 0 AAAAACCCCC * +d2 0 zz 6 1 5M10N5M * 0 0 AAAAACCCCC * +sid 0 zz 6 1 1S4M10D5I4M1S * 0 0 AAAAAGGGGGCCCCC * +A1 0 zz 6 1 10M * 0 0 AAAAATTTTT * MD:Z:10 NM:i:0 co:Z:correct fields +A2 0 zz 6 1 10M * 0 0 AAAAGGTTTT * MD:Z:4A0T4 NM:i:2 +A3 0 zz 6 1 10M * 0 0 GAAAATTTTG * MD:Z:0A8T0 NM:i:2 +I1 0 zz 6 1 5M1I5M * 0 0 AAAAAGTTTTT * MD:Z:10 NM:i:1 +I2 0 zz 6 1 5M3I5M * 0 0 AAAAAGGGTTTTT * MD:Z:10 NM:i:3 +I3 0 zz 6 1 10M2I * 0 0 AAAAATTTTTCC * MD:Z:10 NM:i:2 +I4 0 zz 6 1 10M2P2I * 0 0 AAAAATTTTTCC * MD:Z:10 NM:i:2 +D1 0 zz 6 1 5M10D5M * 0 0 AAAAACCCCC * MD:Z:5^TTTTTTTTTT5 NM:i:10 +D2 0 zz 6 1 5M10N5M * 0 0 AAAAACCCCC * MD:Z:10 NM:i:0 +SID 0 zz 6 1 1S4M10D5I4M1S * 0 0 AAAAAGGGGGCCCCC * MD:Z:4^ATTTTTTTTT0T3 NM:i:16 diff --git a/test/xx#MD2.sam b/test/xx#MD2.sam new file mode 100644 index 000000000..b586b359b --- /dev/null +++ b/test/xx#MD2.sam @@ -0,0 +1,20 @@ +@SQ SN:zz LN:30 +@CO All MD and/or NM should differ to the stored values +a1 0 zz 6 1 10M * 0 0 AAAAATTTTT * MD:Z:9 NM:i:0 co:Z:MD incorrect fields +a2 0 zz 6 1 10M * 0 0 AAAAGGTTTT * MD:Z:4A0A4 NM:i:2 +a3 0 zz 6 1 10M * 0 0 GAAAATTTTG * MD:Z:0G8T0 NM:i:2 +i1 0 zz 6 1 5M1I5M * 0 0 AAAAAGTTTTT * MD:Z:11 NM:i:1 +i2 0 zz 6 1 5M3I5M * 0 0 AAAAAGGGTTTTT * MD:Z:1A1 NM:i:3 +i3 0 zz 6 1 10M2I * 0 0 AAAAATTTTTCC * MD:Z:12 NM:i:2 +d1 0 zz 6 1 5M10D5M * 0 0 AAAAACCCCC * MD:Z:5^CTTTTTTTTT5 NM:i:10 +d2 0 zz 6 1 5M10N5M * 0 0 AAAAACCCCC * MD:Z:9 NM:i:0 +sid 0 zz 6 1 1S4M10D5I4M1S * 0 0 AAAAAGGGGGCCCCC * MD:Z:4^TTTTTTTTT0T3 NM:i:16 +A1 0 zz 6 1 10M * 0 0 AAAAATTTTT * MD:Z:10 NM:i:1 co:Z:NM incorrect fields +A2 0 zz 6 1 10M * 0 0 AAAAGGTTTT * MD:Z:4A0T4 NM:i:0 +A3 0 zz 6 1 10M * 0 0 GAAAATTTTG * MD:Z:0A8T0 NM:i:0 +I1 0 zz 6 1 5M1I5M * 0 0 AAAAAGTTTTT * MD:Z:10 NM:i:0 +I2 0 zz 6 1 5M3I5M * 0 0 AAAAAGGGTTTTT * MD:Z:10 NM:i:0 +I3 0 zz 6 1 10M2I * 0 0 AAAAATTTTTCC * MD:Z:10 NM:i:0 +D1 0 zz 6 1 5M10D5M * 0 0 AAAAACCCCC * MD:Z:5^TTTTTTTTTT5 NM:i:11 +D2 0 zz 6 1 5M10N5M * 0 0 AAAAACCCCC * MD:Z:10 NM:i:1 +SID 0 zz 6 1 1S4M10D5I4M1S * 0 0 AAAAAGGGGGCCCCC * MD:Z:4^ATTTTTTTTT0T3 NM:i:1 diff --git a/test/xx#blank.sam b/test/xx#blank.sam index e69de29bb..df026756b 100644 --- a/test/xx#blank.sam +++ b/test/xx#blank.sam @@ -0,0 +1 @@ +@CO No useful headers or records (0-length file is not considered SAM) diff --git a/test/xx.fa b/test/xx.fa index a233f7d92..faa1fb03d 100644 --- a/test/xx.fa +++ b/test/xx.fa @@ -2,4 +2,6 @@ AAAAAAAAAATTTTTTTTTT >yy AAAAAAAAAATTTTTTTTTT +>zz +AAAAAAAAAATTTTTTTTTTCCCCCCCCCC diff --git a/test/xx.fa.fai b/test/xx.fa.fai index 97b1a3b06..279eeec4b 100644 --- a/test/xx.fa.fai +++ b/test/xx.fa.fai @@ -1,2 +1,3 @@ xx 20 4 20 21 yy 20 29 20 21 +zz 30 54 30 31 diff --git a/textutils.c b/textutils.c index dc688b4a6..0c6db25d5 100644 --- a/textutils.c +++ b/textutils.c @@ -1,6 +1,6 @@ /* textutils.c -- non-bioinformatics utility routines for text etc. - Copyright (C) 2016 Genome Research Ltd. + Copyright (C) 2016, 2018-2019 Genome Research Ltd. Author: John Marshall @@ -22,6 +22,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -29,6 +30,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hfile.h" #include "htslib/kstring.h" +#include "htslib/sam.h" // For stringify_argv() declaration #include "hts_internal.h" @@ -159,20 +161,21 @@ static char *sscan_string(char *s) } } -static void fscan_string(hFILE *fp, kstring_t *d) +static int fscan_string(hFILE *fp, kstring_t *d) { int c, d1, d2, d3, d4; + uint32_t e = 0; while ((c = hgetc(fp)) != EOF) switch (c) { case '\\': - if ((c = hgetc(fp)) == EOF) return; + if ((c = hgetc(fp)) == EOF) return e == 0 ? 0 : -1; switch (c) { - case 'b': kputc('\b', d); break; - case 'f': kputc('\f', d); break; - case 'n': kputc('\n', d); break; - case 'r': kputc('\r', d); break; - case 't': kputc('\t', d); break; - default: kputc(c, d); break; + case 'b': e |= kputc('\b', d) < 0; break; + case 'f': e |= kputc('\f', d) < 0; break; + case 'n': e |= kputc('\n', d) < 0; break; + case 'r': e |= kputc('\r', d) < 0; break; + case 't': e |= kputc('\t', d) < 0; break; + default: e |= kputc(c, d) < 0; break; case 'u': if ((c = hgetc(fp)) != EOF && (d1 = dehex(c)) >= 0 && (c = hgetc(fp)) != EOF && (d2 = dehex(c)) >= 0 && @@ -180,19 +183,20 @@ static void fscan_string(hFILE *fp, kstring_t *d) (c = hgetc(fp)) != EOF && (d4 = dehex(c)) >= 0) { char buf[8]; char *lim = encode_utf8(buf, d1 << 12 | d2 << 8 | d3 << 4 | d4); - kputsn(buf, lim - buf, d); + e |= kputsn(buf, lim - buf, d) < 0; } break; } break; case '"': - return; + return e == 0 ? 0 : -1; default: - kputc(c, d); + e |= kputc(c, d) < 0; break; } + return e == 0 ? 0 : -1; } static char token_type(hts_json_token *token) @@ -215,22 +219,27 @@ static char token_type(hts_json_token *token) } } +HTSLIB_EXPORT hts_json_token * hts_json_alloc_token() { return calloc(1, sizeof(hts_json_token)); } +HTSLIB_EXPORT char hts_json_token_type(hts_json_token *token) { return token->type; } +HTSLIB_EXPORT void hts_json_free_token(hts_json_token *token) { free(token); } +HTSLIB_EXPORT char *hts_json_token_str(hts_json_token *token) { return token->str; } +HTSLIB_EXPORT char hts_json_snext(char *str, size_t *state, hts_json_token *token) { char *s = &str[*state >> 2]; @@ -280,6 +289,7 @@ char hts_json_snext(char *str, size_t *state, hts_json_token *token) #undef STATE } +HTSLIB_EXPORT char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr) { char peek; @@ -377,6 +387,8 @@ static char snext(void *arg1, void *arg2, hts_json_token *token) { return hts_json_snext(arg1, arg2, token); } + +HTSLIB_EXPORT char hts_json_sskip_value(char *str, size_t *state, char type) { return skip_value(type, snext, str, state); @@ -386,6 +398,8 @@ static char fnext(void *arg1, void *arg2, hts_json_token *token) { return hts_json_fnext(arg1, token, arg2); } + +HTSLIB_EXPORT char hts_json_fskip_value(struct hFILE *fp, char type) { kstring_t str = { 0, 0, NULL }; @@ -393,3 +407,42 @@ char hts_json_fskip_value(struct hFILE *fp, char type) free(str.s); return ret; } + +/* + * A function to help with construction of CL tags in @PG records. + * Takes an argc, argv pair and returns a single space-separated string. + * This string should be deallocated by the calling function. + * + * Returns malloced char * on success + * NULL on failure + */ +char *stringify_argv(int argc, char *argv[]) { + char *str, *cp; + size_t nbytes = 1; + int i, j; + + /* Allocate */ + for (i = 0; i < argc; i++) { + if (i > 0) nbytes += 1; + nbytes += strlen(argv[i]); + } + if (!(str = malloc(nbytes))) + return NULL; + + /* Copy */ + cp = str; + for (i = 0; i < argc; i++) { + if (i > 0) *cp++ = ' '; + j = 0; + while (argv[i][j]) { + if (argv[i][j] == '\t') + *cp++ = ' '; + else + *cp++ = argv[i][j]; + j++; + } + } + *cp++ = 0; + + return str; +} diff --git a/textutils_internal.h b/textutils_internal.h index 5edd5d293..b9c68ccc3 100644 --- a/textutils_internal.h +++ b/textutils_internal.h @@ -1,6 +1,6 @@ /* textutils_internal.h -- non-bioinformatics utility routines for text etc. - Copyright (C) 2016,2018 Genome Research Ltd. + Copyright (C) 2016,2018,2019 Genome Research Ltd. Author: John Marshall @@ -65,7 +65,7 @@ typedef struct hts_json_token hts_json_token; /// Allocate an empty JSON token structure, for use with hts_json_* functions /** @return An empty token on success; NULL on failure */ -hts_json_token * hts_json_alloc_token(); +hts_json_token *hts_json_alloc_token(void); /// Free a JSON token void hts_json_free_token(hts_json_token *token); @@ -125,6 +125,8 @@ or array. */ char hts_json_sskip_value(char *str, size_t *state, char type); +struct hFILE; + /// Read one JSON token from a file /** @param fp The file stream @param token On return, filled in with the token read @@ -148,7 +150,6 @@ or array. */ char hts_json_fskip_value(struct hFILE *fp, char type); - // The functions operate on ints such as are returned by fgetc(), // i.e., characters represented as unsigned-char-valued ints, or EOF. // To operate on plain chars (and to avoid warnings on some platforms), @@ -163,11 +164,132 @@ static inline int isdigit_c(char c) { return isdigit((unsigned char) c); } static inline int isgraph_c(char c) { return isgraph((unsigned char) c); } static inline int islower_c(char c) { return islower((unsigned char) c); } static inline int isprint_c(char c) { return isprint((unsigned char) c); } +static inline int ispunct_c(char c) { return ispunct((unsigned char) c); } static inline int isspace_c(char c) { return isspace((unsigned char) c); } static inline int isupper_c(char c) { return isupper((unsigned char) c); } +static inline int isxdigit_c(char c) { return isxdigit((unsigned char) c); } static inline char tolower_c(char c) { return tolower((unsigned char) c); } static inline char toupper_c(char c) { return toupper((unsigned char) c); } +// Faster replacements for strtol, for use when parsing lots of numbers. +// Note that these only handle base 10 and do not skip leading whitespace + +/// Convert a string to a signed integer, with overflow detection +/** @param[in] in Input string + @param[out] end Returned end pointer + @param[in] bits Bits available for the converted value + @param[out] failed Location of overflow flag + @return String value converted to an int64_t + +Converts a signed decimal string to an int64_t. The string should +consist of an optional '+' or '-' sign followed by one or more of +the digits 0 to 9. The output value will be limited to fit in the +given number of bits (including the sign bit). If the value is too big, +the largest possible value will be returned and *failed will be set to 1. + +The address of the first character following the converted number will +be stored in *end. + +Both end and failed must be non-NULL. + */ +static inline int64_t hts_str2int(const char *in, char **end, int bits, + int *failed) { + uint64_t n = 0, limit = (1ULL << (bits - 1)) - 1; + uint32_t fast = (bits - 1) * 1000 / 3322 + 1; // log(10)/log(2) ~= 3.322 + const unsigned char *v = (const unsigned char *) in; + const unsigned int ascii_zero = '0'; // Prevents conversion to signed + unsigned char d; + int neg = 1; + + switch(*v) { + case '-': + neg=-1; + limit++; /* fall through */ + case '+': + v++; + break; + default: + break; + } + + while (--fast && *v>='0' && *v<='9') + n = n*10 + *v++ - ascii_zero; + + if (!fast) { + uint64_t limit_d_10 = limit / 10; + uint64_t limit_m_10 = limit - 10 * limit_d_10; + while ((d = *v - ascii_zero) < 10) { + if (n < limit_d_10 || (n == limit_d_10 && d <= limit_m_10)) { + n = n*10 + d; + v++; + } else { + do { v++; } while (*v - ascii_zero < 10); + n = limit; + *failed = 1; + break; + } + } + } + + *end = (char *)v; + + return (n && neg < 0) ? -((int64_t) (n - 1)) - 1 : n; +} + +/// Convert a string to an unsigned integer, with overflow detection +/** @param[in] in Input string + @param[out] end Returned end pointer + @param[in] bits Bits available for the converted value + @param[out] failed Location of overflow flag + @return String value converted to a uint64_t + +Converts an unsigned decimal string to a uint64_t. The string should +consist of an optional '+' sign followed by one or more of the digits 0 +to 9. The output value will be limited to fit in the given number of bits. +If the value is too big, the largest possible value will be returned +and *failed will be set to 1. + +The address of the first character following the converted number will +be stored in *end. + +Both end and failed must be non-NULL. + */ + +static inline uint64_t hts_str2uint(const char *in, char **end, int bits, + int *failed) { + uint64_t n = 0, limit = (bits < 64 ? (1ULL << bits) : 0) - 1; + const unsigned char *v = (const unsigned char *) in; + const unsigned int ascii_zero = '0'; // Prevents conversion to signed + uint32_t fast = bits * 1000 / 3322 + 1; // log(10)/log(2) ~= 3.322 + unsigned char d; + + if (*v == '+') + v++; + + while (--fast && *v>='0' && *v<='9') + n = n*10 + *v++ - ascii_zero; + + if (!fast) { + uint64_t limit_d_10 = limit / 10; + uint64_t limit_m_10 = limit - 10 * limit_d_10; + while ((d = *v - ascii_zero) < 10) { + if (n < limit_d_10 || (n == limit_d_10 && d <= limit_m_10)) { + n = n*10 + d; + v++; + } else { + do { v++; } while (*v - ascii_zero < 10); + n = limit; + *failed = 1; + break; + } + } + } + + *end = (char *)v; + return n; +} + + #ifdef __cplusplus } #endif diff --git a/thread_pool.c b/thread_pool.c index 53f2704b9..bbb2d14fe 100644 --- a/thread_pool.c +++ b/thread_pool.c @@ -1,6 +1,6 @@ /* thread_pool.c -- A pool of generic worker threads - Copyright (c) 2013-2017 Genome Research Ltd. + Copyright (c) 2013-2019 Genome Research Ltd. Author: James Bonfield @@ -23,6 +23,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef TEST_MAIN +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #endif @@ -40,6 +41,9 @@ DEALINGS IN THE SOFTWARE. */ #include "thread_pool_internal.h" +static void hts_tpool_process_detach_locked(hts_tpool *p, + hts_tpool_process *q); + //#define DEBUG #ifdef DEBUG @@ -96,11 +100,15 @@ static int hts_tpool_add_result(hts_tpool_job *j, void *data) { return 0; } - if (!(r = malloc(sizeof(*r)))) + if (!(r = malloc(sizeof(*r)))) { + pthread_mutex_unlock(&q->p->pool_m); + hts_tpool_process_shutdown(q); return -1; + } r->next = NULL; r->data = data; + r->result_cleanup = j->result_cleanup; r->serial = j->serial; q->n_output++; @@ -298,13 +306,17 @@ int hts_tpool_process_sz(hts_tpool_process *q) { * This sets the shutdown flag and wakes any threads waiting on process * condition variables. */ -void hts_tpool_process_shutdown(hts_tpool_process *q) { - pthread_mutex_lock(&q->p->pool_m); +static void hts_tpool_process_shutdown_locked(hts_tpool_process *q) { q->shutdown = 1; pthread_cond_broadcast(&q->output_avail_c); pthread_cond_broadcast(&q->input_not_full_c); pthread_cond_broadcast(&q->input_empty_c); pthread_cond_broadcast(&q->none_processing_c); +} + +void hts_tpool_process_shutdown(hts_tpool_process *q) { + pthread_mutex_lock(&q->p->pool_m); + hts_tpool_process_shutdown_locked(q); pthread_mutex_unlock(&q->p->pool_m); } @@ -355,6 +367,7 @@ hts_tpool_process *hts_tpool_process_init(hts_tpool *p, int qsize, int in_only) q->output_tail = NULL; q->next_serial = 0; q->curr_serial = 0; + q->no_more_input = 0; q->n_input = 0; q->n_output = 0; q->n_processing= 0; @@ -381,11 +394,19 @@ void hts_tpool_process_destroy(hts_tpool_process *q) { if (!q) return; + // Prevent dispatch from queuing up any more jobs. + // We want to reset (and flush) the queue here, before + // we set the shutdown flag, but we need to avoid races + // with queue more input during reset. + pthread_mutex_lock(&q->p->pool_m); + q->no_more_input = 1; + pthread_mutex_unlock(&q->p->pool_m); + // Ensure it's fully drained before destroying the queue hts_tpool_process_reset(q, 0); pthread_mutex_lock(&q->p->pool_m); - hts_tpool_process_detach(q->p, q); - hts_tpool_process_shutdown(q); + hts_tpool_process_detach_locked(q->p, q); + hts_tpool_process_shutdown_locked(q); // Maybe a worker is scanning this queue, so delay destruction if (--q->ref_count > 0) { @@ -429,10 +450,10 @@ void hts_tpool_process_attach(hts_tpool *p, hts_tpool_process *q) { pthread_mutex_unlock(&p->pool_m); } -void hts_tpool_process_detach(hts_tpool *p, hts_tpool_process *q) { - pthread_mutex_lock(&p->pool_m); +static void hts_tpool_process_detach_locked(hts_tpool *p, + hts_tpool_process *q) { if (!p->q_head || !q->prev || !q->next) - goto done; + return; hts_tpool_process *curr = p->q_head, *first = curr; do { @@ -450,8 +471,11 @@ void hts_tpool_process_detach(hts_tpool *p, hts_tpool_process *q) { curr = curr->next; } while (curr != first); +} - done: +void hts_tpool_process_detach(hts_tpool *p, hts_tpool_process *q) { + pthread_mutex_lock(&p->pool_m); + hts_tpool_process_detach_locked(p, q); pthread_mutex_unlock(&p->pool_m); } @@ -477,18 +501,15 @@ static void *tpool_worker(void *arg) { hts_tpool *p = w->p; hts_tpool_job *j; - for (;;) { + pthread_mutex_lock(&p->pool_m); + while (!p->shutdown) { // Pop an item off the pool queue - pthread_mutex_lock(&p->pool_m); assert(p->q_head == 0 || (p->q_head->prev && p->q_head->next)); int work_to_do = 0; hts_tpool_process *first = p->q_head, *q = first; do { - if (p->shutdown) - break; - // Iterate over queues, finding one with jobs and also // room to put the result. //if (q && q->input_head && !hts_tpool_process_output_full(q)) { @@ -501,15 +522,6 @@ static void *tpool_worker(void *arg) { if (q) q = q->next; } while (q && q != first); - if (p->shutdown) { - shutdown: -#ifdef DEBUG - fprintf(stderr, "%d: Shutting down\n", worker_id(p)); -#endif - pthread_mutex_unlock(&p->pool_m); - return NULL; - } - if (!work_to_do) { // We scanned all queues and cannot process any, so we wait. p->nwaiting++; @@ -536,8 +548,7 @@ static void *tpool_worker(void *arg) { } p->nwaiting--; - pthread_mutex_unlock(&p->pool_m); - continue; // To outer for(;;) loop. + continue; // To outer loop. } // Otherwise work_to_do, so process as many items in this queue as @@ -574,21 +585,35 @@ static void *tpool_worker(void *arg) { DBG_OUT(stderr, "%d: Processing queue %p, serial %"PRId64"\n", worker_id(j->p), q, j->serial); - hts_tpool_add_result(j, j->func(j->arg)); + if (hts_tpool_add_result(j, j->func(j->arg)) < 0) + goto err; //memset(j, 0xbb, sizeof(*j)); free(j); pthread_mutex_lock(&p->pool_m); } - if (--q->ref_count == 0) // we were the last user + if (--q->ref_count == 0) { // we were the last user hts_tpool_process_destroy(q); - else + } else { // Out of jobs on this queue, so restart search from next one. // This is equivalent to "work-stealing". - p->q_head = q->next; - - pthread_mutex_unlock(&p->pool_m); + if (p->q_head) + p->q_head = p->q_head->next; + } } + + shutdown: + pthread_mutex_unlock(&p->pool_m); +#ifdef DEBUG + fprintf(stderr, "%d: Shutting down\n", worker_id(p)); +#endif + return NULL; + + err: +#ifdef DEBUG + fprintf(stderr, "%d: Failed to add result\n", worker_id(p)); +#endif + return NULL; } static void wake_next_worker(hts_tpool_process *q, int locked) { @@ -718,7 +743,7 @@ int hts_tpool_size(hts_tpool *p) { */ int hts_tpool_dispatch(hts_tpool *p, hts_tpool_process *q, void *(*func)(void *arg), void *arg) { - return hts_tpool_dispatch2(p, q, func, arg, 0); + return hts_tpool_dispatch3(p, q, func, arg, NULL, NULL, 0); } /* @@ -729,7 +754,15 @@ int hts_tpool_dispatch(hts_tpool *p, hts_tpool_process *q, * nonblock -1 => add task regardless of whether queue is full (over-size) */ int hts_tpool_dispatch2(hts_tpool *p, hts_tpool_process *q, - void *(*func)(void *arg), void *arg, int nonblock) { + void *(*func)(void *arg), void *arg, int nonblock) { + return hts_tpool_dispatch3(p, q, func, arg, NULL, NULL, nonblock); +} + +int hts_tpool_dispatch3(hts_tpool *p, hts_tpool_process *q, + void *(*exec_func)(void *arg), void *arg, + void (*job_cleanup)(void *arg), + void (*result_cleanup)(void *data), + int nonblock) { hts_tpool_job *j; pthread_mutex_lock(&p->pool_m); @@ -737,7 +770,7 @@ int hts_tpool_dispatch2(hts_tpool *p, hts_tpool_process *q, DBG_OUT(stderr, "Dispatching job for queue %p, serial %"PRId64"\n", q, q->curr_serial); - if (q->n_input >= q->qsize && nonblock == 1) { + if ((q->no_more_input || q->n_input >= q->qsize) && nonblock == 1) { pthread_mutex_unlock(&p->pool_m); errno = EAGAIN; return -1; @@ -747,17 +780,21 @@ int hts_tpool_dispatch2(hts_tpool *p, hts_tpool_process *q, pthread_mutex_unlock(&p->pool_m); return -1; } - j->func = func; + j->func = exec_func; j->arg = arg; + j->job_cleanup = job_cleanup; + j->result_cleanup = result_cleanup; j->next = NULL; j->p = p; j->q = q; j->serial = q->curr_serial++; if (nonblock == 0) { - while (q->n_input >= q->qsize && !q->shutdown && !q->wake_dispatch) + while ((q->no_more_input || q->n_input >= q->qsize) && + !q->shutdown && !q->wake_dispatch) { pthread_cond_wait(&q->input_not_full_c, &q->p->pool_m); - if (q->shutdown) { + } + if (q->no_more_input || q->shutdown) { free(j); pthread_mutex_unlock(&p->pool_m); return -1; @@ -817,39 +854,39 @@ void hts_tpool_wake_dispatch(hts_tpool_process *q) { * -1 on failure */ int hts_tpool_process_flush(hts_tpool_process *q) { - int i; - hts_tpool *p = q->p; - - DBG_OUT(stderr, "Flushing pool %p\n", p); - - // Drains the queue - pthread_mutex_lock(&p->pool_m); - - // Wake up everything for the final sprint! - for (i = 0; i < p->tsize; i++) - if (p->t_stack[i]) - pthread_cond_signal(&p->t[i].pending_c); - - // Ensure there is room for the final sprint. - // Shouldn't be possible to get here, but just incase. - if (q->qsize < q->n_output + q->n_input + q->n_processing) - q->qsize = q->n_output + q->n_input + q->n_processing; - - // Wait for n_input and n_processing to hit zero. - while (q->n_input || q->n_processing) { - while (q->n_input) - pthread_cond_wait(&q->input_empty_c, &p->pool_m); - if (q->shutdown) break; - while (q->n_processing) - pthread_cond_wait(&q->none_processing_c, &p->pool_m); - if (q->shutdown) break; + int i; + hts_tpool *p = q->p; + + DBG_OUT(stderr, "Flushing pool %p\n", p); + + // Drains the queue + pthread_mutex_lock(&p->pool_m); + + // Wake up everything for the final sprint! + for (i = 0; i < p->tsize; i++) + if (p->t_stack[i]) + pthread_cond_signal(&p->t[i].pending_c); + + // Ensure there is room for the final sprint. + // Shouldn't be possible to get here, but just incase. + if (q->qsize < q->n_output + q->n_input + q->n_processing) + q->qsize = q->n_output + q->n_input + q->n_processing; + + // Wait for n_input and n_processing to hit zero. + while (q->n_input || q->n_processing) { + while (q->n_input) + pthread_cond_wait(&q->input_empty_c, &p->pool_m); + if (q->shutdown) break; + while (q->n_processing) + pthread_cond_wait(&q->none_processing_c, &p->pool_m); + if (q->shutdown) break; } - pthread_mutex_unlock(&p->pool_m); + pthread_mutex_unlock(&p->pool_m); - DBG_OUT(stderr, "Flushed complete for pool %p, queue %p\n", p, q); + DBG_OUT(stderr, "Flushed complete for pool %p, queue %p\n", p, q); - return 0; + return 0; } /* @@ -865,30 +902,42 @@ int hts_tpool_process_flush(hts_tpool_process *q) { * -1 on failure */ int hts_tpool_process_reset(hts_tpool_process *q, int free_results) { + hts_tpool_job *j, *jn, *j_head; + hts_tpool_result *r, *rn, *r_head; + pthread_mutex_lock(&q->p->pool_m); // prevent next_result from returning data during our flush q->next_serial = INT_MAX; - // Purge any queued input not yet being acted upon - hts_tpool_job *j, *jn; - for (j = q->input_head; j; j = jn) { + // Remove any queued input not yet being acted upon + j_head = q->input_head; + q->input_head = q->input_tail = NULL; + q->n_input = 0; + + // Remove any queued output, thus ensuring we have room to flush. + r_head = q->output_head; + q->output_head = q->output_tail = NULL; + q->n_output = 0; + pthread_mutex_unlock(&q->p->pool_m); + + // Release memory. This can be done unlocked now the lists have been + // removed from the queue + for (j = j_head; j; j = jn) { //fprintf(stderr, "Discard input %d\n", j->serial); jn = j->next; + if (j->job_cleanup) j->job_cleanup(j->arg); free(j); } - q->input_head = q->input_tail = NULL; - q->n_input = 0; - // Purge any queued output, thus ensuring we have room to flush. - hts_tpool_result *r, *rn; - for (r = q->output_head; r; r = rn) { + for (r = r_head; r; r = rn) { //fprintf(stderr, "Discard output %d\n", r->serial); rn = r->next; + if (r->result_cleanup) { + r->result_cleanup(r->data); + r->data = NULL; + } hts_tpool_delete_result(r, free_results); } - q->output_head = q->output_tail = NULL; - q->n_output = 0; - pthread_mutex_unlock(&q->p->pool_m); // Wait for any jobs being processed to complete. // (TODO: consider how to cancel any currently processing jobs. @@ -896,13 +945,9 @@ int hts_tpool_process_reset(hts_tpool_process *q, int free_results) { if (hts_tpool_process_flush(q) != 0) return -1; - // Discard any new output. + // Remove any new output. pthread_mutex_lock(&q->p->pool_m); - for (r = q->output_head; r; r = rn) { - //fprintf(stderr, "Discard output %d\n", r->serial); - rn = r->next; - hts_tpool_delete_result(r, free_results); - } + r_head = q->output_head; q->output_head = q->output_tail = NULL; q->n_output = 0; @@ -911,6 +956,17 @@ int hts_tpool_process_reset(hts_tpool_process *q, int free_results) { pthread_cond_signal(&q->input_not_full_c); pthread_mutex_unlock(&q->p->pool_m); + // Discard unwanted output + for (r = r_head; r; r = rn) { + //fprintf(stderr, "Discard output %d\n", r->serial); + rn = r->next; + if (r->result_cleanup) { + r->result_cleanup(r->data); + r->data = NULL; + } + hts_tpool_delete_result(r, free_results); + } + return 0; } diff --git a/thread_pool_internal.h b/thread_pool_internal.h index 8ecbbad93..c56061450 100644 --- a/thread_pool_internal.h +++ b/thread_pool_internal.h @@ -56,6 +56,8 @@ extern "C" { typedef struct hts_tpool_job { void *(*func)(void *arg); void *arg; + void (*job_cleanup)(void *arg); + void (*result_cleanup)(void *data); struct hts_tpool_job *next; struct hts_tpool *p; @@ -68,6 +70,7 @@ typedef struct hts_tpool_job { */ struct hts_tpool_result { struct hts_tpool_result *next; + void (*result_cleanup)(void *data); uint64_t serial; // sequential number for ordering void *data; // result itself }; @@ -104,6 +107,7 @@ struct hts_tpool_process { uint64_t next_serial; // next serial for output uint64_t curr_serial; // current serial (next input) + int no_more_input; // disable dispatching of more jobs int n_input; // no. items in input queue; was njobs int n_output; // no. items in output queue int n_processing; // no. items being processed (executing) diff --git a/vcf.5 b/vcf.5 index 47e833ac0..35d60c1f3 100644 --- a/vcf.5 +++ b/vcf.5 @@ -4,7 +4,7 @@ vcf \- Variant Call Format .\" .\" Copyright (C) 2011 Broad Institute. -.\" Copyright (C) 2013 Genome Research Ltd. +.\" Copyright (C) 2013-2014 Genome Research Ltd. .\" .\" Author: Heng Li .\" diff --git a/vcf.c b/vcf.c index b63325794..059485948 100644 --- a/vcf.c +++ b/vcf.c @@ -1,7 +1,7 @@ /* vcf.c -- VCF/BCF API functions. Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2017 Genome Research Ltd. + Copyright (C) 2012-2019 Genome Research Ltd. Portions copyright (C) 2014 Intel Corporation. Author: Heng Li @@ -24,6 +24,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -33,6 +34,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include "htslib/vcf.h" @@ -43,16 +45,22 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts_endian.h" #include "htslib/khash_str2int.h" #include "htslib/kstring.h" +#include "htslib/sam.h" #include "htslib/khash.h" KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) typedef khash_t(vdict) vdict_t; #include "htslib/kseq.h" - +HTSLIB_EXPORT uint32_t bcf_float_missing = 0x7F800001; + +HTSLIB_EXPORT uint32_t bcf_float_vector_end = 0x7F800002; + +HTSLIB_EXPORT uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 }; static const char *dump_char(char *buffer, char c) @@ -86,38 +94,59 @@ static char *find_chrom_header_line(char *s) *** VCF header parser *** *************************/ -int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s) +static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len) { if ( !s ) return 0; + if (len == 0) len = strlen(s); const char *ss = s; - while ( !*ss && isspace_c(*ss) ) ss++; - if ( !*ss ) + while ( *ss && isspace_c(*ss) && ss - s < len) ss++; + if ( !*ss || ss - s == len) { hts_log_error("Empty sample name: trailing spaces/tabs in the header line?"); - abort(); + return -1; } vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE]; int ret; - char *sdup = strdup(s); + char *sdup = malloc(len + 1); + if (!sdup) return -1; + memcpy(sdup, s, len); + sdup[len] = 0; + + // Ensure space is available in h->samples + size_t n = kh_size(d); + char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1)); + if (!new_samples) { + free(sdup); + return -1; + } + h->samples = new_samples; + int k = kh_put(vdict, d, sdup, &ret); + if (ret < 0) { + free(sdup); + return -1; + } if (ret) { // absent kh_val(d, k) = bcf_idinfo_def; - kh_val(d, k).id = kh_size(d) - 1; + kh_val(d, k).id = n; } else { hts_log_error("Duplicated sample name '%s'", s); free(sdup); return -1; } - int n = kh_size(d); - h->samples = (char**) realloc(h->samples,sizeof(char*)*n); - h->samples[n-1] = sdup; + h->samples[n] = sdup; h->dirty = 1; return 0; } -int bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str) +int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s) +{ + return bcf_hdr_add_sample_len(h, s, 0); +} + +int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str) { int ret = 0; int i = 0; @@ -126,16 +155,12 @@ int bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str) for (p = q = str;; ++q) { if (*q != '\t' && *q != 0 && *q != '\n') continue; if (++i > 9) { - char *s = (char*)malloc(q - p + 1); - strncpy(s, p, q - p); - s[q - p] = 0; - if ( bcf_hdr_add_sample(h,s) < 0 ) ret = -1; - free(s); + if ( bcf_hdr_add_sample_len(h, p, q - p) < 0 ) ret = -1; } - if (*q == 0 || *q == '\n') break; + if (*q == 0 || *q == '\n' || ret < 0) break; p = q + 1; } - bcf_hdr_add_sample(h,NULL); + return ret; } @@ -148,9 +173,12 @@ int bcf_hdr_sync(bcf_hdr_t *h) khint_t k; if ( h->n[i] < kh_size(d) ) { + bcf_idpair_t *new_idpair; // this should be true only for i=2, BCF_DT_SAMPLE + new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t)); + if (!new_idpair) return -1; h->n[i] = kh_size(d); - h->id[i] = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t)); + h->id[i] = new_idpair; } for (k=kh_begin(d); kkey); if ( hrec->value ) free(hrec->value); int i; @@ -181,23 +210,47 @@ void bcf_hrec_destroy(bcf_hrec_t *hrec) // Copies all fields except IDX. bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec) { + int save_errno; bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); + if (!out) return NULL; + out->type = hrec->type; - if ( hrec->key ) out->key = strdup(hrec->key); - if ( hrec->value ) out->value = strdup(hrec->value); + if ( hrec->key ) { + out->key = strdup(hrec->key); + if (!out->key) goto fail; + } + if ( hrec->value ) { + out->value = strdup(hrec->value); + if (!out->value) goto fail; + } out->nkeys = hrec->nkeys; out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys); + if (!out->keys) goto fail; out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys); + if (!out->vals) goto fail; int i, j = 0; for (i=0; inkeys; i++) { if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue; - if ( hrec->keys[i] ) out->keys[j] = strdup(hrec->keys[i]); - if ( hrec->vals[i] ) out->vals[j] = strdup(hrec->vals[i]); + if ( hrec->keys[i] ) { + out->keys[j] = strdup(hrec->keys[i]); + if (!out->keys[j]) goto fail; + } + if ( hrec->vals[i] ) { + out->vals[j] = strdup(hrec->vals[i]); + if (!out->vals[j]) goto fail; + } j++; } if ( i!=j ) out->nkeys -= i-j; // IDX was omitted return out; + + fail: + save_errno = errno; + hts_log_error("%s", strerror(errno)); + bcf_hrec_destroy(out); + errno = save_errno; + return NULL; } void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec) @@ -227,25 +280,42 @@ void bcf_header_debug(bcf_hdr_t *hdr) } } -void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len) +int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len) { - int n = ++hrec->nkeys; - hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n); - hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n); - assert( len ); - hrec->keys[n-1] = (char*) malloc((len+1)*sizeof(char)); - memcpy(hrec->keys[n-1],str,len); - hrec->keys[n-1][len] = 0; - hrec->vals[n-1] = NULL; + char **tmp; + size_t n = hrec->nkeys + 1; + assert(len > 0 && len < SIZE_MAX); + tmp = realloc(hrec->keys, sizeof(char*)*n); + if (!tmp) return -1; + hrec->keys = tmp; + tmp = realloc(hrec->vals, sizeof(char*)*n); + if (!tmp) return -1; + hrec->vals = tmp; + + hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char)); + if (!hrec->keys[hrec->nkeys]) return -1; + memcpy(hrec->keys[hrec->nkeys],str,len); + hrec->keys[hrec->nkeys][len] = 0; + hrec->vals[hrec->nkeys] = NULL; + hrec->nkeys = n; + return 0; } -void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted) +int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted) { - if ( !str ) { hrec->vals[i] = NULL; return; } - if ( hrec->vals[i] ) free(hrec->vals[i]); + if ( hrec->vals[i] ) { + free(hrec->vals[i]); + hrec->vals[i] = NULL; + } + if ( !str ) return 0; if ( is_quoted ) { + if (len >= SIZE_MAX - 3) { + errno = ENOMEM; + return -1; + } hrec->vals[i] = (char*) malloc((len+3)*sizeof(char)); + if (!hrec->vals[i]) return -1; hrec->vals[i][0] = '"'; memcpy(&hrec->vals[i][1],str,len); hrec->vals[i][len+1] = '"'; @@ -253,21 +323,40 @@ void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_ } else { + if (len == SIZE_MAX) { + errno = ENOMEM; + return -1; + } hrec->vals[i] = (char*) malloc((len+1)*sizeof(char)); + if (!hrec->vals[i]) return -1; memcpy(hrec->vals[i],str,len); hrec->vals[i][len] = 0; } + return 0; } -void hrec_add_idx(bcf_hrec_t *hrec, int idx) +int hrec_add_idx(bcf_hrec_t *hrec, int idx) { - int n = ++hrec->nkeys; - hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n); - hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n); - hrec->keys[n-1] = strdup("IDX"); + int n = hrec->nkeys + 1; + char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n); + if (!tmp) return -1; + hrec->keys = tmp; + + tmp = (char**) realloc(hrec->vals, sizeof(char*)*n); + if (!tmp) return -1; + hrec->vals = tmp; + + hrec->keys[hrec->nkeys] = strdup("IDX"); + if (!hrec->keys[hrec->nkeys]) return -1; + kstring_t str = {0,0,0}; - kputw(idx, &str); - hrec->vals[n-1] = str.s; + if (kputw(idx, &str) < 0) { + free(hrec->keys[hrec->nkeys]); + return -1; + } + hrec->vals[hrec->nkeys] = str.s; + hrec->nkeys = n; + return 0; } int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key) @@ -293,11 +382,13 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) const char *q = p; while ( *q && *q!='=' && *q != '\n' ) q++; - int n = q-p; + ptrdiff_t n = q-p; if ( *q!='=' || !n ) { *len = q-line+1; return NULL; } // wrong format bcf_hrec_t *hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); + if (!hrec) return NULL; hrec->key = (char*) malloc(sizeof(char)*(n+1)); + if (!hrec->key) goto fail; memcpy(hrec->key,p,n); hrec->key[n] = 0; @@ -306,6 +397,7 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) { while ( *q && *q!='\n' ) q++; hrec->value = (char*) malloc((q-p+1)*sizeof(char)); + if (!hrec->value) goto fail; memcpy(hrec->value, p, q-p); hrec->value[q-p] = 0; *len = q - line + (*q ? 1 : 0); // Skip \n but not \0 @@ -339,7 +431,7 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) bcf_hrec_destroy(hrec); return NULL; } - bcf_hrec_add_key(hrec, p, q-p-m); + if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail; p = ++q; while ( *q && *q==' ' ) { p++; q++; } int quoted = *p=='"' ? 1 : 0; @@ -358,7 +450,8 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) } const char *r = q; while ( r > p && r[-1] == ' ' ) r--; - bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted); + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0) + goto fail; if ( quoted && *q=='"' ) q++; if ( *q=='>' ) { nopen--; q++; } } @@ -373,55 +466,80 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) *len = q - line + (*q ? 1 : 0); return hrec; + + fail: + bcf_hrec_destroy(hrec); + return NULL; } static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo) { + size_t new_n; + // If available, preserve existing IDX if ( idinfo->id==-1 ) - idinfo->id = hdr->n[dict_type]++; + idinfo->id = hdr->n[dict_type]; else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key ) { hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s", idinfo->id, tag); - exit(1); + errno = EINVAL; + return -1; } - if ( idinfo->id >= hdr->n[dict_type] ) hdr->n[dict_type] = idinfo->id+1; - hts_expand0(bcf_idpair_t,hdr->n[dict_type],hdr->m[dict_type],hdr->id[dict_type]); + new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type]; + if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type], + &hdr->id[dict_type], HTS_RESIZE_CLEAR)) { + return -1; + } + hdr->n[dict_type] = new_n; // NB: the next kh_put call can invalidate the idinfo pointer, therefore - // we leave it unassigned here. It myst be set explicitly in bcf_hdr_sync. + // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync. hdr->id[dict_type][idinfo->id].key = tag; return 0; } -// returns: 1 when hdr needs to be synced, 0 otherwise -int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) +// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise +static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) { // contig - int i,j, ret; + int i, ret, replacing = 0; khint_t k; char *str; + if ( !strcmp(hrec->key, "contig") ) { + hts_pos_t len = 0; hrec->type = BCF_HL_CTG; // Get the contig ID ($str) and length ($j) i = bcf_hrec_find_key(hrec,"length"); - if ( i<0 ) j = 0; - else if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0; + if ( i<0 ) len = 0; + else { + char *end = hrec->vals[i]; + len = strtoll(hrec->vals[i], &end, 10); + if (end == hrec->vals[i] || len < 0) return 0; + } i = bcf_hrec_find_key(hrec,"ID"); if ( i<0 ) return 0; str = strdup(hrec->vals[i]); + if (!str) return -1; // Register in the dictionary vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG]; khint_t k = kh_get(vdict, d, str); - if ( k != kh_end(d) ) { free(str); return 0; } // already present - k = kh_put(vdict, d, str, &ret); + if ( k != kh_end(d) ) { // already present + free(str); + if (kh_val(d, k).hrec[0] != NULL) // and not removed + return 0; + replacing = 1; + } else { + k = kh_put(vdict, d, str, &ret); + if (ret < 0) { free(str); return -1; } + } int idx = bcf_hrec_find_key(hrec,"IDX"); if ( idx!=-1 ) @@ -430,6 +548,10 @@ int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) idx = strtol(hrec->vals[idx], &tmp, 10); if ( *tmp || idx < 0 || idx >= INT_MAX - 1) { + if (!replacing) { + kh_del(vdict, d, k); + free(str); + } hts_log_warning("Error parsing the IDX tag, skipping"); return 0; } @@ -437,10 +559,20 @@ int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) kh_val(d, k) = bcf_idinfo_def; kh_val(d, k).id = idx; - kh_val(d, k).info[0] = j; + kh_val(d, k).info[0] = len; kh_val(d, k).hrec[0] = hrec; - bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)); - if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d,k).id); + if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) { + if (!replacing) { + kh_del(vdict, d, k); + free(str); + } + return -1; + } + if ( idx==-1 ) { + if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) { + return -1; + } + } return 1; } @@ -514,6 +646,7 @@ int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) if ( !id ) return 0; str = strdup(id); + if (!str) return -1; vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID]; k = kh_get(vdict, d, str); @@ -524,26 +657,45 @@ int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) if ( kh_val(d, k).hrec[info&0xf] ) return 0; kh_val(d, k).info[info&0xf] = info; kh_val(d, k).hrec[info&0xf] = hrec; - if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id); + if ( idx==-1 ) { + if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) { + return -1; + } + } return 1; } k = kh_put(vdict, d, str, &ret); + if (ret < 0) { + free(str); + return -1; + } kh_val(d, k) = bcf_idinfo_def; kh_val(d, k).info[info&0xf] = info; kh_val(d, k).hrec[info&0xf] = hrec; kh_val(d, k).id = idx; - bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)); - if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d,k).id); + if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) { + kh_del(vdict, d, k); + free(str); + return -1; + } + if ( idx==-1 ) { + if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) { + return -1; + } + } return 1; } int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) { + int res; if ( !hrec ) return 0; hrec->type = BCF_HL_GEN; - if ( !bcf_hdr_register_hrec(hdr,hrec) ) + res = bcf_hdr_register_hrec(hdr,hrec); + if (res < 0) return -1; + if ( !res ) { // If one of the hashed field, then it is already present if ( hrec->type != BCF_HL_GEN ) @@ -568,10 +720,13 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) } // New record, needs to be added - int n = ++hdr->nhrec; - hdr->hrec = (bcf_hrec_t**) realloc(hdr->hrec, n*sizeof(bcf_hrec_t*)); - hdr->hrec[n-1] = hrec; + int n = hdr->nhrec + 1; + bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*)); + if (!new_hrec) return -1; + hdr->hrec = new_hrec; + hdr->hrec[hdr->nhrec] = hrec; hdr->dirty = 1; + hdr->nhrec = n; return hrec->type==BCF_HL_GEN ? 0 : 1; } @@ -638,23 +793,32 @@ void bcf_hdr_check_sanity(bcf_hdr_t *hdr) int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) { - int len, needs_sync = 0, done = 0; + int len, done = 0; char *p = htxt; // Check sanity: "fileformat" string must come as first bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len); if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") ) hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?"); - needs_sync += bcf_hdr_add_hrec(hdr, hrec); + if (bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } // The filter PASS must appear first in the dictionary hrec = bcf_hdr_parse_line(hdr,"##FILTER=",&len); - needs_sync += bcf_hdr_add_hrec(hdr, hrec); + if (bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } // Parse the whole header do { while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) { - needs_sync += bcf_hdr_add_hrec(hdr, hrec); + if (bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } p += len; } @@ -685,10 +849,12 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) return -1; } - int ret = bcf_hdr_parse_sample_line(hdr,p); - bcf_hdr_sync(hdr); + if (bcf_hdr_parse_sample_line(hdr,p) < 0) + return -1; + if (bcf_hdr_sync(hdr) < 0) + return -1; bcf_hdr_check_sanity(hdr); - return ret; + return 0; } int bcf_hdr_append(bcf_hdr_t *hdr, const char *line) @@ -696,7 +862,8 @@ int bcf_hdr_append(bcf_hdr_t *hdr, const char *line) int len; bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len); if ( !hrec ) return -1; - bcf_hdr_add_hrec(hdr, hrec); + if (bcf_hdr_add_hrec(hdr, hrec) < 0) + return -1; return 0; } @@ -775,19 +942,26 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...) { + char tmp[256], *line = tmp; va_list ap; va_start(ap, fmt); - int n = vsnprintf(NULL, 0, fmt, ap) + 2; + int n = vsnprintf(line, sizeof(tmp), fmt, ap); va_end(ap); - char *line = (char*)malloc(n); - va_start(ap, fmt); - vsnprintf(line, n, fmt, ap); - va_end(ap); + if (n >= sizeof(tmp)) { + n++; // For trailing NUL + line = (char*)malloc(n); + if (!line) + return -1; + + va_start(ap, fmt); + vsnprintf(line, n, fmt, ap); + va_end(ap); + } int ret = bcf_hdr_append(hdr, line); - free(line); + if (line != tmp) free(line); return ret; } @@ -807,7 +981,7 @@ const char *bcf_hdr_get_version(const bcf_hdr_t *hdr) return hrec->value; } -void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) +int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) { bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL); if ( !hrec ) @@ -824,6 +998,7 @@ void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) hrec->value = strdup(version); } hdr->dirty = 1; + return 0; // FIXME: check for errs in this function (return < 0 if so) } bcf_hdr_t *bcf_hdr_init(const char *mode) @@ -910,7 +1085,7 @@ bcf_hdr_t *bcf_hdr_read(htsFile *hfp) size_t hlen; char *htxt = NULL; if (bgzf_read(fp, buf, 4) != 4) goto fail; - hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24); + hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24); if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; } htxt = (char*)malloc(hlen + 1); if (!htxt) goto fail; @@ -932,9 +1107,17 @@ int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h) errno = EINVAL; return -1; } - if ( h->dirty ) bcf_hdr_sync(h); - if (hfp->format.format == vcf || hfp->format.format == text_format) + if ( h->dirty ) { + if (bcf_hdr_sync(h) < 0) return -1; + } + hfp->format.category = variant_data; + if (hfp->format.format == vcf || hfp->format.format == text_format) { + hfp->format.format = vcf; return vcf_hdr_write(hfp, h); + } + + if (hfp->format.format == binary_format) + hfp->format.format = bcf; kstring_t htxt = {0,0,0}; bcf_hdr_format(h, 1, &htxt); @@ -1002,6 +1185,9 @@ void bcf_empty(bcf1_t *v) free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt); if (v->d.var ) free(v->d.var); free(v->shared.s); free(v->indiv.s); + memset(&v->d,0,sizeof(v->d)); + memset(&v->shared,0,sizeof(v->shared)); + memset(&v->indiv,0,sizeof(v->indiv)); } void bcf_destroy(bcf1_t *v) @@ -1013,20 +1199,27 @@ void bcf_destroy(bcf1_t *v) static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) { - uint32_t x[8]; + union { + uint32_t i; + float f; + } x[8]; ssize_t ret; if ((ret = bgzf_read(fp, x, 32)) != 32) { if (ret == 0) return -1; return -2; } bcf_clear1(v); - x[0] -= 24; // to exclude six 32-bit integers - if (ks_resize(&v->shared, x[0]) != 0) return -2; - if (ks_resize(&v->indiv, x[1]) != 0) return -2; - memcpy(v, x + 2, 16); - v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff; - v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff; - v->shared.l = x[0], v->indiv.l = x[1]; + if (x[0].i < 24) return -2; + x[0].i -= 24; // to exclude six 32-bit integers + if (ks_resize(&v->shared, x[0].i) != 0) return -2; + if (ks_resize(&v->indiv, x[1].i) != 0) return -2; + v->rid = x[2].i; + v->pos = x[3].i; + v->rlen = x[4].i; + v->qual = x[5].f; + v->n_allele = x[6].i>>16; v->n_info = x[6].i&0xffff; + v->n_fmt = x[7].i>>24; v->n_sample = x[7].i&0xffffff; + v->shared.l = x[0].i, v->indiv.l = x[1].i; // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4 if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0; @@ -1042,7 +1235,6 @@ static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q, int32_t *val) { - uint32_t v; uint32_t t; if (end - p < 2) return -1; t = *p++ & 0xf; @@ -1054,13 +1246,11 @@ static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q, } else if (t == BCF_BT_INT16) { if (end - p < 2) return -1; *q = p + 2; - v = p[0] | (p[1] << 8); - *val = v < 0x8000 ? v : -((int32_t) (0xffff - v)) - 1; + *val = le_to_i16(p); } else if (t == BCF_BT_INT32) { if (end - p < 4) return -1; *q = p + 4; - v = p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24); - *val = v < 0x80000000UL ? v : -((int32_t) (0xffffffffUL - v)) - 1; + *val = le_to_i32(p); } else { return -1; } @@ -1143,18 +1333,21 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { reports = 0; if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared; if (num > 0) { + bytes = (size_t) num << bcf_type_shift[type]; if (((1 << type) & is_integer) == 0) { hts_log_warning("Bad BCF record: Invalid %s type %d (%s)", "FILTER", type, get_type_name(type)); err |= BCF_ERR_TAG_INVALID; - } - bytes = (size_t) num << bcf_type_shift[type]; - if (end - ptr < bytes) goto bad_shared; - for (i = 0; i < num; i++) { - int32_t key = bcf_dec_int1(ptr, type, &ptr); - if (key < 0 || key >= hdr->n[BCF_DT_ID]) { - if (!reports++ || hts_verbose >= HTS_LOG_DEBUG) - hts_log_warning("Bad BCF record: Invalid %s id %d", "FILTER", key); - err |= BCF_ERR_TAG_UNDEF; + if (end - ptr < bytes) goto bad_shared; + ptr += bytes; + } else { + if (end - ptr < bytes) goto bad_shared; + for (i = 0; i < num; i++) { + int32_t key = bcf_dec_int1(ptr, type, &ptr); + if (key < 0 || key >= hdr->n[BCF_DT_ID]) { + if (!reports++ || hts_verbose >= HTS_LOG_DEBUG) + hts_log_warning("Bad BCF record: Invalid %s id %d", "FILTER", key); + err |= BCF_ERR_TAG_UNDEF; + } } } } @@ -1164,7 +1357,8 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { for (i = 0; i < rec->n_info; i++) { int32_t key = -1; if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared; - if (key < 0 || key >= hdr->n[BCF_DT_ID]) { + if (key < 0 || key >= hdr->n[BCF_DT_ID] + || hdr->id[BCF_DT_ID][key].key == NULL) { if (!reports++ || hts_verbose >= HTS_LOG_DEBUG) hts_log_warning("Bad BCF record: Invalid %s id %d", "INFO", key); err |= BCF_ERR_TAG_UNDEF; @@ -1205,15 +1399,15 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { rec->errcode |= err; - return err ? -1 : 0; + return err ? -2 : 0; // Return -2 so bcf_read() reports an error bad_shared: hts_log_error("Bad BCF record - shared section malformed or too short"); - return -1; + return -2; bad_indiv: hts_log_error("Bad BCF record - individuals section malformed or too short"); - return -1; + return -2; } static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt); @@ -1267,7 +1461,7 @@ int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) return bcf_subset_format(h,v); } -int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end) +int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end) { bcf1_t *v = (bcf1_t *) vv; int ret; @@ -1276,31 +1470,40 @@ int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end) return ret; } -static inline void bcf1_sync_id(bcf1_t *line, kstring_t *str) +static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str) { // single typed string - if ( line->d.id && strcmp(line->d.id, ".") ) bcf_enc_vchar(str, strlen(line->d.id), line->d.id); - else bcf_enc_size(str, 0, BCF_BT_CHAR); + if ( line->d.id && strcmp(line->d.id, ".") ) { + return bcf_enc_vchar(str, strlen(line->d.id), line->d.id); + } else { + return bcf_enc_size(str, 0, BCF_BT_CHAR); + } } -static inline void bcf1_sync_alleles(bcf1_t *line, kstring_t *str) +static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str) { // list of typed strings int i; - for (i=0; in_allele; i++) - bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]); + for (i=0; in_allele; i++) { + if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0) + return -1; + } if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]); + return 0; } -static inline void bcf1_sync_filter(bcf1_t *line, kstring_t *str) +static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str) { // typed vector of integers - if ( line->d.n_flt ) bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1); - else bcf_enc_vint(str, 0, 0, -1); + if ( line->d.n_flt ) { + return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1); + } else { + return bcf_enc_vint(str, 0, 0, -1); + } } -static inline void bcf1_sync_info(bcf1_t *line, kstring_t *str) +static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str) { // pairs of typed vectors - int i, irm = -1; + int i, irm = -1, e = 0; for (i=0; in_info; i++) { bcf_info_t *info = &line->d.info[i]; @@ -1310,7 +1513,7 @@ static inline void bcf1_sync_info(bcf1_t *line, kstring_t *str) if ( irm < 0 ) irm = i; continue; } - kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str); + e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0; if ( irm >=0 ) { bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp; @@ -1318,6 +1521,7 @@ static inline void bcf1_sync_info(bcf1_t *line, kstring_t *str) } } if ( irm>=0 ) line->n_info = irm; + return e == 0 ? 0 : -1; } static int bcf1_sync(bcf1_t *line) @@ -1500,10 +1704,12 @@ bcf1_t *bcf_dup(bcf1_t *src) int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) { - if ( h->dirty ) bcf_hdr_sync(h); + if ( h->dirty ) { + if (bcf_hdr_sync(h) < 0) return -1; + } if ( bcf_hdr_nsamples(h)!=v->n_sample ) { - hts_log_error("Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d)", + hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)", bcf_seqname(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h)); return -1; } @@ -1517,21 +1723,33 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) // header. At this point, the header must have been printed, // proceeding would lead to a broken BCF file. Errors must be checked // and cleared by the caller before we can proceed. - hts_log_error("Unchecked error (%d), exiting", v->errcode); - exit(1); + hts_log_error("Unchecked error (%d)", v->errcode); + return -1; } bcf1_sync(v); // check if the BCF record was modified BGZF *fp = hfp->fp.bgzf; - uint32_t x[8]; - x[0] = v->shared.l + 24; // to include six 32-bit integers - x[1] = v->indiv.l; - memcpy(x + 2, v, 16); - x[6] = (uint32_t)v->n_allele<<16 | v->n_info; - x[7] = (uint32_t)v->n_fmt<<24 | v->n_sample; + union { + uint32_t i; + float f; + } x[8]; + x[0].i = v->shared.l + 24; // to include six 32-bit integers + x[1].i = v->indiv.l; + x[2].i = v->rid; + x[3].i = v->pos; + x[4].i = v->rlen; + x[5].f = v->qual; + x[6].i = (uint32_t)v->n_allele<<16 | v->n_info; + x[7].i = (uint32_t)v->n_fmt<<24 | v->n_sample; if ( bgzf_write(fp, x, 32) != 32 ) return -1; if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1; if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1; + + if (hfp->idx) { + if (hts_idx_push(hfp->idx, v->rid, v->pos, v->pos + v->rlen, bgzf_tell(fp), 1) < 0) + return -1; + } + return 0; } @@ -1539,11 +1757,36 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) *** VCF header I/O *** **********************/ +static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) { + bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t)); + int save_errno; + if (!hrec) goto fail; + + hrec->key = strdup("contig"); + if (!hrec->key) goto fail; + + if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail; + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0) + goto fail; + if (bcf_hdr_add_hrec(h, hrec) < 0) + goto fail; + return 0; + + fail: + save_errno = errno; + hts_log_error("%s", strerror(errno)); + if (hrec) bcf_hrec_destroy(hrec); + errno = save_errno; + return -1; +} + bcf_hdr_t *vcf_hdr_read(htsFile *fp) { kstring_t txt, *s = &fp->line; int ret; bcf_hdr_t *h; + tbx_t *idx = NULL; + const char **names = NULL; h = bcf_hdr_init("r"); if (!h) { hts_log_error("Failed to allocate bcf header"); @@ -1551,6 +1794,7 @@ bcf_hdr_t *vcf_hdr_read(htsFile *fp) } txt.l = txt.m = 0; txt.s = 0; while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) { + int e = 0; if (s->l == 0) continue; if (s->s[0] != '#') { hts_log_error("No sample line"); @@ -1566,17 +1810,21 @@ bcf_hdr_t *vcf_hdr_read(htsFile *fp) while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) { char *tab = strchr(tmp.s, '\t'); if (tab == NULL) continue; - kputs("##contig=\n", 2, &txt); + e |= (kputs("##contig=\n", 2, &txt) < 0); } free(tmp.s); if (hclose(f) != 0) { - hts_log_warning("Failed to close %s", fp->fn_aux); + hts_log_error("Error on closing %s", fp->fn_aux); + goto error; } + if (e) goto error; } - kputsn(s->s, s->l, &txt); - kputc('\n', &txt); + if (kputsn(s->s, s->l, &txt) < 0) goto error; + if (kputc('\n', &txt) < 0) goto error; if (s->s[1] != '#') break; } if ( ret < -1 ) goto error; @@ -1588,31 +1836,31 @@ bcf_hdr_t *vcf_hdr_read(htsFile *fp) if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error; // check tabix index, are all contigs listed in the header? add the missing ones - tbx_t *idx = tbx_index_load(fp->fn); + idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL); if ( idx ) { int i, n, need_sync = 0; - const char **names = tbx_seqnames(idx, &n); + names = tbx_seqnames(idx, &n); + if (!names) goto error; for (i=0; ikey = strdup("contig"); - bcf_hrec_add_key(hrec, "ID", strlen("ID")); - bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); - bcf_hdr_add_hrec(h, hrec); + if (add_missing_contig_hrec(h, names[i]) < 0) goto error; need_sync = 1; } + if ( need_sync ) { + if (bcf_hdr_sync(h) < 0) goto error; + } free(names); tbx_destroy(idx); - if ( need_sync ) - bcf_hdr_sync(h); } free(txt.s); return h; error: + if (idx) tbx_destroy(idx); + free(names); free(txt.s); if (h) bcf_hdr_destroy(h); return NULL; @@ -1620,46 +1868,62 @@ bcf_hdr_t *vcf_hdr_read(htsFile *fp) int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname) { - int i, n; + int i = 0, n = 0, save_errno; char **lines = hts_readlines(fname, &n); if ( !lines ) return 1; for (i=0; ivalue ) { int j, nout = 0; - ksprintf(str, "##%s=<", hrec->key); + e |= ksprintf(str, "##%s=<", hrec->key) < 0; for (j=0; jnkeys; j++) { // do not output IDX if output is VCF if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue; - if ( nout ) kputc(',',str); - ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]); + if ( nout ) e |= kputc(',',str) < 0; + e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0; nout++; } - ksprintf(str,">\n"); + e |= ksprintf(str,">\n") < 0; } else - ksprintf(str,"##%s=%s\n", hrec->key,hrec->value); + e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0; + + return e == 0 ? 0 : -1; } -void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str) +int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str) { - _bcf_hrec_format(hrec,0,str); + return _bcf_hrec_format(hrec,0,str); } int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str) @@ -1726,9 +1990,9 @@ int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) *** Typed value I/O *** ***********************/ -void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) +int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) { - int32_t max = INT32_MIN + 1, min = INT32_MAX; + int32_t max = INT32_MIN, min = INT32_MAX; int i; if (n <= 0) bcf_enc_size(s, 0, BCF_BT_NULL); else if (n == 1) bcf_enc_int1(s, a[0]); @@ -1739,13 +2003,13 @@ void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) if (max < a[i]) max = a[i]; if (min > a[i]) min = a[i]; } - if (max <= INT8_MAX && min > bcf_int8_vector_end) { + if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) { bcf_enc_size(s, wsize, BCF_BT_INT8); for (i = 0; i < n; ++i) if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s); else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s); else kputc(a[i], s); - } else if (max <= INT16_MAX && min > bcf_int16_vector_end) { + } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { uint8_t *p; bcf_enc_size(s, wsize, BCF_BT_INT16); ks_resize(s, s->l + n * sizeof(int16_t)); @@ -1772,6 +2036,26 @@ void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) s->l += n * sizeof(int32_t); } } + + return 0; // FIXME: check for errs in this function +} + +static int bcf_enc_long1(kstring_t *s, int64_t x) { + uint32_t e = 0; + if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32) + return bcf_enc_int1(s, x); + if (x == bcf_int64_vector_end) { + e |= bcf_enc_size(s, 1, BCF_BT_INT8); + e |= kputc(bcf_int8_vector_end, s) < 0; + } else if (x == bcf_int64_missing) { + e |= bcf_enc_size(s, 1, BCF_BT_INT8); + e |= kputc(bcf_int8_missing, s) < 0; + } else { + e |= bcf_enc_size(s, 1, BCF_BT_INT64); + e |= ks_expand(s, 8); + if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; } + } + return e == 0 ? 0 : -1; } static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) { @@ -1792,33 +2076,35 @@ static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) return 0; } -void bcf_enc_vfloat(kstring_t *s, int n, float *a) +int bcf_enc_vfloat(kstring_t *s, int n, float *a) { assert(n >= 0); bcf_enc_size(s, n, BCF_BT_FLOAT); serialize_float_array(s, n, a); + return 0; // FIXME: check for errs in this function } -void bcf_enc_vchar(kstring_t *s, int l, const char *a) +int bcf_enc_vchar(kstring_t *s, int l, const char *a) { bcf_enc_size(s, l, BCF_BT_CHAR); kputsn(a, l, s); + return 0; // FIXME: check for errs in this function } -void bcf_fmt_array(kstring_t *s, int n, int type, void *data) +int bcf_fmt_array(kstring_t *s, int n, int type, void *data) { int j = 0; + uint32_t e = 0; if (n == 0) { - kputc('.', s); - return; + return kputc('.', s) >= 0 ? 0 : -1; } if (type == BCF_BT_CHAR) { char *p = (char*)data; for (j = 0; j < n && *p; ++j, ++p) { - if ( *p==bcf_str_missing ) kputc('.', s); - else kputc(*p, s); + if ( *p==bcf_str_missing ) e |= kputc('.', s) < 0; + else e |= kputc(*p, s) < 0; } } else @@ -1831,7 +2117,7 @@ void bcf_fmt_array(kstring_t *s, int n, int type, void *data) if ( is_vector_end ) break; \ if ( j ) kputc(',', s); \ if ( is_missing ) kputc('.', s); \ - else kprint; \ + else e |= kprint < 0; \ } \ } switch (type) { @@ -1843,6 +2129,7 @@ void bcf_fmt_array(kstring_t *s, int n, int type, void *data) } #undef BRANCH } + return e == 0 ? 0 : -1; } uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) @@ -1859,18 +2146,21 @@ uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) typedef struct { int key, max_m, size, offset; - uint64_t is_gt:1, max_g:31, max_l:32; + uint32_t is_gt:1, max_g:31; + uint32_t max_l; uint32_t y; uint8_t *buf; } fmt_aux_t; -static inline void align_mem(kstring_t *s) +static inline int align_mem(kstring_t *s) { + int e = 0; if (s->l&7) { uint64_t zero = 0; int l = ((s->l + 7)>>3<<3) - s->l; - kputsn((char*)&zero, l, s); + e = kputsn((char*)&zero, l, s) < 0; } + return e == 0 ? 0 : -1; } // p,q is the start and the end of the FORMAT field @@ -1891,7 +2181,8 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p char *end = s->s + s->l; if ( q>=end ) { - hts_log_error("FORMAT column with no sample columns starting at %s:%d", s->s, v->pos+1); + hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", s->s, v->pos+1); + v->errcode |= BCF_ERR_NCOLS; return -1; } @@ -1906,7 +2197,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { if (j >= MAX_N_FMT) { v->errcode |= BCF_ERR_LIMITS; - hts_log_error("FORMAT column at %s:%d lists more identifiers than htslib can handle", + hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle", bcf_seqname(h,v), v->pos+1); return -1; } @@ -1926,10 +2217,13 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p ksprintf(&tmp, "##FORMAT=", t); bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); free(tmp.s); - if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h); + int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1; + if (res < 0) bcf_hrec_destroy(hrec); + if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); + k = kh_get(vdict, d, t); v->errcode = BCF_ERR_TAG_UNDEF; - if (k == kh_end(d)) { + if (res || k == kh_end(d)) { hts_log_error("Could not add dummy header for FORMAT '%s'", t); v->errcode |= BCF_ERR_TAG_INVALID; return -1; @@ -1975,9 +2269,10 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p j++; if ( j>=v->n_fmt ) { - hts_log_error("Incorrect number of FORMAT fields at %s:%d", + hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"", h->id[BCF_DT_CTG][v->rid].key, v->pos+1); - exit(1); + v->errcode |= BCF_ERR_NCOLS; + return -1; } } else break; @@ -2003,7 +2298,8 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } else { hts_log_error("The format type %d is currently not supported", f->y>>4&0xf); - abort(); // I do not know how to do with Flag in the genotype fields + v->errcode |= BCF_ERR_TAG_INVALID; + return -1; } align_mem(mem); f->offset = mem->l; @@ -2038,11 +2334,20 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p if (z->is_gt) { // genotypes int32_t is_phased = 0, *x = (int32_t*)(z->buf + z->size * m); for (l = 0;; ++t) { - if (*t == '.') ++t, x[l++] = is_phased; - else x[l++] = (strtol(t, &t, 10) + 1) << 1 | is_phased; -#if THOROUGH_SANITY_CHECKS - assert( 0 ); // success of strtol,strtod not checked -#endif + if (*t == '.') { + ++t, x[l++] = is_phased; + } else { + char *tt = t; + errno = 0; + long val = strtol(t, &t, 10); + if (errno == ERANGE || val > (INT32_MAX>>1)-1 || val < 0) { + hts_log_error("Unsupported value:'%s' (too large or negative)", tt); + return -1; + } else { + x[l] = (val + 1) << 1 | is_phased; + l++; + } + } is_phased = (*t == '|'); if (*t != '|' && *t != '/') break; } @@ -2071,7 +2376,11 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]); - } else abort(); + } else { + hts_log_error("Unknown FORMAT field type %d", z->y>>4&0xf); + v->errcode |= BCF_ERR_TAG_INVALID; + return -1; + } if (*t == '\0') { break; @@ -2081,7 +2390,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } else { char buffer[8]; - hts_log_error("Invalid character '%s' in '%s' FORMAT field at %s:%d", + hts_log_error("Invalid character '%s' in '%s' FORMAT field at %s:%"PRIhts_pos"", dump_char(buffer, *t), h->id[BCF_DT_ID][z->key].key, bcf_seqname(h,v), v->pos+1); v->errcode |= BCF_ERR_CHAR; return -1; @@ -2140,14 +2449,14 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p if ( v->n_sample!=bcf_hdr_nsamples(h) ) { - hts_log_error("Number of columns at %s:%d does not match the number of samples (%d vs %d)", + hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)", bcf_seqname(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h)); v->errcode |= BCF_ERR_NCOLS; return -1; } if ( v->indiv.l > 0xffffffff ) { - hts_log_error("The FORMAT at %s:%d is too long", bcf_seqname(h,v), v->pos+1); + hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed @@ -2167,7 +2476,9 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) ks_tokaux_t aux; int max_n_flt = 0, max_n_val = 0; int32_t *flt_a = NULL, *val_a = NULL; - int ret = -1; + int ret = -2; + const uint32_t MAX_ALLELES = 65535; // n_allele is 16 bits + const uint32_t MAX_INFO = 65535; // n_info is 16 bits if (!s || !h || !v || !(s->s)) return ret; @@ -2194,10 +2505,12 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) ksprintf(&tmp, "##contig=", p); bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); free(tmp.s); - if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h); + int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1; + if (res < 0) bcf_hrec_destroy(hrec); + if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); k = kh_get(vdict, d, p); v->errcode = BCF_ERR_CTG_UNDEF; - if (k == kh_end(d)) { + if (res || k == kh_end(d)) { hts_log_error("Could not add dummy header for contig '%s'", p); v->errcode |= BCF_ERR_CTG_INVALID; goto err; @@ -2205,7 +2518,14 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } v->rid = kh_val(d, k).id; } else if (i == 1) { // POS - v->pos = atoi(p) - 1; + errno = 0; + v->pos = strtoll(p, NULL, 10); + if (errno == ERANGE || v->pos == INT64_MIN) { + hts_log_error("Position value '%s' is too large", p); + goto err; + } else { + v->pos -= 1; + } } else if (i == 2) { // ID if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p); else bcf_enc_size(str, 0, BCF_BT_CHAR); @@ -2216,6 +2536,12 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) if (strcmp(p, ".")) { for (r = t = p;; ++r) { if (*r == ',' || *r == 0) { + if (v->n_allele == MAX_ALLELES) { + hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos, + bcf_seqname(h,v), v->pos+1); + v->errcode |= BCF_ERR_LIMITS; + goto err; + } bcf_enc_vchar(str, r - t, t); t = r + 1; ++v->n_allele; @@ -2260,10 +2586,12 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) ksprintf(&tmp, "##FILTER=", t); bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); free(tmp.s); - if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h); + int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1; + if (res < 0) bcf_hrec_destroy(hrec); + if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); k = kh_get(vdict, d, t); v->errcode = BCF_ERR_TAG_UNDEF; - if (k == kh_end(d)) { + if (res || k == kh_end(d)) { hts_log_error("Could not add dummy header for FILTER '%s'", t); v->errcode |= BCF_ERR_TAG_INVALID; goto err; @@ -2285,6 +2613,12 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) int c; char *val, *end; if (*r != ';' && *r != '=' && *r != 0) continue; + if (v->n_info == MAX_INFO) { + hts_log_error("Too many INFO entries at %s:%"PRIhts_pos, + bcf_seqname(h,v), v->pos+1); + v->errcode |= BCF_ERR_LIMITS; + goto err; + } val = end = 0; c = *r; *r = 0; if (c == '=') { @@ -2302,10 +2636,12 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) ksprintf(&tmp, "##INFO=", key); bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); free(tmp.s); - if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h); + int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1; + if (res < 0) bcf_hrec_destroy(hrec); + if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); k = kh_get(vdict, d, key); v->errcode = BCF_ERR_TAG_UNDEF; - if (k == kh_end(d)) { + if (res || k == kh_end(d)) { hts_log_error("Could not add dummy header for INFO '%s'", key); v->errcode |= BCF_ERR_TAG_INVALID; goto err; @@ -2336,29 +2672,39 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) val_a = z; } if ((y>>4&0xf) == BCF_HT_INT) { - for (i = 0, t = val; i < n_val; ++i, ++t) + // Allow first value only to be 64 bit + // (for large END value) + int64_t v64 = strtoll(val, &te, 10); + if ( te==val ) { // conversion failed + val_a[0] = bcf_int32_missing; + v64 = bcf_int64_missing; + } else { + val_a[0] = v64 >= BCF_MIN_BT_INT32 && v64 <= BCF_MAX_BT_INT32 ? v64 : bcf_int32_missing; + } + for (t = te; *t && *t != ','; t++); + if (*t == ',') ++t; + for (i = 1; i < n_val; ++i, ++t) { val_a[i] = strtol(t, &te, 10); if ( te==t ) // conversion failed - { val_a[i] = bcf_int32_missing; - while ( *te && *te!=',' ) te++; - } - t = te; + for (t = te; *t && *t != ','; t++); } - bcf_enc_vint(str, n_val, val_a, -1); - if (strcmp(key, "END") == 0) v->rlen = val_a[0] - v->pos; + if (n_val == 1) { + bcf_enc_long1(str, v64); + } else { + bcf_enc_vint(str, n_val, val_a, -1); + } + if (strcmp(key, "END") == 0) + v->rlen = v64 - v->pos; } else if ((y>>4&0xf) == BCF_HT_REAL) { float *val_f = (float *)val_a; for (i = 0, t = val; i < n_val; ++i, ++t) { val_f[i] = strtod(t, &te); if ( te==t ) // conversion failed - { bcf_float_set_missing(val_f[i]); - while ( *te && *te!=',' ) te++; - } - t = te; + for (t = te; *t && *t != ','; t++); } bcf_enc_vfloat(str, n_val, val_f); } @@ -2372,7 +2718,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } else if (i == 8) {// FORMAT free(flt_a); free(val_a); - return vcf_parse_format(s, h, v, p, q); + return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2; } } @@ -2421,6 +2767,7 @@ static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info) else if (info->type == BCF_BT_INT32) info->v1.i = le_to_i32(ptr); else if (info->type == BCF_BT_FLOAT) info->v1.f = le_to_float(ptr); else if (info->type == BCF_BT_INT16) info->v1.i = le_to_i16(ptr); + else if (info->type == BCF_BT_INT64) info->v1.i = le_to_i64(ptr); } ptr += info->len << bcf_type_shift[info->type]; info->vptr_len = ptr - info->vptr; @@ -2501,7 +2848,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) int i; bcf_unpack((bcf1_t*)v, BCF_UN_ALL); kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM - kputc('\t', s); kputw(v->pos + 1, s); // POS + kputc('\t', s); kputll(v->pos + 1, s); // POS kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID kputc('\t', s); // REF if (v->n_allele > 0) kputs(v->d.allele[0], s); @@ -2531,8 +2878,9 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) if ( !z->vptr ) continue; if ( !first ) kputc(';', s); first = 0; - if (z->key >= h->n[BCF_DT_ID]) { - hts_log_error("Invalid BCF, the INFO index is too large"); + if (z->key < 0 || z->key >= h->n[BCF_DT_ID]) { + hts_log_error("Invalid BCF, the INFO index %d is %s", + z->key, z->key < 0 ? "negative" : "too large"); errno = EINVAL; return -1; } @@ -2546,6 +2894,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) case BCF_BT_INT8: if ( z->v1.i==bcf_int8_missing ) kputc('.', s); else kputw(z->v1.i, s); break; case BCF_BT_INT16: if ( z->v1.i==bcf_int16_missing ) kputc('.', s); else kputw(z->v1.i, s); break; case BCF_BT_INT32: if ( z->v1.i==bcf_int32_missing ) kputc('.', s); else kputw(z->v1.i, s); break; + case BCF_BT_INT64: if ( z->v1.i==bcf_int64_missing ) kputc('.', s); else kputll(z->v1.i, s); break; case BCF_BT_FLOAT: if ( bcf_float_is_missing(z->v1.f) ) kputc('.', s); else kputd(z->v1.f, s); break; case BCF_BT_CHAR: kputc(z->v1.i, s); break; default: hts_log_error("Unexpected type %d", z->type); exit(1); break; @@ -2621,6 +2970,16 @@ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); else ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); + + if (fp->idx) { + int tid; + if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname(h, v))) < 0) + return -1; + + if (hts_idx_push(fp->idx, tid, v->pos, v->pos + v->rlen, bgzf_tell(fp->fp.bgzf), 1) < 0) + return -1; + } + return ret==fp->line.l ? 0 : -1; } @@ -2641,26 +3000,41 @@ int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id) *** BCF indexing *** ********************/ +// Calculate number of index levels given min_shift and the header contig +// list. Also returns number of contigs in *nids_out. +static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift, + int starting_n_lvls, int *nids_out) +{ + int n_lvls, i, nids = 0; + int64_t max_len = 0, s; + + for (i = 0; i < h->n[BCF_DT_CTG]; ++i) + { + if ( !h->id[BCF_DT_CTG][i].val ) continue; + if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) + max_len = h->id[BCF_DT_CTG][i].val->info[0]; + nids++; + } + if ( !max_len ) max_len = (1LL<<31) - 1; // In case contig line is broken. + max_len += 256; + s = 1LL << (min_shift + starting_n_lvls * 3); + for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3); + + if (nids_out) *nids_out = nids; + return n_lvls; +} + hts_idx_t *bcf_index(htsFile *fp, int min_shift) { - int n_lvls, i; + int n_lvls; bcf1_t *b = NULL; hts_idx_t *idx = NULL; bcf_hdr_t *h; - int64_t max_len = 0, s; int r; h = bcf_hdr_read(fp); if ( !h ) return NULL; int nids = 0; - for (i = 0; i < h->n[BCF_DT_CTG]; ++i) - { - if ( !h->id[BCF_DT_CTG][i].val ) continue; - if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) max_len = h->id[BCF_DT_CTG][i].val->info[0]; - nids++; - } - if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken. - max_len += 256; - for (n_lvls = 0, s = 1< s; ++n_lvls, s <<= 3); + n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids); idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); if (!idx) goto fail; b = bcf_init1(); @@ -2688,6 +3062,11 @@ hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx) return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn); } +hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags) +{ + return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags); +} + int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads) { htsFile *fp; @@ -2700,13 +3079,18 @@ int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_thr if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; } switch (fp->format.format) { case bcf: - idx = bcf_index(fp, min_shift); - if (idx) { - ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI); - if (ret < 0) ret = -4; - hts_idx_destroy(idx); + if (!min_shift) { + hts_log_error("TBI indices for BCF files are not supported"); + ret = -1; + } else { + idx = bcf_index(fp, min_shift); + if (idx) { + ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI); + if (ret < 0) ret = -4; + hts_idx_destroy(idx); + } + else ret = -1; } - else ret = -1; break; case vcf: @@ -2737,13 +3121,80 @@ int bcf_index_build(const char *fn, int min_shift) return bcf_index_build3(fn, NULL, min_shift, 0); } +// Initialise fp->idx for the current format type. +// This must be called after the header has been written but no other data. +static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) { + int n_lvls, fmt; + + if (min_shift == 0) { + min_shift = 14; + n_lvls = 5; + fmt = HTS_FMT_TBI; + } else { + // Set initial n_lvls to match tbx_index() + int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3; + // Increase if necessary + n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL); + fmt = HTS_FMT_CSI; + } + + fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); + if (!fp->idx) return -1; + + // Tabix meta data, added even in CSI for VCF + uint8_t conf[4*7]; + u32_to_le(TBX_VCF, conf+0); // fmt + u32_to_le(1, conf+4); // name col + u32_to_le(2, conf+8); // beg col + u32_to_le(0, conf+12); // end col + u32_to_le('#', conf+16); // comment + u32_to_le(0, conf+20); // n.skip + u32_to_le(0, conf+24); // ref name len + if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) { + hts_idx_destroy(fp->idx); + fp->idx = NULL; + return -1; + } + fp->fnidx = fnidx; + + return 0; +} + +// Initialise fp->idx for the current format type. +// This must be called after the header has been written but no other data. +int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) { + int n_lvls, nids = 0; + + if (fp->format.format == vcf) + return vcf_idx_init(fp, h, min_shift, fnidx); + + if (!min_shift) + min_shift = 14; + + n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids); + + fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); + if (!fp->idx) return -1; + fp->fnidx = fnidx; + + return 0; +} + +// Finishes an index. Call afer the last record has been written. +// Returns 0 on success, <0 on failure. +// +// NB: same format as SAM/BAM as it uses bgzf. +int bcf_idx_save(htsFile *fp) { + return sam_idx_save(fp); +} + /***************** *** Utilities *** *****************/ int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) { - int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0; + int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res; for (i=0; inhrec; i++) { if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value ) @@ -2758,8 +3209,11 @@ int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) // to bcf_hdr_combine() and make this optional? if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break; } - if ( j>=ndst_ori ) - need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if ( j>=ndst_ori ) { + res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if (res < 0) return -1; + need_sync += res; + } } else if ( src->hrec[i]->type==BCF_HL_STR ) { @@ -2768,8 +3222,11 @@ int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) if ( j>=0 ) { bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key); - if ( !rec ) - need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if ( !rec ) { + res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if (res < 0) return -1; + need_sync += res; + } } } else @@ -2778,9 +3235,11 @@ int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) assert( j>=0 ); // this should always be true for valid VCFs bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL); - if ( !rec ) - need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); - else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT ) + if ( !rec ) { + res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if (res < 0) return -1; + need_sync += res; + } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT ) { // Check that both records are of the same type. The bcf_hdr_id2length // macro cannot be used here because dst header is not synced yet. @@ -2803,7 +3262,9 @@ int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) } } } - if ( need_sync ) bcf_hdr_sync(dst); + if ( need_sync ) { + if (bcf_hdr_sync(dst) < 0) return -1; + } return ret; } @@ -2823,7 +3284,7 @@ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) return dst; } - int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0; + int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res; for (i=0; inhrec; i++) { if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value ) @@ -2838,8 +3299,11 @@ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) // to bcf_hdr_combine() and make this optional? if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break; } - if ( j>=ndst_ori ) - need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if ( j>=ndst_ori ) { + res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if (res < 0) return NULL; + need_sync += res; + } } else if ( src->hrec[i]->type==BCF_HL_STR ) { @@ -2848,8 +3312,11 @@ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) if ( j>=0 ) { bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key); - if ( !rec ) - need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if ( !rec ) { + res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if (res < 0) return NULL; + need_sync += res; + } } } else @@ -2858,9 +3325,11 @@ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) assert( j>=0 ); // this should always be true for valid VCFs bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL); - if ( !rec ) - need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); - else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT ) + if ( !rec ) { + res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if (res < 0) return NULL; + need_sync += res; + } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT ) { // Check that both records are of the same type. The bcf_hdr_id2length // macro cannot be used here because dst header is not synced yet. @@ -2883,9 +3352,12 @@ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) } } } - if ( need_sync ) bcf_hdr_sync(dst); + if ( need_sync ) { + if (bcf_hdr_sync(dst) < 0) return NULL; + } return dst; } + int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line) { int i; @@ -3077,19 +3549,24 @@ int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) int i, narr = bit_array_size(bcf_hdr_nsamples(hdr)); hdr->keep_samples = (uint8_t*) calloc(narr,1); + if (!hdr->keep_samples) return -1; hdr->nsamples_ori = bcf_hdr_nsamples(hdr); if ( !samples ) { // exclude all samples - bcf_hdr_nsamples(hdr) = 0; khint_t k; - vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE]; + vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict; + new_dict = kh_init(vdict); + if (!new_dict) return -1; + + bcf_hdr_nsamples(hdr) = 0; + for (k = kh_begin(d); k != kh_end(d); ++k) if (kh_exist(d, k)) free((char*)kh_key(d, k)); kh_destroy(vdict, d); - hdr->dict[BCF_DT_SAMPLE] = kh_init(vdict); - bcf_hdr_sync(hdr); + hdr->dict[BCF_DT_SAMPLE] = new_dict; + if (bcf_hdr_sync(hdr) < 0) return -1; return 0; } @@ -3120,32 +3597,55 @@ int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) bcf_hdr_nsamples(hdr) = 0; for (i=0; insamples_ori; i++) if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++; + if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; } else { + // Make new list and dictionary with desired samples char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr)); + vdict_t *new_dict, *d; + int k, res; + if (!samples) return -1; + + new_dict = kh_init(vdict); + if (!new_dict) { + free(samples); + return -1; + } idx = 0; - for (i=0; insamples_ori; i++) - if ( bit_array_test(hdr->keep_samples,i) ) samples[idx++] = strdup(hdr->samples[i]); - free(hdr->samples); - hdr->samples = samples; + for (i=0; insamples_ori; i++) { + if ( bit_array_test(hdr->keep_samples,i) ) { + samples[idx] = hdr->samples[i]; + k = kh_put(vdict, new_dict, hdr->samples[i], &res); + if (res < 0) { + free(samples); + kh_destroy(vdict, new_dict); + return -1; + } + kh_val(new_dict, k) = bcf_idinfo_def; + kh_val(new_dict, k).id = idx; + idx++; + } + } - // delete original samples from the dictionary - vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE]; - int k; + // Delete desired samples from old dictionary, so we don't free them + d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE]; + for (i=0; i < idx; i++) { + int k = kh_get(vdict, d, samples[i]); + if (k < kh_end(d)) kh_del(vdict, d, k); + } + + // Free everything else for (k = kh_begin(d); k != kh_end(d); ++k) if (kh_exist(d, k)) free((char*)kh_key(d, k)); kh_destroy(vdict, d); + hdr->dict[BCF_DT_SAMPLE] = new_dict; - // add the subset back - hdr->dict[BCF_DT_SAMPLE] = d = kh_init(vdict); - for (i=0; isamples[i], &ignore); - kh_val(d, k) = bcf_idinfo_def; - kh_val(d, k).id = kh_size(d) - 1; - } - bcf_hdr_sync(hdr); + free(hdr->samples); + hdr->samples = samples; + + if (bcf_hdr_sync(hdr) < 0) + return -1; } return ret; @@ -3184,7 +3684,7 @@ int bcf_is_snp(bcf1_t *v) bcf_unpack(v, BCF_UN_STR); for (i = 0; i < v->n_allele; ++i) { - if ( v->d.allele[i][1]==0 ) continue; + if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue; // mpileup's allele, see also below. This is not completely satisfactory, // a general library is here narrowly tailored to fit samtools. @@ -3198,12 +3698,13 @@ int bcf_is_snp(bcf1_t *v) static void bcf_set_variant_type(const char *ref, const char *alt, variant_t *var) { + if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; } // overlapping variant + // The most frequent case if ( !ref[1] && !alt[1] ) { if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; } if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant - if ( *alt == '*' ) { var->n = 0; var->type = VCF_REF; return; } var->n = 1; var->type = VCF_SNP; return; } if ( alt[0]=='<' ) @@ -3257,7 +3758,7 @@ static void bcf_set_variant_type(const char *ref, const char *alt, variant_t *va // should do also complex events, SVs, etc... } -static void bcf_set_variant_types(bcf1_t *b) +static int bcf_set_variant_types(bcf1_t *b) { if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR); bcf_dec_t *d = &b->d; @@ -3268,12 +3769,15 @@ static void bcf_set_variant_types(bcf1_t *b) } int i; b->d.var_type = 0; + d->var[0].type = VCF_REF; + d->var[0].n = 0; for (i=1; in_allele; i++) { bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]); b->d.var_type |= d->var[i].type; //fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type); } + return 0; } int bcf_get_variant_types(bcf1_t *rec) @@ -3331,6 +3835,14 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v else bcf_enc_vchar(&str, strlen((char*)values), (char*)values); } + else if ( type==BCF_HT_LONG ) + { + if (n != 1) { + hts_log_error("Only storing a single BCF_HT_LONG value is supported"); + abort(); + } + bcf_enc_long1(&str, *(int64_t *) values); + } else { hts_log_error("The type %d not implemented yet", type); @@ -3353,7 +3865,8 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v } else { - assert( !inf->vptr_free ); // fix the caller or improve here: this has been modified before + if ( inf->vptr_free ) + free(inf->vptr - inf->vptr_off); bcf_unpack_info_core1((uint8_t*)str.s, inf); inf->vptr_free = 1; line->d.shared_dirty |= BCF1_DIRTY_INF; @@ -3371,7 +3884,11 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v } line->unpacked |= BCF_UN_INFO; - if ( n==1 && !strcmp("END",key) ) line->rlen = ((int32_t*)values)[0] - line->pos; + if ( n==1 && !strcmp("END",key) ) { + assert(type == BCF_HT_INT || type == BCF_HT_LONG); + int64_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values; + line->rlen = end - line->pos; + } return 0; } @@ -3493,7 +4010,8 @@ int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const } else { - assert( !fmt->p_free ); // fix the caller or improve here: this has been modified before + if ( fmt->p_free ) + free(fmt->p - fmt->p_off); bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt); fmt->p_free = 1; line->d.indiv_dirty = 1; @@ -3713,7 +4231,7 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi { int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header - if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=type ) return -2; // expected different type + if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); @@ -3737,7 +4255,15 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi } // Make sure the buffer is big enough - int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float); + int size1; + switch (type) { + case BCF_HT_INT: size1 = sizeof(int32_t); break; + case BCF_HT_LONG: size1 = sizeof(int64_t); break; + case BCF_HT_REAL: size1 = sizeof(float); break; + default: + hts_log_error("Unexpected output type %d", type); + return -2; + } if ( *ndst < info->len ) { *ndst = info->len; @@ -3758,11 +4284,28 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi ret = j; \ } while (0) switch (info->type) { - case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; - case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; - case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; + case BCF_BT_INT8: + if (type == BCF_HT_LONG) { + BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); + } else { + BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); + } + break; + case BCF_BT_INT16: + if (type == BCF_HT_LONG) { + BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); + } else { + BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); + } + break; + case BCF_BT_INT32: + if (type == BCF_HT_LONG) { + BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break; + } else { + BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; + } case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break; - default: hts_log_error("Unexpected type %d", info->type); exit(1); + default: hts_log_error("Unexpected type %d", info->type); return -2; } #undef BRANCH return ret; // set by BRANCH diff --git a/vcf_sweep.c b/vcf_sweep.c index 799e78af3..234ea881f 100644 --- a/vcf_sweep.c +++ b/vcf_sweep.c @@ -1,6 +1,6 @@ /* vcf_sweep.c -- forward/reverse sweep API. - Copyright (C) 2013 Genome Research Ltd. + Copyright (C) 2013-2014, 2019 Genome Research Ltd. Author: Petr Danecek @@ -22,6 +22,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include "htslib/vcf_sweep.h" @@ -49,8 +50,8 @@ struct _bcf_sweep_t }; BGZF *hts_get_bgzfp(htsFile *fp); -int hts_useek(htsFile *file, long uoffset, int where); -long hts_utell(htsFile *file); +int hts_useek(htsFile *file, off_t uoffset, int where); +off_t hts_utell(htsFile *file); static inline int sw_rec_equal(bcf_sweep_t *sw, bcf1_t *rec) { @@ -66,7 +67,7 @@ static inline int sw_rec_equal(bcf_sweep_t *sw, bcf1_t *rec) return 1; } -static void sw_rec_save(bcf_sweep_t *sw, bcf1_t *rec) +static int sw_rec_save(bcf_sweep_t *sw, bcf1_t *rec) { sw->lrid = rec->rid; sw->lpos = rec->pos; @@ -78,11 +79,13 @@ static void sw_rec_save(bcf_sweep_t *sw, bcf1_t *rec) sw->lals_len = len; hts_expand(char, len, sw->mlals, sw->lals); memcpy(sw->lals, rec->d.allele[0], len); + + return 0; // FIXME: check for errs in this function } -static void sw_fill_buffer(bcf_sweep_t *sw) +static int sw_fill_buffer(bcf_sweep_t *sw) { - if ( !sw->iidx ) return; + if ( !sw->iidx ) return 0; sw->iidx--; int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0); @@ -102,6 +105,8 @@ static void sw_fill_buffer(bcf_sweep_t *sw) rec = &sw->rec[sw->nrec]; } sw_rec_save(sw, &sw->rec[0]); + + return 0; // FIXME: check for errs in this function } bcf_sweep_t *bcf_sweep_init(const char *fname) @@ -146,7 +151,7 @@ bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw) { if ( sw->direction==SW_BWD ) sw_seek(sw, SW_FWD); - long pos = hts_utell(sw->file); + off_t pos = hts_utell(sw->file); bcf1_t *rec = &sw->rec[0]; int ret = bcf_read1(sw->file, sw->hdr, rec); diff --git a/vcfutils.c b/vcfutils.c index 97cd52541..0c70977b3 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -1,6 +1,6 @@ /* vcfutils.c -- allele-related utility functions. - Copyright (C) 2012-2016 Genome Research Ltd. + Copyright (C) 2012-2018 Genome Research Ltd. Author: Petr Danecek @@ -22,7 +22,9 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include +#include #include "htslib/vcfutils.h" #include "htslib/kbitset.h" @@ -64,12 +66,12 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) case BCF_BT_INT8: BRANCH_INT(int8_t); break; case BCF_BT_INT16: BRANCH_INT(int16_t); break; case BCF_BT_INT32: BRANCH_INT(int32_t); break; - default: hts_log_error("Unexpected type %d at %s:%d", ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; + default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT if ( anid[BCF_DT_CTG][line->rid].key, line->pos+1); + hts_log_error("Incorrect AN/AC counts at %s:%"PRIhts_pos, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); } ac[0] = an - nac; @@ -98,7 +100,7 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ if ( p[ial]>>1 > line->n_allele ) \ { \ - hts_log_error("Incorrect allele (\"%d\") in %s at %s:%d", (p[ial]>>1)-1, header->samples[i], header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ + hts_log_error("Incorrect allele (\"%d\") in %s at %s:%"PRIhts_pos, (p[ial]>>1)-1, header->samples[i], header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ exit(1); \ } \ ac[(p[ial]>>1)-1]++; \ @@ -109,7 +111,7 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; - default: hts_log_error("Unexpected type %d at %s:%d", fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; + default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT return 1; @@ -188,7 +190,7 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ if ( (p[ial]>>1)-1 >= line->n_allele ) { \ - hts_log_error("Allele index is out of bounds at %s:%d", header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ + hts_log_error("Allele index is out of bounds at %s:%"PRIhts_pos, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ ret = -1; \ goto clean; \ } \ @@ -200,7 +202,7 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break; - default: hts_log_error("Unexpected GT %d at %s:%d", + default: hts_log_error("Unexpected GT %d at %s:%"PRIhts_pos, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos + 1); goto clean; } @@ -222,7 +224,7 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) return ret ? ret : nrm; } -void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) +int bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) { int i; kbitset_t *rm_set = kbs_init(line->n_allele); @@ -231,6 +233,8 @@ void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) bcf_remove_allele_set(header, line, rm_set); kbs_destroy(rm_set); + + return 0; // FIXME: check for errs in this function } int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kbitset_t *rm_set) @@ -263,7 +267,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int nR_new = line->n_allele-nrm; if ( nR_new<=0 ) // should not be able to remove reference allele { - hts_log_error("Cannot remove reference allele at %s:%d [%d]", + hts_log_error("Cannot remove reference allele at %s:%"PRIhts_pos" [%d]", bcf_seqname(header,line), line->pos+1, nR_new); goto err; } @@ -294,7 +298,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb mdat_bytes = mdat * size; if ( nret<0 ) { - hts_log_error("Could not access INFO/%s at %s:%d [%d]", + hts_log_error("Could not access INFO/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -332,7 +336,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( j==1 && s == '.' ) continue; // missing if ( j!=nexp ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=%c=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=%c=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, j); goto err; } @@ -363,7 +367,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( n==1 && s == '.' ) continue; // missing if ( n!=nG_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=G=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=G=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, n); goto err; } @@ -372,7 +376,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type); if ( nret<0 ) { - hts_log_error("Could not update INFO/%s at %s:%d [%d]", + hts_log_error("Could not update INFO/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -404,7 +408,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nA_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=A=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=A=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nA_ori, nret); goto err; } @@ -416,7 +420,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nR_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nR_ori, nret); goto err; } @@ -448,7 +452,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nG_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, nret); goto err; } @@ -482,7 +486,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type); if ( nret<0 ) { - hts_log_error("Could not update INFO/%s at %s:%d [%d]", + hts_log_error("Could not update INFO/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -508,7 +512,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int al = bcf_gt_allele(ptr[j]); if ( !( al=0 ) ) { - hts_log_error("Problem updating genotypes at %s:%d [ al=0 :: al=%d,nR_ori=%d,map[al]=%d ]", + hts_log_error("Problem updating genotypes at %s:%"PRIhts_pos" [ al=0 :: al=%d,nR_ori=%d,map[al]=%d ]", bcf_seqname(header,line), line->pos+1, al, nR_ori, map[al]); goto err; } @@ -519,7 +523,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/GT at %s:%d [%d]", + hts_log_error("Could not update FORMAT/GT at %s:%"PRIhts_pos" [%d]", bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -546,7 +550,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb mdat_bytes = mdat * size; if ( nret<0 ) { - hts_log_error("Could not access FORMAT/%s at %s:%d [%d]", + hts_log_error("Could not access FORMAT/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -587,7 +591,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( k_src==1 && s == '.' ) continue; // missing if ( k_src!=nexp ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=%c=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=%c=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, k_src); goto err; } @@ -612,7 +616,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( nexp==1 && s == '.' ) continue; // missing if ( nexp!=nG_ori && nexp!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(diploid) or %d(haploid), but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=G=%d(diploid) or %d(haploid), but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nR_ori, nexp); goto err; } @@ -657,7 +661,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb } if ( k_src!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(haploid), but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=G=%d(haploid), but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, k_src); goto err; } @@ -669,7 +673,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/%s at %s:%d [%d]", + hts_log_error("Could not update FORMAT/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -705,7 +709,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nA_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=A=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=A=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nA_ori, nori); goto err; } @@ -717,7 +721,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, nori); goto err; } @@ -753,7 +757,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nG_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=G=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nori); goto err; } @@ -806,7 +810,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/%s at %s:%d [%d]", + hts_log_error("Could not update FORMAT/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; } diff --git a/version.sh b/version.sh index 3b3a8c5ee..be7b29cdf 100755 --- a/version.sh +++ b/version.sh @@ -1,7 +1,30 @@ #!/bin/sh +# version.sh -- Script to build the htslib version string +# +# Author : James Bonfield +# +# Copyright (C) 2017-2018 Genome Research Ltd. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.9 +VERSION=1.10 # If we have a git clone, then check against the current tag if [ -e .git ] @@ -22,9 +45,9 @@ then v3=`expr "$VERSION" : '[0-9]*.[0-9]*.\([0-9]*\)'` if [ -z "`expr "$VERSION" : '^\([0-9.]*\)$'`" ] then - VERSION="$v1.$v2.255" + VERSION="$v1.$v2.255" else - VERSION="$v1.$v2${v3:+.}$v3" + VERSION="$v1.$v2${v3:+.}$v3" fi fi