diff --git a/.dockerignore b/.dockerignore index 91b832e..8ad355b 100644 --- a/.dockerignore +++ b/.dockerignore @@ -3,3 +3,6 @@ tests/*/output/* tests/*/temp.fasta tests/hrefpkg-build/hrefpkg *.ssi +*-env +deenurp.egg-info +build diff --git a/.travis.yml b/.travis.yml index b5b9a2f..2d12726 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,7 @@ -sudo: false language: python -python: 2.7 +python: + - 3.8 + - 3.9 cache: # must specify pip dir @@ -11,13 +12,6 @@ cache: env: global: - # These two environment variables could be set by Travis itself, or Travis - # could configure itself in /etc/, ~/, or inside of the virtual - # environments. In any case if these two values get configured then end - # users only need to enable the pip cache and manually run pip wheel before - # running pip install. - - PIP_WHEEL_DIR=$HOME/.cache/pip/wheels - - PIP_FIND_LINKS=file://$HOME/.cache/pip/wheels - VIRTUAL_ENV=deenurp-env addons: @@ -32,7 +26,7 @@ install: - "bin/bootstrap.sh $VIRTUAL_ENV" script: - - python setup.py test + - python3 -m deenurp.test - tests/run.sh notifications: diff --git a/CHANGES.rst b/CHANGES.rst index 4513c7d..528aa69 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -2,6 +2,11 @@ Changes for deenurp ===================== +0.3.0 +===== + +* Python 3 support and other dependency upgrades and bug fixes + 0.2.7 ===== diff --git a/Dockerfile b/Dockerfile index eb2188a..850f01e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,30 +1,21 @@ -FROM ubuntu:18.04 -MAINTAINER sminot@fredhutch.org +FROM python:3.9 +LABEL org.opencontainers.image.authors="sminot@fredhutch.org,nhoffman@uw.edu,crosenth@uw.edu" # Install prerequisites -RUN apt-get update && apt-get install --assume-yes --no-install-recommends \ - build-essential \ - gfortran \ - git \ - liblapack-dev \ - libopenblas-dev \ - make \ - python-dev \ - python-pip \ - python2.7 \ - unzip \ - wget +RUN apt-get update && \ +apt-get upgrade --assume-yes && \ +apt-get install --assume-yes --no-install-recommends git wget # Add files RUN mkdir /usr/local/share/deenurp/ ADD bin /usr/local/share/deenurp/bin ADD tests /usr/local/share/deenurp/tests ADD deenurp /usr/local/share/deenurp/deenurp -ADD deenurp.py setup.py requirements.txt MANIFEST.in /usr/local/share/deenurp/ +ADD deenurp.py setup.py requirements.txt /usr/local/share/deenurp/ # Install deenurp and dependencies RUN cd /usr/local/share/deenurp/ && \ - PYTHON=/usr/bin/python2.7 \ + PYTHON=/usr/local/bin/python3 \ DEENURP=/usr/local/share/deenurp/ \ bin/bootstrap.sh /usr/local/ @@ -32,17 +23,12 @@ RUN cd /usr/local/share/deenurp/ && \ RUN rm -rf /var/lib/apt/lists/* && \ rm -rf /root/.cache/pip && \ rm -rf /usr/local/share/deenurp/src && \ - apt-get purge -y --auto-remove \ - build-essential \ - unzip \ - git \ - python-dev \ - make + apt-get purge -y --auto-remove git # create some mount points RUN mkdir -p /app /fh /mnt /run/shm # Run tests -RUN python -m deenurp.test && \ +RUN python3 -m deenurp.test && \ cd /usr/local/share/deenurp && \ tests/run.sh diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 793ab0b..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include requirements.txt -include README.rst -include deenurp/data/* -include deenurp/test/data/* diff --git a/README.rst b/README.rst index 7889048..1740b69 100644 --- a/README.rst +++ b/README.rst @@ -12,9 +12,13 @@ The Easy Way * confirm availability of necessary libraries to compile dependencies (on Ubuntu: ``sudo apt-get install gfortran libopenblas-dev liblapack-dev``) -* Install Python 2.7 -* run `bin/bootstrap.sh` -* run `source bin-env/bin/activate` +* Install Python >= 3.8 or Python 3 Virtual Environment +:: + + % python3 -m venv bin-env + % source bin-env/bin/activate + % bin/bootstrap.sh + the `deenurp` executable should now be on your `$PATH` @@ -25,7 +29,7 @@ See required system libraries above. First, install binary dependencies: -* Python 2.7 +* Python 3 - pip, for installing python dependencies (http://www.pip-installer.org/) - Python packages: @@ -46,6 +50,12 @@ Finally, install:: python setup.py install +The Docker Way +============== + +Deenurp can be run from a Docker image which can be built locally from the Dockerfile +or pulled ``docker pull nghoffman/deenurp:v0.3.0`` + De-novo reference set creation ============================== diff --git a/Singularity b/Singularity deleted file mode 100644 index 67950dd..0000000 --- a/Singularity +++ /dev/null @@ -1,41 +0,0 @@ -Bootstrap:docker -From:ubuntu:18.04 - -%setup - mkdir ${SINGULARITY_ROOTFS}/src - -%files - bin /src/bin - tests /src/tests - deenurp /src/deenurp - setup.py /src/ - requirements.txt /src/ - MANIFEST.in /src/ - -%post - apt-get update && apt-get install --assume-yes --no-install-recommends \ - build-essential \ - gfortran \ - git \ - liblapack-dev \ - libopenblas-dev \ - make \ - python-dev \ - python-pip \ - python2.7 \ - unzip \ - wget - cd /src/ && \ - PYTHON=/usr/bin/python2.7 \ - DEENURP=/src/ \ - bin/bootstrap.sh /usr/local/ - rm -rf /var/lib/apt/lists/* - rm -rf /root/.cache/pip - rm -rf /src/src - apt-get purge -y --auto-remove \ - build-essential \ - unzip \ - git \ - python-dev \ - make - mkdir -p /fh /app /mnt # create bind points diff --git a/bin/bootstrap.sh b/bin/bootstrap.sh index b7ca66a..53dc22a 100755 --- a/bin/bootstrap.sh +++ b/bin/bootstrap.sh @@ -14,23 +14,9 @@ set -e # installs deenurp and dependencies to $VIRTUAL_ENV if defined; # otherwise creates a virtualenv locally. -# Will attempt to install python packages from wheels if $PIP_FIND_LINKS is defined -# and pip --use-wheel is specified - -# set $PIP_WHEEL_DIR and $PIP_FIND_LINKS in the parent environment if -# desired - -# Will attempt to create wheels if $PIP_WHEEL_DIR is defined -# see https://pip.pypa.io/en/latest/user_guide.html#environment-variables - - mkdir -p src SRCDIR=$(readlink -f src) -if [[ -n "$PIP_WHEEL_DIR" ]]; then - mkdir -p "$PIP_WHEEL_DIR" -fi - srcdir(){ tar -tf $1 | head -1 } @@ -44,7 +30,7 @@ else fi if [[ -z $PYTHON ]]; then - PYTHON=$(which python2) + PYTHON=$(which python3) fi # Defines the default source directory for deenurp as the parent of @@ -55,65 +41,24 @@ fi VENV_VERSION=15.1.0 PPLACER_BUILD=1.1.alpha19 -INFERNAL_VERSION=1.1.2 +INFERNAL_VERSION=1.1.4 RAXML_VERSION=8.0.5 MUSCLE_VERSION=3.8.31 VSEARCH_VERSION=2.6.2 -check_version(){ - # usage: check_version module version-string - "$PYTHON" < /dev/null -import $1 -from distutils.version import LooseVersion -assert LooseVersion($1.__version__) >= LooseVersion("$2") -EOF -} - -# create virtualenv if necessary, downloading source if available -# version is not up to date. -VENV_URL="https://github.com/pypa/virtualenv/archive/${VENV_VERSION}" +# create virtualenv if [[ ! -f "${venv:?}/bin/activate" ]]; then - # if the system virtualenv is up to date, use it - if check_version virtualenv $VENV_VERSION; then - echo "using $(which virtualenv) (version $(virtualenv --version))" - virtualenv "$venv" - else - echo "downloading virtualenv version $VENV_VERSION" - if [[ ! -f src/virtualenv-${VENV_VERSION}/virtualenv.py ]]; then - mkdir -p src - (cd src && \ - wget --quiet -nc ${VENV_URL}.tar.gz && \ - tar -xf ${VENV_VERSION}.tar.gz) - fi - "$PYTHON" src/virtualenv-${VENV_VERSION}/virtualenv.py "$venv" - fi + $PYTHON -m venv $venv else echo "virtualenv $venv already exists" fi source $venv/bin/activate - # full path; set by activate venv=$VIRTUAL_ENV - -# Preserve the order of installation. The requirements are sorted so -# that secondary (and higher-order) dependencies appear first. See -# bin/pipdeptree2requirements.py. We use --no-deps to prevent various -# packages from being repeatedly installed, uninstalled, reinstalled, -# etc. Also, enfoprcing the order of installation ensures that -# install-time dependencies are met (`pip install -r requirements.txt` -# fails due to a install-time dependency that cogent has for numpy) -pip2 install -U pip - -# install pysqlite and updated sqlite3 libraries -wget --quiet -O - \ - https://raw.githubusercontent.com/fhcrc/taxtastic/master/dev/install_pysqlite.sh | bash - -while read pkg; do - pip2 install "$pkg" --no-deps --upgrade -done < <(/bin/grep -v -E '^#|^$' "$DEENURP/requirements.txt") - -pip2 install "$DEENURP" +pip install -U pip wheel +pip install -r requirements.txt +pip install . # install pplacer and accompanying python scripts PPLACER_DIR=pplacer-Linux-v${PPLACER_BUILD} @@ -128,11 +73,12 @@ if pplacer_is_installed; then $venv/bin/pplacer --version else mkdir -p src && \ - (cd src && \ - wget -nc --quiet https://github.com/matsen/pplacer/releases/download/v$PPLACER_BUILD/$PPLACER_ZIP && \ - unzip -o $PPLACER_ZIP && \ - cp $PPLACER_DIR/{pplacer,guppy,rppr} $venv/bin && \ - pip2 install -U $PPLACER_DIR/scripts) + (cd src \ + && wget -nc --quiet https://github.com/matsen/pplacer/releases/download/v$PPLACER_BUILD/$PPLACER_ZIP \ + && unzip -o $PPLACER_ZIP \ + && cp $PPLACER_DIR/{pplacer,guppy,rppr} $venv/bin \ + # && pip2 install -U $PPLACER_DIR/scripts \ + ) # confirm that we have installed the requested build if ! pplacer_is_installed; then echo -n "Error: you requested pplacer build $PPLACER_BUILD " @@ -218,4 +164,3 @@ else cd muscle${MUSCLE_VERSION}/src && \ ./mk && cp muscle $venv/bin) fi - diff --git a/bin/pipdeptree2requirements.py b/bin/pipdeptree2requirements.py index 960de26..65c7ef5 100755 --- a/bin/pipdeptree2requirements.py +++ b/bin/pipdeptree2requirements.py @@ -21,5 +21,5 @@ deps = set() for __, pkg in sorted(lines): if pkg not in deps: - print pkg + print(pkg) deps.add(pkg) diff --git a/deenurp/__init__.py b/deenurp/__init__.py index ddbaf5c..d61df57 100644 --- a/deenurp/__init__.py +++ b/deenurp/__init__.py @@ -24,7 +24,7 @@ import os import pkgutil import sys -import util +from . import util log = logging.getLogger(__name__) @@ -89,9 +89,8 @@ def setup_logging(namespace): '%(funcName)s %(lineno)s %(message)s') datefmt = '%Y-%m-%d %H:%M:%S' - logging.basicConfig(stream=namespace.log, format=log_format, - level=loglevel, log_format=log_format, - datefmt=datefmt) + logging.basicConfig( + stream=namespace.log, format=log_format, level=loglevel, datefmt=datefmt) def parse_version(parser): @@ -105,7 +104,7 @@ def parse_args(parser): parser.add_argument('-l', '--log', metavar='FILE', default=sys.stdout, - type=util.file_opener('a', buffering=0), # append + type=util.file_opener('a'), # append help='Send logging to a file') parser.add_argument('-v', '--verbose', @@ -128,7 +127,7 @@ def parse_subcommands(parser, argv): """ Setup all sub-commands """ - import subcommands + from . import subcommands subparsers = parser.add_subparsers(dest='subparser_name') @@ -158,7 +157,7 @@ def parse_subcommands(parser, argv): try: imp = '{}.{}'.format(subcommands.__name__, name) mod = importlib.import_module(imp) - except Exception, e: + except Exception as e: log.error('error importing subcommand {}'.format(name)) log.error(e) continue diff --git a/deenurp/outliers.py b/deenurp/outliers.py index 8654f52..3928d5c 100644 --- a/deenurp/outliers.py +++ b/deenurp/outliers.py @@ -33,7 +33,7 @@ def read_dists(fobj): spl = line.split() assert len(spl) == N + 1 taxa.append(spl.pop(0)) - distmat[row, :] = map(float, spl) + distmat[row, :] = list(map(float, spl)) return taxa, distmat @@ -48,7 +48,7 @@ def fasttree_dists(fasta): cmd = ['FastTree', '-nt', '-makematrix', fasta] - with tempfile.TemporaryFile('rw') as stdout, open(os.devnull) as devnull: + with tempfile.TemporaryFile('w+') as stdout, open(os.devnull) as devnull: proc = subprocess.Popen(cmd, stdout=stdout, stderr=devnull) proc.communicate() stdout.flush() @@ -98,14 +98,20 @@ def outliers(distmat, radius): """ + # A previous implementation used a masked_array to guard againt na + # values, but apparently this results in undefined behavior: + # https://github.com/numpy/numpy/issues/14716 + # use a masked array in case there are any nan - ma = np.ma.masked_array(distmat, np.isnan(distmat)) + # ma = np.ma.masked_array(distmat, np.isnan(distmat)) # index of most central element. - medoid = find_medoid(ma) + # medoid = find_medoid(ma) + medoid = find_medoid(distmat) # distance from each element to most central element - dists = ma[medoid, :] + # dists = ma[medoid, :] + dists = distmat[medoid, :] to_prune = dists > radius return medoid, dists, to_prune @@ -147,12 +153,13 @@ def outliers_by_cluster(distmat, t, D, min_size=1, cluster_type='single', **kwar # all of the sequences log.warning('no clusters were found') - medoids = pd.DataFrame.from_items([ - ('cluster', [-1]), - ('count', [len(clusters)]), - ('medoid', [find_medoid(distmat)]), - ('dist', [None]) - ]) + medoids = pd.DataFrame({ + 'cluster': [-1], + 'count': [len(clusters)], + 'medoid': [find_medoid(distmat)], + 'dist': [None], + }) + to_prune = pd.Series([False for x in clusters]) else: medoids = find_cluster_medoids(distmat, clusters) @@ -192,7 +199,7 @@ def scipy_cluster(X, module, t, **kwargs): Z = fun(y) clusters = scipy.cluster.hierarchy.fcluster(Z, t, **args) title = 'scipy.cluster.hierarchy.{} {}'.format( - module, ' '.join('%s=%s' % item for item in args.items())) + module, ' '.join('%s=%s' % item for item in list(args.items()))) return clusters, title @@ -237,7 +244,7 @@ def find_cluster_medoids(X, clusters): zip([0 if c == -1 else 1 for c in uclusters], counts, uclusters), reverse=True) - __, counts, uclusters = zip(*tallies) + __, counts, uclusters = list(zip(*tallies)) medoids = [(None if cluster == -1 else find_medoid(X, clusters == cluster)) for _, _, cluster in tallies] @@ -245,12 +252,12 @@ def find_cluster_medoids(X, clusters): # measure distances from the medoid of the first (largest) cluster dists = [None if medoid is None else X[medoids[0], medoid] for medoid in medoids] - return pd.DataFrame.from_items([ - ('cluster', uclusters), - ('count', counts), - ('medoid', medoids), - ('dist', dists) - ]) + return pd.DataFrame({ + 'cluster': uclusters, + 'count': counts, + 'medoid': medoids, + 'dist': dists, + }) def choose_clusters(df, min_size, max_dist): @@ -317,11 +324,11 @@ def mds(X, taxa, n_jobs=1): n_jobs=n_jobs) if np.all(X == 0): - df = pd.DataFrame.from_items([ - ('seqname', taxa), - ('x', np.zeros(n)), - ('y', np.zeros(n)) - ]) + df = pd.DataFrame({ + 'seqname': taxa, + 'x': np.zeros(n), + 'y': np.zeros(n), + }) else: mds_fit = mds.fit_transform(X) df = pd.DataFrame(mds_fit, columns=['x', 'y']) diff --git a/deenurp/search.py b/deenurp/search.py index a0668b0..4b5d939 100644 --- a/deenurp/search.py +++ b/deenurp/search.py @@ -13,9 +13,7 @@ from deenurp import uclust from Bio import SeqIO -from .util import SingletonDefaultDict, memoize - -_ntf = tempfile.NamedTemporaryFile +from .util import SingletonDefaultDict, memoize, ntf SELECT_THRESHOLD = 0.05 SEARCH_THRESHOLD = 0.90 @@ -123,7 +121,7 @@ def _search(con, quiet=True, select_threshold=SELECT_THRESHOLD, cursor = con.cursor() count = 0 ref_name = p['ref_fasta'] - with open(p['ref_meta']) as fp: + with open(p['ref_meta'], 'r') as fp: cluster_info = _load_cluster_info(fp, p['group_field']) @memoize @@ -140,7 +138,8 @@ def get_seq_id(name): cursor.execute(sql, [name]) return cursor.fetchone()[0] - with _ntf(prefix='usearch') as uc_fp: + with ntf(prefix='usearch') as uc_fp: + uc_fp.close() uclust.search( ref_name, p['fasta_file'], @@ -153,16 +152,16 @@ def get_seq_id(name): # import shutil # shutil.copy(uc_fp.name, '.') - records = uclust.parse_uclust_out(uc_fp) - records = (i for i in records if i.type == - 'H' and i.pct_id >= p['search_identity'] * 100.0) + records = uclust.parse_uclust_out(uc_fp.name) + records = (i for i in records + if i.type == 'H' and i.pct_id >= p['search_identity'] * 100.0) by_seq = uclust.hits_by_sequence(records) by_seq = select_hits(by_seq, select_threshold) sql = """ -INSERT INTO best_hits (sequence_id, hit_idx, ref_id, pct_id) -VALUES (?, ?, ?, ?) -""" + INSERT INTO best_hits (sequence_id, hit_idx, ref_id, pct_id) + VALUES (?, ?, ?, ?) + """ for _, hits in by_seq: # Drop clusters from blacklist hits = ( @@ -225,7 +224,7 @@ def get_sample_id(sample_name): seq_count += 1 if sequence.id not in weights: continue - for sample, weight in weights[sequence.id].items(): + for sample, weight in list(weights[sequence.id].items()): sample_id = get_sample_id(sample) cursor.execute("""INSERT INTO sequences_samples (sequence_id, sample_id, weight) @@ -244,11 +243,13 @@ def _create_tables( search_identity=SEARCH_IDENTITY, quiet=True, group_field='cluster'): + schema = os.path.join(os.path.dirname(__file__), 'data', 'search.schema') cursor = con.cursor() cursor.executescript(open(schema).read().strip()) + # Save parameters - rows = [(k, locals().get(k)) for k in _PARAMS.keys()] + rows = [(k, v) for k, v in locals().items() if k in _PARAMS] cursor.executemany("INSERT INTO params VALUES (?, ?)", rows) diff --git a/deenurp/select.py b/deenurp/select.py index 5903de4..6f74b73 100644 --- a/deenurp/select.py +++ b/deenurp/select.py @@ -19,7 +19,7 @@ from . import util, wrap from .config import DEFAULT_THREADS -from .util import as_fasta, tempdir +from .util import as_fasta, tempdir, ntf from .wrap import (cmalign, as_refpkg, redupfile_of_seqs, rppr_min_adcl, guppy_redup, pplacer, esl_sfetch) @@ -64,12 +64,10 @@ def _cluster(sequences, threshold=CLUSTER_THRESHOLD): """ sequences = list(sequences) assert sequences - with as_fasta(sequences) as fasta_name, \ - tempfile.NamedTemporaryFile(prefix='uc-') as ntf: - - uclust.cluster(fasta_name, ntf.name, pct_id=threshold, quiet=True) - ntf.seek(0) - r = list(uclust.cluster_seeds(fasta_name, ntf)) + with as_fasta(sequences) as fasta_name, ntf(prefix='uc-') as uc: + uc.close() + uclust.cluster(fasta_name, uc.name, pct_id=threshold, quiet=True) + r = list(uclust.cluster_seeds(fasta_name, uc.name)) logging.debug("Clustered %d to %d", len(sequences), len(r)) return r @@ -115,7 +113,7 @@ def select_sequences_for_cluster( # the operation below assumes unique identifiers for the set of # ref and query seqs, so ensure that this is the case for seq in query_seqs: - seq.id = seq.id + hashlib.md5(seq.id).hexdigest()[:8] + seq.id = seq.id + hashlib.md5(seq.id.encode('utf-8')).hexdigest()[:8] c = itertools.chain(ref_seqs, query_seqs) @@ -127,10 +125,10 @@ def select_sequences_for_cluster( redupfile_of_seqs(query_seqs) as redup_path: jplace = pplacer(rp.path, fasta, out_dir=placedir(), threads=1) + # Redup guppy_redup(jplace, redup_path, placedir('redup.jplace')) - prune_leaves = set( - rppr_min_adcl(placedir('redup.jplace'), keep_leaves)) + prune_leaves = set(rppr_min_adcl(placedir('redup.jplace'), keep_leaves)) result = frozenset(i.id for i in ref_seqs) - prune_leaves assert len(result) == keep_leaves @@ -223,10 +221,13 @@ def sequences_hitting_cluster(con, cluster_name): def esl_sfetch_seqs(sequence_file, sequence_names, fa_idx): """ """ - with tempfile.NamedTemporaryFile(prefix='esl', suffix='.fasta') as tf: + with ntf('wb', prefix='esl', suffix='.fasta') as tf: + # esl_sfetch() writes binary data, so we close and reopen the + # file to access the sequence data in text mode esl_sfetch(sequence_file, sequence_names, tf, fa_idx) - tf.seek(0) - return list(SeqIO.parse(tf, 'fasta')) + tf.close() + with open(tf.name, 'r') as seqs: + return list(SeqIO.parse(seqs, 'fasta')) def get_total_weight_per_sample(con): @@ -306,11 +307,11 @@ def choose_references( sample_weights = get_sample_weights(deenurp_db, cluster_seq_names) norm_sw = dict() - for k, v in sample_weights.items(): + for k, v in list(sample_weights.items()): norm_sw[k] = v / sample_total_weights[k] max_sample, max_weight = max( - norm_sw.items(), key=operator.itemgetter(1)) + list(norm_sw.items()), key=operator.itemgetter(1)) logging.info( 'Cluster %s: Max hit by %s: %.3f%%, %d hits', diff --git a/deenurp/subcommands/add_reps.py b/deenurp/subcommands/add_reps.py index 9b82a2e..fbbdb70 100644 --- a/deenurp/subcommands/add_reps.py +++ b/deenurp/subcommands/add_reps.py @@ -37,7 +37,7 @@ def action(args): tax = taxonomy.Taxonomy(create_engine('sqlite:///{0}'.format(args.tax_db)), ncbi.ranks) - sequence_ids = set(k for k, v in tax_map.items() + sequence_ids = set(k for k, v in list(tax_map.items()) if v and tax.lineage(tax_id=v).get(args.rank) in tax_ids) # Fetch @@ -47,4 +47,4 @@ def action(args): if r.id in sequence_ids: args.outfile.write('{}\n{}\n'.format(r.description, r.seq)) count += 1 - print 'selected', count, 'sequences' + print('selected', count, 'sequences') diff --git a/deenurp/subcommands/cluster_refs.py b/deenurp/subcommands/cluster_refs.py index effb725..1fe0a8e 100644 --- a/deenurp/subcommands/cluster_refs.py +++ b/deenurp/subcommands/cluster_refs.py @@ -28,6 +28,7 @@ def build_parser(p): [default: %(default).3f]""") p.add_argument('-i', '--cluster-id', default=0.985, type=float, help="""Cluster ID [default: %(default).3f]""") + def cluster_identify_redundant(named_sequence_file, named_ids, to_cluster, threshold=0.97): with util.ntf(suffix='.uc', prefix='to_cluster') as tf: @@ -38,12 +39,13 @@ def cluster_identify_redundant(named_sequence_file, named_ids, to_cluster, maxrejects=100) # Uclust.search renames to tf, need a new handle. - records = uclust.parse_uclust_out(tf) + records = uclust.parse_uclust_out(tf.name) hits = (i.query_label for i in records if i.type == 'H' and i.pct_id >= threshold * 100.0) return frozenset(hits) + def taxonomic_clustered(taxonomy, cluster_rank): """ Generate tax_id, sequence_id_set tuples for each tax_id at cluster_rank @@ -52,6 +54,7 @@ def taxonomic_clustered(taxonomy, cluster_rank): return ((node.tax_id, frozenset(node.subtree_sequence_ids())) for node in nodes) + def identify_otus_unnamed(seq_file, cluster_similarity): """ Generates sequence ids in a cluster @@ -64,10 +67,11 @@ def identify_otus_unnamed(seq_file, cluster_similarity): # Sort and cluster uclust.cluster( seq_file, tf.name, pct_id=cluster_similarity, quiet=True) - clusters = uclust.sequences_by_cluster(uclust.parse_uclust_out(tf)) + clusters = uclust.sequences_by_cluster(uclust.parse_uclust_out(tf.name)) for _, sequences in clusters: yield [i.query_label for i in sequences] + def action(a): # index fasta file fa_idx = wrap.read_seq_file(a.named_sequence_file) @@ -154,7 +158,7 @@ def add_cluster(i): seqinfo_records = (seqinfo.get(i, {'seqname': i}) for i in done) seqinfo_records = (add_cluster(i) for i in seqinfo_records) - fields = list(seqinfo.values()[0].keys()) + fields = list(list(seqinfo.values())[0].keys()) fields.append('cluster') w = csv.DictWriter(fp, fields, quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n') diff --git a/deenurp/subcommands/expand_named.py b/deenurp/subcommands/expand_named.py index 2e8663b..83cc3f2 100644 --- a/deenurp/subcommands/expand_named.py +++ b/deenurp/subcommands/expand_named.py @@ -136,8 +136,8 @@ def action(a): w.writerows(i for i in r if i['seqname'] not in overlap) if 'cluster' in fn: rows = ({'seqname': k, 'tax_id': v, 'inferred_tax_id': 'yes', 'cluster': v} - for k, v in update_hits.items()) + for k, v in list(update_hits.items())) else: rows = ({'seqname': k, 'tax_id': v, 'inferred_tax_id': 'yes'} - for k, v in update_hits.items()) + for k, v in list(update_hits.items())) w.writerows(rows) diff --git a/deenurp/subcommands/fill_lonely.py b/deenurp/subcommands/fill_lonely.py index 2ab3558..308d18a 100644 --- a/deenurp/subcommands/fill_lonely.py +++ b/deenurp/subcommands/fill_lonely.py @@ -102,7 +102,7 @@ def build_parser(p): p.add_argument('output', help="""Output file (fasta)""", - type=argparse.FileType('w')) + type=argparse.FileType('wb')) p.add_argument('output_seqinfo', help="""Destination to write seqinfo for new representatives""", @@ -209,7 +209,7 @@ def action(args): additional_reps.add(s) logging.info("%d additional references", len(additional_reps)) - with open(args.chosen_fasta) as fp, args.output as ofp: + with open(args.chosen_fasta, 'rb') as fp, args.output as ofp: shutil.copyfileobj(fp, ofp) wrap.esl_sfetch(args.search_fasta, additional_reps, ofp, fa_idx) diff --git a/deenurp/subcommands/filter_outliers.py b/deenurp/subcommands/filter_outliers.py index ec8fc75..0da6d2c 100644 --- a/deenurp/subcommands/filter_outliers.py +++ b/deenurp/subcommands/filter_outliers.py @@ -135,7 +135,7 @@ def build_parser(p): output_group.add_argument( '--output-seqs', help="""REQUIRED destination for sequences""", required=True, - type=argparse.FileType('w'), metavar='FILE') + type=argparse.FileType('wb'), metavar='FILE') output_group.add_argument( '--filtered-seqinfo', type=argparse.FileType('w'), metavar='FILE', help="""Path to write filtered sequence info""") @@ -233,22 +233,19 @@ def distmat_muscle(sequence_file, prefix, maxiters=wrap.MUSCLE_MAXITERS): with util.ntf(prefix=prefix, suffix='.fasta') as a_fasta: wrap.muscle_files(sequence_file, a_fasta.name, maxiters=maxiters) a_fasta.flush() - taxa, distmat = outliers.fasttree_dists(a_fasta.name) return taxa, distmat -def distmat_cmalign( - sequence_file, - prefix, - cpu=wrap.CMALIGN_THREADS, - min_bitscore=10): +def distmat_cmalign(sequence_file, prefix, cpu=wrap.CMALIGN_THREADS, + min_bitscore=10): - with util.ntf(prefix=prefix, suffix='.aln') as a_sto, \ - util.ntf(prefix=prefix, suffix='.fasta') as a_fasta: + with util.ntf('w+', prefix=prefix, suffix='.aln') as a_sto, \ + util.ntf('w+', prefix=prefix, suffix='.fasta') as a_fasta: scores = wrap.cmalign_files(sequence_file, a_sto.name, cpu=cpu) + a_sto.seek(0) low_scores = scores['bit_sc'] < min_bitscore if low_scores.any(): @@ -294,8 +291,10 @@ def parse_usearch_allpairs(filename, seqnames): nseqs = len(seqnames) distmat = numpy.repeat(0.0, nseqs ** 2) distmat.shape = (nseqs, nseqs) - ii = pd.match(data['query'], seqnames) - jj = pd.match(data['target'], seqnames) + + idx = dict(zip(seqnames, range(nseqs))) + ii = [idx[name] for name in data['query']] + jj = [idx[name] for name in data['target']] # usearch_allpairs_files returns comparisons corresponding to a # triangular matrix, whereas vsearch_allpairs_files returns all @@ -428,7 +427,7 @@ def mock_filter(seqs, keep): empty = numpy.repeat(numpy.nan, len(seqs)) return pd.DataFrame({ - 'seqname': seqs, + 'seqname': list(seqs), 'centroid': empty, 'dist': empty, 'is_out': numpy.repeat(not keep, len(seqs))}) @@ -492,7 +491,7 @@ def filter_worker(tax_id, def action(a): # itemize sequences provided in the input file fa_idx = wrap.read_seq_file(a.sequence_file) - seqnames = fa_idx.keys() + seqnames = list(fa_idx.keys()) # Load taxonomy with a.taxonomy as fp: @@ -618,7 +617,7 @@ def action(a): log.exception( "Error in child process: %s", exception) executor.shutdown(wait=False) - traceback.print_tb(f._traceback) + traceback.print_tb(sys.exc_info()[2]) raise exception info = futs.pop(f) diff --git a/deenurp/subcommands/hrefpkg_build.py b/deenurp/subcommands/hrefpkg_build.py index a374c32..5d1dcfb 100644 --- a/deenurp/subcommands/hrefpkg_build.py +++ b/deenurp/subcommands/hrefpkg_build.py @@ -106,8 +106,8 @@ def action(a): hrefpkgs = [] futs = {} with open(j('index.csv'), 'w') as fp, \ - open(j('train.fasta'), 'w') as train_fp, \ - open(j('test.fasta'), 'w') as test_fp, \ + open(j('train.fasta'), 'wb') as train_fp, \ + open(j('test.fasta'), 'wb') as test_fp, \ futures.ThreadPoolExecutor(a.threads) as executor: def log_hrefpkg(tax_id): path = j(tax_id + '.refpkg') @@ -125,8 +125,8 @@ def log_hrefpkg(tax_id): continue f = executor.submit(tax_id_refpkg, node.tax_id, taxonomy, seqinfo, - a.sequence_file, fa_idx, output_dir=a.output_dir, test_file=test_fp, - train_file=train_fp) + a.sequence_file, fa_idx, output_dir=a.output_dir, + test_file=test_fp, train_file=train_fp) futs[f] = node.tax_id, node.name while futs: @@ -135,7 +135,8 @@ def log_hrefpkg(tax_id): tax_id, name = futs.pop(f) r = f.result() if r: - logging.info("Finished refpkg for %s (%s) [%d remaining]", name, tax_id, len(pending)) + logging.info( + 'Finished refpkg for %s (%s) [%d remaining]', name, tax_id, len(pending)) log_hrefpkg(tax_id) assert len(futs) == len(pending) @@ -158,7 +159,7 @@ def find_nodes(taxonomy, index_rank, want_rank='species'): moving up a rank if no species-level nodes with sequences exist. """ ranks = taxonomy.ranks - rdict = dict(zip(ranks, xrange(len(ranks)))) + rdict = dict(list(zip(ranks, list(range(len(ranks)))))) assert index_rank in rdict assert want_rank in rdict @@ -229,14 +230,14 @@ def sequence_names(f): sequence_ids = frozenset(taxonomy.subtree_sequence_ids()) with util.ntf(prefix='aln_fasta', suffix='.fasta') as tf, \ - util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp, \ - util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp: + util.ntf('w', prefix='seq_info', suffix='.csv') as seq_info_fp, \ + util.ntf('w', prefix='taxonomy', suffix='.csv') as tax_fp: wrap.esl_sfetch(sequence_file, sequence_ids, tf, fa_idx) tf.close() # Seqinfo file r = (i for i in seqinfo if i['seqname'] in sequence_ids) - w = csv.DictWriter(seq_info_fp, seqinfo[0].keys(), lineterminator='\n', + w = csv.DictWriter(seq_info_fp, list(seqinfo[0].keys()), lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC) w.writeheader() w.writerows(r) @@ -252,7 +253,7 @@ def sequence_names(f): rp.update_file('taxonomy', tax_fp.name) rp.update_file('profile', wrap.CM) - for k, v in meta.items(): + for k, v in list(meta.items()): rp.update_metadata(k, v) rp.commit_transaction() @@ -280,12 +281,12 @@ def tax_id_refpkg(tax_id, full_tax, seqinfo, sequence_file, fa_idx, Build a reference package containing all descendants of tax_id from an index reference package. """ - with util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp, \ - util.ntf(prefix='aln_sto', suffix='.sto') as sto_fp, \ - util.ntf(prefix='aln_fasta', suffix='.fasta') as fasta_fp, \ + with util.ntf('w', prefix='taxonomy', suffix='.csv') as tax_fp, \ + util.ntf('w+', prefix='aln_sto', suffix='.sto') as sto_fp, \ + util.ntf('w', prefix='aln_fasta', suffix='.fasta') as fasta_fp, \ util.ntf(prefix='tree', suffix='.tre') as tree_fp, \ util.ntf(prefix='tree', suffix='.stats') as stats_fp, \ - util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp: + util.ntf('w', prefix='seq_info', suffix='.csv') as seq_info_fp: # Subset taxonomy n = full_tax.get_node(tax_id) @@ -295,7 +296,7 @@ def tax_id_refpkg(tax_id, full_tax, seqinfo, sequence_file, fa_idx, tax_fp.close() # Subset seq_info - w = csv.DictWriter(seq_info_fp, seqinfo[0].keys(), + w = csv.DictWriter(seq_info_fp, list(seqinfo[0].keys()), quoting=csv.QUOTE_NONNUMERIC) w.writeheader() rows = [i for i in seqinfo if i['tax_id'] in descendants] @@ -311,8 +312,8 @@ def tax_id_refpkg(tax_id, full_tax, seqinfo, sequence_file, fa_idx, keep_seq_ids |= frozenset(keep) l = len(rest) if l >= 2 * PER_TAXON: - train_seq_ids |= frozenset(rest[:l / 2]) - test_seq_ids |= frozenset(rest[l / 2:]) + train_seq_ids |= frozenset(rest[:l // 2]) + test_seq_ids |= frozenset(rest[l // 2:]) # Picked rows = [sinfo[i] for i in keep_seq_ids] @@ -320,12 +321,13 @@ def tax_id_refpkg(tax_id, full_tax, seqinfo, sequence_file, fa_idx, seq_info_fp.close() # Fetch sequences - with tempfile.NamedTemporaryFile() as tf: - wrap.esl_sfetch(sequence_file, - keep_seq_ids, tf, fa_idx) - # Rewind - tf.seek(0) - sequences = list(SeqIO.parse(tf, 'fasta')) + with util.ntf() as tf: + wrap.esl_sfetch(sequence_file, keep_seq_ids, tf, fa_idx) + tf.close() + # reopen in text mode and read extracted sequences + with open(tf.name) as seqfile: + sequences = list(SeqIO.parse(seqfile, 'fasta')) + logging.info("Tax id %s: %d sequences", tax_id, len(sequences)) if len(set(str(i.seq) for i in sequences)) == 1: @@ -349,8 +351,9 @@ def tax_id_refpkg(tax_id, full_tax, seqinfo, sequence_file, fa_idx, aligned = wrap.cmalign(sequences, output=sto_fp) aligned = list(aligned) assert aligned + # Tree - wrap.fasttree(aligned, log_path=stats_fp.name, output_fp=tree_fp, threads=1, gtr=True) + wrap.fasttree(aligned, log_path=stats_fp.name, output_fp=tree_fp.name, threads=1, gtr=True) tree_fp.close() sto_fp.close() SeqIO.write(aligned, fasta_fp, 'fasta') @@ -366,7 +369,7 @@ def tax_id_refpkg(tax_id, full_tax, seqinfo, sequence_file, fa_idx, try: rp.update_phylo_model('FastTree', stats_fp.name) except: - print >> sys.stderr, stats_fp.read() + print(stats_fp.read(), file=sys.stderr) raise rp.update_file('profile', wrap.CM) rp.commit_transaction() @@ -403,7 +406,7 @@ def w(*args): partition_count = int(partition_prop * child_count) logging.info("Pruning %d/%d from %s-%s", partition_count, child_count, node.tax_id, node.name) - prune = set(random.sample(range(len(children)), partition_count)) + prune = set(random.sample(list(range(len(children))), partition_count)) # Lists of taxa to prune from the individual partitions p1_prune = [n.tax_id for i, n in enumerate(children) if i in prune] diff --git a/deenurp/subcommands/rdp_sequence_filter.py b/deenurp/subcommands/rdp_sequence_filter.py index e010f57..26314a2 100644 --- a/deenurp/subcommands/rdp_sequence_filter.py +++ b/deenurp/subcommands/rdp_sequence_filter.py @@ -65,7 +65,7 @@ def action(a): accepted = 0 rejected = 0 - for sequence, info in itertools.izip(sequences, reader): + for sequence, info in zip(sequences, reader): assert sequence.id == info['seqname'] # Check quality diff --git a/deenurp/subcommands/transfer_names.py b/deenurp/subcommands/transfer_names.py index f892bea..479855f 100644 --- a/deenurp/subcommands/transfer_names.py +++ b/deenurp/subcommands/transfer_names.py @@ -147,7 +147,7 @@ def action(args): w = csv.DictWriter(new_seq_info, ref_seq_info_reader.fieldnames) w.writeheader() - w.writerows(ref_seq_info.values()) + w.writerows(list(ref_seq_info.values())) new_seq_info.close() args.refpkg.start_transaction() diff --git a/deenurp/test/__main__.py b/deenurp/test/__main__.py index 3a6806a..ec55a73 100644 --- a/deenurp/test/__main__.py +++ b/deenurp/test/__main__.py @@ -9,13 +9,13 @@ suite = test.suite() outcome = suite.run(result) if outcome.wasSuccessful(): - print('ok: ' + str(outcome)) + print(('ok: ' + str(outcome))) else: - print('--> {} failures:'.format(len(outcome.failures))) + print(('--> {} failures:'.format(len(outcome.failures)))) for testcase, tb in outcome.failures: msg = str(testcase) - print('=' * len(msg)) - print(msg + '\n') - print(tb.strip()) - print('=' * len(msg)) + print(('=' * len(msg))) + print((msg + '\n')) + print((tb.strip())) + print(('=' * len(msg))) sys.exit(1) diff --git a/deenurp/test/test_outliers.py b/deenurp/test/test_outliers.py index efbed44..ab8efc6 100644 --- a/deenurp/test/test_outliers.py +++ b/deenurp/test/test_outliers.py @@ -8,7 +8,7 @@ try: import numpy as np import pandas as pd -except ImportError, err: +except ImportError as err: # prefer errors within tests over failure at the time the test # suites are assembled print(err) @@ -133,7 +133,7 @@ def test_mds_02(self): try: wrap.require_executable(wrap.VSEARCH) -except MissingDependencyError, e: +except MissingDependencyError as e: vsearch_available = False else: vsearch_available = True diff --git a/deenurp/test/test_search.py b/deenurp/test/test_search.py index a1472fe..5d6993c 100644 --- a/deenurp/test/test_search.py +++ b/deenurp/test/test_search.py @@ -1,6 +1,6 @@ import collections import os.path -from cStringIO import StringIO +from io import StringIO import unittest from deenurp import search @@ -58,5 +58,5 @@ def test_basic(self): expected = [ ('seq1', [TestHit('seq1', 't2', 99.9)]), ('seq2', [TestHit('seq2', 't6', 98.4)])] - self.assertItemsEqual(expected, r) + self.assertEqual(expected, r) diff --git a/deenurp/test/test_subcommand_extract_genbank.py b/deenurp/test/test_subcommand_extract_genbank.py index cac7c1c..a6e7db8 100644 --- a/deenurp/test/test_subcommand_extract_genbank.py +++ b/deenurp/test/test_subcommand_extract_genbank.py @@ -7,7 +7,7 @@ import os import unittest -from cStringIO import StringIO +from io import StringIO from deenurp.subcommands import ncbi_extract_genbank diff --git a/deenurp/test/test_subcommand_filter_outliers.py b/deenurp/test/test_subcommand_filter_outliers.py index a4106d7..93cd15f 100644 --- a/deenurp/test/test_subcommand_filter_outliers.py +++ b/deenurp/test/test_subcommand_filter_outliers.py @@ -1,5 +1,6 @@ import unittest +import pandas as pd from Bio import SeqIO from deenurp import wrap @@ -22,6 +23,14 @@ def test_parse_usearch_allpairs(self): distmat = filter_outliers.parse_usearch_allpairs(filename, seqnames) self.assertEqual(len(seqnames), distmat.shape[0]) + # confirm pairwise comparisons in the file + tab = pd.read_table(filename, header=None, names=filter_outliers.BLAST6NAMES) + for __, row in tab.iterrows(): + dist = distmat[seqnames.index(row['query']), + seqnames.index(row['target'])] + self.assertAlmostEqual(dist, 1 - (row['pct_id'] / 100.0)) + + @unittest.skipUnless(which(wrap.VSEARCH), "{} not found.".format(wrap.VSEARCH)) def test_distmat_pairwise_vsearch(self): infile = util.data_path('e_faecalis.head.fasta') diff --git a/deenurp/test/test_util.py b/deenurp/test/test_util.py index 3cbbc2e..25a7295 100644 --- a/deenurp/test/test_util.py +++ b/deenurp/test/test_util.py @@ -15,7 +15,7 @@ def test_nokey(self): def test_key(self): keys = ('n', 's') v = [(1, 'test'), (2, 'test'), (2, 'other')] - l = [dict(zip(keys, i)) for i in v] + l = [dict(list(zip(keys, i))) for i in v] expected1 = [{'n': 1, 's': 'test'}, {'n': 2, 's': 'test'}] actual1 = util.unique(l, key=operator.itemgetter('n')) diff --git a/deenurp/test/test_wrap.py b/deenurp/test/test_wrap.py index fe019ee..3f79064 100644 --- a/deenurp/test/test_wrap.py +++ b/deenurp/test/test_wrap.py @@ -45,14 +45,15 @@ def test_as_refpkg(self): self.assertTrue(os.path.isdir(refpkg.path)) if which('rppr'): - out = subprocess.check_output(['rppr', 'check', '-c', refpkg.path]) - self.assertTrue('OK!' in out, out) + job = subprocess.run(['rppr', 'check', '-c', refpkg.path], + capture_output=True, text=True) + self.assertTrue('OK!' in job.stdout) @unittest.skipUnless(which('rppr'), "rppr not found") class RpprMinAdclTreeTestCase(unittest.TestCase): def setUp(self): - self.tf = tempfile.NamedTemporaryFile(prefix='adcl', suffix='.tre') + self.tf = tempfile.NamedTemporaryFile('w+', prefix='adcl', suffix='.tre') self.tf.write("((C000721552:0.20692,C002038857:0.00015)0.844:0.01031,C002038856:0.00014,((C002963332:0.08558,(C001550734:0.06763,((C000004779:0.03889,C002963310:0.04622)0.633:0.00151,(C002963318:0.00014,C002963266:0.00014)0.697:0.00016)0.992:0.15253)0.889:0.07332)0.924:0.07668,C002038858:0.01032)0.907:0.00014);\n") self.tf.flush() @@ -70,7 +71,7 @@ def test_min_adcl_prune7(self): try: wrap.require_executable(wrap.VSEARCH) -except MissingDependencyError, e: +except MissingDependencyError as e: vsearch_available = False else: vsearch_available = True @@ -89,7 +90,7 @@ def test_vsearch_version_fail(self): wrap._require_vsearch_version, version='5.0') def test_vsearch_allpairs_files(self): - with deenurp.util.ntf(suffix='.blast6out', mode='rw') as outfile: + with deenurp.util.ntf(suffix='.blast6out', mode='w+') as outfile: wrap.vsearch_allpairs_files(self.sequencefile, outfile.name) self.assertTrue(os.path.exists(outfile.name)) outfile.flush() diff --git a/deenurp/uclust.py b/deenurp/uclust.py index e33a3dd..a610aa1 100644 --- a/deenurp/uclust.py +++ b/deenurp/uclust.py @@ -28,8 +28,9 @@ DEFAULT_PCT_ID = 0.99 # For parsing .uc format -UCLUST_HEADERS = ['type', 'cluster_number', 'size', 'pct_id', 'strand', - 'query_start', 'seed_start', 'alignment', 'query_label', 'target_label'] +UCLUST_HEADERS = ['type', 'cluster_number', 'size', 'pct_id', + 'strand', 'query_start', 'seed_start', 'alignment', + 'query_label', 'target_label'] UCLUST_TYPES = {'cluster_number': int, 'pct_id': float, 'query_start': int, 'seed_start': int, 'size': int} @@ -44,10 +45,11 @@ def _handle(s, *args, **kwargs): If s is a string, opens s and yields the open file. Otherwise, has no effect. """ - if isinstance(s, basestring): + if isinstance(s, str): with open(s, *args, **kwargs) as fp: yield fp else: + raise ValueError('try passing in a string instead') yield s @@ -65,7 +67,7 @@ def _check_call(cmd, **kwargs): Log and run command. Additional arguments are passed to ``subprocess.check_call`` """ - cmd = map(str, cmd) + cmd = list(map(str, cmd)) logging.debug(' '.join(cmd)) subprocess.check_call(cmd, **kwargs) @@ -88,8 +90,9 @@ def parse_uclust_out(ucout_fp): """ Parse the results of running UCLUST, returning UClustRecords. - ucout_fp can be file name or file handle. + ucout_fp can be file name or text mode file handle. """ + with _handle(ucout_fp) as fp: # Skip comments rows = (i.rstrip() for i in fp if not i.startswith('#')) @@ -101,10 +104,12 @@ def parse_uclust_out(ucout_fp): def parse_uclust_as_df(ucout_fp): dtype = {'type': str, 'query_label': str, 'target_label': str, 'alignment': str} - df = pd.read_csv(ucout_fp, sep='\t', na_values='*', names=UCLUST_HEADERS, dtype=dtype) + df = pd.read_csv( + ucout_fp, sep='\t', na_values='*', names=UCLUST_HEADERS, dtype=dtype) # define target_label as query_label for seed sequences - df['target_label'] = np.where(df['type'] == 'S', df['query_label'], df['target_label']) + df['target_label'] = np.where( + df['type'] == 'S', df['query_label'], df['target_label']) return df @@ -303,6 +308,6 @@ def guppy_redup_from_uclust(uclust_records, sample_map=None): clusters[number][sample].count += 1 rows = [(seeds[num], dedup_seq.id, dedup_seq.count) - for num, samples in clusters.items() - for dedup_seq in samples.values()] + for num, samples in list(clusters.items()) + for dedup_seq in list(samples.values())] return rows diff --git a/deenurp/util.py b/deenurp/util.py index f760208..a90355e 100644 --- a/deenurp/util.py +++ b/deenurp/util.py @@ -23,7 +23,7 @@ def apply_df_status(func, df, msg=''): """ tmp_column = 'index_number' row_count = float(len(df)) - df[tmp_column] = xrange(int(row_count)) + df[tmp_column] = range(int(row_count)) msg += ' {:.0%}\r' def apply_func(item, msg): @@ -120,13 +120,13 @@ def nothing(obj=None): @contextlib.contextmanager -def ntf(**kwargs): +def ntf(*args, **kwargs): """ Near-clone of tempfile.NamedTemporaryFile, but the file is deleted when the context manager exits, rather than when it's closed. """ kwargs['delete'] = False - tf = tempfile.NamedTemporaryFile(**kwargs) + tf = tempfile.NamedTemporaryFile(*args, **kwargs) try: with tf: yield tf @@ -135,17 +135,16 @@ def ntf(**kwargs): @contextlib.contextmanager -def tempcopy(path, **kwargs): - """ - Create a temporary copy of ``path``, available for the duration of the - context manager +def tempcopy(path): + """Create a temporary copy of ``path``, available for the duration of + the context manager + """ + prefix, suffix = os.path.splitext(os.path.basename(path)) - a = {'prefix': prefix, 'suffix': suffix} - a.update(kwargs) - with open(path) as fp, ntf(**a) as tf: - shutil.copyfileobj(fp, tf) + with ntf(prefix=prefix, suffix=suffix) as tf: tf.close() + shutil.copyfile(path, tf.name) yield tf.name @@ -179,7 +178,7 @@ def as_fasta(sequences, **kwargs): """ if 'suffix' not in kwargs: kwargs['suffix'] = '.fasta' - with ntf(**kwargs) as tf: + with ntf('w+', **kwargs) as tf: SeqIO.write(sequences, tf, 'fasta') tf.flush() tf.close() diff --git a/deenurp/wrap.py b/deenurp/wrap.py index 6082c35..57226ee 100644 --- a/deenurp/wrap.py +++ b/deenurp/wrap.py @@ -10,7 +10,7 @@ import subprocess import re from distutils.version import LooseVersion -from cStringIO import StringIO +from io import StringIO import pandas as pd @@ -52,10 +52,9 @@ def as_refpkg(sequences, name='temp.refpkg', threads=FASTTREE_THREADS): tempdir(prefix='refpkg') as refpkg_dir: log_fp.close() - - fasttree(sequences, log_path=log_fp.name, output_fp=tree_fp, gtr=True, - threads=threads) tree_fp.close() + fasttree(sequences, log_path=log_fp.name, output_fp=tree_fp.name, + gtr=True, threads=threads) rp = Refpkg(refpkg_dir(name), create=True) rp.update_metadata('locus', '') @@ -63,11 +62,11 @@ def as_refpkg(sequences, name='temp.refpkg', threads=FASTTREE_THREADS): rp.update_file('tree', tree_fp.name) # FASTA and Stockholm alignment - with ntf(suffix='.fasta') as f: + with ntf('w', suffix='.fasta') as f: SeqIO.write(sequences, f, 'fasta') f.close() rp.update_file('aln_fasta', f.name) - with ntf(suffix='.sto') as f: + with ntf('w', suffix='.sto') as f: SeqIO.write(sequences, f, 'stockholm') f.close() rp.update_file('aln_sto', f.name) @@ -76,8 +75,8 @@ def as_refpkg(sequences, name='temp.refpkg', threads=FASTTREE_THREADS): @contextlib.contextmanager -def redupfile_of_seqs(sequences, **kwargs): - with ntf(**kwargs) as tf: +def redupfile_of_seqs(sequences): + with ntf('w') as tf: writer = csv.writer(tf, lineterminator='\n') rows = ((s.id, s.id, s.annotations.get('weight', 1.0)) for s in sequences) writer.writerows(rows) @@ -89,9 +88,10 @@ def redupfile_of_seqs(sequences, **kwargs): def fasttree(sequences, output_fp, log_path=None, quiet=True, gtr=False, gamma=False, threads=FASTTREE_THREADS, prefix=None): - if len(sequences) < 3: + nseqs = len(sequences) + if nseqs < 3: raise ValueError( - 'at least 3 sequences are required but {} were provided'.format(len(sequences))) + f'at least 3 sequences are required but {nseqs} were provided') executable = 'FastTreeMP' if threads and threads > 1 else 'FastTree' if executable == 'FastTreeMP' and not which('FastTreeMP'): @@ -102,27 +102,25 @@ def fasttree(sequences, output_fp, log_path=None, quiet=True, env = os.environ.copy() if threads: env['OMP_NUM_THREADS'] = str(threads) - cmd = (prefix or []) + [executable, '-nt'] - for k, v in (('-gtr', gtr), ('-gamma', gamma), ('-quiet', quiet)): - if v: - cmd.append(k) - if log_path is not None: - cmd.extend(['-log', log_path]) - logging.debug(' '.join(cmd)) + with ntf('w', suffix='.fasta') as fasta: + assert SeqIO.write(sequences, fasta, 'fasta') + fasta.flush() - with ntf() as stderr: - p = subprocess.Popen(cmd, stdout=output_fp, stdin=subprocess.PIPE, - stderr=stderr, env=env) + cmd = (prefix or []) + [executable] + opts = [('-gtr', gtr), ('-gamma', gamma), ('-quiet', quiet)] + cmd.extend([k for k, v in opts if v]) - count = SeqIO.write(sequences, p.stdin, 'fasta') - assert count - p.stdin.close() - p.wait() - if not p.returncode == 0: - stderr.seek(0) - logging.error(stderr.read()) - raise subprocess.CalledProcessError(p.returncode, cmd) + if log_path: + cmd.extend(['-log', log_path]) + + cmd.extend(['-out', output_fp, '-nt', fasta.name]) + logging.debug(' '.join(cmd)) + + job = subprocess.run(cmd, capture_output=True, text=True, env=env) + if not job.returncode == 0: + logging.error(job.stderr) + raise subprocess.CalledProcessError(job.returncode, cmd) def guppy_redup(placefile, redup_file, output): @@ -175,8 +173,8 @@ def rppr_min_adcl(jplace, leaves, algorithm='pam', posterior_prob=False, if always_include: cmd.extend(('--always-include', always_include)) logging.debug(' '.join(cmd)) - output = subprocess.check_output(cmd) - return output.splitlines() + job = subprocess.run(cmd, capture_output=True, text=True) + return job.stdout.strip().splitlines() def rppr_min_adcl_tree(newick_file, leaves, algorithm='pam', always_include=None): @@ -200,9 +198,8 @@ def _require_cmalign_11(cmalign='cmalign'): Check for cmalign version 1.1, raising an error if not found """ version_str = 'INFERNAL 1.1' - cmd = [cmalign, '-h'] - o = subprocess.check_output(cmd) - if version_str not in o: + o = subprocess.run([cmalign, '-h'], capture_output=True, text=True) + if version_str not in o.stdout: msg = ('cmalign 1.1 not found. ' 'Expected {0} in output of "{1}", got:\n{2}').format( version_str, ' '.join(cmd), o) @@ -213,56 +210,64 @@ def cmalign_scores(text): """ Parse stdout of cmalign into a data.frame """ + dtypes = { + "idx": int, + "seq_name": str, + "length": int, + "cm_from": int, + "cm_to": int, + "trunc": str, + "bit_sc": float, + "avg_pp": str, + "band_calc": float, + "alignment": float, + "total": float, + "mem": float + } + return pd.read_csv( + StringIO(text), + comment="#", + delim_whitespace=True, + dtype=dtypes, + index_col='seq_name', + names=dtypes.keys() + ) - header_rexp = re.compile(r'^#\s+idx') - lines = [] - for line in text.splitlines(): - if header_rexp.search(line): - line = ' ' + line[1:].replace(' (Mb)', '') - # replace single spaces - line = re.sub(r'(?= `version` """ - cmd = [vsearch, '--version'] - p = subprocess.Popen( - cmd, - stderr=subprocess.PIPE, - stdout=open(os.devnull, 'w')) - __, stderr = p.communicate() - vsearch = re.search(r'^vsearch v(?P\d+\.\d+\.[^_]+)', stderr) + output = subprocess.run([vsearch, '--version'], capture_output=True, text=True) + vsearch = re.search(r'^vsearch v(?P\d+\.\d+\.[^_]+)', output.stderr) ver = vsearch.groupdict()['vstr'] if LooseVersion(ver) < LooseVersion(version): @@ -305,32 +305,31 @@ def vsearch_allpairs_files(input_file, output_file, executable=VSEARCH, '--blast6out', output_file] logging.info(' '.join(cmd)) - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - logging.debug(p.stdout.read().strip()) - error = p.stderr.read().strip() - if p.wait() != 0: + job = subprocess.run(cmd, capture_output=True, text=True) + logging.debug(job.stdout) + + if job.returncode != 0: # TODO: preserve output files (input_file, output_file) - raise subprocess.CalledProcessError(p.returncode, error) + raise subprocess.CalledProcessError(job.returncode, job.stderr) def muscle_files(input_file, output_file, maxiters=MUSCLE_MAXITERS): - cmd = ['muscle'] + cmd = [ + 'muscle', + '-in', input_file, + '-out', output_file, + # TODO: set value based on number of sequences? + '-maxiters', str(maxiters), + ] + logging.debug(' '.join(cmd)) require_executable(cmd[0]) - cmd.extend(['-in', input_file]) - cmd.extend(['-out', output_file]) - - # TODO: set value based on number of sequences? - cmd.extend(['-maxiters', str(maxiters)]) + job = subprocess.run(cmd, capture_output=True, text=True) + logging.debug(job.stdout) - logging.debug(' '.join(cmd)) - p = subprocess.Popen(cmd, - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - logging.debug(p.stdout.read().strip()) - error = p.stderr.read().strip() - if p.wait() != 0: + if job.returncode != 0: # TODO: preserve output files (input_file, output_file) - raise subprocess.CalledProcessError(p.returncode, error) + raise subprocess.CalledProcessError(p.returncode, job.stderr) def read_seq_file(sequence_file): @@ -358,7 +357,7 @@ def read_seq_file(sequence_file): def esl_sfetch(sequence_file, name_iter, output_fp, fa_idx): """ Fetch sequences named in name_iter from sequence_file, indexing if - necessary, writing to output_fp. + necessary, writing binary data to open file object output_fp. """ count = 0 with open(sequence_file, 'rb') as fi: diff --git a/distribute_setup.py b/distribute_setup.py deleted file mode 100644 index a1cc2a1..0000000 --- a/distribute_setup.py +++ /dev/null @@ -1,546 +0,0 @@ -#!python -"""Bootstrap distribute installation - -If you want to use setuptools in your package's setup.py, just include this -file in the same directory with it, and add this to the top of your setup.py:: - - from distribute_setup import use_setuptools - use_setuptools() - -If you want to require a specific version of setuptools, set a download -mirror, or use an alternate download directory, you can do so by supplying -the appropriate options to ``use_setuptools()``. - -This file can also be run as a script to install or upgrade setuptools. -""" -import os -import shutil -import sys -import time -import fnmatch -import tempfile -import tarfile -import optparse - -from distutils import log - -try: - from site import USER_SITE -except ImportError: - USER_SITE = None - -try: - import subprocess - - def _python_cmd(*args): - args = (sys.executable,) + args - return subprocess.call(args) == 0 - -except ImportError: - # will be used for python 2.3 - def _python_cmd(*args): - args = (sys.executable,) + args - # quoting arguments if windows - if sys.platform == 'win32': - def quote(arg): - if ' ' in arg: - return '"%s"' % arg - return arg - args = [quote(arg) for arg in args] - return os.spawnl(os.P_WAIT, sys.executable, *args) == 0 - -DEFAULT_VERSION = "0.6.34" -DEFAULT_URL = "http://pypi.python.org/packages/source/d/distribute/" -SETUPTOOLS_FAKED_VERSION = "0.6c11" - -SETUPTOOLS_PKG_INFO = """\ -Metadata-Version: 1.0 -Name: setuptools -Version: %s -Summary: xxxx -Home-page: xxx -Author: xxx -Author-email: xxx -License: xxx -Description: xxx -""" % SETUPTOOLS_FAKED_VERSION - - -def _install(tarball, install_args=()): - # extracting the tarball - tmpdir = tempfile.mkdtemp() - log.warn('Extracting in %s', tmpdir) - old_wd = os.getcwd() - try: - os.chdir(tmpdir) - tar = tarfile.open(tarball) - _extractall(tar) - tar.close() - - # going in the directory - subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) - os.chdir(subdir) - log.warn('Now working in %s', subdir) - - # installing - log.warn('Installing Distribute') - if not _python_cmd('setup.py', 'install', *install_args): - log.warn('Something went wrong during the installation.') - log.warn('See the error message above.') - # exitcode will be 2 - return 2 - finally: - os.chdir(old_wd) - shutil.rmtree(tmpdir) - - -def _build_egg(egg, tarball, to_dir): - # extracting the tarball - tmpdir = tempfile.mkdtemp() - log.warn('Extracting in %s', tmpdir) - old_wd = os.getcwd() - try: - os.chdir(tmpdir) - tar = tarfile.open(tarball) - _extractall(tar) - tar.close() - - # going in the directory - subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) - os.chdir(subdir) - log.warn('Now working in %s', subdir) - - # building an egg - log.warn('Building a Distribute egg in %s', to_dir) - _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) - - finally: - os.chdir(old_wd) - shutil.rmtree(tmpdir) - # returning the result - log.warn(egg) - if not os.path.exists(egg): - raise IOError('Could not build the egg.') - - -def _do_download(version, download_base, to_dir, download_delay): - egg = os.path.join(to_dir, 'distribute-%s-py%d.%d.egg' - % (version, sys.version_info[0], sys.version_info[1])) - if not os.path.exists(egg): - tarball = download_setuptools(version, download_base, - to_dir, download_delay) - _build_egg(egg, tarball, to_dir) - sys.path.insert(0, egg) - import setuptools - setuptools.bootstrap_install_from = egg - - -def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, - to_dir=os.curdir, download_delay=15, no_fake=True): - # making sure we use the absolute path - to_dir = os.path.abspath(to_dir) - was_imported = 'pkg_resources' in sys.modules or \ - 'setuptools' in sys.modules - try: - try: - import pkg_resources - if not hasattr(pkg_resources, '_distribute'): - if not no_fake: - _fake_setuptools() - raise ImportError - except ImportError: - return _do_download(version, download_base, to_dir, download_delay) - try: - pkg_resources.require("distribute>=" + version) - return - except pkg_resources.VersionConflict: - e = sys.exc_info()[1] - if was_imported: - sys.stderr.write( - "The required version of distribute (>=%s) is not available,\n" - "and can't be installed while this script is running. Please\n" - "install a more recent version first, using\n" - "'easy_install -U distribute'." - "\n\n(Currently using %r)\n" % (version, e.args[0])) - sys.exit(2) - else: - del pkg_resources, sys.modules['pkg_resources'] # reload ok - return _do_download(version, download_base, to_dir, - download_delay) - except pkg_resources.DistributionNotFound: - return _do_download(version, download_base, to_dir, - download_delay) - finally: - if not no_fake: - _create_fake_setuptools_pkg_info(to_dir) - - -def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, - to_dir=os.curdir, delay=15): - """Download distribute from a specified location and return its filename - - `version` should be a valid distribute version number that is available - as an egg for download under the `download_base` URL (which should end - with a '/'). `to_dir` is the directory where the egg will be downloaded. - `delay` is the number of seconds to pause before an actual download - attempt. - """ - # making sure we use the absolute path - to_dir = os.path.abspath(to_dir) - try: - from urllib.request import urlopen - except ImportError: - from urllib2 import urlopen - tgz_name = "distribute-%s.tar.gz" % version - url = download_base + tgz_name - saveto = os.path.join(to_dir, tgz_name) - src = dst = None - if not os.path.exists(saveto): # Avoid repeated downloads - try: - log.warn("Downloading %s", url) - src = urlopen(url) - # Read/write all in one block, so we don't create a corrupt file - # if the download is interrupted. - data = src.read() - dst = open(saveto, "wb") - dst.write(data) - finally: - if src: - src.close() - if dst: - dst.close() - return os.path.realpath(saveto) - - -def _no_sandbox(function): - def __no_sandbox(*args, **kw): - try: - from setuptools.sandbox import DirectorySandbox - if not hasattr(DirectorySandbox, '_old'): - def violation(*args): - pass - DirectorySandbox._old = DirectorySandbox._violation - DirectorySandbox._violation = violation - patched = True - else: - patched = False - except ImportError: - patched = False - - try: - return function(*args, **kw) - finally: - if patched: - DirectorySandbox._violation = DirectorySandbox._old - del DirectorySandbox._old - - return __no_sandbox - - -def _patch_file(path, content): - """Will backup the file then patch it""" - f = open(path) - existing_content = f.read() - f.close() - if existing_content == content: - # already patched - log.warn('Already patched.') - return False - log.warn('Patching...') - _rename_path(path) - f = open(path, 'w') - try: - f.write(content) - finally: - f.close() - return True - -_patch_file = _no_sandbox(_patch_file) - - -def _same_content(path, content): - f = open(path) - existing_content = f.read() - f.close() - return existing_content == content - - -def _rename_path(path): - new_name = path + '.OLD.%s' % time.time() - log.warn('Renaming %s to %s', path, new_name) - os.rename(path, new_name) - return new_name - - -def _remove_flat_installation(placeholder): - if not os.path.isdir(placeholder): - log.warn('Unkown installation at %s', placeholder) - return False - found = False - for file in os.listdir(placeholder): - if fnmatch.fnmatch(file, 'setuptools*.egg-info'): - found = True - break - if not found: - log.warn('Could not locate setuptools*.egg-info') - return - - log.warn('Moving elements out of the way...') - pkg_info = os.path.join(placeholder, file) - if os.path.isdir(pkg_info): - patched = _patch_egg_dir(pkg_info) - else: - patched = _patch_file(pkg_info, SETUPTOOLS_PKG_INFO) - - if not patched: - log.warn('%s already patched.', pkg_info) - return False - # now let's move the files out of the way - for element in ('setuptools', 'pkg_resources.py', 'site.py'): - element = os.path.join(placeholder, element) - if os.path.exists(element): - _rename_path(element) - else: - log.warn('Could not find the %s element of the ' - 'Setuptools distribution', element) - return True - -_remove_flat_installation = _no_sandbox(_remove_flat_installation) - - -def _after_install(dist): - log.warn('After install bootstrap.') - placeholder = dist.get_command_obj('install').install_purelib - _create_fake_setuptools_pkg_info(placeholder) - - -def _create_fake_setuptools_pkg_info(placeholder): - if not placeholder or not os.path.exists(placeholder): - log.warn('Could not find the install location') - return - pyver = '%s.%s' % (sys.version_info[0], sys.version_info[1]) - setuptools_file = 'setuptools-%s-py%s.egg-info' % \ - (SETUPTOOLS_FAKED_VERSION, pyver) - pkg_info = os.path.join(placeholder, setuptools_file) - if os.path.exists(pkg_info): - log.warn('%s already exists', pkg_info) - return - - log.warn('Creating %s', pkg_info) - try: - f = open(pkg_info, 'w') - except EnvironmentError: - log.warn("Don't have permissions to write %s, skipping", pkg_info) - return - try: - f.write(SETUPTOOLS_PKG_INFO) - finally: - f.close() - - pth_file = os.path.join(placeholder, 'setuptools.pth') - log.warn('Creating %s', pth_file) - f = open(pth_file, 'w') - try: - f.write(os.path.join(os.curdir, setuptools_file)) - finally: - f.close() - -_create_fake_setuptools_pkg_info = _no_sandbox( - _create_fake_setuptools_pkg_info -) - - -def _patch_egg_dir(path): - # let's check if it's already patched - pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO') - if os.path.exists(pkg_info): - if _same_content(pkg_info, SETUPTOOLS_PKG_INFO): - log.warn('%s already patched.', pkg_info) - return False - _rename_path(path) - os.mkdir(path) - os.mkdir(os.path.join(path, 'EGG-INFO')) - pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO') - f = open(pkg_info, 'w') - try: - f.write(SETUPTOOLS_PKG_INFO) - finally: - f.close() - return True - -_patch_egg_dir = _no_sandbox(_patch_egg_dir) - - -def _before_install(): - log.warn('Before install bootstrap.') - _fake_setuptools() - - -def _under_prefix(location): - if 'install' not in sys.argv: - return True - args = sys.argv[sys.argv.index('install') + 1:] - for index, arg in enumerate(args): - for option in ('--root', '--prefix'): - if arg.startswith('%s=' % option): - top_dir = arg.split('root=')[-1] - return location.startswith(top_dir) - elif arg == option: - if len(args) > index: - top_dir = args[index + 1] - return location.startswith(top_dir) - if arg == '--user' and USER_SITE is not None: - return location.startswith(USER_SITE) - return True - - -def _fake_setuptools(): - log.warn('Scanning installed packages') - try: - import pkg_resources - except ImportError: - # we're cool - log.warn('Setuptools or Distribute does not seem to be installed.') - return - ws = pkg_resources.working_set - try: - setuptools_dist = ws.find( - pkg_resources.Requirement.parse('setuptools', replacement=False) - ) - except TypeError: - # old distribute API - setuptools_dist = ws.find( - pkg_resources.Requirement.parse('setuptools') - ) - - if setuptools_dist is None: - log.warn('No setuptools distribution found') - return - # detecting if it was already faked - setuptools_location = setuptools_dist.location - log.warn('Setuptools installation detected at %s', setuptools_location) - - # if --root or --preix was provided, and if - # setuptools is not located in them, we don't patch it - if not _under_prefix(setuptools_location): - log.warn('Not patching, --root or --prefix is installing Distribute' - ' in another location') - return - - # let's see if its an egg - if not setuptools_location.endswith('.egg'): - log.warn('Non-egg installation') - res = _remove_flat_installation(setuptools_location) - if not res: - return - else: - log.warn('Egg installation') - pkg_info = os.path.join(setuptools_location, 'EGG-INFO', 'PKG-INFO') - if (os.path.exists(pkg_info) and - _same_content(pkg_info, SETUPTOOLS_PKG_INFO)): - log.warn('Already patched.') - return - log.warn('Patching...') - # let's create a fake egg replacing setuptools one - res = _patch_egg_dir(setuptools_location) - if not res: - return - log.warn('Patching complete.') - _relaunch() - - -def _relaunch(): - log.warn('Relaunching...') - # we have to relaunch the process - # pip marker to avoid a relaunch bug - _cmd1 = ['-c', 'install', '--single-version-externally-managed'] - _cmd2 = ['-c', 'install', '--record'] - if sys.argv[:3] == _cmd1 or sys.argv[:3] == _cmd2: - sys.argv[0] = 'setup.py' - args = [sys.executable] + sys.argv - sys.exit(subprocess.call(args)) - - -def _extractall(self, path=".", members=None): - """Extract all members from the archive to the current working - directory and set owner, modification time and permissions on - directories afterwards. `path' specifies a different directory - to extract to. `members' is optional and must be a subset of the - list returned by getmembers(). - """ - import copy - import operator - from tarfile import ExtractError - directories = [] - - if members is None: - members = self - - for tarinfo in members: - if tarinfo.isdir(): - # Extract directories with a safe mode. - directories.append(tarinfo) - tarinfo = copy.copy(tarinfo) - tarinfo.mode = 448 # decimal for oct 0700 - self.extract(tarinfo, path) - - # Reverse sort directories. - if sys.version_info < (2, 4): - def sorter(dir1, dir2): - return cmp(dir1.name, dir2.name) - directories.sort(sorter) - directories.reverse() - else: - directories.sort(key=operator.attrgetter('name'), reverse=True) - - # Set correct owner, mtime and filemode on directories. - for tarinfo in directories: - dirpath = os.path.join(path, tarinfo.name) - try: - self.chown(tarinfo, dirpath) - self.utime(tarinfo, dirpath) - self.chmod(tarinfo, dirpath) - except ExtractError: - e = sys.exc_info()[1] - if self.errorlevel > 1: - raise - else: - self._dbg(1, "tarfile: %s" % e) - - -def _build_install_args(options): - """ - Build the arguments to 'python setup.py install' on the distribute package - """ - install_args = [] - if options.user_install: - if sys.version_info < (2, 6): - log.warn("--user requires Python 2.6 or later") - raise SystemExit(1) - install_args.append('--user') - return install_args - -def _parse_args(): - """ - Parse the command line for options - """ - parser = optparse.OptionParser() - parser.add_option( - '--user', dest='user_install', action='store_true', default=False, - help='install in user site package (requires Python 2.6 or later)') - parser.add_option( - '--download-base', dest='download_base', metavar="URL", - default=DEFAULT_URL, - help='alternative URL from where to download the distribute package') - options, args = parser.parse_args() - # positional arguments are ignored - return options - -def main(version=DEFAULT_VERSION): - """Install or upgrade setuptools and EasyInstall""" - options = _parse_args() - tarball = download_setuptools(download_base=options.download_base) - return _install(tarball, _build_install_args(options)) - -if __name__ == '__main__': - sys.exit(main()) diff --git a/requirements.in b/requirements.in deleted file mode 100644 index d078113..0000000 --- a/requirements.in +++ /dev/null @@ -1,10 +0,0 @@ -numpy -cython -pandas -scipy -scikit-learn -hdbscan -biopython -taxtastic -futures -seqmagick diff --git a/requirements.txt b/requirements.txt index 2ad65b0..512b913 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,32 +1,24 @@ -# this file is order-dependent! see bin/pipdeptree2requirements.py - -# to add new dependencies: -# - create a virtualenv -# - install packages: pip install -r requirements.in -# - install pipdeptree -# - run `pipdeptree -f --nowarn | bin/pipdeptree2requirements.py` to generate a new requirements.txt -# - manually replace any references to github repos -# - correct any incompatibilities (eg, seqmagick running under python2 requires biopython <= 1.66 - -MarkupSafe==1.0 -numpy==1.14.0 -setuptools==38.4.0 -six==1.11.0 -DendroPy==4.3.0 -Jinja2==2.10 -PyYAML==3.12 -SQLAlchemy==1.2.0 -biopython==1.66 -decorator==4.1.2 +biopython==1.79 +Cython==0.29.28 +decorator==5.1.1 +DendroPy==4.5.2 fastalite==0.3 -psycopg2==2.7.3.2 -python-dateutil==2.6.1 -pytz==2017.3 -scikit-learn==0.19.1 -Cython==0.27.3 -futures==3.2.0 -hdbscan==0.8.11 -pandas==0.22.0 -scipy==1.0.0 -seqmagick==0.6.2 -taxtastic==0.8.5 +greenlet==1.1.2 +hdbscan==0.8.28 +Jinja2==3.0.3 +joblib==1.1.0 +MarkupSafe==2.1.0 +numpy==1.22.2 +pandas==1.4.1 +psycopg2-binary==2.9.3 +pygtrie==2.4.2 +python-dateutil==2.8.2 +pytz==2021.3 +PyYAML==6.0 +scikit-learn==1.0.2 +scipy==1.8.0 +seqmagick==0.8.4 +six==1.16.0 +SQLAlchemy==1.4.32 +taxtastic==0.9.2 +threadpoolctl==3.1.0 diff --git a/setup.py b/setup.py index 22682be..7bbe388 100644 --- a/setup.py +++ b/setup.py @@ -1,27 +1,44 @@ import os import sys import subprocess + from setuptools import setup, find_packages, Command -# Fix for `setup.py test` -# See http://bugs.python.org/issue15881 -try: - import multiprocessing - from concurrent import futures -except ImportError: - pass +datadir = 'deenurp/data' +version_file = f'{datadir}/version.txt' subprocess.call( - ('mkdir -p {data} && ' - 'git describe --tags --dirty > {data}/{file}.tmp ' - '&& mv {data}/{file}.tmp {data}/{file} ' - '|| rm -f {data}/{file}.tmp').format(data='deenurp/data', file='version.txt'), + (f'mkdir -p {datadir} && ' + f'git describe --tags --dirty > {version_file}.tmp ' + f'&& mv {version_file}.tmp {version_file} ' + f'|| rm -f {version_file}.tmp'), shell=True, stderr=open(os.devnull, "w")) # import must follow 'git describe' command above to update version from deenurp import __version__ +class CheckVersion(Command): + description = 'Confirm that the stored package version is correct' + user_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + with open(version_file) as f: + stored_version = f.read().strip() + + git_version = subprocess.check_output( + ['git', 'describe', '--tags', '--dirty']).strip() + + assert stored_version == git_version + print('the current version is', stored_version) + + class run_audit(Command): """Audits source code using PyFlakes for following issues: @@ -41,7 +58,7 @@ def run(self): try: import pyflakes.scripts.pyflakes as flakes except ImportError: - print "Audit requires PyFlakes installed in your system." + print("Audit requires PyFlakes installed in your system.") sys.exit(-1) warns = 0 @@ -53,17 +70,30 @@ def run(self): if file != '__init__.py' and file.endswith('.py'): warns += flakes.checkPath(os.path.join(root, file)) if warns > 0: - print "Audit finished with total %d warnings." % warns + print("Audit finished with total %d warnings." % warns) else: - print "No problems found in sourcecode." - - -setup(name='deenurp', - version=__version__, - package_data={'deenurp': ['data/*', 'test/data/*']}, - entry_points={ - 'console_scripts': {'deenurp = deenurp:main'}}, - cmdclass={'audit': run_audit}, - test_suite='deenurp.test.suite', - packages=find_packages(exclude=['tests']) - ) + print("No problems found in sourcecode.") + + +setup( + name='deenurp', + version=__version__, + package_data={'deenurp': ['data/*', 'test/data/*']}, + entry_points={ + 'console_scripts': {'deenurp = deenurp:main'}}, + cmdclass={'audit': run_audit, 'check_version': CheckVersion}, + test_suite='deenurp.test.suite', + packages=find_packages(exclude=['tests']), + python_requires='>=3.8', + install_requires=[ + 'numpy', + 'cython', + 'pandas', + 'scipy', + 'scikit-learn', + 'hdbscan', + 'biopython', + 'taxtastic', + 'seqmagick', + ], +) diff --git a/tests/hrefpkg-build/run.sh b/tests/hrefpkg-build/run.sh index 324385f..96d734b 100755 --- a/tests/hrefpkg-build/run.sh +++ b/tests/hrefpkg-build/run.sh @@ -6,4 +6,5 @@ BASE=../rdp_10_30_named1200bp_subset rm -rf hrefpkg mkdir hrefpkg DEENURP=${DEENURP-../../deenurp.py} -$DEENURP hrefpkg_build --index-rank=family $BASE.fasta $BASE.seqinfo.csv $BASE.taxonomy.csv --output-dir hrefpkg +$DEENURP hrefpkg_build --index-rank=family \ + $BASE.fasta $BASE.seqinfo.csv $BASE.taxonomy.csv --output-dir hrefpkg diff --git a/tests/run.sh b/tests/run.sh index b88ee54..69d668e 100755 --- a/tests/run.sh +++ b/tests/run.sh @@ -10,6 +10,7 @@ while read subdir; do if echo $subdir | grep -qv -E '^#'; then echo $subdir (cd $TESTS_DIR/$subdir && ./run.sh) + # (cd $TESTS_DIR/$subdir && bash -v ./run.sh) fi done <