From 1231e965e5a5c849e3242030742b6f6200b2de68 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Thu, 1 Dec 2022 18:03:08 -0500 Subject: [PATCH] Remove pandas dependency (#273) * removing MemoryCache and pandas requirement * got rid of six dependency * got rid of two different requirements lists * added unit test for shell --- pyensembl/__init__.py | 2 - pyensembl/common.py | 1 - pyensembl/database.py | 1 - pyensembl/genome.py | 5 +- pyensembl/memory_cache.py | 136 ---------------------------- pyensembl/sequence_data.py | 7 +- pyensembl/version.py | 2 +- requirements.txt | 9 +- setup.py | 13 +-- test/common.py | 2 - test/test_download_cache.py | 2 - test/test_gene_objects.py | 2 - test/test_id_length.py | 2 - test/test_locus.py | 2 - test/test_memory_cache.py | 94 ------------------- test/test_missing_genome_sources.py | 2 - test/test_mouse.py | 2 - test/test_release_versions.py | 2 - test/test_search.py | 3 +- test/test_serialization.py | 3 - test/test_timings.py | 2 - test/test_transcript_objects.py | 2 - test/test_ucsc_gtf.py | 3 +- 23 files changed, 13 insertions(+), 286 deletions(-) delete mode 100644 pyensembl/memory_cache.py delete mode 100644 test/test_memory_cache.py diff --git a/pyensembl/__init__.py b/pyensembl/__init__.py index 606b85e..08d18fd 100644 --- a/pyensembl/__init__.py +++ b/pyensembl/__init__.py @@ -10,7 +10,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .memory_cache import MemoryCache from .database import Database from .download_cache import DownloadCache from .ensembl_release import EnsemblRelease, cached_release @@ -41,7 +40,6 @@ __all__ = [ "__version__", - "MemoryCache", "DownloadCache", "Database", "EnsemblRelease", diff --git a/pyensembl/common.py b/pyensembl/common.py index 610aa2c..532907a 100644 --- a/pyensembl/common.py +++ b/pyensembl/common.py @@ -14,7 +14,6 @@ from functools import wraps - def dump_pickle(obj, filepath): with open(filepath, "wb") as f: # use lower protocol for compatibility between Python 2 and Python 3 diff --git a/pyensembl/database.py b/pyensembl/database.py index 96a9443..04e7809 100644 --- a/pyensembl/database.py +++ b/pyensembl/database.py @@ -16,7 +16,6 @@ import datacache from typechecks import require_integer, require_string - from gtfparse import read_gtf, create_missing_features from .common import memoize diff --git a/pyensembl/genome.py b/pyensembl/genome.py index cb61483..7b7232c 100644 --- a/pyensembl/genome.py +++ b/pyensembl/genome.py @@ -21,7 +21,6 @@ from serializable import Serializable -from .memory_cache import MemoryCache from .download_cache import DownloadCache from .database import Database from .exon import Exon @@ -108,7 +107,6 @@ def __init__( copy_local_files_to_cache=self.copy_local_files_to_cache, install_string_function=self.install_string, cache_directory_path=cache_directory_path) - self.memory_cache = MemoryCache() self._init_lazy_fields() @property @@ -435,8 +433,7 @@ def __hash__(self): def clear_cache(self): """ - Clear any in-memory cached values and short-lived on-disk - materializations from MemoryCache + Clear any in-memory cached values """ for maybe_fn in self.__dict__.values(): # clear cache associated with all memoization decorators, diff --git a/pyensembl/memory_cache.py b/pyensembl/memory_cache.py deleted file mode 100644 index 60859c1..0000000 --- a/pyensembl/memory_cache.py +++ /dev/null @@ -1,136 +0,0 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Cache and serializing the results of expensive computations. Used in pyensembl -primarily to cache the heavy-weight parsing of GTF files and various -filtering operations on Ensembl entries. - -A piece of data is returned from one of three sources: -1) Cache cold. Run the user-supplied compute_fn. -2) Cache warm on disk. Parse or unpickle the serialized result into memory. -3) Cache warm in memory. Return cached object. -""" - -import logging -from os import remove, stat -from os.path import exists - -import pandas as pd - -from .common import load_pickle, dump_pickle - - -logger = logging.getLogger(__name__) - - -class MemoryCache(object): - """ - In-memory and on-disk caching of long-running queries and computations. - """ - def __init__(self): - self._memory_cache = {} - - def is_empty(self, filename): - return stat(filename).st_size == 0 - - def delete_file(self, path): - if exists(path): - logger.info("Deleting cached file %s", path) - remove(path) - - def remove_from_cache(self, key): - if key in self._memory_cache: - del self._memory_cache[key] - self.delete_file(key) - - def clear_cached_objects(self): - for key in self._memory_cache.keys(): - self.delete_file(key) - self._memory_cache.clear() - - def _read_csv(self, csv_path): - logger.info("Reading Dataframe from %s", csv_path) - df = pd.read_csv(csv_path) - if 'seqname' in df: - # by default, Pandas will infer the type as int, - # then switch to str when it hits non-numerical - # chromosomes. Make sure whole column has the same type - df['seqname'] = df['seqname'].map(str) - return df - - def _write_csv(self, df, csv_path, chunksize=10**5): - """ - Parameters - ---------- - df : pandas.DataFrame - - csv_path : str - - chunksize : int - Number of rows to write at a time. Helps to limit memory - consumption while writing a CSV. - """ - logger.info("Saving DataFrame to %s", csv_path) - df.to_csv(csv_path, index=False, chunksize=chunksize) - - def cached_dataframe(self, csv_path, compute_fn): - """ - If a CSV path is in the _memory_cache, then return that cached value. - - If we've already saved the DataFrame as a CSV then load it. - - Otherwise run the provided `compute_fn`, and store its result - in memory and and save it as a CSV. - """ - if not csv_path.endswith(".csv"): - raise ValueError("Invalid path '%s', must be a CSV file" % csv_path) - - if csv_path in self._memory_cache: - return self._memory_cache[csv_path] - - if exists(csv_path) and not self.is_empty(csv_path): - df = self._read_csv(csv_path) - else: - df = compute_fn() - if not isinstance(df, pd.DataFrame): - raise TypeError( - "Expected compute_fn to return DataFrame, got %s : %s" % ( - df, type(df))) - self._write_csv(df, csv_path) - self._memory_cache[csv_path] = df - return df - - def cached_object(self, path, compute_fn): - """ - If `cached_object` has already been called for a value of `path` in this - running Python instance, then it should have a cached value in the - _memory_cache; return that value. - - If this function was never called before with a particular value of - `path`, then call compute_fn, and pickle it to `path`. - - If `path` already exists, unpickle it and store that value in - _memory_cache. - """ - if path in self._memory_cache: - return self._memory_cache[path] - - if exists(path) and not self.is_empty(path): - obj = load_pickle(path) - else: - obj = compute_fn() - dump_pickle(obj, path) - self._memory_cache[path] = obj - return obj diff --git a/pyensembl/sequence_data.py b/pyensembl/sequence_data.py index 2e5a218..f72d379 100644 --- a/pyensembl/sequence_data.py +++ b/pyensembl/sequence_data.py @@ -14,10 +14,7 @@ from os.path import exists, abspath, split, join import logging from collections import Counter - -from six.moves import cPickle as pickle -from six import string_types - +import pickle from .common import (load_pickle, dump_pickle) from .fasta import parse_fasta_dictionary @@ -34,7 +31,7 @@ def __init__( fasta_paths, cache_directory_path=None): - if isinstance(fasta_paths, string_types): + if type(fasta_paths) is str: fasta_paths = [fasta_paths] self.fasta_paths = [abspath(path) for path in fasta_paths] diff --git a/pyensembl/version.py b/pyensembl/version.py index a33997d..04188a1 100644 --- a/pyensembl/version.py +++ b/pyensembl/version.py @@ -1 +1 @@ -__version__ = '2.1.0' +__version__ = '2.2.0' diff --git a/requirements.txt b/requirements.txt index 2b254da..70c87f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,8 @@ typechecks>=0.0.2 -numpy>=1.7 -pandas>=0.15 datacache>=1.1.4 memoized-property>=1.0.2 -nose>=1.3.3 tinytimer -six>=1.9.0 +gtfparse>=1.3.0 +serializable +nose>=1.3.3 pylint>=1.4.4 -gtfparse>=1.1.0 -serializable \ No newline at end of file diff --git a/setup.py b/setup.py index 4626a50..aa2605e 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,9 @@ raise RuntimeError('Cannot find version information') if __name__ == '__main__': + with open("requirements.txt") as f: + requirements = [l.strip() for l in f] + setup( name=package_name, version=version, @@ -63,15 +66,7 @@ 'Programming Language :: Python', 'Topic :: Scientific/Engineering :: Bio-Informatics', ], - install_requires=[ - "typechecks>=0.0.2", - "pandas>=0.15", - "datacache>=1.1.4", - "memoized-property>=1.0.2", - "gtfparse>=1.1.0", - "serializable", - "tinytimer", - ], + install_requires=requirements, long_description=readme_markdown, long_description_content_type='text/markdown', packages=[package_name], diff --git a/test/common.py b/test/common.py index c457505..9b20c3b 100644 --- a/test/common.py +++ b/test/common.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - import functools from pyensembl import ( diff --git a/test/test_download_cache.py b/test/test_download_cache.py index 878e322..03c7da6 100644 --- a/test/test_download_cache.py +++ b/test/test_download_cache.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - from nose.tools import assert_raises, ok_ from pyensembl.download_cache import ( DownloadCache, diff --git a/test/test_gene_objects.py b/test/test_gene_objects.py index 31bd759..2258f43 100644 --- a/test/test_gene_objects.py +++ b/test/test_gene_objects.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - from nose.tools import eq_ from .common import test_ensembl_releases diff --git a/test/test_id_length.py b/test/test_id_length.py index f6ae8ef..7371cd4 100644 --- a/test/test_id_length.py +++ b/test/test_id_length.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - from .common import major_releases from nose.tools import nottest diff --git a/test/test_locus.py b/test/test_locus.py index 26e4840..a1af6fd 100644 --- a/test/test_locus.py +++ b/test/test_locus.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - from pyensembl.locus import Locus from pyensembl.normalization import normalize_chromosome diff --git a/test/test_memory_cache.py b/test/test_memory_cache.py deleted file mode 100644 index ef253f4..0000000 --- a/test/test_memory_cache.py +++ /dev/null @@ -1,94 +0,0 @@ -from __future__ import absolute_import - -import tempfile - -from pyensembl import MemoryCache - -import pandas as pd -from nose.tools import raises - -memory_cache = MemoryCache() - -class Counter(object): - """ - Use this class to count how many times a function gets called by - cached_object and cached_dataframe. - """ - def __init__(self): - self.count = 0 - - def increment(self): - self.count += 1 - return self.count - - def increment_dataframe(self): - value = self.increment() - return pd.DataFrame({'x': [value]}) - -def test_cached_object_with_tempfile(): - """ - test_cached_object_with_tempfile : A temporary file exists before - calling into compute_cache.cached_object but is empty, should be treated - as if result has never been computed before (rather than trying to load - the empty file). - """ - counter = Counter() - with tempfile.NamedTemporaryFile() as f: - # call repeatedly to test the hot and cold cache logic - result = memory_cache.cached_object( - f.name, compute_fn=counter.increment) - assert result == 1, "Expected result=1, got %s" % (result,) - assert counter.count == 1, \ - "Expected compute_fn to be called once, got %s" % (counter.count,) - - -def test_cached_dataframe_with_tempfile(): - """ - test_cached_dataframe_with_tempfile : A temporary file exists before - calling into compute_cache.cached_dataframe but is empty, - should be treated as if result has never been computed before - (rather than trying to load the empty file). - """ - counter = Counter() - with tempfile.NamedTemporaryFile(suffix='.csv') as f: - # call repeatedly to test hot and cold cache logic - for _ in range(2): - df = memory_cache.cached_dataframe( - f.name, compute_fn=counter.increment_dataframe) - # get counter value from inside of dataframe - result = df['x'].iloc[0] - assert result == 1, \ - "Expected result=1, got %s" % (result,) - assert counter.count == 1, \ - "Expected compute_fn to be called once, got %s" % ( - counter.count,) - -def test_cached_dataframe_returns_correct_type(): - def make_a_dataframe(): - return pd.DataFrame({'x': [0, 1, 2]}) - with tempfile.NamedTemporaryFile(suffix='.csv') as f: - # call repeatedly to test the cold and hot cache logic - for _ in range(2): - df = memory_cache.cached_dataframe( - f.name, compute_fn=make_a_dataframe) - assert isinstance(df, pd.DataFrame), \ - "Expected DataFrame, got %s : %s" % (df, type(df)) - -def test_cached_object_with_list_returns_correct_type(): - def make_a_list(): - return [1, 2, 3] - with tempfile.NamedTemporaryFile() as f: - # call repeatedly to test the cold and hot cache logic - for _ in range(2): - df = memory_cache.cached_object( - f.name, compute_fn=make_a_list) - assert isinstance(df, list), \ - "Expected list, got %s : %s" % (df, type(df)) - -@raises(Exception) -def test_dataframe_path_must_be_csv(): - # compute_cache should raise an exception when filename doesn't - # end with .csv extension - memory_cache.cached_dataframe( - csv_path="tempfile_not_csv", - compute_fn=lambda _: pd.DataFrame({'x': []})) diff --git a/test/test_missing_genome_sources.py b/test/test_missing_genome_sources.py index 6236e9b..35a4f41 100644 --- a/test/test_missing_genome_sources.py +++ b/test/test_missing_genome_sources.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - from pyensembl import Genome from nose.tools import eq_, ok_, assert_raises diff --git a/test/test_mouse.py b/test/test_mouse.py index 88ee532..24a0b4a 100644 --- a/test/test_mouse.py +++ b/test/test_mouse.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - from nose.tools import eq_, with_setup from .data import ( diff --git a/test/test_release_versions.py b/test/test_release_versions.py index c9847b7..4eca06f 100644 --- a/test/test_release_versions.py +++ b/test/test_release_versions.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE from nose.tools import raises diff --git a/test/test_search.py b/test/test_search.py index 135550c..b04688e 100644 --- a/test/test_search.py +++ b/test/test_search.py @@ -1,7 +1,6 @@ -from __future__ import absolute_import +from nose.tools import eq_ from pyensembl import find_nearest_locus -from nose.tools import eq_ from .common import test_ensembl_releases @test_ensembl_releases() diff --git a/test/test_serialization.py b/test/test_serialization.py index 5072272..d90b6b7 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -1,5 +1,3 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,7 +10,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import import pickle from nose.tools import eq_, with_setup from pyensembl import Genome, Transcript, Gene, Exon diff --git a/test/test_timings.py b/test/test_timings.py index 708de75..a948886 100644 --- a/test/test_timings.py +++ b/test/test_timings.py @@ -1,5 +1,3 @@ -from __future__ import print_function, absolute_import - from pyensembl import genome_for_reference_name from tinytimer import benchmark diff --git a/test/test_transcript_objects.py b/test/test_transcript_objects.py index 9e11e1c..79d08d5 100644 --- a/test/test_transcript_objects.py +++ b/test/test_transcript_objects.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - from pyensembl import Locus, cached_release from nose.tools import eq_, assert_not_equal, assert_greater diff --git a/test/test_ucsc_gtf.py b/test/test_ucsc_gtf.py index 7a8ffd7..3e4a9e5 100644 --- a/test/test_ucsc_gtf.py +++ b/test/test_ucsc_gtf.py @@ -1,7 +1,6 @@ -from __future__ import absolute_import +from nose.tools import eq_ from pyensembl import Genome, Database -from nose.tools import eq_ from .common import TemporaryDirectory from .data import data_path