From 1231e965e5a5c849e3242030742b6f6200b2de68 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Thu, 1 Dec 2022 18:03:08 -0500
Subject: [PATCH] Remove pandas dependency (#273)

* removing MemoryCache and pandas requirement

* got rid of six dependency

* got rid of two different requirements lists

* added unit test for shell
---
 pyensembl/__init__.py               |   2 -
 pyensembl/common.py                 |   1 -
 pyensembl/database.py               |   1 -
 pyensembl/genome.py                 |   5 +-
 pyensembl/memory_cache.py           | 136 ----------------------------
 pyensembl/sequence_data.py          |   7 +-
 pyensembl/version.py                |   2 +-
 requirements.txt                    |   9 +-
 setup.py                            |  13 +--
 test/common.py                      |   2 -
 test/test_download_cache.py         |   2 -
 test/test_gene_objects.py           |   2 -
 test/test_id_length.py              |   2 -
 test/test_locus.py                  |   2 -
 test/test_memory_cache.py           |  94 -------------------
 test/test_missing_genome_sources.py |   2 -
 test/test_mouse.py                  |   2 -
 test/test_release_versions.py       |   2 -
 test/test_search.py                 |   3 +-
 test/test_serialization.py          |   3 -
 test/test_timings.py                |   2 -
 test/test_transcript_objects.py     |   2 -
 test/test_ucsc_gtf.py               |   3 +-
 23 files changed, 13 insertions(+), 286 deletions(-)
 delete mode 100644 pyensembl/memory_cache.py
 delete mode 100644 test/test_memory_cache.py

diff --git a/pyensembl/__init__.py b/pyensembl/__init__.py
index 606b85e..08d18fd 100644
--- a/pyensembl/__init__.py
+++ b/pyensembl/__init__.py
@@ -10,7 +10,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .memory_cache import MemoryCache
 from .database import Database
 from .download_cache import DownloadCache
 from .ensembl_release import EnsemblRelease, cached_release
@@ -41,7 +40,6 @@
 
 __all__ = [
     "__version__",
-    "MemoryCache",
     "DownloadCache",
     "Database",
     "EnsemblRelease",
diff --git a/pyensembl/common.py b/pyensembl/common.py
index 610aa2c..532907a 100644
--- a/pyensembl/common.py
+++ b/pyensembl/common.py
@@ -14,7 +14,6 @@
 
 from functools import wraps
 
-
 def dump_pickle(obj, filepath):
     with open(filepath, "wb") as f:
         # use lower protocol for compatibility between Python 2 and Python 3
diff --git a/pyensembl/database.py b/pyensembl/database.py
index 96a9443..04e7809 100644
--- a/pyensembl/database.py
+++ b/pyensembl/database.py
@@ -16,7 +16,6 @@
 
 import datacache
 from typechecks import require_integer, require_string
-
 from gtfparse import read_gtf, create_missing_features
 
 from .common import memoize
diff --git a/pyensembl/genome.py b/pyensembl/genome.py
index cb61483..7b7232c 100644
--- a/pyensembl/genome.py
+++ b/pyensembl/genome.py
@@ -21,7 +21,6 @@
 
 from serializable import Serializable
 
-from .memory_cache import MemoryCache
 from .download_cache import DownloadCache
 from .database import Database
 from .exon import Exon
@@ -108,7 +107,6 @@ def __init__(
             copy_local_files_to_cache=self.copy_local_files_to_cache,
             install_string_function=self.install_string,
             cache_directory_path=cache_directory_path)
-        self.memory_cache = MemoryCache()
         self._init_lazy_fields()
 
     @property
@@ -435,8 +433,7 @@ def __hash__(self):
 
     def clear_cache(self):
         """
-        Clear any in-memory cached values and short-lived on-disk
-        materializations from MemoryCache
+        Clear any in-memory cached values
         """
         for maybe_fn in self.__dict__.values():
             # clear cache associated with all memoization decorators,
diff --git a/pyensembl/memory_cache.py b/pyensembl/memory_cache.py
deleted file mode 100644
index 60859c1..0000000
--- a/pyensembl/memory_cache.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) 2015. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Cache and serializing the results of expensive computations. Used in pyensembl
-primarily to cache the heavy-weight parsing of GTF files and various
-filtering operations on Ensembl entries.
-
-A piece of data is returned from one of three sources:
-1) Cache cold. Run the user-supplied compute_fn.
-2) Cache warm on disk. Parse or unpickle the serialized result into memory.
-3) Cache warm in memory. Return cached object.
-"""
-
-import logging
-from os import remove, stat
-from os.path import exists
-
-import pandas as pd
-
-from .common import load_pickle, dump_pickle
-
-
-logger = logging.getLogger(__name__)
-
-
-class MemoryCache(object):
-    """
-    In-memory and on-disk caching of long-running queries and computations.
-    """
-    def __init__(self):
-        self._memory_cache = {}
-
-    def is_empty(self, filename):
-        return stat(filename).st_size == 0
-
-    def delete_file(self, path):
-        if exists(path):
-            logger.info("Deleting cached file %s", path)
-            remove(path)
-
-    def remove_from_cache(self, key):
-        if key in self._memory_cache:
-            del self._memory_cache[key]
-        self.delete_file(key)
-
-    def clear_cached_objects(self):
-        for key in self._memory_cache.keys():
-            self.delete_file(key)
-        self._memory_cache.clear()
-
-    def _read_csv(self, csv_path):
-        logger.info("Reading Dataframe from %s", csv_path)
-        df = pd.read_csv(csv_path)
-        if 'seqname' in df:
-            # by default, Pandas will infer the type as int,
-            # then switch to str when it hits non-numerical
-            # chromosomes. Make sure whole column has the same type
-            df['seqname'] = df['seqname'].map(str)
-        return df
-
-    def _write_csv(self, df, csv_path, chunksize=10**5):
-        """
-        Parameters
-        ----------
-        df : pandas.DataFrame
-
-        csv_path : str
-
-        chunksize : int
-            Number of rows to write at a time. Helps to limit memory
-            consumption while writing a CSV.
-        """
-        logger.info("Saving DataFrame to %s", csv_path)
-        df.to_csv(csv_path, index=False, chunksize=chunksize)
-
-    def cached_dataframe(self, csv_path, compute_fn):
-        """
-        If a CSV path is in the _memory_cache, then return that cached value.
-
-        If we've already saved the DataFrame as a CSV then load it.
-
-        Otherwise run the provided `compute_fn`, and store its result
-        in memory and and save it as a CSV.
-        """
-        if not csv_path.endswith(".csv"):
-            raise ValueError("Invalid path '%s', must be a CSV file" % csv_path)
-
-        if csv_path in self._memory_cache:
-            return self._memory_cache[csv_path]
-
-        if exists(csv_path) and not self.is_empty(csv_path):
-            df = self._read_csv(csv_path)
-        else:
-            df = compute_fn()
-            if not isinstance(df, pd.DataFrame):
-                raise TypeError(
-                    "Expected compute_fn to return DataFrame, got %s : %s" % (
-                        df, type(df)))
-            self._write_csv(df, csv_path)
-        self._memory_cache[csv_path] = df
-        return df
-
-    def cached_object(self, path, compute_fn):
-        """
-        If `cached_object` has already been called for a value of `path` in this
-        running Python instance, then it should have a cached value in the
-         _memory_cache; return that value.
-
-        If this function was never called before with a particular value of
-        `path`, then call compute_fn, and pickle it to `path`.
-
-        If `path` already exists, unpickle it and store that value in
-        _memory_cache.
-        """
-        if path in self._memory_cache:
-            return self._memory_cache[path]
-
-        if exists(path) and not self.is_empty(path):
-            obj = load_pickle(path)
-        else:
-            obj = compute_fn()
-            dump_pickle(obj, path)
-        self._memory_cache[path] = obj
-        return obj
diff --git a/pyensembl/sequence_data.py b/pyensembl/sequence_data.py
index 2e5a218..f72d379 100644
--- a/pyensembl/sequence_data.py
+++ b/pyensembl/sequence_data.py
@@ -14,10 +14,7 @@
 from os.path import exists, abspath, split, join
 import logging
 from collections import Counter
-
-from six.moves import cPickle as pickle
-from six import string_types
-
+import pickle 
 from .common import (load_pickle, dump_pickle)
 from .fasta import parse_fasta_dictionary
 
@@ -34,7 +31,7 @@ def __init__(
             fasta_paths,
             cache_directory_path=None):
 
-        if isinstance(fasta_paths, string_types):
+        if type(fasta_paths) is str:
             fasta_paths = [fasta_paths]
 
         self.fasta_paths = [abspath(path) for path in fasta_paths]
diff --git a/pyensembl/version.py b/pyensembl/version.py
index a33997d..04188a1 100644
--- a/pyensembl/version.py
+++ b/pyensembl/version.py
@@ -1 +1 @@
-__version__ = '2.1.0'
+__version__ = '2.2.0'
diff --git a/requirements.txt b/requirements.txt
index 2b254da..70c87f5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,8 @@
 typechecks>=0.0.2
-numpy>=1.7
-pandas>=0.15
 datacache>=1.1.4
 memoized-property>=1.0.2
-nose>=1.3.3
 tinytimer
-six>=1.9.0
+gtfparse>=1.3.0
+serializable
+nose>=1.3.3
 pylint>=1.4.4
-gtfparse>=1.1.0
-serializable
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 4626a50..aa2605e 100644
--- a/setup.py
+++ b/setup.py
@@ -41,6 +41,9 @@
     raise RuntimeError('Cannot find version information')
 
 if __name__ == '__main__':
+    with open("requirements.txt") as f:
+        requirements = [l.strip() for l in f]
+    
     setup(
         name=package_name,
         version=version,
@@ -63,15 +66,7 @@
             'Programming Language :: Python',
             'Topic :: Scientific/Engineering :: Bio-Informatics',
         ],
-        install_requires=[
-            "typechecks>=0.0.2",
-            "pandas>=0.15",
-            "datacache>=1.1.4",
-            "memoized-property>=1.0.2",
-            "gtfparse>=1.1.0",
-            "serializable",
-            "tinytimer",
-        ],
+        install_requires=requirements,
         long_description=readme_markdown,
         long_description_content_type='text/markdown',
         packages=[package_name],
diff --git a/test/common.py b/test/common.py
index c457505..9b20c3b 100644
--- a/test/common.py
+++ b/test/common.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 import functools
 
 from pyensembl import (
diff --git a/test/test_download_cache.py b/test/test_download_cache.py
index 878e322..03c7da6 100644
--- a/test/test_download_cache.py
+++ b/test/test_download_cache.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from nose.tools import assert_raises, ok_
 from pyensembl.download_cache import (
     DownloadCache,
diff --git a/test/test_gene_objects.py b/test/test_gene_objects.py
index 31bd759..2258f43 100644
--- a/test/test_gene_objects.py
+++ b/test/test_gene_objects.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from nose.tools import eq_
 
 from .common import test_ensembl_releases
diff --git a/test/test_id_length.py b/test/test_id_length.py
index f6ae8ef..7371cd4 100644
--- a/test/test_id_length.py
+++ b/test/test_id_length.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from .common import major_releases
 
 from nose.tools import nottest
diff --git a/test/test_locus.py b/test/test_locus.py
index 26e4840..a1af6fd 100644
--- a/test/test_locus.py
+++ b/test/test_locus.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from pyensembl.locus import Locus
 from pyensembl.normalization import normalize_chromosome
 
diff --git a/test/test_memory_cache.py b/test/test_memory_cache.py
deleted file mode 100644
index ef253f4..0000000
--- a/test/test_memory_cache.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from __future__ import absolute_import
-
-import tempfile
-
-from pyensembl import MemoryCache
-
-import pandas as pd
-from nose.tools import raises
-
-memory_cache = MemoryCache()
-
-class Counter(object):
-    """
-    Use this class to count how many times a function gets called by
-    cached_object and cached_dataframe.
-    """
-    def __init__(self):
-        self.count = 0
-
-    def increment(self):
-        self.count += 1
-        return self.count
-
-    def increment_dataframe(self):
-        value = self.increment()
-        return pd.DataFrame({'x': [value]})
-
-def test_cached_object_with_tempfile():
-    """
-    test_cached_object_with_tempfile : A temporary file exists before
-    calling into compute_cache.cached_object but is empty, should be treated
-    as if result has never been computed before (rather than trying to load
-    the empty file).
-    """
-    counter = Counter()
-    with tempfile.NamedTemporaryFile() as f:
-        # call repeatedly to test the hot and cold cache logic
-        result = memory_cache.cached_object(
-            f.name, compute_fn=counter.increment)
-        assert result == 1, "Expected result=1, got %s" % (result,)
-        assert counter.count == 1, \
-            "Expected compute_fn to be called once, got %s" % (counter.count,)
-
-
-def test_cached_dataframe_with_tempfile():
-    """
-    test_cached_dataframe_with_tempfile : A temporary file exists before
-    calling into compute_cache.cached_dataframe but is empty,
-    should be treated as if result has never been computed before
-    (rather than trying to load the empty file).
-    """
-    counter = Counter()
-    with tempfile.NamedTemporaryFile(suffix='.csv') as f:
-        # call repeatedly to test hot and cold cache logic
-        for _ in range(2):
-            df = memory_cache.cached_dataframe(
-                f.name, compute_fn=counter.increment_dataframe)
-            # get counter value from inside of dataframe
-            result = df['x'].iloc[0]
-            assert result == 1, \
-                "Expected result=1, got %s" % (result,)
-            assert counter.count == 1, \
-                "Expected compute_fn to be called once, got %s" % (
-                    counter.count,)
-
-def test_cached_dataframe_returns_correct_type():
-    def make_a_dataframe():
-        return pd.DataFrame({'x': [0, 1, 2]})
-    with tempfile.NamedTemporaryFile(suffix='.csv') as f:
-        # call repeatedly to test the cold and hot cache logic
-        for _ in range(2):
-            df = memory_cache.cached_dataframe(
-                f.name, compute_fn=make_a_dataframe)
-            assert isinstance(df, pd.DataFrame), \
-                "Expected DataFrame, got %s : %s" % (df, type(df))
-
-def test_cached_object_with_list_returns_correct_type():
-    def make_a_list():
-        return [1, 2, 3]
-    with tempfile.NamedTemporaryFile() as f:
-        # call repeatedly to test the cold and hot cache logic
-        for _ in range(2):
-            df = memory_cache.cached_object(
-                f.name, compute_fn=make_a_list)
-            assert isinstance(df, list), \
-                "Expected list, got %s : %s" % (df, type(df))
-
-@raises(Exception)
-def test_dataframe_path_must_be_csv():
-    # compute_cache should raise an exception when filename doesn't
-    # end with .csv extension
-    memory_cache.cached_dataframe(
-        csv_path="tempfile_not_csv",
-        compute_fn=lambda _: pd.DataFrame({'x': []}))
diff --git a/test/test_missing_genome_sources.py b/test/test_missing_genome_sources.py
index 6236e9b..35a4f41 100644
--- a/test/test_missing_genome_sources.py
+++ b/test/test_missing_genome_sources.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from pyensembl import Genome
 from nose.tools import eq_, ok_, assert_raises
 
diff --git a/test/test_mouse.py b/test/test_mouse.py
index 88ee532..24a0b4a 100644
--- a/test/test_mouse.py
+++ b/test/test_mouse.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from nose.tools import eq_, with_setup
 
 from .data import (
diff --git a/test/test_release_versions.py b/test/test_release_versions.py
index c9847b7..4eca06f 100644
--- a/test/test_release_versions.py
+++ b/test/test_release_versions.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE
 
 from nose.tools import raises
diff --git a/test/test_search.py b/test/test_search.py
index 135550c..b04688e 100644
--- a/test/test_search.py
+++ b/test/test_search.py
@@ -1,7 +1,6 @@
-from __future__ import absolute_import
+from nose.tools import eq_
 
 from pyensembl import find_nearest_locus
-from nose.tools import eq_
 from .common import test_ensembl_releases
 
 @test_ensembl_releases()
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 5072272..d90b6b7 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -1,5 +1,3 @@
-# Copyright (c) 2016. Mount Sinai School of Medicine
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +10,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import absolute_import
 import pickle
 from nose.tools import eq_, with_setup
 from pyensembl import Genome, Transcript, Gene, Exon
diff --git a/test/test_timings.py b/test/test_timings.py
index 708de75..a948886 100644
--- a/test/test_timings.py
+++ b/test/test_timings.py
@@ -1,5 +1,3 @@
-from __future__ import print_function, absolute_import
-
 from pyensembl import genome_for_reference_name
 
 from tinytimer import benchmark
diff --git a/test/test_transcript_objects.py b/test/test_transcript_objects.py
index 9e11e1c..79d08d5 100644
--- a/test/test_transcript_objects.py
+++ b/test/test_transcript_objects.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from pyensembl import Locus, cached_release
 from nose.tools import eq_, assert_not_equal, assert_greater
 
diff --git a/test/test_ucsc_gtf.py b/test/test_ucsc_gtf.py
index 7a8ffd7..3e4a9e5 100644
--- a/test/test_ucsc_gtf.py
+++ b/test/test_ucsc_gtf.py
@@ -1,7 +1,6 @@
-from __future__ import absolute_import
+from nose.tools import eq_
 
 from pyensembl import Genome, Database
-from nose.tools import eq_
 
 from .common import TemporaryDirectory
 from .data import data_path