Merge pull request #1242 from marscher/minor

Bugfixes and export_to_hdf5
markovmodel · Feb 9, 2018 · 3f13a09 · 3f13a09
2 parents 4173ebe + 0f74323
commit 3f13a09
Show file tree

Hide file tree

Showing 44 changed files with 628 additions and 489 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -29,6 +29,13 @@ env:
     - CONDA_PY=3.5
     - CONDA_PY=3.6
 
+matrix:
+  exclude: # test only 2.7 on osx.
+    - env: CONDA_PY=3.5
+      os: osx
+    - os: osx
+      env: CONDA_PY=3.6
+
 before_install:
 - source devtools/ci/travis/install_miniconda.sh
 

diff --git a/conftest.py b/conftest.py
@@ -24,3 +24,12 @@ def add_np(doctest_namespace):
         np.set_printoptions(legacy='1.13')
     except TypeError:
         pass
+
+
+@pytest.fixture(autouse=True)
+def filter_warnings():
+    import warnings
+    old_filters = warnings.filters[:]
+    warnings.filterwarnings('ignore', message='You have not selected any features. Returning plain coordinates.')
+    yield
+    warnings.filters = old_filters
diff --git a/devtools/conda-recipe/meta.yaml b/devtools/conda-recipe/meta.yaml
@@ -27,12 +27,12 @@ requirements:
     - python
     - scipy
     - setuptools
-    - gcc # [ not win ]
+    - toolchain
+
   run:
     - bhmm >=0.6,<0.7
     - decorator >=4.0.0
     - h5py
-    - libgcc # [linux or osx]
     - matplotlib
     - mdtraj
     - mock # TODO: remove when py3k only.
@@ -59,6 +59,7 @@ test:
   requires:
     - pytest
     - pytest-cov
+    - coverage
    # TODO: disabled on win64, until https://bugs.python.org/issue31701 is fixed.
     - pytest-faulthandler # [not win]
     - pytest-xdist

diff --git a/devtools/conda-recipe/run_test.py b/devtools/conda-recipe/run_test.py
@@ -22,6 +22,7 @@
                "{njobs_args} "
                "--junit-xml={junit_xml} "
                "-c {pytest_cfg}"
+               #"--durations=20 "
                .format(test_pkg=test_pkg, cover_pkg=cover_pkg,
                        junit_xml=junit_xml, pytest_cfg='setup.cfg',
                        dest_report=os.path.join(os.path.expanduser('~/'), 'coverage.xml'),

diff --git a/doc/source/CHANGELOG.rst b/doc/source/CHANGELOG.rst
@@ -19,7 +19,7 @@ your Python installation to at least version 3.5 to catch future updates.
   extracted per iteration from a data source. This is invariant to the dimension of data sets. #1190
 - datasets: added Prinz potential (quadwell). #1226
 - coordinates: added VAMP estimator. #1237
-
+- coordinates: added method 'write_to_hdf5' for easy exporting streams to HDF5. #1242
 
 - References:
 

diff --git a/pyemma-ipython b/pyemma-ipython
diff --git a/pyemma/_base/loggable.py b/pyemma/_base/loggable.py
@@ -61,10 +61,6 @@ def logger(self):
             self.__create_logger()
             return self._logger_instance
 
-    @property
-    def _logger(self):
-        return self.logger
-
     def _logger_is_active(self, level):
         """ @param level: int log level (debug=10, info=20, warn=30, error=40, critical=50)"""
         return self.logger.level >= level

diff --git a/pyemma/_base/serialization/serialization.py b/pyemma/_base/serialization/serialization.py
@@ -194,7 +194,8 @@ def _get_interpolation_map(cls):
         return map
 
     def save(self, file_name, model_name='default', overwrite=False, save_streaming_chain=False):
-        r"""
+        r""" saves the current state of this object to given file and name.
+
         Parameters
         -----------
         file_name: str

diff --git a/pyemma/coordinates/api.py b/pyemma/coordinates/api.py
@@ -80,13 +80,16 @@ def _check_old_chunksize_arg(chunksize, chunk_size_default, **kw):
         chosen_chunk_size = chunksize
     else:
         import warnings
+        from pyemma.util.annotators import get_culprit
+        filename, lineno = get_culprit(3)
         if is_default:  # case 2.
-            warnings.warn('Passed deprecated argument "chunk_size", please use "chunksize"',
-                          category=_PyEMMA_DeprecationWarning)
+            warnings.warn_explicit('Passed deprecated argument "chunk_size", please use "chunksize"',
+                                   category=_PyEMMA_DeprecationWarning, filename=filename, lineno=lineno)
             chosen_chunk_size = kw.pop('chunk_size')  # remove this argument to avoid further passing to other funcs.
         else:  # case 3.
-            warnings.warn('Passed two values for chunk size: "chunk_size" and "chunksize", while the first one'
-                          ' is deprecated. Please use "chunksize" in the future.', category=_PyEMMA_DeprecationWarning)
+            warnings.warn_explicit('Passed two values for chunk size: "chunk_size" and "chunksize", while the first one'
+                                   ' is deprecated. Please use "chunksize" in the future.',
+                                   category=_PyEMMA_DeprecationWarning, filename=filename, lineno=lineno)
             chosen_chunk_size = chunksize
     assert chosen_chunk_size is not NotImplemented
     return chosen_chunk_size

diff --git a/pyemma/coordinates/clustering/include/bits/kmeans_bits.h b/pyemma/coordinates/clustering/include/bits/kmeans_bits.h
@@ -87,10 +87,11 @@ KMeans<dtype>::cluster(const np_array &np_chunk, const np_array &np_centers, int
         }
 #else
         {
+            std::mutex mutex;
+
             std::vector<scoped_thread> threads;
             threads.reserve(static_cast<std::size_t>(n_threads));
 
-            std::mutex mutex;
             std::size_t grainSize = n_frames / n_threads;
 
             auto worker = [&](std::size_t tid, std::size_t begin, std::size_t end, std::mutex& m) {

diff --git a/pyemma/coordinates/clustering/interface.py b/pyemma/coordinates/clustering/interface.py
@@ -293,11 +293,11 @@ def save_dtrajs(self, trajfiles=None, prefix='',
 
         for filename, dtraj in zip(output_files, self.dtrajs):
             dest = path.join(output_dir, filename)
-            self._logger.debug('writing dtraj to "%s"' % dest)
+            self.logger.debug('writing dtraj to "%s"' % dest)
             try:
                 if path.exists(dest) and not self.overwrite_dtrajs:
                     raise EnvironmentError('Attempted to write dtraj "%s" which already existed. To automatically'
                                            ' overwrite existing files, set source.overwrite_dtrajs=True.' % dest)
                 write_dtraj(dest, dtraj)
             except IOError:
-                self._logger.exception('Exception during writing dtraj to "%s"' % dest)
+                self.logger.exception('Exception during writing dtraj to "%s"' % dest)
diff --git a/pyemma/coordinates/clustering/kmeans.py b/pyemma/coordinates/clustering/kmeans.py
@@ -220,7 +220,7 @@ def _estimate(self, iterable, **kw):
                     first_chunk = False
             self.initial_centers_ = self.clustercenters[:]
 
-            self._logger.debug("Accumulated all data, running kmeans on %s", self._in_memory_chunks.shape)
+            self.logger.debug("Accumulated all data, running kmeans on %s", self._in_memory_chunks.shape)
             self._in_memory_chunks_set = True
         else:
             if len(self.clustercenters) != self.n_clusters:
@@ -240,11 +240,11 @@ def _estimate(self, iterable, **kw):
                                                                             callback)
             if code == 0:
                 self._converged = True
-                self._logger.info("Cluster centers converged after %i steps.", iterations + 1)
+                self.logger.info("Cluster centers converged after %i steps.", iterations + 1)
             else:
-                self._logger.info("Algorithm did not reach convergence criterion"
+                self.logger.info("Algorithm did not reach convergence criterion"
                                   " of %g in %i iterations. Consider increasing max_iter.",
-                                  self.tolerance, self.max_iter)
+                                 self.tolerance, self.max_iter)
         self._finish_estimate()
 
         return self
@@ -272,7 +272,7 @@ def _init_estimate(self):
         total_length = sum(traj_lengths)
         if not self.n_clusters:
             self.n_clusters = min(int(math.sqrt(total_length)), 5000)
-            self._logger.info("The number of cluster centers was not specified, "
+            self.logger.info("The number of cluster centers was not specified, "
                               "using min(sqrt(N), 5000)=%s as n_clusters." % self.n_clusters)
         from pyemma.coordinates.data import DataInMemory
         if not isinstance(self, MiniBatchKmeansClustering) and not isinstance(self.data_producer, DataInMemory):
@@ -431,7 +431,7 @@ def _estimate(self, iterable, **kw):
 
                 if rel_change <= self.tolerance:
                     self._converged = True
-                    self._logger.info("Cluster centers converged after %i steps.", i_pass + 1)
+                    self.logger.info("Cluster centers converged after %i steps.", i_pass + 1)
                     self._progress_force_finish(stage=1)
                 else:
                     self._progress_update(1, stage=1)
@@ -441,6 +441,6 @@ def _estimate(self, iterable, **kw):
         self._finish_estimate()
 
         if not self._converged:
-            self._logger.info("Algorithm did not reach convergence criterion"
+            self.logger.info("Algorithm did not reach convergence criterion"
                               " of %g in %i iterations. Consider increasing max_iter.", self.tolerance, self.max_iter)
         return self
diff --git a/pyemma/coordinates/clustering/tests/test_assign.py b/pyemma/coordinates/clustering/tests/test_assign.py
@@ -234,8 +234,8 @@ def test_assignment_multithread(self):
         # re-do assignment with multiple threads and compare results
         chunksize = 1000
 
-        assignment_mp = coor.assign_to_centers(self.X, self.centers_big, n_jobs=2, chunk_size=chunksize)
-        assignment_sp = coor.assign_to_centers(self.X, self.centers_big, n_jobs=1, chunk_size=chunksize)
+        assignment_mp = coor.assign_to_centers(self.X, self.centers_big, n_jobs=2, chunksize=chunksize)
+        assignment_sp = coor.assign_to_centers(self.X, self.centers_big, n_jobs=1, chunksize=chunksize)
 
         np.testing.assert_equal(assignment_mp, assignment_sp)
 
@@ -252,8 +252,8 @@ def test_assignment_multithread_minrsmd(self):
                              ).reshape((N_centers, -1))
         chunksize = 1000
 
-        assignment_mp = coor.assign_to_centers(reader, centers, n_jobs=2, chunk_size=chunksize, metric='minRMSD')
-        assignment_sp = coor.assign_to_centers(reader, centers, n_jobs=1, chunk_size=chunksize, metric='minRMSD')
+        assignment_mp = coor.assign_to_centers(reader, centers, n_jobs=2, chunksize=chunksize, metric='minRMSD')
+        assignment_sp = coor.assign_to_centers(reader, centers, n_jobs=1, chunksize=chunksize, metric='minRMSD')
 
         np.testing.assert_equal(assignment_mp, assignment_sp)
 

diff --git a/pyemma/coordinates/clustering/uniform_time.py b/pyemma/coordinates/clustering/uniform_time.py
@@ -68,17 +68,17 @@ def _estimate(self, iterable, **kw):
             traj_lengths = self.trajectory_lengths(stride=self.stride, skip=self.skip)
             total_length = sum(traj_lengths)
             self.n_clusters = min(int(math.sqrt(total_length)), 5000)
-            self._logger.info("The number of cluster centers was not specified, "
+            self.logger.info("The number of cluster centers was not specified, "
                               "using min(sqrt(N), 5000)=%s as n_clusters." % self.n_clusters)
 
         # initialize time counters
         T = iterable.n_frames_total(stride=self.stride, skip=self.skip)
         if self.n_clusters > T:
             self.n_clusters = T
-            self._logger.info('Requested more clusters (k = %i'
+            self.logger.info('Requested more clusters (k = %i'
                               ' than there are total data points %i)'
                               '. Will do clustering with k = %i'
-                              % (self.n_clusters, T, T))
+                             % (self.n_clusters, T, T))
 
         # first data point in the middle of the time segment
         next_t = (T // self.n_clusters) // 2

diff --git a/pyemma/coordinates/data/_base/_in_memory_mixin.py b/pyemma/coordinates/data/_base/_in_memory_mixin.py
@@ -0,0 +1,47 @@
+
+class InMemoryMixin(object):
+    """ Performs mapping of an iterable/datasource to memory.
+    """
+
+    __serialize_version = 0
+    __serialize_fields = ('_in_memory', '_Y', '_Y_source')
+
+    def __init__(self):
+        super(InMemoryMixin, self).__init__()
+        self._in_memory = False
+        self._mapping_to_mem_active = False
+        self._Y = None
+        self._Y_source = None
+
+    @property
+    def in_memory(self):
+        r"""are results stored in memory?"""
+        return self._in_memory
+
+    @in_memory.setter
+    def in_memory(self, op_in_mem):
+        r"""
+        If set to True, the output will be stored in memory.
+        """
+        old_state = self.in_memory
+        if not old_state and op_in_mem:
+            self._map_to_memory()
+        elif not op_in_mem and old_state:
+            self._clear_in_memory()
+
+    def _clear_in_memory(self):
+        self._Y = None
+        self._Y_source = None
+        self._in_memory = False
+
+    def _map_to_memory(self, stride=1):
+        r"""Maps results to memory. Will be stored in attribute :attr:`_Y`."""
+        self._mapping_to_mem_active = True
+        try:
+            self._Y = self.get_output(stride=stride)
+            from pyemma.coordinates.data import DataInMemory
+            self._Y_source = DataInMemory(self._Y)
+        finally:
+            self._mapping_to_mem_active = False
+
+        self._in_memory = True
+1 −1		methods/feature_selection/feature_selection.ipynb
+2 −1		methods/multi_ensemble/doublewell/PyEMMA.thermo.estimate_multi_temperatur_-_asymmetric_double_well.ipynb