Merge pull request #231 from Jhsmit/cuda

Allow CUDA and float dtype
Jhsmit · Sep 27, 2021 · 64a63aa · 64a63aa
2 parents 24c813c + 6e16ad2
commit 64a63aa
Show file tree

Hide file tree

Showing 11 changed files with 113 additions and 36 deletions.
diff --git a/pyhdx/__init__.py b/pyhdx/__init__.py
@@ -1,9 +1,13 @@
 from .models import PeptideMasterTable, PeptideMeasurements, HDXMeasurement, Coverage, HDXMeasurementSet
 from .fileIO import read_dynamx
 from .fitting_torch import TorchSingleFitResult, TorchBatchFitResult
-from .output import Output, Report
 from ._version import get_versions
 
+try:
+    from .output import Output, Report
+except ModuleNotFoundError:
+    pass
+
 
 __version__ = get_versions()['version']
 

diff --git a/pyhdx/cli.py b/pyhdx/cli.py
@@ -1,7 +1,7 @@
 import argparse
 from ipaddress import ip_address
 from pyhdx.web import serve
-from pyhdx.config import ConfigurationSettings
+from pyhdx.config import cfg
 from pyhdx.local_cluster import verify_cluster, default_cluster
 
 
@@ -15,8 +15,6 @@ def main():
     parser.add_argument('--scheduler_address', help="Run with local cluster <ip>:<port>")
     args = parser.parse_args()
 
-    cfg = ConfigurationSettings()
-
     if args.scheduler_address:
         ip, port = args.scheduler_address.split(':')
         if not ip_address(ip):

diff --git a/pyhdx/config.ini b/pyhdx/config.ini
@@ -1,3 +1,7 @@
 [cluster]
 scheduler_address = 127.0.0.1:52123
 n_workers = 10
+
+[fitting]
+dtype = float64
+device = cpu
diff --git a/pyhdx/config.py b/pyhdx/config.py
@@ -1,10 +1,15 @@
 import configparser
 from pathlib import Path
-from pyhdx import __version__
+from pyhdx._version import get_versions
 from packaging import version
+import torch
 import warnings
 
 
+__version__ = get_versions()['version']
+del get_versions
+
+
 def read_config(path):
     """read .ini config file at path, return configparser.ConfigParser object"""
     config = configparser.ConfigParser()
@@ -86,6 +91,21 @@ def write_config(self, path=None):
         with open(pth, 'w') as config_file:
             self._config.write(config_file)
 
+    @property
+    def TORCH_DTYPE(self):
+        dtype = self.get('fitting', 'dtype')
+        if dtype in ['float64', 'double']:
+            return torch.float64
+        elif dtype in ['float32', 'float']:
+            return torch.float32
+        else:
+            raise ValueError(f'Unsupported data type: {dtype}')
+
+    @property
+    def TORCH_DEVICE(self):
+        device = self.get('fitting', 'device')
+        return torch.device(device)
+
 
 def valid_config():
     """Checks if the current config file in the user home directory is a valid config
@@ -111,4 +131,6 @@ def valid_config():
 
 config_file_path = config_dir / 'config.ini'
 if not valid_config():
-    reset_config()
+    reset_config()
+
+cfg = ConfigurationSettings()
diff --git a/pyhdx/fitting.py b/pyhdx/fitting.py
@@ -10,9 +10,10 @@
 from tqdm import trange
 
 from pyhdx.fit_models import SingleKineticModel, TwoComponentAssociationModel, TwoComponentDissociationModel
-from pyhdx.fitting_torch import DeltaGFit, TorchSingleFitResult, TorchBatchFitResult, TORCH_DTYPE, TORCH_DEVICE
-from pyhdx.models import Protein
+from pyhdx.fitting_torch import DeltaGFit, TorchSingleFitResult, TorchBatchFitResult
 from pyhdx.support import temporary_seed
+from pyhdx.models import Protein
+from pyhdx.config import cfg
 
 EmptyResult = namedtuple('EmptyResult', ['chi_squared', 'params'])
 er = EmptyResult(np.nan, {k: np.nan for k in ['tau1', 'tau2', 'r']})
@@ -451,7 +452,7 @@ def fit_gibbs_global(hdxm, initial_guess, r1=R1, epochs=EPOCHS, patience=PATIENC
     assert len(initial_guess) == hdxm.Nr, "Invalid length of initial guesses"
 
     dtype = torch.float64
-    deltaG_par = torch.nn.Parameter(torch.tensor(initial_guess, dtype=TORCH_DTYPE, device=TORCH_DEVICE).unsqueeze(-1))  #reshape (nr, 1)
+    deltaG_par = torch.nn.Parameter(torch.tensor(initial_guess, dtype=cfg.TORCH_DTYPE, device=cfg.TORCH_DEVICE).unsqueeze(-1))  #reshape (nr, 1)
 
     model = DeltaGFit(deltaG_par)
     criterion = torch.nn.MSELoss(reduction='mean')
@@ -580,7 +581,7 @@ def _batch_fit(hdx_set, initial_guess, reg_func, fit_kwargs, optimizer_kwargs):
 
     assert initial_guess.shape == (hdx_set.Ns, hdx_set.Nr), "Invalid shape of initial guesses"
 
-    deltaG_par = torch.nn.Parameter(torch.tensor(initial_guess, dtype=TORCH_DTYPE, device=TORCH_DEVICE).reshape(hdx_set.Ns, hdx_set.Nr, 1))
+    deltaG_par = torch.nn.Parameter(torch.tensor(initial_guess, dtype=cfg.TORCH_DTYPE, device=cfg.TORCH_DEVICE).reshape(hdx_set.Ns, hdx_set.Nr, 1))
 
     model = DeltaGFit(deltaG_par)
     criterion = torch.nn.MSELoss(reduction='mean')

diff --git a/pyhdx/fitting_torch.py b/pyhdx/fitting_torch.py
@@ -8,9 +8,10 @@
 
 from pyhdx.fileIO import dataframe_to_file
 from pyhdx.models import Protein
+from pyhdx.config import cfg
 
-TORCH_DTYPE = t.double
-TORCH_DEVICE = t.device('cpu')
+# TORCH_DTYPE = t.double
+# TORCH_DEVICE = t.device('cpu')
 
 class DeltaGFit(nn.Module):
     def __init__(self, deltaG):
@@ -46,11 +47,12 @@ def estimate_errors(hdxm, deltaG):
     -------
 
     """
+    dtype = t.float64
     joined = pd.concat([deltaG, hdxm.coverage['exchanges']], axis=1, keys=['dG', 'ex'])
     dG = joined.query('ex==True')['dG']
-    deltaG = t.tensor(dG.to_numpy(), dtype=TORCH_DTYPE)
+    deltaG = t.tensor(dG.to_numpy(), dtype=dtype)
 
-    tensors = {k: v.cpu() for k, v in hdxm.get_tensors(exchanges=True).items()}
+    tensors = {k: v.cpu() for k, v in hdxm.get_tensors(exchanges=True, dtype=dtype).items()}
 
     def hes_loss(deltaG_input):
         criterion = t.nn.MSELoss(reduction='sum')

diff --git a/pyhdx/local_cluster.py b/pyhdx/local_cluster.py
@@ -1,10 +1,8 @@
 from dask.distributed import LocalCluster, Client
 import time
-from pyhdx.config import ConfigurationSettings
+from pyhdx.config import cfg
 import argparse
 
-cfg = ConfigurationSettings()
-
 def default_client(timeout='2s'):
     """Return Dask client at scheduler adress as defined by the global config"""
     scheduler_address = cfg.get('cluster', 'scheduler_address')

diff --git a/pyhdx/models.py b/pyhdx/models.py
@@ -12,6 +12,7 @@
 from pyhdx.alignment import align_dataframes
 from pyhdx.fileIO import dataframe_to_file
 from pyhdx.support import reduce_inter, fields_view
+from pyhdx.config import cfg
 
 
 def protein_wrapper(func, *args, **kwargs):
@@ -748,7 +749,7 @@ def d_exp(self):
         df.columns.name = 'exposure'
         return df
 
-    def get_tensors(self, exchanges=False):
+    def get_tensors(self, exchanges=False, dtype=None):
         """
         Returns a dictionary of tensor variables for fitting to Linderstrøm-Lang kinetics.
 
@@ -784,8 +785,8 @@ def get_tensors(self, exchanges=False):
         else:
             bools = np.ones(self.Nr, dtype=bool)
 
-        dtype = pyhdx.fitting_torch.TORCH_DTYPE
-        device = pyhdx.fitting_torch.TORCH_DEVICE
+        dtype = dtype or cfg.TORCH_DTYPE
+        device = cfg.TORCH_DEVICE
 
         tensors = {
             'temperature': torch.tensor([self.temperature], dtype=dtype, device=device).unsqueeze(-1),
@@ -1130,7 +1131,7 @@ def add_alignment(self, alignment, first_r_numbers=None):
 
         self.aligned_indices = df.to_numpy(dtype=int).T
 
-    def get_tensors(self):
+    def get_tensors(self, dtype=None):
         #todo create correct shapes as per table X for all
         temperature = np.array([kf.temperature for kf in self.hdxm_list])
 
@@ -1142,8 +1143,8 @@ def get_tensors(self):
         k_int = np.zeros((self.Ns, self.Nr))
         k_int[self.masks['sr']] = k_int_values
 
-        dtype = pyhdx.fitting_torch.TORCH_DTYPE
-        device = pyhdx.fitting_torch.TORCH_DEVICE
+        dtype = dtype or cfg.TORCH_DTYPE
+        device = cfg.TORCH_DEVICE
 
         tensors = {
             'temperature': torch.tensor(temperature, dtype=dtype, device=device).reshape(self.Ns, 1, 1),

diff --git a/pyhdx/web/apps.py b/pyhdx/web/apps.py
@@ -15,7 +15,7 @@
 import logging
 import panel as pn
 from pyhdx.web.log import logger
-from pyhdx.config import ConfigurationSettings
+from pyhdx.config import cfg
 from pyhdx.local_cluster import default_client
 
 from pathlib import Path
@@ -27,7 +27,6 @@
 current_dir = Path(__file__).parent
 data_dir = current_dir.parent.parent / 'tests' / 'test_data'
 global_opts = {'show_grid': True}
-cfg = ConfigurationSettings()
 
 @logger('pyhdx')
 def main_app(client='default'):

diff --git a/pyhdx/web/serve.py b/pyhdx/web/serve.py
@@ -4,7 +4,7 @@
 import numpy as np
 import torch
 
-from pyhdx.config import ConfigurationSettings
+from pyhdx.config import cfg
 from pyhdx.local_cluster import verify_cluster
 
 import logging
@@ -24,7 +24,7 @@ def run_main():
     np.random.seed(43)
     torch.manual_seed(43)
 
-    scheduler_address = ConfigurationSettings().get('cluster', 'scheduler_address')
+    scheduler_address = cfg.get('cluster', 'scheduler_address')
     if not verify_cluster(scheduler_address):
         print(f"No valid Dask scheduler found at specified address: '{scheduler_address}'")
         return

diff --git a/tests/test_fitting.py b/tests/test_fitting.py
@@ -3,13 +3,15 @@
 from pyhdx.fileIO import read_dynamx, csv_to_protein, csv_to_dataframe, save_fitresult, load_fitresult
 from pyhdx.fitting import fit_rates_weighted_average, fit_gibbs_global, fit_gibbs_global_batch, fit_gibbs_global_batch_aligned
 from pyhdx.models import HDXMeasurementSet
+from pyhdx.config import cfg
 import numpy as np
 import torch
 import time
 from dask.distributed import LocalCluster
 from pathlib import Path
 
 import pandas as pd
+from pandas.testing import assert_series_equal
 
 cwd = Path(__file__).parent
 input_dir = cwd / 'test_data' / 'input'
@@ -52,6 +54,35 @@ def test_initial_guess(self):
         # todo additional tests:
         #  result = fit_rates_half_time_interpolate()
 
+    def test_dtype_cuda(self):
+        check_deltaG = csv_to_protein(output_dir / 'ecSecB_torch_fit.csv')
+        initial_rates = csv_to_dataframe(output_dir / 'ecSecB_guess.csv')
+
+        cfg.set('fitting', 'device', 'cuda')
+        gibbs_guess = self.hdxm_apo.guess_deltaG(initial_rates['rate']).to_numpy()
+
+        if torch.cuda.is_available():
+            fr_global = fit_gibbs_global(self.hdxm_apo, gibbs_guess, epochs=1000, r1=2)
+            out_deltaG = fr_global.output
+            for field in ['deltaG', 'k_obs', 'covariance']:
+                assert_series_equal(check_deltaG[field], out_deltaG[field], rtol=0.01, check_dtype=False)
+        else:
+            with pytest.raises(AssertionError, match=r".* CUDA .*"):
+                fr_global = fit_gibbs_global(self.hdxm_apo, gibbs_guess, epochs=1000, r1=2)
+
+        cfg.set('fitting', 'device', 'cpu')
+        cfg.set('fitting', 'dtype', 'float32')
+
+        fr_global = fit_gibbs_global(self.hdxm_apo, gibbs_guess, epochs=1000, r1=2)
+        dg = fr_global.model.deltaG
+        assert dg.dtype == torch.float32
+
+        out_deltaG = fr_global.output
+        for field in ['deltaG', 'k_obs']:
+            assert_series_equal(check_deltaG[field], out_deltaG[field], rtol=0.01, check_dtype=False)
+
+        cfg.set('fitting', 'dtype', 'float64')
+
     def test_global_fit(self):
         initial_rates = csv_to_dataframe(output_dir / 'ecSecB_guess.csv')
 
@@ -64,33 +95,50 @@ def test_global_fit(self):
         out_deltaG = fr_global.output
         check_deltaG = csv_to_protein(output_dir / 'ecSecB_torch_fit.csv')
 
-        assert np.allclose(check_deltaG['deltaG'], out_deltaG['deltaG'], equal_nan=True, rtol=0.01)
-        assert np.allclose(check_deltaG['covariance'], out_deltaG['covariance'], equal_nan=True, rtol=0.01)
-        assert np.allclose(check_deltaG['k_obs'], out_deltaG['k_obs'], equal_nan=True, rtol=0.01)
+        for field in ['deltaG', 'covariance', 'k_obs']:
+            assert_series_equal(check_deltaG[field], out_deltaG[field], rtol=0.01)
 
         mse = fr_global.get_mse()
         assert mse.shape == (self.hdxm_apo.Np, self.hdxm_apo.Nt)
 
     @pytest.mark.skip(reason="Longer fit is not checked by default due to long computation times")
     def test_global_fit_extended(self):
+        check_deltaG = csv_to_protein(output_dir / 'ecSecB_torch_fit_epochs_20000.csv')
         initial_rates = csv_to_dataframe(output_dir / 'ecSecB_guess.csv')
+        gibbs_guess = self.hdxm_apo.guess_deltaG(initial_rates['rate']).to_numpy()
 
         t0 = time.time()  # Very crude benchmarks
-        gibbs_guess = self.hdxm_apo.guess_deltaG(initial_rates['rate']).to_numpy()
         fr_global = fit_gibbs_global(self.hdxm_apo, gibbs_guess, epochs=20000, r1=2)
         t1 = time.time()
 
         assert t1 - t0 < 20
         out_deltaG = fr_global.output
-        check_deltaG = csv_to_protein(output_dir / 'ecSecB_torch_fit_epochs_20000.csv')
-
-        assert np.allclose(check_deltaG['deltaG'], out_deltaG['deltaG'], equal_nan=True, rtol=0.01)
-        assert np.allclose(check_deltaG['covariance'], out_deltaG['covariance'], equal_nan=True, rtol=0.01)
-        assert np.allclose(check_deltaG['k_obs'], out_deltaG['k_obs'], equal_nan=True, rtol=0.01)
+        for field in ['deltaG', 'k_obs', 'covariance']:
+            assert_series_equal(check_deltaG[field], out_deltaG[field], rtol=0.01, check_dtype=False)
 
         mse = fr_global.get_mse()
         assert mse.shape == (self.hdxm_apo.Np, self.hdxm_apo.Nt)
 
+    @pytest.mark.skip(reason="Longer fit is not checked by default due to long computation times")
+    def test_global_fit_extended_cuda(self):
+        check_deltaG = csv_to_protein(output_dir / 'ecSecB_torch_fit_epochs_20000.csv')
+        initial_rates = csv_to_dataframe(output_dir / 'ecSecB_guess.csv')
+        gibbs_guess = self.hdxm_apo.guess_deltaG(initial_rates['rate']).to_numpy()
+
+        #todo allow contextmanger?
+        cfg.set('fitting', 'device', 'cuda')
+        cfg.set('fitting', 'dtype', 'float32')
+
+        fr_global = fit_gibbs_global(self.hdxm_apo, gibbs_guess, epochs=20000, r1=2)
+        out_deltaG = fr_global.output
+
+        for field in ['deltaG', 'k_obs']:
+            assert_series_equal(check_deltaG[field], out_deltaG[field], rtol=0.01, check_dtype=False)
+
+        cfg.set('fitting', 'device', 'cpu')
+        cfg.set('fitting', 'dtype', 'float64')
+
+
     def test_batch_fit(self, tmp_path):
         hdx_set = HDXMeasurementSet([self.hdxm_apo, self.hdxm_dimer])
         guess = csv_to_dataframe(output_dir / 'ecSecB_guess.csv')