diff --git a/Tests/test_batchandrocorrection.py b/Tests/test_batchandrocorrection.py index 62a13a14..627c556c 100644 --- a/Tests/test_batchandrocorrection.py +++ b/Tests/test_batchandrocorrection.py @@ -2,17 +2,14 @@ Test that batch and run-order correction behaves sensibly with a combination of synthetic and model datasets. """ -import scipy -import pandas import numpy -import seaborn as sns import sys import unittest -import os sys.path.append("..") import nPYc from generateTestDataset import generateTestDataset +from nPYc.enumerations import SampleType class test_rocorrection(unittest.TestCase): @@ -20,25 +17,47 @@ class test_rocorrection(unittest.TestCase): Use stored and synthetic datasets to validate run-order correction. """ - def test_correctmsdataset_parallelisation(self): - """ - Check that parallel and single threaded code paths return the same result. - """ + def setUp(self): + self.noSamp = numpy.random.randint(500, high=2000, size=None) + self.noFeat = numpy.random.randint(50, high=100, size=None) + + self.msData = generateTestDataset(self.noSamp, self.noFeat, dtype='MSDataset') + self.msDataERCorrection = generateTestDataset(20, 2, dtype='MSDataset') + self.msDataERCorrection._intensityData[self.msDataERCorrection.sampleMetadata['SampleType'] == SampleType.StudyPool, :] = 0.5 + self.msDataERCorrection._intensityData[self.msDataERCorrection.sampleMetadata['SampleType'] == SampleType.StudySample, :] = 2 + self.msDataERCorrection._intensityData[self.msDataERCorrection.sampleMetadata['SampleType'] == SampleType.ExternalReference, :] = 1 + + def test_correctMSDataset(self): + with self.subTest(msg='Test correctMSdataset parallelisation'): + """ + Check that parallel and single threaded code paths return the same result. + """ + + correctedDataP = nPYc.batchAndROCorrection.correctMSdataset(self.msData, parallelise=True) + correctedDataS = nPYc.batchAndROCorrection.correctMSdataset(self.msData, parallelise=False) - noSamp = numpy.random.randint(500, high=2000, size=None) - noFeat = numpy.random.randint(50, high=100, size=None) + numpy.testing.assert_array_almost_equal(correctedDataP.fit, correctedDataS.fit, err_msg="Serial and parallel fits not equal.") - msData = generateTestDataset(noSamp, noFeat, dtype='MSDataset') + numpy.testing.assert_array_almost_equal(correctedDataP.intensityData, correctedDataS.intensityData, err_msg="Serial and parallel corrected data not equal.") - correctedDataP = nPYc.batchAndROCorrection.correctMSdataset(msData, parallelise=True) - correctedDataS = nPYc.batchAndROCorrection.correctMSdataset(msData, parallelise=False) + with self.subTest(msg='Test correctMSdataset correction sample selection'): + """ + Check that parallel and single threaded code paths return the same result. + """ + expectedCorrectedERDataFit = numpy.ones((20, 2)) - numpy.testing.assert_array_almost_equal(correctedDataP.fit, correctedDataS.fit, err_msg="Serial and parallel fits not equal.") + expectedCorrectedERDataIntensity = numpy.ones((20, 2)) + expectedCorrectedERDataIntensity[self.msDataERCorrection.sampleMetadata['SampleType'] == SampleType.StudyPool, :] = 0.5 + expectedCorrectedERDataIntensity[self.msDataERCorrection.sampleMetadata['SampleType'] == SampleType.StudySample, :] = 2 - numpy.testing.assert_array_almost_equal(correctedDataP.intensityData, correctedDataS.intensityData, err_msg="Serial and parallel corrected data not equal.") + correctedDataER = nPYc.batchAndROCorrection.correctMSdataset(self.msDataERCorrection, parallelise=False, correctionSampleType=SampleType.ExternalReference) + numpy.testing.assert_array_almost_equal(correctedDataER.fit, expectedCorrectedERDataFit, err_msg="Correction trendlines are not equal.") -class test_rocorrection_sythetic(unittest.TestCase): + numpy.testing.assert_array_almost_equal(correctedDataER.intensityData, expectedCorrectedERDataIntensity, err_msg="Corrected intensities are not equal") + + +class test_rocorrection_synthetic(unittest.TestCase): def setUp(self): @@ -64,10 +83,12 @@ def test_runOrderCompensation_synthetic(self): (only testing LOWESS at present) """ - (corrected,fit) = nPYc.batchAndROCorrection._batchAndROCorrection.runOrderCompensation(self.testD, + corrected, fit = nPYc.batchAndROCorrection._batchAndROCorrection.runOrderCompensation(self.testD, self.testRO, self.testSRmask, - {'window':11, 'method':'LOWESS'}) + {'window': 11, + 'method': 'LOWESS', + 'align': 'median'}) numpy.testing.assert_array_almost_equal(self.testD, fit) numpy.testing.assert_array_almost_equal(numpy.std(corrected), 0.) @@ -77,7 +98,7 @@ def test_doLOESScorrection_synthetic(self): Correction of a monotonically increasing trend should be essentially perfect. """ - (corrected,fit) = nPYc.batchAndROCorrection._batchAndROCorrection.doLOESScorrection(self.testD[self.testSRmask], + corrected, fit = nPYc.batchAndROCorrection._batchAndROCorrection.doLOESScorrection(self.testD[self.testSRmask], self.testRO[self.testSRmask], self.testD, self.testRO, @@ -86,7 +107,7 @@ def test_doLOESScorrection_synthetic(self): numpy.testing.assert_array_almost_equal(self.testD, fit) numpy.testing.assert_array_almost_equal(numpy.std(corrected), 0.) - def test_batchCorrection_sythetic(self): + def test_batchCorrection_synthetic(self): # We need at least two features - pick which randomly testD2 = numpy.array([self.testD, self.testD]).T @@ -97,7 +118,7 @@ def test_batchCorrection_sythetic(self): self.testSRmask, numpy.ones_like(self.testRO), [featureNo], - {'align':'mean', 'window':11, 'method':'LOWESS'}, + {'align': 'mean', 'window': 11, 'method': 'LOWESS'}, 0) featureNo_out = output[featureNo][0] @@ -147,7 +168,7 @@ def setUp(self): sigma = numpy.random.randn(1) self.testD[batchMask] = sigma * numpy.random.randn(noSamples) + batchMean - def test_batchCorrection_sythetic(self): + def test_batchCorrection_synthetic(self): """ Check we can correct the offset in averages of two normal distributions """ @@ -163,7 +184,7 @@ def test_batchCorrection_sythetic(self): self.testSRmask, self.batch, [featureNo], - {'align':'mean', 'window':11, 'method':None}, + {'align': 'mean', 'window': 11, 'method': None}, 0) means = list() @@ -182,7 +203,7 @@ def test_batchCorrection_sythetic(self): self.testSRmask, self.batch, [featureNo], - {'align':'median', 'window':11, 'method':None}, + {'align': 'median', 'window': 11, 'method': None}, 0) medians = list() @@ -193,6 +214,50 @@ def test_batchCorrection_sythetic(self): numpy.testing.assert_allclose(medians, overallMedian) + with self.subTest(msg='Checking alignment=\'none\', with method == None'): + + output = nPYc.batchAndROCorrection._batchAndROCorrection._batchCorrection(testD2, + self.testRO, + self.testSRmask, + self.batch, + [featureNo], + {'align': 'no', 'window': 11, 'method': None}, + 0) + # Check if means are unchanged before and after correction (with method == None) + means = list() + batchWiseMeans = list() + batches = list(set(self.batch)) + for batch in batches: + feature = output[featureNo][1] + batchWiseMeans.append(numpy.mean(self.testD[(self.batch == batch) & self.testSRmask])) + means.append(numpy.mean(feature[(self.batch == batch) & self.testSRmask])) + + numpy.testing.assert_allclose(means, batchWiseMeans) + + with self.subTest(msg='Checking alignment=\'none\', with method == LOWESS'): + + output = nPYc.batchAndROCorrection._batchAndROCorrection._batchCorrection( + testD2, + self.testRO, + self.testSRmask, + self.batch, + [featureNo], + {'align': 'no', 'window': 11, 'method': 'LOWESS'}, + 0) + + # Check if means are unchanged before and after correction (with method == 'LOWESS') + # Without alignment, the mean of corrected data is expected to be close to 1 if batch + # contains positive values and -np.inf if batch contains negative values + batches = list(set(self.batch)) + expectedMeans = numpy.array([1 if numpy.mean(self.testD[(self.batch == x) & self.testSRmask]) > 0 else -numpy.inf for x in batches]) + means = list() + for batch in batches: + feature = output[featureNo][1] + means.append(numpy.mean( + feature[(self.batch == batch) & self.testSRmask])) + + numpy.testing.assert_allclose(means, expectedMeans, rtol=1e-02) + def test_correctMSdataset_raises(self): with self.subTest(msg='Object type'): diff --git a/nPYc/__init__.py b/nPYc/__init__.py index cc1ba02b..d4d78218 100644 --- a/nPYc/__init__.py +++ b/nPYc/__init__.py @@ -1,7 +1,7 @@ """ The `nPYc-Toolbox `_ defines objects for representing, and implements functions to manipulate and display, metabolic profiling datasets. """ -__version__ = '1.2.6' +__version__ = '1.2.7' from . import enumerations from .objects import Dataset, MSDataset, NMRDataset, TargetedDataset diff --git a/nPYc/batchAndROCorrection/_batchAndROCorrection.py b/nPYc/batchAndROCorrection/_batchAndROCorrection.py index 3375a9ce..00260616 100755 --- a/nPYc/batchAndROCorrection/_batchAndROCorrection.py +++ b/nPYc/batchAndROCorrection/_batchAndROCorrection.py @@ -18,7 +18,7 @@ from ..enumerations import AssayRole, SampleType -def correctMSdataset(data, window=11, method='LOWESS', align='median', parallelise=True, excludeFailures=True): +def correctMSdataset(data, window=11, method='LOWESS', align='median', parallelise=True, excludeFailures=True, correctionSampleType=SampleType.StudyPool): """ Conduct run-order correction and batch alignment on the :py:class:`~nPYc.objects.MSDataset` instance *data*, returning a new instance with corrected intensity values. @@ -31,6 +31,7 @@ def correctMSdataset(data, window=11, method='LOWESS', align='median', paralleli :param str align: Average calculation of batch and feature intensity for correction, one of 'median' (default) or 'mean' :param bool parallelise: If ``True``, use multiple cores :param bool excludeFailures: If ``True``, remove features where a correct fit could not be calculated from the dataset + :param enum correctionSampleType: Which SampleType to use for the correction, default SampleType.StudyPool :return: Duplicate of *data*, with run-order correction applied :rtype: MSDataset """ @@ -44,19 +45,21 @@ def correctMSdataset(data, window=11, method='LOWESS', align='median', paralleli if method is not None: if not isinstance(method, str) & (method in {'LOWESS', 'SavitzkyGolay'}): raise ValueError('method must be == LOWESS or SavitzkyGolay') - if not isinstance(align, str) & (align in {'mean', 'median'}): - raise ValueError('align must be == mean or median') + if not isinstance(align, str) & (align in {'mean', 'median', 'no'}): + raise ValueError('align must be == mean, median or no') if not isinstance(parallelise, bool): raise TypeError("parallelise must be a boolean") if not isinstance(excludeFailures, bool): raise TypeError("excludeFailures must be a boolean") + if not isinstance(correctionSampleType,SampleType): + raise TypeError("correctionType must be a SampleType") with warnings.catch_warnings(): warnings.simplefilter('ignore', category=RuntimeWarning) correctedP = _batchCorrectionHead(data.intensityData, data.sampleMetadata['Run Order'].values, - (data.sampleMetadata['SampleType'].values == SampleType.StudyPool) & (data.sampleMetadata['AssayRole'].values == AssayRole.PrecisionReference), + (data.sampleMetadata['SampleType'].values == correctionSampleType) & (data.sampleMetadata['AssayRole'].values == AssayRole.PrecisionReference), data.sampleMetadata['Correction Batch'].values, window=window, method=method, @@ -101,8 +104,8 @@ def _batchCorrectionHead(data, runOrder, referenceSamples, batchList, window=11, if method is not None: if not isinstance(method, str) & (method in {'LOWESS', 'SavitzkyGolay'}): raise ValueError('method must be == LOWESS or SavitzkyGolay') - if not isinstance(align, str) & (align in {'mean', 'median'}): - raise ValueError('align must be == mean or median') + if not isinstance(align, str) & (align in {'mean', 'median', 'no'}): + raise ValueError('align must be == mean, median or no') if not isinstance(parallelise, bool): raise TypeError('parallelise must be True or False') if not isinstance(savePlots, bool): @@ -205,8 +208,8 @@ def _batchCorrection(data, runOrder, QCsamples, batchList, featureIndex, paramet featureAverage = numpy.mean(feature[QCsamples]) elif parameters['align'] == 'median': featureAverage = numpy.median(feature[QCsamples]) - else: - return numpy.zeros_like(data) + #else: + #return numpy.zeros_like(data) # Iterate over batches. for batch in batches: @@ -230,11 +233,11 @@ def _batchCorrection(data, runOrder, QCsamples, batchList, featureIndex, paramet batchMean = numpy.mean(feature[batchMask & QCsamples]) elif parameters['align'] == 'median': batchMean = numpy.median(feature[batchMask & QCsamples]) - else: - batchMean = numpy.nan_like(feature[batchMask]) - - feature[batchMask] = numpy.divide(feature[batchMask], batchMean) - feature[batchMask] = numpy.multiply(feature[batchMask], featureAverage) + #else: + # batchMean = numpy.nan_like(feature[batchMask]) + if parameters['align'] != 'no': + feature[batchMask] = numpy.divide(feature[batchMask], batchMean) + feature[batchMask] = numpy.multiply(feature[batchMask], featureAverage) # # If negative data mark for exclusion (occurs when too many QCsamples have intensity==0) # if sum(feature[batchMask]<0) != 0: # CJS 240816 @@ -258,11 +261,13 @@ def runOrderCompensation(data, runOrder, referenceSamples, parameters): # Select model # Optimisation of window would happen here. window = parameters['window'] + align = parameters['align'] if parameters['method'] == 'LOWESS': (data, fit) = doLOESScorrection(QCdata, QCrunorder, data, - runOrder, + runOrder, + align=align, window=window) elif parameters['method'] == 'SavitzkyGolay': (data, fit) = doSavitzkyGolayCorrection(QCdata, @@ -276,7 +281,7 @@ def runOrderCompensation(data, runOrder, referenceSamples, parameters): return (data, fit) -def doLOESScorrection(QCdata, QCrunorder, data, runorder, window=11): +def doLOESScorrection(QCdata, QCrunorder, data, runorder, align='median', window=11): """ Fit a LOWESS regression to the data. """ @@ -301,9 +306,12 @@ def doLOESScorrection(QCdata, QCrunorder, data, runorder, window=11): fit[fit < 0] = 0 corrected = numpy.divide(data, fit) - corrected = numpy.multiply(corrected, numpy.median(QCdata)) + if align == 'median': + corrected = numpy.multiply(corrected, numpy.median(QCdata)) + elif align == 'mean': + corrected = numpy.multiply(corrected, numpy.mean(QCdata)) - return (corrected, fit) + return corrected, fit def doSavitzkyGolayCorrection(QCdata, QCrunorder, data, runorder, window=11, polyOrder=3): @@ -323,7 +331,7 @@ def doSavitzkyGolayCorrection(QCdata, QCrunorder, data, runorder, window=11, pol corrected = numpy.divide(data, fit) corrected = numpy.multiply(corrected, numpy.median(QCdata)) - return (corrected, fit) + return corrected, fit def optimiseCorrection(feature, optimise): diff --git a/requirements.txt b/requirements.txt index cdd528cd..10c74db8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ cycler iPython isaExplorer isatools +MarkupSafe==2.0.1 Jinja2 lmfit matplotlib diff --git a/setup.py b/setup.py index 3aee1de4..a928b015 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup(name='nPYc', - version='1.2.6', + version='1.2.7', description='National Phenome Centre toolbox', url='https://github.com/phenomecentre/npyc-toolbox', author='National Phenome Centre',