Skip to content

Commit

Permalink
Feature/batch correction updates (#89)
Browse files Browse the repository at this point in the history
* fix for 'nonposy' deprecation in ax.set_xscale/set_yscale (#65)
* fix for 'nonposy' deprecation in ax.set_xscale/set_yscale
* fix to pqn reference bug
* Feature/mz ml import (#67)
* New general batch inference function
* refactor extractParams to support more data types
* add mzMLparamRead
* small tweaks to unitest pandas imports
* changes to batch_inference and dilution series
* changes to reports
* seaborn swarmplot changed to stripplot
* - fix for duplicated samples in extractParams and inferBatches
* - updates to python version
* - bugfix in batch inference and ammendBatch: Automated 'Correction Batch' suggestions always start at 1.
* added correctionSampleType to correctMSdataset to enable correction against any SampleType
* - fix for numpy.matlib deprecation
* - version bump
* - .github actions builds update
* - add support for BI-QUANT-PS 2.0
- modify unitests
* - add support for BI-QUANT-PS 2.0
- modify unitests
* - numpy.int -> int and pandas.util.testing : preventive future deprecation fix
* - add align option = 'no' to batch correction
* - test_rocorrection_synthetic - changes for new align parameter
* - tentative fix for #77
* - modification to batch correction unitests for new parameters
* - add test for extractParams() with filetype='mzML'
* - merge changes from mzML import branch
* - extractParams test fix
* - fix for #37
* - rtol for test_batchandroccorrection increased for macos
* - update version
* add wheel to requirements
* Update requirements.txt
Co-authored-by: Gordon Haggart <[email protected]>
Co-authored-by: ghaggart <[email protected]>
  • Loading branch information
Gscorreia89 authored Aug 9, 2022
1 parent 614339d commit 2bb6b5b
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 44 deletions.
113 changes: 89 additions & 24 deletions Tests/test_batchandrocorrection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,43 +2,62 @@
Test that batch and run-order correction behaves sensibly with a combination of synthetic and model datasets.
"""

import scipy
import pandas
import numpy
import seaborn as sns
import sys
import unittest
import os

sys.path.append("..")
import nPYc
from generateTestDataset import generateTestDataset
from nPYc.enumerations import SampleType


class test_rocorrection(unittest.TestCase):
"""
Use stored and synthetic datasets to validate run-order correction.
"""

def test_correctmsdataset_parallelisation(self):
"""
Check that parallel and single threaded code paths return the same result.
"""
def setUp(self):
self.noSamp = numpy.random.randint(500, high=2000, size=None)
self.noFeat = numpy.random.randint(50, high=100, size=None)

self.msData = generateTestDataset(self.noSamp, self.noFeat, dtype='MSDataset')
self.msDataERCorrection = generateTestDataset(20, 2, dtype='MSDataset')
self.msDataERCorrection._intensityData[self.msDataERCorrection.sampleMetadata['SampleType'] == SampleType.StudyPool, :] = 0.5
self.msDataERCorrection._intensityData[self.msDataERCorrection.sampleMetadata['SampleType'] == SampleType.StudySample, :] = 2
self.msDataERCorrection._intensityData[self.msDataERCorrection.sampleMetadata['SampleType'] == SampleType.ExternalReference, :] = 1

def test_correctMSDataset(self):
with self.subTest(msg='Test correctMSdataset parallelisation'):
"""
Check that parallel and single threaded code paths return the same result.
"""

correctedDataP = nPYc.batchAndROCorrection.correctMSdataset(self.msData, parallelise=True)
correctedDataS = nPYc.batchAndROCorrection.correctMSdataset(self.msData, parallelise=False)

noSamp = numpy.random.randint(500, high=2000, size=None)
noFeat = numpy.random.randint(50, high=100, size=None)
numpy.testing.assert_array_almost_equal(correctedDataP.fit, correctedDataS.fit, err_msg="Serial and parallel fits not equal.")

msData = generateTestDataset(noSamp, noFeat, dtype='MSDataset')
numpy.testing.assert_array_almost_equal(correctedDataP.intensityData, correctedDataS.intensityData, err_msg="Serial and parallel corrected data not equal.")

correctedDataP = nPYc.batchAndROCorrection.correctMSdataset(msData, parallelise=True)
correctedDataS = nPYc.batchAndROCorrection.correctMSdataset(msData, parallelise=False)
with self.subTest(msg='Test correctMSdataset correction sample selection'):
"""
Check that parallel and single threaded code paths return the same result.
"""
expectedCorrectedERDataFit = numpy.ones((20, 2))

numpy.testing.assert_array_almost_equal(correctedDataP.fit, correctedDataS.fit, err_msg="Serial and parallel fits not equal.")
expectedCorrectedERDataIntensity = numpy.ones((20, 2))
expectedCorrectedERDataIntensity[self.msDataERCorrection.sampleMetadata['SampleType'] == SampleType.StudyPool, :] = 0.5
expectedCorrectedERDataIntensity[self.msDataERCorrection.sampleMetadata['SampleType'] == SampleType.StudySample, :] = 2

numpy.testing.assert_array_almost_equal(correctedDataP.intensityData, correctedDataS.intensityData, err_msg="Serial and parallel corrected data not equal.")
correctedDataER = nPYc.batchAndROCorrection.correctMSdataset(self.msDataERCorrection, parallelise=False, correctionSampleType=SampleType.ExternalReference)

numpy.testing.assert_array_almost_equal(correctedDataER.fit, expectedCorrectedERDataFit, err_msg="Correction trendlines are not equal.")

class test_rocorrection_sythetic(unittest.TestCase):
numpy.testing.assert_array_almost_equal(correctedDataER.intensityData, expectedCorrectedERDataIntensity, err_msg="Corrected intensities are not equal")


class test_rocorrection_synthetic(unittest.TestCase):

def setUp(self):

Expand All @@ -64,10 +83,12 @@ def test_runOrderCompensation_synthetic(self):
(only testing LOWESS at present)
"""

(corrected,fit) = nPYc.batchAndROCorrection._batchAndROCorrection.runOrderCompensation(self.testD,
corrected, fit = nPYc.batchAndROCorrection._batchAndROCorrection.runOrderCompensation(self.testD,
self.testRO,
self.testSRmask,
{'window':11, 'method':'LOWESS'})
{'window': 11,
'method': 'LOWESS',
'align': 'median'})

numpy.testing.assert_array_almost_equal(self.testD, fit)
numpy.testing.assert_array_almost_equal(numpy.std(corrected), 0.)
Expand All @@ -77,7 +98,7 @@ def test_doLOESScorrection_synthetic(self):
Correction of a monotonically increasing trend should be essentially perfect.
"""

(corrected,fit) = nPYc.batchAndROCorrection._batchAndROCorrection.doLOESScorrection(self.testD[self.testSRmask],
corrected, fit = nPYc.batchAndROCorrection._batchAndROCorrection.doLOESScorrection(self.testD[self.testSRmask],
self.testRO[self.testSRmask],
self.testD,
self.testRO,
Expand All @@ -86,7 +107,7 @@ def test_doLOESScorrection_synthetic(self):
numpy.testing.assert_array_almost_equal(self.testD, fit)
numpy.testing.assert_array_almost_equal(numpy.std(corrected), 0.)

def test_batchCorrection_sythetic(self):
def test_batchCorrection_synthetic(self):

# We need at least two features - pick which randomly
testD2 = numpy.array([self.testD, self.testD]).T
Expand All @@ -97,7 +118,7 @@ def test_batchCorrection_sythetic(self):
self.testSRmask,
numpy.ones_like(self.testRO),
[featureNo],
{'align':'mean', 'window':11, 'method':'LOWESS'},
{'align': 'mean', 'window': 11, 'method': 'LOWESS'},
0)

featureNo_out = output[featureNo][0]
Expand Down Expand Up @@ -147,7 +168,7 @@ def setUp(self):
sigma = numpy.random.randn(1)
self.testD[batchMask] = sigma * numpy.random.randn(noSamples) + batchMean

def test_batchCorrection_sythetic(self):
def test_batchCorrection_synthetic(self):
"""
Check we can correct the offset in averages of two normal distributions
"""
Expand All @@ -163,7 +184,7 @@ def test_batchCorrection_sythetic(self):
self.testSRmask,
self.batch,
[featureNo],
{'align':'mean', 'window':11, 'method':None},
{'align': 'mean', 'window': 11, 'method': None},
0)

means = list()
Expand All @@ -182,7 +203,7 @@ def test_batchCorrection_sythetic(self):
self.testSRmask,
self.batch,
[featureNo],
{'align':'median', 'window':11, 'method':None},
{'align': 'median', 'window': 11, 'method': None},
0)

medians = list()
Expand All @@ -193,6 +214,50 @@ def test_batchCorrection_sythetic(self):

numpy.testing.assert_allclose(medians, overallMedian)

with self.subTest(msg='Checking alignment=\'none\', with method == None'):

output = nPYc.batchAndROCorrection._batchAndROCorrection._batchCorrection(testD2,
self.testRO,
self.testSRmask,
self.batch,
[featureNo],
{'align': 'no', 'window': 11, 'method': None},
0)
# Check if means are unchanged before and after correction (with method == None)
means = list()
batchWiseMeans = list()
batches = list(set(self.batch))
for batch in batches:
feature = output[featureNo][1]
batchWiseMeans.append(numpy.mean(self.testD[(self.batch == batch) & self.testSRmask]))
means.append(numpy.mean(feature[(self.batch == batch) & self.testSRmask]))

numpy.testing.assert_allclose(means, batchWiseMeans)

with self.subTest(msg='Checking alignment=\'none\', with method == LOWESS'):

output = nPYc.batchAndROCorrection._batchAndROCorrection._batchCorrection(
testD2,
self.testRO,
self.testSRmask,
self.batch,
[featureNo],
{'align': 'no', 'window': 11, 'method': 'LOWESS'},
0)

# Check if means are unchanged before and after correction (with method == 'LOWESS')
# Without alignment, the mean of corrected data is expected to be close to 1 if batch
# contains positive values and -np.inf if batch contains negative values
batches = list(set(self.batch))
expectedMeans = numpy.array([1 if numpy.mean(self.testD[(self.batch == x) & self.testSRmask]) > 0 else -numpy.inf for x in batches])
means = list()
for batch in batches:
feature = output[featureNo][1]
means.append(numpy.mean(
feature[(self.batch == batch) & self.testSRmask]))

numpy.testing.assert_allclose(means, expectedMeans, rtol=1e-02)

def test_correctMSdataset_raises(self):

with self.subTest(msg='Object type'):
Expand Down
2 changes: 1 addition & 1 deletion nPYc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
The `nPYc-Toolbox <https://github.com/phenomecentre/nPYc-Toolbox>`_ defines objects for representing, and implements functions to manipulate and display, metabolic profiling datasets.
"""
__version__ = '1.2.6'
__version__ = '1.2.7'

from . import enumerations
from .objects import Dataset, MSDataset, NMRDataset, TargetedDataset
Expand Down
44 changes: 26 additions & 18 deletions nPYc/batchAndROCorrection/_batchAndROCorrection.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from ..enumerations import AssayRole, SampleType


def correctMSdataset(data, window=11, method='LOWESS', align='median', parallelise=True, excludeFailures=True):
def correctMSdataset(data, window=11, method='LOWESS', align='median', parallelise=True, excludeFailures=True, correctionSampleType=SampleType.StudyPool):
"""
Conduct run-order correction and batch alignment on the :py:class:`~nPYc.objects.MSDataset` instance *data*, returning a new instance with corrected intensity values.
Expand All @@ -31,6 +31,7 @@ def correctMSdataset(data, window=11, method='LOWESS', align='median', paralleli
:param str align: Average calculation of batch and feature intensity for correction, one of 'median' (default) or 'mean'
:param bool parallelise: If ``True``, use multiple cores
:param bool excludeFailures: If ``True``, remove features where a correct fit could not be calculated from the dataset
:param enum correctionSampleType: Which SampleType to use for the correction, default SampleType.StudyPool
:return: Duplicate of *data*, with run-order correction applied
:rtype: MSDataset
"""
Expand All @@ -44,19 +45,21 @@ def correctMSdataset(data, window=11, method='LOWESS', align='median', paralleli
if method is not None:
if not isinstance(method, str) & (method in {'LOWESS', 'SavitzkyGolay'}):
raise ValueError('method must be == LOWESS or SavitzkyGolay')
if not isinstance(align, str) & (align in {'mean', 'median'}):
raise ValueError('align must be == mean or median')
if not isinstance(align, str) & (align in {'mean', 'median', 'no'}):
raise ValueError('align must be == mean, median or no')
if not isinstance(parallelise, bool):
raise TypeError("parallelise must be a boolean")
if not isinstance(excludeFailures, bool):
raise TypeError("excludeFailures must be a boolean")
if not isinstance(correctionSampleType,SampleType):
raise TypeError("correctionType must be a SampleType")

with warnings.catch_warnings():
warnings.simplefilter('ignore', category=RuntimeWarning)

correctedP = _batchCorrectionHead(data.intensityData,
data.sampleMetadata['Run Order'].values,
(data.sampleMetadata['SampleType'].values == SampleType.StudyPool) & (data.sampleMetadata['AssayRole'].values == AssayRole.PrecisionReference),
(data.sampleMetadata['SampleType'].values == correctionSampleType) & (data.sampleMetadata['AssayRole'].values == AssayRole.PrecisionReference),
data.sampleMetadata['Correction Batch'].values,
window=window,
method=method,
Expand Down Expand Up @@ -101,8 +104,8 @@ def _batchCorrectionHead(data, runOrder, referenceSamples, batchList, window=11,
if method is not None:
if not isinstance(method, str) & (method in {'LOWESS', 'SavitzkyGolay'}):
raise ValueError('method must be == LOWESS or SavitzkyGolay')
if not isinstance(align, str) & (align in {'mean', 'median'}):
raise ValueError('align must be == mean or median')
if not isinstance(align, str) & (align in {'mean', 'median', 'no'}):
raise ValueError('align must be == mean, median or no')
if not isinstance(parallelise, bool):
raise TypeError('parallelise must be True or False')
if not isinstance(savePlots, bool):
Expand Down Expand Up @@ -205,8 +208,8 @@ def _batchCorrection(data, runOrder, QCsamples, batchList, featureIndex, paramet
featureAverage = numpy.mean(feature[QCsamples])
elif parameters['align'] == 'median':
featureAverage = numpy.median(feature[QCsamples])
else:
return numpy.zeros_like(data)
#else:
#return numpy.zeros_like(data)

# Iterate over batches.
for batch in batches:
Expand All @@ -230,11 +233,11 @@ def _batchCorrection(data, runOrder, QCsamples, batchList, featureIndex, paramet
batchMean = numpy.mean(feature[batchMask & QCsamples])
elif parameters['align'] == 'median':
batchMean = numpy.median(feature[batchMask & QCsamples])
else:
batchMean = numpy.nan_like(feature[batchMask])

feature[batchMask] = numpy.divide(feature[batchMask], batchMean)
feature[batchMask] = numpy.multiply(feature[batchMask], featureAverage)
#else:
# batchMean = numpy.nan_like(feature[batchMask])
if parameters['align'] != 'no':
feature[batchMask] = numpy.divide(feature[batchMask], batchMean)
feature[batchMask] = numpy.multiply(feature[batchMask], featureAverage)

# # If negative data mark for exclusion (occurs when too many QCsamples have intensity==0)
# if sum(feature[batchMask]<0) != 0: # CJS 240816
Expand All @@ -258,11 +261,13 @@ def runOrderCompensation(data, runOrder, referenceSamples, parameters):
# Select model
# Optimisation of window would happen here.
window = parameters['window']
align = parameters['align']
if parameters['method'] == 'LOWESS':
(data, fit) = doLOESScorrection(QCdata,
QCrunorder,
data,
runOrder,
runOrder,
align=align,
window=window)
elif parameters['method'] == 'SavitzkyGolay':
(data, fit) = doSavitzkyGolayCorrection(QCdata,
Expand All @@ -276,7 +281,7 @@ def runOrderCompensation(data, runOrder, referenceSamples, parameters):
return (data, fit)


def doLOESScorrection(QCdata, QCrunorder, data, runorder, window=11):
def doLOESScorrection(QCdata, QCrunorder, data, runorder, align='median', window=11):
"""
Fit a LOWESS regression to the data.
"""
Expand All @@ -301,9 +306,12 @@ def doLOESScorrection(QCdata, QCrunorder, data, runorder, window=11):
fit[fit < 0] = 0

corrected = numpy.divide(data, fit)
corrected = numpy.multiply(corrected, numpy.median(QCdata))
if align == 'median':
corrected = numpy.multiply(corrected, numpy.median(QCdata))
elif align == 'mean':
corrected = numpy.multiply(corrected, numpy.mean(QCdata))

return (corrected, fit)
return corrected, fit


def doSavitzkyGolayCorrection(QCdata, QCrunorder, data, runorder, window=11, polyOrder=3):
Expand All @@ -323,7 +331,7 @@ def doSavitzkyGolayCorrection(QCdata, QCrunorder, data, runorder, window=11, pol
corrected = numpy.divide(data, fit)
corrected = numpy.multiply(corrected, numpy.median(QCdata))

return (corrected, fit)
return corrected, fit


def optimiseCorrection(feature, optimise):
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ cycler
iPython
isaExplorer
isatools
MarkupSafe==2.0.1
Jinja2
lmfit
matplotlib
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages

setup(name='nPYc',
version='1.2.6',
version='1.2.7',
description='National Phenome Centre toolbox',
url='https://github.com/phenomecentre/npyc-toolbox',
author='National Phenome Centre',
Expand Down

0 comments on commit 2bb6b5b

Please sign in to comment.