RECETOX · hechth · Jul 25, 2024 · Dec 7, 2023 · Dec 7, 2023 · Dec 7, 2023
diff --git a/tools/ipapy2/ipapy2_MS1_annotation.py b/tools/ipapy2/ipapy2_MS1_annotation.py
@@ -0,0 +1,20 @@
+import click
+import pandas as pd
+from ipaPy2 import ipa
+
+
+@click.group(invoke_without_command=True)
+@click.option('--ipa_isotope_filename', 'ipa_isotope_filename', type=click.Path(exists=True), required=True)
+@click.option('--all_adducts_filename', 'all_adducts_filename', type=click.Path(exists=True), required=True)
+@click.option('--ppm', 'ppm', type=click.Path(exists=True), required=True)
+@click.option('--ncores', 'ncores', type=click.Path(exists=True), required=True)
+@click.option('--output_filename', 'output_filename', type=click.Path(writable=True), required=True)
+def cli(input_filename, ionisation, ncores, output_filename):
+    ipa_isotope_table = pd.read_csv(ipa_isotope_filename)
+    all_adducts_filename_table = pd.read_csv(all_adducts_filename)
+    annotations = ipa.MS1annotation(ipa_isotope_table, all_adducts_filename_table, ionisation=ionisation, ncores=ncores)
+    annotations.to_csv(output_filename, index=False)
+
+
+if __name__ == '__main__':
+    cli()
diff --git a/tools/ipapy2/ipapy2_MS1_annotation.xml b/tools/ipapy2/ipapy2_MS1_annotation.xml
@@ -0,0 +1,105 @@
+<tool id="ipapy2_MS1_annotation" name="4. IPA MS1 annotation" version="@TOOL_VERSION@+galaxy0" python_template_version="3.5" profile="21.05">
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+    </requirements>
+
+    <command detect_errors="exit_code"><![CDATA[
+        python3 '${ipapy2_MS1_annotation_cli}'
+    ]]></command>
+
+<configfiles>
+<configfile name="ipapy2_MS1_annotation_cli">
+@init_logger@
+
+df = pd.read_csv('${MS1_table}')
+allAdds = pd.read_csv('${all_adducts}' )
+#if $ppmunk == "None"
+#if $ppmthr == "None"
+annotations = ipa.MS1annotation(df, allAdds, ppm=$ppm, me=$me, ratiosd=$ratiosd, ppmunk=$ppm, ratiounk=$ratiounk, ppmthr=2*$ppm, pRTNone=$pRTNone, pRTout=$pRTout, ncores=$ncores)
+#else
+annotations = ipa.MS1annotation(df, allAdds, ppm=$ppm, me=$me, ratiosd=$ratiosd, ppmunk=$ppm, ratiounk=$ratiounk, ppmthr=$ppmthr, pRTNone=$pRTNone, pRTout=$pRTout, ncores=$ncores)
+#end if
+#else
+#if $ppmthr == "None"
+annotations = ipa.MS1annotation(df, allAdds, ppm=$ppm, me=$me, ratiosd=$ratiosd, ppmunk=$ppmunk, ratiounk=$ratiounk, ppmthr=2*$ppm, pRTNone=$pRTNone, pRTout=$pRTout, ncores=$ncores)
+#else
+annotations = ipa.MS1annotation(df, allAdds, ppm=$ppm, me=$me, ratiosd=$ratiosd, ppmunk=$ppmunk, ratiounk=$ratiounk, ppmthr=$ppmthr, pRTNone=$pRTNone, pRTout=$pRTout, ncores=$ncores)
+#end if
+#end if
+annotations.to_csv('${output}', index=False)
+</configfile>
+</configfiles>
+
+    <inputs>
+        <param label="MS1 table" name="MS1_table" type="data" format="csv" help="pandas dataframe containing the MS1 data." />
+        <param label="all possible adducts table" name="all_adducts" type="data" format="csv" help="pandas dataframe containing the information on all the possible adducts given the database." />
+        <param label="ppm" name="ppm" type="float" help="accuracy of the MS instrument used."/>
+        <param label="me" name="me" type="float" value="0.000548579909065" help="accurate mass of the electron. Default 5.48579909065e-04."/>
+        <param label="ratiosd" name="ratiosd" type="float" value="0.9" help="It represents the acceptable ratio between predicted intensity and observed intensity of isotopes. default 0.9."/>
+        <param label="ppmunk" name="ppmunk" type="float" optional="true" value="None" help="ppm associated to the 'unknown' annotation. If not provided equal to ppm."/>
+        <param label="ratiounk" name="ratiounk" type="float" value="0.5" help="isotope ratio associated to the 'unknown' annotation. If not provided equal to 0.5."/>
+        <param label="ppmthr" name="ppmthr" type="float" optional="true" value="None" help="Maximum ppm possible for the annotations. Ff not provided equal to 2*ppm."/>
+        <param label="pRTNone" name="pRTNone" type="float" value="0.8" help="Multiplicative factor for the RT if no RTrange present in the database. If not provided equal to 0.8."/>
+        <param label="pRTout" name="pRTout" type="float" value="0.4" help="Multiplicative factor for the RT if measured RT is outside the RTrange present in the database. If not provided equal to 0.4."/>
+        <expand macro="ncores"/>
+    </inputs>
+
+    <outputs>
+        <data label="${tool.name} on ${on_string}" name="output" format="csv"/>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="MS1_table" value="MS1_data.csv"/>
+            <param name="all_adducts" value="all_adducts.csv"/>
+            <param name="ppm" value="3"/>
+            <output name="output" file="annotations.csv"/>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+    ::
+        MS1annotation(df, allAdds, ppm, me=0.000548579909065, ratiosd=0.9, ppmunk=None, ratiounk=None, ppmthr=None, pRTNone=None, pRTout=None, ncores=1)
+        Annotation of the dataset base on the MS1 information. Prior probabilities
+        are based on mass only, while post probabilities are based on mass, RT,
+        previous knowledge and isotope patterns.
+
+        Parameters
+        ----------
+        df: pandas dataframe containing the MS1 data. It should be the output of the
+            function ipa.map_isotope_patterns()
+        allAdds: pandas dataframe containing the information on all the possible
+                adducts given the database. It should be the output of either
+                ipa.compute_all_adducts() or ipa.compute_all_adducts_Parallel()
+        ppm: accuracy of the MS instrument used
+        me: accurate mass of the electron. Default 5.48579909065e-04
+        ratiosd: default 0.9. It represents the acceptable ratio between predicted
+                intensity and observed intensity of isotopes. It is used to compute
+                the shape parameters of the lognormal distribution used to
+                calculate the isotope pattern scores as sqrt(1/ratiosd)
+        ppmunk: ppm associated to the 'unknown' annotation. If not provided equal
+                to ppm.
+        ratiounk: isotope ratio associated to the 'unknown' annotation. If not
+                provided equal to 0.5
+        ppmthr: Maximum ppm possible for the annotations. If not provided equal to
+                2*ppm
+        pRTNone: Multiplicative factor for the RT if no RTrange present in the
+                database. If not provided equal to 0.8
+        pRTout: Multiplicative factor for the RT if measured RT is outside the
+                RTrange present in the database. If not provided equal to 0.4
+        ncores: default value 1. Number of cores used
+
+        Returns
+        -------
+        annotations: a dictionary containing all the possible annotations for the
+                    measured features. The keys of the dictionary are the unique
+                    ids for the features present in df. For each feature, the
+                    annotations are summarized in a pandas dataframe.
+    ]]></help>
+
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/ipapy2/ipapy2_MS2_annotation.py b/tools/ipapy2/ipapy2_MS2_annotation.py
diff --git a/tools/ipapy2/ipapy2_MS2_annotation.xml b/tools/ipapy2/ipapy2_MS2_annotation.xml
@@ -0,0 +1,121 @@
+<tool id="ipapy2_MS2_annotation" name="5. IPA MS2 annotation" version="@TOOL_VERSION@+galaxy0" python_template_version="3.5" profile="21.05">
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+    </requirements>
+
+    <command detect_errors="exit_code"><![CDATA[
+        python3 '${ipapy2_MS2_annotation_cli}'
+    ]]></command>
+
+<configfiles>
+<configfile name="ipapy2_MS2_annotation_cli">
+@init_logger@
+
+df = pd.read_csv('${MS1_table}')
+dfMS2 = pd.read_csv('${MS2_table}')
+allAdds = pd.read_csv('${all_adducts}')
+DBMS2 = pd.read_csv('${DBMS2_table}')
+annotations = ipa.MSMSannotation(df, dfMS2, allAdds, DBMS2, ppm=$ppm, me=$me, ratiosd=$ratiosd, ppmunk=$ppmunk, ratiounk=$ratiounk, ppmthr=$ppmthr, pRTNone=$pRTNone, pRTout=$pRTout, mzdCS=$mzdCS, ppmCS=$ppmCS, CSunk=$CSunk, evfilt=$evfilt, ncores=$ncores)
+annotations.to_csv('${output}', index=False)
+</configfile>
+</configfiles>
+
+    <inputs>
+        <param label="MS1 table" name="MS1_table" type="data" format="csv" help="pandas dataframe containing the MS1 data." />
+        <param label="MS2 table" name="MS2_table" type="data" format="csv" help="pandas dataframe containing the MS2 data." />
+        <param label="all possible adducts table" name="all_adducts" type="data" format="csv" help="pandas dataframe containing the information on all the possible adducts given the database." />
+        <param label="DBMS2 table" name="DBMS2_table" type="data" format="csv" help="pandas dataframe containing the database containing the MS2 information." />
+        <param label="ppm" name="ppm" type="float" help="accuracy of the MS instrument used."/>
+        <param label="me" name="me" type="float" value="0.000548579909065" help="accurate mass of the electron. Default 5.48579909065e-04."/>
+        <param label="ratiosd" name="ratiosd" type="float" value="0.9" help="It represents the acceptable ratio between predicted intensity and observed intensity of isotopes. default 0.9."/>
+        <param label="ppmunk" name="ppmunk" type="float" value='$ppm' help="ppm associated to the 'unknown' annotation. If not provided equal to ppm.."/>
+        <param label="ratiounk" name="ratiounk" type="float" value="0.5" help="isotope ratio associated to the 'unknown' annotation. If not provided equal to 0.5."/>
+        <param label="ppmthr" name="ppmthr" type="float" value='$ppm*2' help="Maximum ppm possible for the annotations. Ff not provided equal to 2*ppm."/>
+        <param label="pRTNone" name="pRTNone" type="float" value="0.8" help="Multiplicative factor for the RT if no RTrange present in the database. If not provided equal to 0.8."/>
+        <param label="pRTout" name="pRTout" type="float" value="0.4" help="Multiplicative factor for the RT if measured RT is outside the RTrange present in the database. If not provided equal to 0.4."/>
+        <param label="mzdCS" name="mzdCS" type="float" value="0" help="maximum mz difference allowed when computing cosine similarity scores. If one wants to use this parameter instead of ppmCS, this must be set to 0. Default 0.."/>
+        <param label="ppmCS" name="ppmCS" type="float" value="10" help="maximum ppm allowed when computing cosine similarity scores. If one wants to use this parameter instead of mzdCS, this must be set to 0. Default 10.."/>
+        <param label="CSunk" name="CSunk" type="float" value="0.7" help="cosine similarity score associated with the 'unknown' annotation. Default 0.7."/>
+        <param label="evfilt" name="evfilt" type="boolean" value="False" help="Default value False. If true, only spectrum acquired with the same collision energy are considered.."/>
+        <expand macro="ncores"/>
+    </inputs>
+
+    <outputs>
+        <data label="${tool.name} on ${on_string}" name="output" format="csv"/>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="MS1_table" value="MS1_data.csv"/>
+            <param name="MS2_table" value="MS2_data.csv"/>
+            <param name="all_adducts" value="all_adducts.csv"/>
+            <param name="DBMS2_table" value="MS2_DB.csv"/>
+            <param name="ppm" value="3"/>
+            <output name="output" file="annotations.csv"/>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+    ::
+        MSMSannotation(df, dfMS2, allAdds, DBMS2, ppm, me=0.000548579909065, ratiosd=0.9, ppmunk=None, ratiounk=None, ppmthr=None, pRTNone=None, pRTout=None, mzdCS=0, ppmCS=10, CSunk=0.7, evfilt=False, ncores=1)
+        Annotation of the dataset base on the MS1 and MS2 information. Prior
+        probabilities are based on mass only, while post probabilities are based
+        on mass, RT, previous knowledge and isotope patterns.
+
+        Parameters
+        ----------
+        df: pandas dataframe containing the MS1 data. It should be the output of the
+            function ipa.map_isotope_patterns()
+        dfMS2: pandas dataframe containing the MS2 data. It must contain 3 columns
+            -id: an unique id for each feature for which the MS2 spectrum was
+                acquired (same as in df)
+            -spectrum: string containing the spectrum information in the following
+                    format 'mz1:Int1 mz2:Int2 mz3:Int3 ...'
+            -ev: collision energy used to acquire the fragmentation spectrum
+        allAdds: pandas dataframe containing the information on all the possible
+                adducts given the database. It should be the output of either
+                ipa.compute_all_adducts() or ipa.compute_all_adducts_Parallel()
+        DBMS2: pandas dataframe containing the database containing the MS2
+            information
+        ppm: accuracy of the MS instrument used
+        me: accurate mass of the electron. Default 5.48579909065e-04
+        ratiosd: default 0.9. It represents the acceptable ratio between predicted
+                intensity and observed intensity of isotopes. it is used to compute
+                the shape parameters of the lognormal distribution used to
+                calculate the isotope pattern scores as sqrt(1/ratiosd)
+        ppmunk: ppm associated to the 'unknown' annotation. If not provided equal
+                to ppm.
+        ratiounk: isotope ratio associated to the 'unknown' annotation. If not
+                provided equal to 0.5
+        ppmthr: Maximum ppm possible for the annotations. Ff not provided equal to
+                2*ppm
+        pRTNone: Multiplicative factor for the RT if no RTrange present in the
+                database. If not provided equal to 0.8
+        pRTout: Multiplicative factor for the RT if measured RT is outside the
+                RTrange present in the database. If not provided equal to 0.4
+        mzdCS: maximum mz difference allowed when computing cosine similarity
+            scores. If one wants to use this parameter instead of ppmCS, this
+            must be set to 0. Default 0.
+        ppmCS: maximum ppm allowed when computing cosine similarity scores.
+            If one wants to use this parameter instead of mzdCS, this must be
+            set to 0. Default 10.
+        CSunk: cosine similarity score associated with the 'unknown' annotation.
+                Default 0.7
+        evfilt: Default value False. If true, only spectrum acquired with the same
+                collision energy are considered.
+        ncores: default value 1. Number of cores used
+
+        Returns
+        -------
+        annotations: a dictionary containing all the possible annotations for the
+                    measured features. The keys of the dictionary are the unique
+                    ids for the features present in df. For each feature, the
+                    annotations are summarized in a pandas dataframe.
+    ]]></help>
+
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/ipapy2/ipapy2_all_adducts.py b/tools/ipapy2/ipapy2_all_adducts.py
@@ -0,0 +1,22 @@
+import argparse
+import sys
+import pandas as pd
+from ipaPy2 import ipa
+
+
+def main(argv):
+    parser = argparse.ArgumentParser(description="compute all possible adducts.")
+    parser.add_argument("adducts_filename", type=str, help="a dataframe containing all possible adducts.")
+    parser.add_argument("DB_filename", type=str, help="a dataframe of database.")
+    parser.add_argument("--ionisation", type=int, defaul=1, choices=['1', '-1'], help="ionisation. +1 or -1.")
+    parser.add_argument("--ncores", type=int, defaul=1, help="number of cores.")
+    parser.add_argument("output_filename", type=str, help="a dataframe containing all possible adducts given the database.")
+    args = parser.parse_args()
+
+    adducts_table = pd.read_csv(args.adducts_filename)
+    DB_table = pd.read_csv(args.DB_filename)
+    all_adducts = ipa.compute_all_adducts(adducts_table, DB_table, ionisation=args.ionisation, ncores=args.ncores)
+    all_adducts.to_csv(args.output_filename, index=False)
+
+if __name__ == '__main__':
+    main(argv=sys.argv[1:])
diff --git a/tools/ipapy2/ipapy2_all_adducts.xml b/tools/ipapy2/ipapy2_all_adducts.xml
@@ -0,0 +1,94 @@
+<tool id="ipapy2_all_adducts" name="1. IPA compute all adducts" version="@TOOL_VERSION@+galaxy0" python_template_version="3.5" profile="21.05">
-<tool id="ipapy2_all_adducts" name="1. IPA compute all adducts" version="@TOOL_VERSION@+galaxy0" python_template_version="3.5" profile="21.05">
+<tool id="ipapy2_all_adducts" name="IPA compute database with adducts" version="@TOOL_VERSION@+galaxy0"  profile="21.05">
-<tool id="ipapy2_all_adducts" name="1. IPA compute all adducts" version="@TOOL_VERSION@+galaxy0" python_template_version="3.5" profile="21.05">
+<tool id="ipapy2_all_adducts" name="IPA compute database with adducts" version="@TOOL_VERSION@+galaxy0"  profile="21.05">
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+    </requirements>
+
+    <command detect_errors="exit_code"><![CDATA[
+        python3 '${ipapy2_all_adducts_cli}'
+    ]]></command>
+
+<configfiles>
+<configfile name="ipapy2_all_adducts_cli">
+@init_logger@
+
+adducts_table = pd.read_csv('${adducts_table}')
+DB_table = pd.read_csv('${DB_table}')
+#if $ionisation == "positive"
+all_adducts = ipa.compute_all_adducts(adducts_table, DB_table, ionisation=1, ncores=$ncores)
+#else
+all_adducts = ipa.compute_all_adducts(adducts_table, DB_table, ionisation=-1, ncores=$ncores)
+#end if
+all_adducts.to_csv('${output}', index=False)
+</configfile>
+</configfiles>
+
+    <inputs>
+            <param label="Adducts table" name="adducts_table" type="data" format="csv" help="Dataframe containing information on all possible adducts."/>
+            <param label="DB table" name="DB_table" type="data" format="csv" help="Dataframe containing a database."/>
+            <expand macro="ionisation"/>
+            <expand macro="ncores"/>
+    </inputs>
+
+    <outputs>
+            <data label="${tool.name} on ${on_string}" name="output" format="csv"/>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="adducts_table" value="adducts.csv"/>
+            <param name="DB_table" value="DB.csv"/>
+            <param name="ionisation" value="1"/>
+            <output name="output" file="all_adducts.csv"/>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+    ::
+        compute_all_adducts(adductsAll, DB, ionisation=1, ncores=1)
+        compute all adducts table based on the information present in the database
+
+        Parameters
+        ----------
+        adductsAll : pandas dataframe (necessary)
+                    Dataframe containing information on all possible
+                    adducts. The file must be in the same format as the example
+                    provided in the DB/adducts.csv
+        DB : pandas dataframe (necessary)
+            Dataframe containing the database against which the annotation is
+            performed. The DB must contain the following columns in this exact
+            order (optional fields can contain None):
+                - id: unique id of the database entry (e.g., 'C00031') - necessary
+                - name: compound name (e.g., 'D-Glucose') - necessary
+                - formula: chemical formula (e.g., 'C6H12O6') - necessary
+                - inchi: inchi string - optional
+                - smiles: smiles string - optional
+                - RT: if known, retention time range (in seconds) where this
+                    compound is expected to elute (e.g., '30;60') - optional
+                - adductsPos: list of adducts that should be considered in
+                            positive mode for this entry (e.g.,'M+Na;M+H;M+')
+                - adductsNeg: list of adducts that should be considered in
+                            negative mode for this entry (e.g.,'M-H;M-2H')
+                - description: comments on the entry - optional
+                - pk: previous knowledge on the likelihood of this compound to be
+                    present in the sample analysed. The value has to be between
+                    1 (compound likely to be present in the sample) and 0
+                    (compound cannot be present in the sample).
+                - MS2: id for the MS2 database entries related to this compound
+                        (optional)
+                - reactions: list of reactions ids involving this compound
+                            (e.g., 'R00010 R00015 R00028')-optional 
+        ionisation : Default value 1. positive = 1, negative = -1
+        ncores : default value 1. Number of cores used
+
+        Returns
+        -------
+        allAdds: pandas dataframe containing the information on all the possible
+        adducts given the database.
+    ]]></help>
+
+    <expand macro="citations"/>
+</tool>