cytomining · gwaybio · Jun 17, 2022 · May 23, 2022 · May 23, 2022 · May 23, 2022
diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
@@ -20,6 +20,7 @@ jobs:
         pip install pytest
         pip install pytest-cov
         pip install -r requirements.txt
+        pip install .[collate]
         pytest --cov=./ --cov-report=xml
     - name: Upload coverage to Codecov
       uses: codecov/codecov-action@v1

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -31,6 +31,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install pytest
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        pip install .[collate]
     - name: Test with pytest
       run: |
         pytest
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,7 @@ build
 .coverage*
 .idea/*
 /dist
+*.sqlite
+pycytominer/tests/test_data/collate/backend/**/*.csv
+!pycytominer/tests/test_data/collate/backend/**/*master.csv
+
diff --git a/README.md b/README.md
@@ -46,6 +46,15 @@ Since the project is actively being developed, with new features added regularly
 pip install git+git://github.com/cytomining/pycytominer@2aa8638d7e505ab510f1d5282098dd59bb2cb470
 ```
 
+If you want to perform data collation inside pycytominer, you will need an extra package; this will change your installation commands slightly
+
+```bash
+# Example for general case commit:
+pip install "pycytominer[collate] @ git+git://github.com/cytomining/pycytominer"
+# Example for specific commit:
+pip install "pycytominer[collate] @ git+git://github.com/cytomining/pycytominer@2aa8638d7e505ab510f1d5282098dd59bb2cb470"
+```
+
 ## Usage
 
 Using pycytominer is simple and fun.

diff --git a/docs/pycytominer.cyto_utils.rst b/docs/pycytominer.cyto_utils.rst
@@ -12,6 +12,14 @@ pycytominer.cyto\_utils.cells module
    :undoc-members:
    :show-inheritance:
 
+pycytominer.cyto\_utils.collate module
+------------------------------------
+
+.. automodule:: pycytominer.cyto_utils.collate
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
 pycytominer.cyto\_utils.features module
 ---------------------------------------
 

diff --git a/docs/pycytominer.tests.test_cyto_utils.rst b/docs/pycytominer.tests.test_cyto_utils.rst
@@ -20,6 +20,14 @@ pycytominer.tests.test\_cyto\_utils.test\_cells module
    :undoc-members:
    :show-inheritance:
 
+pycytominer.tests.test\_cyto\_utils.test\_collate module
+------------------------------------------------------
+
+.. automodule:: pycytominer.tests.test_cyto_utils.test_collate
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
 pycytominer.tests.test\_cyto\_utils.test\_feature\_blocklist module
 -------------------------------------------------------------------
 

diff --git a/pycytominer/cyto_utils/__init__.py b/pycytominer/cyto_utils/__init__.py
@@ -34,3 +34,4 @@
     aggregate_fields_count,
     aggregate_image_features,
 )
+from .collate import collate
diff --git a/pycytominer/cyto_utils/collate.py b/pycytominer/cyto_utils/collate.py
@@ -0,0 +1,220 @@
+import os
+import pathlib
+import subprocess
+import sys
+
+
+def run_check_errors(cmd):
+    """Run a system command, and exit if an error occurred, otherwise continue"""
+    if type(cmd) == str:
+        cmd = cmd.split()
+    output = subprocess.run(cmd, capture_output=True, text=True)
+    if output.stderr != "":
+        print_cmd = " ".join(map(str, cmd))
+        sys.exit(
+            f"The error {output.stderr} was generated when running {print_cmd}. Exiting."
+        )
+    return
+
+
+def collate(
+    batch,
+    config,
+    plate,
+    base_directory="../..",
+    column=None,
+    munge=False,
+    csv_dir="analysis",
+    aws_remote=None,
+    aggregate_only=False,
+    tmp_dir="/tmp",
+    overwrite=False,
+    add_image_features=True,
+    image_feature_categories=["Granularity", "Texture", "ImageQuality", "Threshold"],
+    printtoscreen=True,
+):
+    """Collate the CellProfiler-created CSVs into a single SQLite file by calling cytominer-database
+
+    Parameters
+    ----------
+    batch : str
+        Batch name to process
+    config : str
+        Config file to pass to cytominer-database
+    plate : str
+        Plate name to process
+    base_directory : str, default "../.."
+        Base directory for subdirectories containing CSVs, backends, etc; in our preferred structure, this is the "workspace" directory
+    column : str, optional, default None
+        An existing column to be explicitly copied to a new column called Metadata_Plate if no Metadata_Plate column already explicitly exists
+    munge : bool, default False
+        Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table
+    csv_dir : str, default 'analysis'
+        The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be "analysis"
+    aws_remote : str, optional, default None
+        A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run
+    aggregate_only : bool, default False
+        Whether to perform only the aggregation of existent SQLite files and bypass previous collation steps
+    tmp_dir: str, default '/tmp'
+        The temporary directory to be used by cytominer-databases for output
+    overwrite: bool, optional, default False
+        Whether or not to overwrite an sqlite that exists in the temporary directory if it already exists
+    add_image_features: bool, optional, default True
+        Whether or not to add the image features to the profiles
+    image_feature_categories: list, optional, default ['Number','Intensity','Granularity','Texture','ImageQuality','Count','Threshold']
+        The list of image feature groups to be used by add_image_features during aggregation
+    printtoscreen: bool, optional, default True
+        Whether or not to print output to the terminal
+    """
+
+    from pycytominer.cyto_utils.cells import SingleCells
+
+    # Set up directories (these need to be abspaths to keep from confusing makedirs later)
+    input_dir = pathlib.Path(f"{base_directory}/analysis/{batch}/{plate}/{csv_dir}")
+    backend_dir = pathlib.Path(f"{base_directory}/backend/{batch}/{plate}")
+    cache_backend_dir = pathlib.Path(f"{tmp_dir}/backend/{batch}/{plate}")
+
+    aggregated_file = pathlib.Path(f"{backend_dir}/{plate}.csv")
+    backend_file = pathlib.Path(f"{backend_dir}/{plate}.sqlite")
+    cache_backend_file = pathlib.Path(f"{cache_backend_dir}/{plate}.sqlite")
+
+    if not aggregate_only:
+        if os.path.exists(cache_backend_file):
+            if not overwrite:
+                sys.exit(
+                    f"An SQLite file for {plate} already exists at {cache_backend_file} and overwrite is set to False. Terminating."
+                )
+            else:
+                os.remove(cache_backend_file)
+
+        for eachdir in [input_dir, backend_dir, cache_backend_dir]:
+            if not os.path.exists(eachdir):
+                os.makedirs(eachdir, exist_ok=True)
+
+        if aws_remote:
+
+            remote_input_dir = f"{aws_remote}/analysis/{batch}/{plate}/{csv_dir}"
+
+            remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite"
+
+            remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv"
+
+            sync_cmd = f'aws s3 sync --exclude "*" --include "*/Cells.csv" --include "*/Nuclei.csv" --include "*/Cytoplasm.csv" --include "*/Image.csv" {remote_input_dir} {input_dir}'
+            if printtoscreen:
+                print(f"Downloading CSVs from {remote_input_dir} to {input_dir}")
+            run_check_errors(sync_cmd)
+
+        ingest_cmd = [
+            "cytominer-database",
+            "ingest",
+            input_dir,
+            f"sqlite:///{cache_backend_file}",
+            "-c",
+            config,
+        ]
+        if not munge:
+            # munge is by default True in cytominer-database
+            ingest_cmd.append("--no-munge")
+        if printtoscreen:
+            print(f"Ingesting {input_dir}")
+        run_check_errors(ingest_cmd)
+
+        if column:
+            if print:
+                print(f"Adding a Metadata_Plate column based on column {column}")
+            alter_cmd = [
+                "sqlite3",
+                cache_backend_file,
+                "ALTER TABLE Image ADD COLUMN Metadata_Plate TEXT;",
+            ]
+            run_check_errors(alter_cmd)
+            update_cmd = [
+                "sqlite3",
+                cache_backend_file,
+                f"UPDATE image SET Metadata_Plate ={column};",
+            ]
+            run_check_errors(update_cmd)
+
+        if printtoscreen:
+            print(f"Indexing database {cache_backend_file}")
+        index_cmd_img = [
+            "sqlite3",
+            cache_backend_file,
+            "CREATE INDEX IF NOT EXISTS table_image_idx ON Image(TableNumber, ImageNumber);",
+        ]
+        run_check_errors(index_cmd_img)
+        for eachcompartment in ["Cells", "Cytoplasm", "Nuclei"]:
+            index_cmd_compartment = [
+                "sqlite3",
+                cache_backend_file,
+                f"CREATE INDEX IF NOT EXISTS table_image_object_{eachcompartment.lower()}_idx ON {eachcompartment}(TableNumber, ImageNumber, ObjectNumber);",
+            ]
+            run_check_errors(index_cmd_compartment)
+        index_cmd_metadata = [
+            "sqlite3",
+            cache_backend_file,
+            "CREATE INDEX IF NOT EXISTS plate_well_image_idx ON Image(Metadata_Plate, Metadata_Well);",
+        ]
+        run_check_errors(index_cmd_metadata)
+
+        if aws_remote:
+
+            if printtoscreen:
+                print(f"Uploading {cache_backend_file} to {remote_backend_file}")
+            cp_cmd = ["aws", "s3", "cp", cache_backend_file, remote_backend_file]
+            run_check_errors(cp_cmd)
+
+            if printtoscreen:
+                print(
+                    f"Removing analysis files from {input_dir} and {cache_backend_dir}"
+                )
+            import shutil
+
+            shutil.rmtree(input_dir)
+
+        if printtoscreen:
+            print(f"Renaming {cache_backend_file} to {backend_file}")
+        os.rename(cache_backend_file, backend_file)
+
+    if printtoscreen:
+        print(f"Aggregating sqlite:///{backend_file}")
+
+    if aggregate_only and aws_remote:
+        remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite"
+
+        remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv"
+
+        cp_cmd = ["aws", "s3", "cp", remote_backend_file, backend_file]
+        if printtoscreen:
+            print(
+                f"Downloading SQLite files from {remote_backend_file} to {backend_file}"
+            )
+        run_check_errors(cp_cmd)
+
+    if not os.path.exists(backend_file):
+        sys.exit(f"{backend_file} does not exist. Exiting.")
+
+    if add_image_features:
+        pass
+    else:
+        image_feature_categories = None  # defensive but not sure what will happen if we give a list but set to False
+
+    database = SingleCells(
+        f"sqlite:///{backend_file}",
+        aggregation_operation="mean",
+        add_image_features=add_image_features,
+        image_feature_categories=image_feature_categories,
+    )
+    database.aggregate_profiles(output_file=aggregated_file)
+
+    if aws_remote:
+        if printtoscreen:
+            print(f"Uploading {aggregated_file} to {remote_aggregated_file}")
+        csv_cp_cmd = ["aws", "s3", "cp", aggregated_file, remote_aggregated_file]
+        run_check_errors(csv_cp_cmd)
+
+        if printtoscreen:
+            print(f"Removing backend files from {backend_dir}")
+        import shutil
+
+        shutil.rmtree(backend_dir)
diff --git a/pycytominer/cyto_utils/collate_cmd.py b/pycytominer/cyto_utils/collate_cmd.py
@@ -0,0 +1,97 @@
+import argparse
+from pycytominer.cyto_utils.collate import collate
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="Collate CSVs")
+    parser.add_argument("batch", help="Batch name to process")
+    parser.add_argument("config", help="Config file to pass to cytominer-database")
+    parser.add_argument("plate", help="Plate name to process")
+    parser.add_argument(
+        "--base",
+        "--base-directory",
+        dest="base_directory",
+        default="../..",
+        help="Base directory for subdirectories containing CSVs, backends, etc; in our preferred structure, this is the 'workspace' directory",
+    )
+    parser.add_argument(
+        "--column",
+        default=None,
+        help="An existing column to be explicitly copied to a Metadata_Plate column if Metadata_Plate was not set",
+    )
+    parser.add_argument(
+        "--munge",
+        action="store_true",
+        default=False,
+        help="Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table",
+    )
+    parser.add_argument(
+        "--csv-dir",
+        dest="csv_dir",
+        default="analysis",
+        help="The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be 'analysis'",
+    )
+    parser.add_argument(
+        "--aws-remote",
+        dest="aws_remote",
+        default=None,
+        help="A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run",
+    )
+    parser.add_argument(
+        "--aggregate-only",
+        dest="aggregate_only",
+        action="store_true",
+        default=False,
+        help="Whether to perform only the aggregation of existant SQLite files and bypass previous collation steps",
+    )
+    parser.add_argument(
+        "--temp",
+        default="/tmp",
+        help="The temporary directory to be used by cytominer-databases for output",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        default=False,
+        help="Whether or not to overwrite an sqlite that exists in the temporary directory if it already exists",
+    )
+    parser.add_argument(
+        "--dont-add-image-features",
+        dest="add_image_features",
+        action="store_false",
+        default=True,
+        help="Whether or not to add the image features to the profiles",
+    )
+    parser.add_argument(
+        "--image-feature-categories",
+        dest="image_feature_categories",
+        type=lambda s: [item for item in s.split(",")],
+        default="Granularity,Texture,ImageQuality,Count,Threshold",
+        help="Which image feature categories should be added if adding image features to the aggregated profiles. Multiple values can be passed in if comma separated with no spaces between them",
+    )
+    parser.add_argument(
+        "--dont-print",
+        dest="printtoscreen",
+        action="store_false",
+        default=True,
+        help="Whether to print status updates",
+    )
+
+    args = parser.parse_args()
+
+    collate(
+        args.batch,
+        args.config,
+        args.plate,
+        base_directory=args.base_directory,
+        column=args.column,
+        munge=args.munge,
+        csv_dir=args.csv_dir,
+        aws_remote=args.aws_remote,
+        aggregate_only=args.aggregate_only,
+        temp=args.temp,
+        overwrite=args.overwrite,
+        add_image_features=args.add_image_features,
+        image_feature_categories=args.image_feature_categories,
+        printtoscreen=args.printtoscreen,
+    )