diff --git a/requirements.txt b/requirements.txt index f48d423..b4c2452 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ ruamel.yaml >= 0.16 sevenbridges-python >= 2.0 nf-core==2.1 +wrabbit==0.1.1 cwlformat packaging \ No newline at end of file diff --git a/sbpack/noncwl/Readme.md b/sbpack/noncwl/Readme.md index a8e89b4..78bd668 100644 --- a/sbpack/noncwl/Readme.md +++ b/sbpack/noncwl/Readme.md @@ -51,18 +51,21 @@ $ sbpack_nf -h usage: sbpack_nf [-h] [--profile PROFILE] - --appid APPID + [--appid APPID] --workflow-path WORKFLOW_PATH [--entrypoint ENTRYPOINT] [--sb-package-id SB_PACKAGE_ID] [--sb-doc SB_DOC] [--dump-sb-app] - [--no-package] [--executor-version EXECUTOR_VERSION] [--execution-mode {single-instance,multi-instance}] [--json] [--sb-schema SB_SCHEMA] [--revision-note REVISION_NOTE] + [--app-name APP_NAME] + [--exclude EXCLUDE [EXCLUDE ...]] + [--sample-sheet-schema SAMPLE_SHEET_SCHEMA] + [--auto] optional arguments: -h, --help show this help message and exit @@ -79,17 +82,16 @@ optional arguments: than one '*.nf' script is detected, an error is raised. --sb-package-id SB_PACKAGE_ID - Id of an already uploaded package + Id of an already uploaded package. --sb-doc SB_DOC Path to a doc file for sb app. If not provided, README.md will be used if available --dump-sb-app Dump created sb app to file if true and exit - --no-package Only provide a sb app schema and a git URL for - entrypoint --executor-version EXECUTOR_VERSION Version of the Nextflow executor to be used with the app. --execution-mode {single-instance,multi-instance} - Execution mode for your application. + Execution mode for your application. Can be multi- + instance or single-instance --json Dump sb app schema in JSON format (YAML by default) --sb-schema SB_SCHEMA Do not create new schema, use this schema file. It is @@ -97,6 +99,24 @@ optional arguments: --revision-note REVISION_NOTE Revision note to be placed in the CWL schema if the app is uploaded to the sbg platform. + --app-name APP_NAME Name of the app to be shown on the platform. + --exclude EXCLUDE [EXCLUDE ...] + Glob patterns you want to exclude from the code + package. By default the following patterns are + excluded: ['*.git', '*.git*', '.git', '.git*', 'work', + '.nextflow.log', '.DS_Store', '.devcontainer', + '.editorconfig', '.gitattributes', '.nextflow', '.pre- + commit-config.yaml', '.prettierignore', + '.prettierrc.yml', '.idea', '.pytest_cache', '*.egg- + info'] + --sample-sheet-schema SAMPLE_SHEET_SCHEMA + Path to the sample sheet schema yaml. The sample sheet + schema should contain the following keys: + 'sample_sheet_input', 'sample_sheet_name', 'header', + 'rows', 'defaults', 'group_by', 'format_' + --auto Automatically detect all possible inputs directly from + the --workflow-path location + ``` ### Example @@ -147,4 +167,60 @@ sbcopy --appid division-name/project-name/app-name --projectid division-name/des Copying an app from one division to another. ``` sbcopy --profile source_division target_division --appid source-division-name/project-name/app-name --projectid target-division-name/destination-project-name -``` \ No newline at end of file +``` + +# sbmanifest + +Developed to help remap file paths contained within a manifest file to platform +file locations. This is useful when utilizing sample sheets to run nextflow +pipelines on SB powered platforms. + +### Usage + +``` +usage: sbmanifest [-h] [--profile PROFILE] --projectid PROJECTID + --sample-sheet SAMPLE_SHEET --columns string [string ...] + [--output OUTPUT] [--upload] [--tags string [string ...]] + [--validate] + +optional arguments: + -h, --help show this help message and exit + --profile PROFILE SB platform profile as set in the SB API credentials + file. + --projectid PROJECTID + Takes the form {user or division}/{project}. + --sample-sheet SAMPLE_SHEET + Path to the sample sheet. + --columns string [string ...] + Specify columns that contain paths to files on the + platformas a list of strings separated by spaces. + --output OUTPUT, -o OUTPUT + Name of the output file. + --upload, -u Upload the file to the project after making it. + --tags string [string ...] + Specify tags that you want the sample sheet to have on + the platform, after it is uploaded. + --validate Validate if each file exists on target project + location. +``` + +### Examples + +Remap a sample sheet and upload it to the platform with the tag "SampleSheet" +``` +sbmanifest --projectid user/project_id --sample-sheet /path/to/rnaseq_samplesheet.csv --columns fastq_1 fastq_2 -o rnaseq_samplesheet.csv -u --tags SampleSheet +``` +Given the contents of this sample sheet is: + +| sample | fastq_1 | fastq_2 | strandedness | +|:--------|:---------------------------------|:---------------------------------|:-------------| +| SAMPLE1 | RNAseq_inputs/SAMPLE1_1.fastq.gz | RNAseq_inputs/SAMPLE1_2.fastq.gz | reverse | +| SAMPLE2 | RNAseq_inputs/SAMPLE2_1.fastq.gz | RNAseq_inputs/SAMPLE2_2.fastq.gz | reverse | + + +Remapped file will be: + +| sample | fastq_1 | fastq_2 | strandedness | +|:--------|:------------------------------------------------------------------|:------------------------------------------------------------------|:-------------| +| SAMPLE1 | vs:///Projects/project-root-uuid/RNAseq_inputs/SAMPLE1_1.fastq.gz | vs:///Projects/project-root-uuid/RNAseq_inputs/SAMPLE1_2.fastq.gz | reverse | +| SAMPLE2 | vs:///Projects/project-root-uuid/RNAseq_inputs/SAMPLE2_1.fastq.gz | vs:///Projects/project-root-uuid/RNAseq_inputs/SAMPLE2_2.fastq.gz | reverse | diff --git a/sbpack/noncwl/__init__.py b/sbpack/noncwl/__init__.py index 86fe336..e69de29 100644 --- a/sbpack/noncwl/__init__.py +++ b/sbpack/noncwl/__init__.py @@ -1,9 +0,0 @@ -import os - -PACKAGE_PATH = os.path.abspath(os.path.dirname(__file__)) -JS_PATH = os.path.join(PACKAGE_PATH, 'js_templates') - - -def read_js_template(file_name): - with open(os.path.join(JS_PATH, file_name), 'r') as f: - return f.read() diff --git a/sbpack/noncwl/constants.py b/sbpack/noncwl/constants.py index 6be4176..dffc187 100644 --- a/sbpack/noncwl/constants.py +++ b/sbpack/noncwl/constants.py @@ -1,7 +1,3 @@ -from enum import Enum -from sbpack.noncwl import read_js_template - - # ############################## Generic Bits ############################### # PACKAGE_SIZE_LIMIT = 100 * 1024 * 1024 # 100 MB REMOVE_INPUT_KEY = "REMOVE_THIS_KEY" @@ -22,8 +18,6 @@ class EXTENSIONS: # ############################ CWL Standard Bits ############################ # # A generic SB input array of files that should be available on the # instance but are not explicitly provided to the execution as wdl params. -SAMPLE_SHEET_FUNCTION = read_js_template("sample_sheet_generator.js") -SAMPLE_SHEET_SWITCH = read_js_template("sample_sheet_switch.js") GENERIC_FILE_ARRAY_INPUT = { "id": "auxiliary_files", @@ -41,18 +35,6 @@ class EXTENSIONS: "that is required for workflow execution." } -GENERIC_NF_OUTPUT_DIRECTORY = { - "id": "nf_workdir", - "type": "Directory?", - "label": "Work Directory", - "doc": "This is a template output. " - "Please change glob to directories specified in " - "publishDir in the workflow.", - "outputBinding": { - "glob": "work" - } -} - GENERIC_WDL_OUTPUT_DIRECTORY = { "id": "output_txt", "doc": "This is a template output. " @@ -84,62 +66,27 @@ class EXTENSIONS: AUX_FILES_REQUIREMENT ] - -def sample_sheet( - file_name, sample_sheet_input, format_, input_source, header, rows, - defaults, group_by): - basename = ".".join(file_name.split(".")[:-1]) - ext = file_name.split(".")[-1] - new_name = f"{basename}.new.{ext}" - - return { - "class": "InitialWorkDirRequirement", - "listing": [ - { - "entryname": f"${{ return {sample_sheet_input} ? {sample_sheet_input}.nameroot + '.new' + {sample_sheet_input}.nameext : '{file_name}' }}", - "entry": SAMPLE_SHEET_FUNCTION.format_map(locals()), - "writable": False - } - ] - } - - -# ############################## Nextflow Bits ############################## # -# Keys that should be skipped when parsing nextflow tower yaml file - -NF_SCHEMA_DEFAULT_NAME = 'nextflow_schema.json' -SB_SCHEMA_DEFAULT_NAME = 'sb_nextflow_schema' - -# Mappings of nextflow input fields to SB input fields -# nextflow_key: cwl_key mapping -NF_TO_CWL_PORT_MAP = { - 'default': 'sbg:toolDefaultValue', - 'description': 'label', - 'help_text': 'doc', - 'mimetype': 'format', - 'fa_icon': 'sbg:icon', - 'pattern': 'sbg:pattern', - 'hidden': 'sbg:hidden', -} - -# Mappings of nextflow definition fields to SB category fields -# nextflow_key: cwl_key mapping -NF_TO_CWL_CATEGORY_MAP = { - 'title': 'sbg:title', - 'description': 'sbg:doc', - 'fa_icon': 'sbg:icon', -} - -# What keys to skip from the tower.yml file -SKIP_NEXTFLOW_TOWER_KEYS = [ - 'tower', - 'mail', +# Nextflow +DEFAULT_EXCLUDE_PATTERNS = [ + "*.git", + "*.git*", + ".git", + ".git*", + # ".github", + # ".gitignore", + # ".gitpod.yml", + "work", + ".nextflow.log", + ".DS_Store", + ".devcontainer", + ".editorconfig", + ".gitattributes", + ".nextflow", + # ".nf-core.yml", + ".pre-commit-config.yaml", + ".prettierignore", + ".prettierrc.yml", + ".idea", + ".pytest_cache", + "*.egg-info", ] - - -class ExecMode(Enum): - single = 'single-instance' - multi = 'multi-instance' - - def __str__(self): - return self.value diff --git a/sbpack/noncwl/js_templates/sample_sheet_generator.js b/sbpack/noncwl/js_templates/sample_sheet_generator.js deleted file mode 100644 index 35d9518..0000000 --- a/sbpack/noncwl/js_templates/sample_sheet_generator.js +++ /dev/null @@ -1,127 +0,0 @@ -${{ - function compareFiles(file_a, file_b) {{ - if (file_a.path < file_b.path) {{ - return -1; - }} else if (file_a.path > file_b.path) {{ - return 1; - }} - return 0; - }} - - var input_source = [].concat(inputs.{input_source}).sort(compareFiles); - if (!input_source.length == 0){{ - // Return empty file if no input files are given. - // Ensures that sample sheet is generated only if there are files to - // either: - // - map onto an existing sample sheet, or - // - generate a sample sheet from input file info/metadata - return ""; - }}; - - var sample_sheet_input = inputs.{sample_sheet_input}; - - var sample_sheet = []; - - if (sample_sheet_input){{ - // If the sample sheet file is given, map inputs to the contents - var contents = sample_sheet_input.contents.split("\n"); - var format_ = sample_sheet_input.nameext.slice(1); - - var split_char = ""; - - switch (format_) {{ - case 'csv': - split_char = ","; - case 'tsv': - split_char = "\t"; - }}; - - for (var i=0; i < input_source.length; i++){{ - var file = input_source[i]; - for (var row=0; row < contents.length; row++){{ - var row_data = contents[row].split(split_char); - for (var column=0; column < row_data.length; column++){{ - var cell = row_data[column]; - if (cell == file.basename){{ - cell = file.path; - }} - row_data[column] = cell; - }} - contents[row] = row_data.join(split_char); - }} - }} - sample_sheet = contents; - }} else {{ - // If the samples are given, create the sample sheet from input data - var format_ = {format_}; - var header = {header}; - var row = {rows}; - var defaults = {defaults}; - var group_by = {group_by}; - - var split_char = ""; - switch (format_) {{ - case 'csv': - split_char = ","; - case 'tsv': - split_char = "\t"; - }} - - if (header){{ - sample_sheet.push(header.join(split_char)); - }}; - var groups = {{}}; - - for (var i = 0; i < input_source.length; i ++){{ - var file = input_source[i]; - var group_criteria = []; - for (var j = 0; j < group_by.length; j ++){{ - group_criteria.push(eval(group_by[j])); - }} - try {{ - groups[group_criteria.join(".")].push(file) - }} catch(ex) {{ - groups[group_criteria.join(".")] = [file] - }} - }} - - if (defaults.length < row.length){{ - for (var i = 0; i < row.length - defaults.length + 1; i++){{ - defaults.push(""); - }} - }}; - - for (k in groups){{ - var row_data = []; - var files = groups[k]; - - files.sort(function(a, b) {{ - var name_a = a.basename.toUpperCase(); - var name_b = b.basename.toUpperCase(); - if (name_a < name_b){{ - return -1; - }} else if (name_a > name_b){{ - return 1; - }} else {{ - return 0; - }} - }}); - - for (var j = 0; j < row.length; j ++){{ - var d = ""; - try {{ - var d = eval(row[j]); - if (d == undefined){{ - d = defaults[j]; - }} - }} catch(ex) {{ - var d = defaults[j]; - }} - row_data.push(d); - }} - - sample_sheet.push(row_data.join(split_char)); - }} - }} - return sample_sheet.join("\n"); -}} \ No newline at end of file diff --git a/sbpack/noncwl/js_templates/sample_sheet_switch.js b/sbpack/noncwl/js_templates/sample_sheet_switch.js deleted file mode 100644 index 430646a..0000000 --- a/sbpack/noncwl/js_templates/sample_sheet_switch.js +++ /dev/null @@ -1,9 +0,0 @@ -${{ - if ({file_input}) {{ - return '{sample_sheet_name}'; - }} else if (!{file_input} && {sample_sheet}){{ - return {sample_sheet}; - }} else {{ - return ""; - }} -}} \ No newline at end of file diff --git a/sbpack/noncwl/manifest.py b/sbpack/noncwl/manifest.py new file mode 100644 index 0000000..3dd1a96 --- /dev/null +++ b/sbpack/noncwl/manifest.py @@ -0,0 +1,327 @@ +from sevenbridges.models.project import Project + +import logging +import sbpack.lib as lib +import argparse +import os + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def paths_to_check(file_name: str) -> list: + """ + :param file_name: Contents of a single manifest file cell that contains + path(s) to files. + """ + chk = [] + rtrn = [] + + if ";" in file_name: + # This should handle the case when there are multiple files in the + # same cell, but they are separated by ";" + # For example: + # file.ext;folder/file.ext + chk.extend(file_name.split(";")) + else: + chk.append(file_name) + + for file_name in chk: + if ":" in file_name: + # If a file is in cloud storage, skip it + continue + + file_name = file_name.strip('/') + rtrn.append(file_name) + cur_path = file_name + while os.path.dirname(cur_path): + cur_path = os.path.dirname(cur_path) + rtrn.append(cur_path) + + return rtrn + + +def remap_cell(project_root: str, path: str) -> str: + """ + Remaps a file path to the 'vs:' file system. + + Supports multiple files separated with ';'. + + :param project_root: Name of the project root directory. + :param path: File path. + :return: File path(s) prefixed with 'vs:///Projects/' and project_root. + """ + # prefix it with the project root + if ";" in path: + return ";".join([remap_cell(project_root, f) for f in path.split(";")]) + + if path and ":" not in path: + while path.startswith('/'): + path = path[1:] + if path: + return f"vs:///Projects/{project_root}/{path}" + else: + return path + + +def validate_sheet( + api, + project: Project, + path_to_file: str, + remap_columns: list, +) -> None: + """ + Go through the sample sheet and validate if files contained within are + located in the project. + + :param api: SevenBridges API + :param project: Project on a SevenBridges powered platform where the files + are located. + :param path_to_file: Path to the sample sheet (manifest) file. + :param remap_columns: Names of the columns to remap. These columns must + contain paths to files. + """ + # Collect the extension of the file to determine the split character + # If the file is a CSV, use ","; or TSV, use "\t" + ext = path_to_file.split('.')[-1] + if ext.lower() == 'csv': + split_char = ',' + elif ext.lower() == 'tsv': + split_char = '\t' + else: + raise ValueError( + f"Invalid file type '{ext}'. Expected a .tsv or .csv file." + ) + + # Create a list of unique paths to files and directories that the files are + # contained in. + to_validate = list() + with open(path_to_file, 'r') as input_file: + # Assume first row is the header + header = input_file.readline().strip('\n').split(split_char) + + # Create a list of indices based on the column names. + indices = [] + for column in remap_columns: + try: + indices.append(header.index(column)) + except ValueError: + raise ValueError( + f"Header column '{column}' not found in the " + f"sample sheet header." + ) + + # Assume all lines below the first are the table contents. + for line in input_file.readlines(): + line = line.strip('\n') + + # Skip empty lines + if not line: + continue + + line = line.split(split_char) + for i in indices: + to_validate.extend(paths_to_check(line[i])) + + # ### Check collected paths ### # + # Memoize checked paths + checked = {} + errors = [] + + for path in sorted(list(to_validate)): + if path in checked: + continue + else: + basename = os.path.basename(path) + parent = None + if os.path.dirname(path): + parent = checked[os.path.dirname(path)] + + file = api.files.query( + names=[basename], + project=project if not parent else None, + parent=parent) + if file: + checked[path] = file[0] + else: + raise FileExistsError( + f"File <{path}> does not exist within " + f"project <{project}>") + + +def remap( + project_root: str, + path_to_file: str, + remap_columns: list, +) -> str: + """ + Remap paths from a manifest file to vs:// paths. + + Remapping is performed only on file paths that are not already in cloud + storage. Paths to project files in the manifest should point to their + relative location in the project root. For example, if a file ("file.ext") + is located in a directory named "directory", which resides in the project + root, then the correct path to that file would be "directory/file.ext". + + The function assumes that the first row is always the header. + + :param project_root: Name of the project root directory. + :param path_to_file: Path to the manifest file. + :param remap_columns: Names of manifest file columns that contain paths to + input files. + :return: Manifest file with remapped columns in string format. + """ + ext = path_to_file.split('.')[-1] + if ext.lower() == 'csv': + split_char = ',' + elif ext.lower() == 'tsv': + split_char = '\t' + else: + raise ValueError( + f"Invalid file type '{ext}'. Expected a .tsv or .csv file." + ) + + sheet = [] + + with open(path_to_file, 'r') as input_file: + header = input_file.readline().strip('\n').split(split_char) + sheet.append(split_char.join(header)) + + indices = [] + for column in remap_columns: + try: + indices.append(header.index(column)) + except ValueError: + raise ValueError( + f"Header column '{column}' not found in the " + f"sample sheet header." + ) + + for line in input_file.readlines(): + if line: + line = line.strip('\n').split(split_char) + for i in indices: + line[i] = remap_cell(project_root, line[i]) + line = split_char.join(line) + sheet.append(line) + + return "\n".join(sheet) + + +def main(): + # CLI parameters + parser = argparse.ArgumentParser() + parser.add_argument( + "--profile", required=False, + default="default", type=str, + help="SB platform profile as set in the SB API credentials file.", + ) + parser.add_argument( + "--projectid", required=True, + type=str, + help="Takes the form {user or division}/{project}.", + ) + parser.add_argument( + "--sample-sheet", required=True, + type=str, + help="Path to the sample sheet." + ) + parser.add_argument( + "--columns", required=True, + metavar='string', nargs='+', type=str, + help="Specify columns that contain paths to files on the platform" + "as a list of strings separated by spaces.", + ) + parser.add_argument( + "--output", '-o', required=False, + type=str, + help="Name of the output file.", + ) + parser.add_argument( + "--upload", '-u', action='store_true', required=False, + help="Upload the file to the project after making it.", + ) + parser.add_argument( + "--tags", required=False, + metavar='string', nargs='+', type=str, + help="Specify tags that you want the sample sheet to have on the " + "platform, after it is uploaded.", + ) + parser.add_argument( + "--validate", action='store_true', required=False, + help="Validate if each file exists on target project location.", + ) + + args = parser.parse_args() + + project = args.projectid + api = lib.get_profile(args.profile) + + project = api.projects.get(project) + project_root = api.files.get(project.root_folder).name + + logger.info('Remapping manifest files.') + sheet = remap( + project_root, + args.sample_sheet, + args.columns + ) + logger.info('Remapping complete.') + + if args.validate: + logger.info('Validating manifest.') + validate_sheet( + api, + project, + args.sample_sheet, + args.columns + ) + logger.info('Validation complete.') + + if not args.output: + name = os.path.basename(args.sample_sheet) + + save_path = os.path.join( + os.path.dirname(args.sample_sheet), + name + ) + i = 0 + while os.path.exists(save_path): + i += 1 + save_path = os.path.join( + os.path.dirname(args.sample_sheet), + f"_{i}_{name}" + ) + else: + save_path = args.output + + with open(save_path, 'w') as output: + logger.info(f'Saving remapped manifest file to <{save_path}>.') + output.write(sheet) + + if args.upload: + name = os.path.basename(args.sample_sheet) + if args.output: + name = args.output + + temp_name = name + i = 0 + + while api.files.query(project=project, names=[temp_name]): + i += 1 + temp_name = f"_{i}_{name}" + + logger.info( + f'Uploading remapped manifest file to project {project} ' + f'under filename <{temp_name}>.') + file = api.files.upload( + save_path, project, file_name=temp_name + ).result() + if args.tags: + file.tags = args.tags + file.save() + + +if __name__ == "__main__": + main() diff --git a/sbpack/noncwl/nextflow.py b/sbpack/noncwl/nextflow.py index c4a8c65..59f4a6c 100644 --- a/sbpack/noncwl/nextflow.py +++ b/sbpack/noncwl/nextflow.py @@ -1,62 +1,45 @@ -import re -import json import argparse import logging -import os import yaml +import os import sbpack.lib as lib -from packaging import version + +from wrabbit.parser.nextflow import NextflowParser from nf_core.schema import PipelineSchema from sbpack.version import __version__ -from sbpack.pack import pack + +from sbpack.noncwl.constants import ( + DEFAULT_EXCLUDE_PATTERNS, +) + from sbpack.noncwl.utils import ( - get_dict_depth, zip_and_push_to_sb, + install_or_upgrade_app, +) + +from wrabbit.parser.utils import ( get_readme, - get_tower_yml, - get_entrypoint, - get_executor_version, get_latest_sb_schema, get_sample_sheet_schema, - get_config_files, - parse_config_file, - create_profile_enum, - install_or_upgrade_app, - nf_to_sb_input_mapper, ) -from sbpack.noncwl.constants import ( - sample_sheet, + +from wrabbit.parser.constants import ( ExecMode, - GENERIC_FILE_ARRAY_INPUT, - GENERIC_NF_OUTPUT_DIRECTORY, - INLINE_JS_REQUIREMENT, - LOAD_LISTING_REQUIREMENT, - AUX_FILES_REQUIREMENT, - SKIP_NEXTFLOW_TOWER_KEYS, EXTENSIONS, - NF_TO_CWL_CATEGORY_MAP, - SAMPLE_SHEET_FILE_ARRAY_INPUT, - SAMPLE_SHEET_SWITCH, NF_SCHEMA_DEFAULT_NAME, SB_SCHEMA_DEFAULT_NAME, - REMOVE_INPUT_KEY, ) -from sbpack.noncwl.wrapper import Wrapper logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -class SBNextflowWrapper: - def __init__(self, workflow_path, sb_doc=None): - self.sb_wrapper = Wrapper() +class SBNextflowWrapper(NextflowParser): + def __init__(self, workflow_path, *args, **kwargs): + super().__init__(workflow_path, *args, **kwargs) self.nf_ps = PipelineSchema() - self.workflow_path = workflow_path - self.nf_schema_path = None - self.nf_config_files = None - self.sb_doc = sb_doc def nf_schema_build(self): """ @@ -80,409 +63,6 @@ def nf_schema_build(self): ) self.nf_schema_path = nf_schema_path - def generate_sb_inputs(self): - """ - Generate SB inputs schema - """ - - # ## Add profiles to the input ## # - self.nf_config_files = get_config_files(self.workflow_path) - - profiles = dict() - - for path in self.nf_config_files: - profiles.update(parse_config_file(path)) - - profiles_choices = sorted(list(set(profiles.keys()))) - - if profiles: - self.sb_wrapper.safe_add_input( - create_profile_enum(profiles_choices) - ) - - # Optional inputs due to profiles - # optional_inputs = [] - # for profile_id, profile_contents in profiles.items(): - # for key in profile_contents.keys(): - # if 'params.' in key: - # input_ = key.rsplit('params.', 0) - # optional_inputs.extend(input_) - # optional_inputs = set(optional_inputs) - - # ## Add inputs ## # - if self.nf_schema_path: - with open(self.nf_schema_path, 'r') as f: - nf_schema = yaml.safe_load(f) - - for p_key, p_value in nf_schema.get('properties', {}).items(): - self.sb_wrapper.safe_add_input( - nf_to_sb_input_mapper(p_key, p_value)) - for def_name, definition in nf_schema.get( - 'definitions', {}).items(): - # Nextflow inputs schema contains multiple definitions where - # each definition contains multiple properties - category = dict() - - for nf_field, sb_field in NF_TO_CWL_CATEGORY_MAP.items(): - if nf_field in definition: - category[sb_field] = definition[nf_field] - - input_category = 'Inputs' - if 'title' in definition: - input_category = category['sbg:title'] - - for port_id, port_data in definition['properties'].items(): - req = False - # if port_id in definition.get('required', []) and \ - # port_id not in optional_inputs: - # req = True - - self.sb_wrapper.safe_add_input(nf_to_sb_input_mapper( - port_id, - port_data, - category=input_category, - required=req, - )) - - # Add the generic file array input - auxiliary files - self.sb_wrapper.safe_add_input(GENERIC_FILE_ARRAY_INPUT) - self.sb_wrapper.add_requirement(AUX_FILES_REQUIREMENT) - self.sb_wrapper.add_requirement(INLINE_JS_REQUIREMENT) - - def generate_sb_outputs(self): - """ - Generate SB output schema - """ - if get_tower_yml(self.workflow_path): - for output in self.parse_output_yml( - open(get_tower_yml(self.workflow_path)) - ): - self.sb_wrapper.safe_add_output(output) - - # if the only output is reports, or there are no outputs, add generic - if len(self.sb_wrapper.outputs) == 0 or \ - (len(self.sb_wrapper.outputs) == 1 and - self.sb_wrapper.outputs[0]['id'] == 'reports'): - self.sb_wrapper.safe_add_output(GENERIC_NF_OUTPUT_DIRECTORY) - - def parse_sample_sheet_schema(self, path): - """ - Example sample sheet: - sample_sheet_input: input_sample_sheet # taken from app wrapper - sample_sheet_name: samplesheet.csv - header: - - SampleID - - Fastq1 - - Fastq2 - rows: - - sample_id - - path - - path - defaults: - - NA - - NA - - NA - group_by: sample_id - format_: csv - - """ - schema = yaml.safe_load(path) - - sample_sheet_input = schema.get('sample_sheet_input') - sample_sheet_name = schema.get('sample_sheet_name', 'samplesheet') - header = schema.get('header', 'null') - - # fix rows - rows = schema.get('rows') - for i, r in enumerate(rows): - if "." not in r: - if r == 'path': - n = 0 - new_r = f'files[{n}].path' - while new_r in rows: - n += 1 - new_r = f'files[{n}].path' - rows[i] = new_r - else: - rows[i] = f'files[0].metadata.{r}' - - defaults = schema.get('defaults', 'null') - - # fix group by - group_by = schema.get('group_by') - if type(group_by) is str: - group_by = [group_by] - for i, gb in enumerate(group_by): - - if "." not in gb: - if gb in ['file', 'none']: - group_by[i] = 'file.path' - else: - group_by[i] = f'file.metadata.{gb}' - - format_ = schema.get('format_', None) - - if format_ and not sample_sheet_name.endswith(format_): - sample_sheet_name += f".{format_}".lower() - - if not format_ and not sample_sheet_name.endswith(['.tsv', '.csv']): - raise Exception('Sample sheet format could not be identified. ' - 'Please specify one of "tsv" or "csv" in the ' - 'sample sheet schema file.') - - if not format_ and sample_sheet_name.endswith(['.tsv', '.csv']): - format_ = sample_sheet_name.split('.').pop().lower() - - if format_.lower() not in ['tsv', 'csv']: - raise Exception(f'Unrecognized sample sheet format "{format_}".') - - # Step 1: - # add a new input to the pipeline - # - new input must not clash with other inputs by ID - # Ensure that the new input is unique - - # Create the sample sheet file array input - file_input = self.sb_wrapper.safe_add_input( - SAMPLE_SHEET_FILE_ARRAY_INPUT - ) - file_input_id = file_input.get('id') - - # Step 2: - # add argument for sample sheet - # - requires: sample sheet input (sample_sheet_input), - # file input (ss_file_input) - # - if the sample sheet is provided on input, - # do not generate a new ss - input_changes = { - 'id': sample_sheet_input, - 'loadContents': True, - 'inputBinding': REMOVE_INPUT_KEY - } - - prefix = self.sb_wrapper.get_input( - sample_sheet_input - )['inputBinding']['prefix'] - - self.sb_wrapper.update_input(input_changes) - self.sb_wrapper.add_argument( - { - "prefix": prefix, - "shellQuote": False, - "valueFrom": SAMPLE_SHEET_SWITCH.format( - file_input=f"inputs.{file_input_id}", - sample_sheet=f"inputs.{sample_sheet_input}", - sample_sheet_name=sample_sheet_name, - ) - } - ) - - # Step 3: - # add file requirement - # - requires: sample sheet schema - # - add InitialWorkDirRequirement if there are none - # - if there are, append the entry to listing - ss = sample_sheet( - file_name=sample_sheet_name, - sample_sheet_input=f"inputs.{sample_sheet_input}", - format_=format_, - input_source=f"inputs.{file_input_id}", - header=header, - rows=rows, - defaults=defaults, - group_by=group_by, - ) - - self.sb_wrapper.add_requirement(ss) - self.sb_wrapper.add_requirement(INLINE_JS_REQUIREMENT) - self.sb_wrapper.add_requirement(LOAD_LISTING_REQUIREMENT) - - def make_output_type(self, key, output_dict, is_record=False): - """ - This creates an output of specific type based on information provided - through output_dict. - - :param key: - :param output_dict: - :param is_record: - :return: - """ - - converted_cwl_output = dict() - - file_pattern = re.compile(r'.*\.(\w+)$') - folder_pattern = re.compile(r'[^.]+$') - id_key = 'id' - - if is_record: - id_key = 'name' - - name = key - if 'display' in output_dict: - name = output_dict['display'] - - clean_id = re.sub(r'[^a-zA-Z0-9_]', "", name.replace( - " ", "_")).lower() - - # Case 1: Output is a Record-type - if get_dict_depth(output_dict) > 0: - # this is a record, go through the dict_ recursively - fields = [self.make_output_type(key, val, is_record=True) - for key, val in output_dict.items()] - - used_field_ids = set() - - for field in fields: - base_field_id = field.get('name', 'Output') - - # Since name fields can be the same for multiple inputs, - # correct the name if it has already been used. - chk_id = base_field_id - i = 1 - if chk_id in used_field_ids: - chk_id = f"{base_field_id}_{i}" - i += 1 - used_field_ids.add(chk_id) - - field['name'] = chk_id - - converted_cwl_output = { - id_key: clean_id, - "label": name, - "type": [ - "null", - { - "type": "record", - "fields": fields, - "name": clean_id - } - ] - } - - # Case 2: Output is a File type - elif re.fullmatch(file_pattern, key): - # create a list of files output - converted_cwl_output = { - id_key: clean_id, - "label": name, - "type": "File[]?", - "outputBinding": { - "glob": key - } - } - - # Case 3: Output is a folder type - elif re.fullmatch(folder_pattern, key): - # create a list of directories output - converted_cwl_output = { - id_key: clean_id, - "label": name, - "type": "Directory[]?", - "outputBinding": { - "glob": key, - "loadListing": "deep_listing" - } - } - return converted_cwl_output - - def parse_output_yml(self, yml_file): - """ - Extracts output information from a YAML file, usually in tower.yml - format. - - :param yml_file: path to YAML file. - :return: list of outputs in CWL format. - """ - outputs = list() - yml_schema = yaml.safe_load(yml_file) - - for key, value in yml_schema.items(): - # Tower yml file can use "tower" key in the yml file to designate - # some configurations tower uses. Since these are not output - # definitions, we skip these. - if key in SKIP_NEXTFLOW_TOWER_KEYS: - continue - if key == "reports" and type(value) is dict: - temp = value.copy() - for k, v in temp.items(): - changed_key = f"work/**/{k}" - while "/**/**/" in changed_key: - changed_key = changed_key.replace("/**/**/", "/**/") - value[changed_key] = v - del value[k] - - outputs.append( - self.make_output_type(key, value) - ) - - return outputs - - def dump_sb_wrapper(self, out_format=EXTENSIONS.yaml): - """ - Dump SB wrapper for nextflow workflow to a file - """ - print('Writing sb nextflow schema file...') - basename = SB_SCHEMA_DEFAULT_NAME - counter = 0 - sb_wrapper_path = os.path.join( - self.workflow_path, - f'{basename}.{out_format}' - ) - - while os.path.exists(sb_wrapper_path): - counter += 1 - sb_wrapper_path = os.path.join( - self.workflow_path, - f'{basename}.{counter}.{out_format}' - ) - - print(f"Schema written to file <{sb_wrapper_path}>") - - if out_format in EXTENSIONS.yaml_all: - with open(sb_wrapper_path, 'w') as f: - yaml.dump(self.sb_wrapper.dump(), f, indent=4, sort_keys=True) - elif out_format in EXTENSIONS.json_all: - with open(sb_wrapper_path, 'w') as f: - json.dump(self.sb_wrapper.dump(), f, indent=4, sort_keys=True) - - def generate_sb_app( - self, sb_schema=None, sb_entrypoint='main.nf', - executor_version=None, sb_package_id=None, execution_mode=None, - sample_sheet_schema=None, - ): - """ - Generate an SB app for a nextflow workflow, OR edit the one created and - defined by the user - """ - - if sb_schema: - sb_schema_dict = pack(sb_schema) - self.sb_wrapper.load(sb_schema_dict) - - else: - self.sb_wrapper.cwl_version = 'None' - self.sb_wrapper.class_ = 'nextflow' - - self.generate_sb_inputs() - self.generate_sb_outputs() - - if sample_sheet_schema: - self.parse_sample_sheet_schema(open(sample_sheet_schema)) - - self.sb_wrapper.set_app_content( - code_package=sb_package_id, - entrypoint=sb_entrypoint, - executor_version=executor_version, - ) - - if execution_mode: - self.sb_wrapper.add_hint({ - 'class': 'sbg:NextflowExecutionMode', - 'value': execution_mode.value - }) - - if self.sb_doc: - self.sb_wrapper.add_docs(self.sb_doc) - def main(): # CLI parameters @@ -509,7 +89,7 @@ def main(): ) parser.add_argument( "--sb-package-id", required=False, - help="Id of an already uploaded package", + help="Id of an already uploaded package.", ) parser.add_argument( "--sb-doc", required=False, @@ -520,10 +100,10 @@ def main(): "--dump-sb-app", action="store_true", required=False, help="Dump created sb app to file if true and exit", ) - parser.add_argument( - "--no-package", action="store_true", required=False, - help="Only provide a sb app schema and a git URL for entrypoint", - ) + # parser.add_argument( + # "--no-package", action="store_true", required=False, + # help="Only provide an sb app schema and a git URL for entrypoint", + # ) parser.add_argument( "--executor-version", required=False, help="Version of the Nextflow executor to be used with the app.", @@ -531,7 +111,8 @@ def main(): parser.add_argument( "--execution-mode", type=ExecMode, choices=list(ExecMode), required=False, default=None, - help="Execution mode for your application.", + help="Execution mode for your application. Can be multi-instance or " + "single-instance", ) parser.add_argument( "--json", action="store_true", required=False, @@ -548,6 +129,18 @@ def main(): help="Revision note to be placed in the CWL schema if the app is " "uploaded to the sbg platform.", ) + parser.add_argument( + "--app-name", required=False, + default=None, type=str, + help="Name of the app to be shown on the platform.", + ) + parser.add_argument( + "--exclude", required=False, + default=None, type=str, nargs="+", + help=f"Glob patterns you want to exclude from the code package. " + f"By default the following patterns are excluded: " + f"{DEFAULT_EXCLUDE_PATTERNS}" + ) parser.add_argument( "--sample-sheet-schema", required=False, default=None, type=str, @@ -566,15 +159,43 @@ def main(): # Preprocess CLI parameter values # This stores them into variables that can be updated if --auto is used - entrypoint = args.entrypoint or \ - get_entrypoint(args.workflow_path) or 'main.nf' + entrypoint = args.entrypoint or None sb_schema = args.sb_schema or None executor_version = args.executor_version or None execution_mode = args.execution_mode or None revision_note = args.revision_note or \ f"Uploaded using sbpack v{__version__}" sample_sheet_schema = args.sample_sheet_schema or None + label = args.app_name or None dump_sb_app = args.dump_sb_app or False + sb_package_id = args.sb_package_id or None + + # Input validation + if not dump_sb_app and not args.appid and not args.auto: + raise Exception( + "The --appid argument is required if " + "--dump-sb-app and/or --auto are not used" + ) + + if sb_schema and execution_mode: + logger.warning( + "Using --sb-schema option overwrites --execution-mode" + ) + + if sb_schema and label: + logger.warning( + "Using --sb-schema option overwrites --app-name" + ) + + if sb_schema and executor_version: + logger.warning( + "Using --sb-schema option overwrites --executor-version" + ) + + if sb_schema and entrypoint: + logger.warning( + "Using --sb-schema option overwrites --entrypoint" + ) sb_doc = None if args.sb_doc: @@ -584,85 +205,53 @@ def main(): with open(get_readme(args.workflow_path), 'r') as f: sb_doc = f.read() - test_sign, test_executor_version = get_executor_version(sb_doc or "") - if test_sign and executor_version and "edge" not in executor_version: - if test_sign == "=" and version.parse(executor_version) != \ - version.parse(test_executor_version): - logger.warning( - f"Provided executor version {executor_version} does not" - f" match detected version {test_sign}{test_executor_version}" - ) - if test_sign == ">" and version.parse(executor_version) <= \ - version.parse(test_executor_version): - logger.warning( - f"Provided executor version {executor_version} does not" - f" match detected version {test_sign}{test_executor_version}" - ) - if test_sign == "<" and version.parse(executor_version) >= \ - version.parse(test_executor_version): - logger.warning( - f"Provided executor version {executor_version} does not" - f" match detected version {test_sign}{test_executor_version}" - ) - if test_sign == ">=" and version.parse(executor_version) < \ - version.parse(test_executor_version): - logger.warning( - f"Provided executor version {executor_version} does not" - f" match detected version {test_sign}{test_executor_version}" - ) - if test_sign == "<=" and version.parse(executor_version) > \ - version.parse(test_executor_version): - logger.warning( - f"Provided executor version {executor_version} does not" - f" match detected version {test_sign}{test_executor_version}" - ) - if args.auto: # This is where the magic happens if not sb_schema: sb_schema = get_latest_sb_schema(args.workflow_path) - # detect nextflow executor version from description - executor_version = test_executor_version + if sb_schema: + logger.info(f'Using sb schema <{sb_schema}>') # Set execution mode to multi-instance if not execution_mode: execution_mode = ExecMode.multi + logger.info(f'Using execution mode <{execution_mode}>') # locate sample sheet if not sample_sheet_schema: sample_sheet_schema = get_sample_sheet_schema(args.workflow_path) + if sample_sheet_schema: + logger.info( + f'Using sample sheet schema <{sample_sheet_schema}>' + ) # if appid is not provided, dump the app if not args.appid: dump_sb_app = True - - # Input validation - if not dump_sb_app: - # appid is required - if not args.appid: - raise Exception( - "The --appid argument is required if " - "--dump-sb-app is not used" + logger.info( + f'Appid not provided. App is not going to be uploaded.' ) nf_wrapper = SBNextflowWrapper( workflow_path=args.workflow_path, - sb_doc=sb_doc + sb_doc=sb_doc, + label=label, + entrypoint=entrypoint, + executor_version=executor_version, + sb_package_id=sb_package_id, ) if sb_schema: # parse input schema - nf_wrapper.generate_sb_app( - sb_schema=sb_schema - ) + with open(sb_schema, 'r') as s: + schema = yaml.safe_load(s) + nf_wrapper.sb_wrapper.load(schema) else: # build schema nf_wrapper.nf_schema_build() # Create app nf_wrapper.generate_sb_app( - sb_entrypoint=entrypoint, - executor_version=executor_version, execution_mode=execution_mode, sample_sheet_schema=sample_sheet_schema, ) @@ -672,19 +261,21 @@ def main(): # Dump app to local file out_format = EXTENSIONS.json if args.json else EXTENSIONS.yaml nf_wrapper.dump_sb_wrapper(out_format=out_format) + else: + # App should be installed on the platform api = lib.get_profile(args.profile) - sb_package_id = None - if args.sb_package_id: - sb_package_id = args.sb_package_id - elif not args.no_package: + # 1. if the code package is not provided on input, + # create and upload it + if not sb_package_id: project_id = '/'.join(args.appid.split('/')[:2]) sb_package_id = zip_and_push_to_sb( api=api, workflow_path=args.workflow_path, project_id=project_id, - folder_name='nextflow_workflows' + folder_name='nextflow_workflows', + exclude_patterns=args.exclude, ) nf_wrapper.sb_wrapper.set_app_content( diff --git a/sbpack/noncwl/utils.py b/sbpack/noncwl/utils.py index 29d6bbf..400413c 100644 --- a/sbpack/noncwl/utils.py +++ b/sbpack/noncwl/utils.py @@ -6,167 +6,31 @@ import yaml import re +from typing import Optional +import fnmatch from sbpack.pack import pack from sevenbridges.errors import NotFound from sbpack.noncwl.constants import ( PACKAGE_SIZE_LIMIT, + DEFAULT_EXCLUDE_PATTERNS, +) +from wrabbit.parser.constants import ( EXTENSIONS, - NF_TO_CWL_PORT_MAP, - SB_SCHEMA_DEFAULT_NAME ) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -def nf_schema_type_mapper(input_type_string): - """ - Convert nextflow schema input type to CWL - """ - type_ = input_type_string.get('type', 'string') - format_ = input_type_string.get('format', '') - - return type_mapper(type_, format_) - - -def nf_to_sb_input_mapper(port_id, port_data, category=None, required=False): - """ - Convert a single input from Nextflow schema to SB schema - """ - sb_input = dict() - sb_input['id'] = port_id - # # Do not convert outdir - # if port_id == 'outdir': - # port_data['format'] = '' - - enum_symbols = port_data.get('enum', []) - - if enum_symbols: - sb_input['type'] = enum_type( - id_=port_id, - symbols=enum_symbols, - ) - else: - sb_input['type'] = nf_schema_type_mapper(port_data) - - sb_input['inputBinding'] = { - 'prefix': f'--{port_id}', - } - - if not required: - sb_input['type'].append('null') - - if category: - sb_input['sbg:category'] = category - - for nf_field, sb_field in NF_TO_CWL_PORT_MAP.items(): - if nf_field in port_data: - value = port_data[nf_field] - if value == ":" and nf_field == 'default': - # Bug prevents running a task if an input's - # default value is exactly ":". This bug will likely be - # fixed at the time of release of this version. - value = " :" - sb_input[sb_field] = value - - return sb_input - - -def type_mapper(type_, format_) -> list: - if isinstance(type_, str): - if type_ == 'string' and 'path' in format_: - if format_ == 'file-path': - return ['File'] - if format_ == 'directory-path': - return ['Directory'] - if format_ == 'path': - return ['File'] - if type_ == 'string': - return ['string'] - if type_ == 'integer': - return ['int'] - if type_ == 'number': - return ['float'] - if type_ == 'boolean': - return ['boolean'] - if type_ == 'object': - # this should be a record type (dictionary) - # it is provided as '{"key1": "value1", "key2": "value2"}' - return ['string'] - return [type_] - elif isinstance(type_, list): - temp_type_list = [] - for m in type_: - temp_type_list.extend(type_mapper(m, format_)) - return temp_type_list - - -def enum_type(id_: str, symbols: list) -> list: - # This can be generalized so that it encompasses create_profile_enum - return [ - { - "type": "enum", - "name": id_, - "symbols": symbols - } - ] - - -def create_profile_enum(profiles: list): - """ - If profiles are defined in the config file, this input stores the profiles - They are added to the commandline as -profile foo,bar,foobar - :param profiles: list of profiles - :return: Profiles enum array input - """ - return { - "id": "profile", - "type": [ - "null", - { - "type": "array", - "items": { - "type": "enum", - "name": "profile", - "symbols": profiles - } - } - ], - "label": "Profiles", - "doc": "Select which profile(s) you want to use for task execution.", - "inputBinding": { - "prefix": "-profile", - "itemSeparator": ",", - "shellQuote": False, - } - } - - -def get_dict_depth(dict_, level=0): - """ - Find the depth of the dictionary. Example: - {'a': 1} - returns 0; - {'a': {'b': 2}} - returns 1... - - :param dict_: input dictionary - :param level: depth of the outer dict - :return: int - """ - n = level - for k, v in dict_.items(): - if type(v) is dict: - lv = get_dict_depth(v, level + 1) - if lv > n: - n = lv - return n - - -def zip_and_push_to_sb(api, workflow_path, project_id, folder_name): +def zip_and_push_to_sb( + api, workflow_path, project_id, folder_name, + exclude_patterns: Optional[list] = None +): """ Create .zip package file. Upload .zip file to the designated folder for packages on SevenBridges Platform. Delete local .zip file. """ - zip_path = zip_directory(workflow_path) + zip_path = zip_directory(workflow_path, exclude_patterns) return push_zip(api, zip_path, project_id, folder_name) @@ -176,35 +40,55 @@ def update_timestamp(file_name): ) + f'_{time.strftime("%Y%m%d-%H%M%S")}' -def zip_directory(workflow_path): +def zip_directory(workflow_path, exclude_patterns: Optional[list] = None): """ This will create a temporary directory that will store all files from the original directory, except for the .git hidden directory. This dir sometimes collects a large amount of files that will not be used by the tool, and can increase the size of the archive up to 10 times. """ + if not exclude_patterns: + exclude_patterns = [] intermediary_dir = update_timestamp(os.path.abspath(workflow_path)) os.mkdir(intermediary_dir) for root, dirs, files in os.walk(workflow_path): - pattern = re.compile(r'(?:^|.*/)\.git(?:$|/.*)') - if re.match(pattern, root): - continue - - dirs = [d for d in dirs if not re.match(pattern, d)] for d in dirs: source_file = os.path.join(root, d) directory_path = os.path.join(intermediary_dir, os.path.relpath( source_file, workflow_path)) - if not os.path.exists(directory_path): - os.mkdir(directory_path) + + if any([ + fnmatch.fnmatch( + directory_path, os.path.join(intermediary_dir, pattern) + ) for pattern in exclude_patterns + DEFAULT_EXCLUDE_PATTERNS + ]): + continue + + try: + if not os.path.exists(directory_path): + os.mkdir(directory_path) + except FileNotFoundError: + """Skip folders that cannot be created""" + pass for file in files: source_file = os.path.join(root, file) dest_file = os.path.join(intermediary_dir, os.path.relpath( source_file, workflow_path)) - shutil.copy2(source_file, dest_file) + + if any([ + fnmatch.fnmatch( + dest_file, os.path.join(intermediary_dir, pattern) + ) for pattern in exclude_patterns + DEFAULT_EXCLUDE_PATTERNS + ]): + continue + + try: + shutil.copy2(source_file, dest_file) + except FileNotFoundError: + pass shutil.make_archive( intermediary_dir, @@ -261,175 +145,7 @@ def push_zip(api, zip_path, project_id, folder_name=None): return uploaded_file_id -def get_readme(path): - """ - Find readme file is there is one in the path folder - """ - for file in os.listdir(path): - if file.lower() == 'readme.md': - return os.path.join(path, file) - return None - - -def get_tower_yml(path): - """ - Find tower.yml file is there is one in the path folder - """ - for file in os.listdir(path): - if file.lower() == 'tower.yml': - return os.path.join(path, file) - return None - - -# Nextflow -def get_entrypoint(path): - """ - Auto find main.nf or similar file is there is one in the path folder. - """ - possible_paths = [] - for file in os.listdir(path): - if file.lower() == 'main.nf': - return file - - if file.lower().endswith('.nf'): - possible_paths.append(file) - - if len(possible_paths) > 1: - raise Exception( - 'Detected more than 1 nextflow file in the root of the ' - 'workflow-path. Please use `--entrypoint` to specify which script ' - 'you want to use as the workflow entrypoint') - elif len(possible_paths) == 1: - return possible_paths.pop() - else: - return None - - -# Nextflow -def get_latest_sb_schema(path): - """ - Auto find sb_nextflow_schema file. - """ - possible_paths = [] - for file in os.listdir(path): - result = re.findall( - fr"(.*{SB_SCHEMA_DEFAULT_NAME}\.?(\d+)?\.(ya?ml|json))", file - ) - if result: - result = result.pop(0) - if not result[1]: - prep = 0, result[0] - else: - prep = int(result[1]), result[0] - possible_paths.append(prep) - - if possible_paths: - latest = sorted(possible_paths).pop()[1] - sb_schema_path = os.path.join(path, latest) - print(f"Located latest sb_nextflow_schema at <{sb_schema_path}>") - return sb_schema_path - else: - return None - - -def get_executor_version(string): - result = re.findall( - r"\[Nextflow]\([^(]+(%E2%89%A5|%E2%89%A4|=|>|<)(\d{2}\.\d+\.\d+)[^)]+\)", - string - ) - if result: - sign, version = result.pop(0) - if sign == "%E2%89%A5": - sign = ">=" - elif sign == "%E2%89%A4": - sign = "<=" - elif not sign: - sign = "=" - - print(f"Identified nextflow executor version requirement {sign} {version}") - return sign, version - else: - return None, None - - -def get_sample_sheet_schema(path): - ss_path = os.path.join(path, "samplesheet_schema.yaml") - - if os.path.exists(ss_path): - print(f"Located latest sample sheet schema at <{ss_path}>") - return ss_path - else: - return None - - -def get_config_files(path): - """ - Auto find config files. - """ - paths = [] - for file in os.listdir(path): - if file.lower().endswith('.config'): - paths.append(os.path.join(path, file)) - return paths or None - - -def find_config_section(file_path: str, section: str) -> str: - section_text = "" - found_section = False - brackets = 0 - - with open(file_path, 'r') as file: - for line in file.readlines(): - if found_section: - section_text += line - brackets += line.count("{") - line.count("}") - - if brackets < 0: - break - - if re.findall(section + r'\s+\{', line): - section_text += "{\n" - found_section = True - - return section_text - - -def parse_config_file(file_path: str) -> dict: - profiles_text = find_config_section(file_path, 'profiles') - - # Extract profiles using regex - profiles = {} - block_pattern = re.compile( - r'\s*(\w+)\s*{([^}]+)}', re.MULTILINE | re.DOTALL - ) - key_val_pattern = re.compile( - r'([a-zA-Z._]+)(?:\s+|)=(?:\s+|)([^\s]+)' - ) - include_pattern = re.compile( - r'includeConfig\s+[\'\"]([a-zA-Z_.\\/]+)[\'\"]' - ) - - blocks = re.findall(block_pattern, profiles_text) - for name, content in blocks: - settings = dict(re.findall(key_val_pattern, content)) - profiles[name] = settings - include_path = re.findall(include_pattern, content) - if include_path: - profiles[name]['includeConfig'] = include_path - include_path = include_path.pop() - additional_path = os.path.join( - os.path.dirname(file_path), include_path) - params_text = find_config_section(additional_path, 'params') - params = dict(re.findall(key_val_pattern, params_text)) - for param, val in params.items(): - profiles[name][f"params.{param}"] = val - - # return currently returns includeConfig and settings, which are not used - # but could be used in the future versions of sbpack - return profiles - - -# Deprecated +# Deprecated - used only in WDL def update_schema_code_package(sb_schema, schema_ext, new_code_package): """ Update the package in the sb_schema diff --git a/sbpack/noncwl/wdl.py b/sbpack/noncwl/wdl.py index 43bea6c..948c1b0 100644 --- a/sbpack/noncwl/wdl.py +++ b/sbpack/noncwl/wdl.py @@ -9,7 +9,6 @@ from subprocess import check_call from sbpack.noncwl.utils import ( zip_and_push_to_sb, - get_readme, install_or_upgrade_app, update_schema_code_package, ) @@ -19,6 +18,9 @@ WRAPPER_REQUIREMENTS, GENERIC_WDL_OUTPUT_DIRECTORY, ) +from wrabbit.parser.utils import ( + get_readme +) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) diff --git a/sbpack/noncwl/wrapper.py b/sbpack/noncwl/wrapper.py deleted file mode 100644 index 660ded9..0000000 --- a/sbpack/noncwl/wrapper.py +++ /dev/null @@ -1,222 +0,0 @@ -import logging -from sbpack.noncwl.constants import REMOVE_INPUT_KEY - - -class Wrapper: - inputs = list() - outputs = list() - app_content = dict() - class_ = None - cwl_version = None - arguments = None - requirements = None - hints = None - doc = None - revision_note = None - - def __init__(self): - pass - - def get_input(self, id_): - for inp in self.inputs: - if inp.get('id', '') == id_: - return inp - else: - logging.warning(f'Input with id <{id_}> not found.') - - def add_input(self, inp): - for input_ in self.inputs: - id_ = inp.get('id') - if input_.get('id') == id_: - # raise an exception or warning - logging.warning(f'Input with id <{id_}> already exists. ' - f'Skipping...') - - self.inputs.append(inp) - - def safe_add_input(self, inp): - all_input_ids = [i['id'] for i in self.inputs if 'id' in i] - input_id = inp.get('id') - temp_id = input_id - i = 0 - while temp_id in all_input_ids: - i += 1 - temp_id = f"{input_id}_{i}" - - inp['id'] = temp_id - self.add_input(inp) - - return inp - - def update_input(self, inp): - id_ = inp.get('id') - for input_ in self.inputs: - if input_['id'] == id_: - input_.update(inp) - for key in input_.copy(): - if input_[key] == REMOVE_INPUT_KEY: - input_.pop(key) - break - else: - raise KeyError( - f'Input with id <{id_}> not found.' - ) - - def get_output(self, id_): - for out in self.outputs: - if out.get('id', '') == id_: - return out - else: - logging.warning(f'Output with id <{id_}> not found.') - - def add_output(self, out): - for output in self.outputs: - id_ = out.get('id') - if output.get('id') == id_: - # raise an exception or warning - logging.warning(f'Output with id <{id_}> already exists. ' - f'Skipping...') - self.outputs.append(out) - - def safe_add_output(self, out): - all_output_ids = [o['id'] for o in self.outputs if 'id' in o] - output_id = out.get('id') - temp_id = output_id - i = 0 - while temp_id in all_output_ids: - i += 1 - temp_id = f"{output_id}_{i}" - - out['id'] = temp_id - self.add_output(out) - - return out - - def update_output(self, out): - id_ = out.get('id') - for output in self.outputs: - if output['id'] == id_: - output.update(out) - for key in output: - if output[key] == REMOVE_INPUT_KEY: - output.pop(key) - break - else: - raise KeyError( - f'Output with id <{id_}> not found.' - ) - - def add_requirement(self, requirement): - if not self.requirements: - self.requirements = list() - - for req in self.requirements: - if req['class'] == requirement['class']: - # check listings -> add missing -> break - if requirement['class'] == 'InitialWorkDirRequirement' and \ - 'listing' in requirement: - if 'listing' not in req: - req['listing'] = [] - req['listing'].extend(requirement['listing']) - break - else: - # add new class - self.requirements.append(requirement) - - def set_app_content( - self, code_package=None, entrypoint=None, executor_version=None, - **kwargs - ): - payload = dict() - - if code_package: - payload['code_package'] = code_package - if entrypoint: - payload['entrypoint'] = entrypoint - if executor_version: - payload['executor_version'] = executor_version - - self.app_content.update(payload) - - def add_argument(self, arg): - if not self.arguments: - self.arguments = list() - self.arguments.append(arg) - - def add_hint(self, hint): - if not self.hints: - self.hints = list() - self.hints.append(hint) - - def add_docs(self, doc): - self.doc = doc - - def add_revision_note(self, note): - self.revision_note = note - - def load(self, schema): - s_inputs = schema.get('inputs', []) - for input_ in s_inputs: - self.add_input(input_) - - s_outputs = schema.get('outputs', []) - for output in s_outputs: - self.add_output(output) - - s_app_content = schema.get('app_content', dict()) - self.set_app_content(**s_app_content) - - self.class_ = schema.get('class', None) - self.cwl_version = schema.get('cwlVersion', None) - - s_arguments = schema.get('arguments', []) - for argument in s_arguments: - self.add_argument(argument) - - s_requirements = schema.get('requirements', []) - for requirement in s_requirements: - self.add_requirement(requirement) - - s_hints = schema.get('hints', []) - for hint in s_hints: - self.add_hint(hint) - - s_doc = schema.get('doc', None) - if s_doc: - self.add_docs(s_doc) - - s_revision_note = schema.get('sbg:revisionNote', None) - if s_revision_note: - self.add_revision_note(s_revision_note) - - def dump(self): - wrapper = dict() - - if self.app_content: - wrapper['app_content'] = self.app_content - - if self.doc: - wrapper['doc'] = self.doc - - wrapper['inputs'] = self.inputs - wrapper['outputs'] = self.outputs - - if self.arguments: - wrapper['arguments'] = self.arguments - - if self.class_: - wrapper['class'] = self.class_ - - if self.cwl_version: - wrapper['cwlVersion'] = self.cwl_version - - if self.requirements: - wrapper['requirements'] = self.requirements - - if self.hints: - wrapper['hints'] = self.hints - - if self.revision_note: - wrapper['sbg:revisionNotes'] = self.revision_note - - return wrapper diff --git a/sbpack/version.py b/sbpack/version.py index 52cc802..39ee4f9 100644 --- a/sbpack/version.py +++ b/sbpack/version.py @@ -1 +1 @@ -__version__ = "2024.2.2rc1" +__version__ = "2024.5.7rc1" diff --git a/setup.py b/setup.py index c5b3743..6d04ab6 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ 'sbpack_nf = sbpack.noncwl.nextflow:main', 'sbpack_wdl = sbpack.noncwl.wdl:main', 'sbcopy = sbpack.noncwl.copy_app:main', + 'sbmanifest = sbpack.noncwl.manifest:main', ], }, author='Seven Bridges',