Skip to content

Commit

Permalink
Merge pull request #67 from oicr-gsi/rjdev
Browse files Browse the repository at this point in the history
Rjdev
  • Loading branch information
rjovelin authored May 30, 2024
2 parents 8c8a867 + 3789f05 commit a4c1372
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 10 deletions.
18 changes: 16 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,17 @@ Path to the sample provenance

Tag released files in Nabu.

example usage: ```dare mark -u USER -st pass -rn DIRECTORY -c COMMENT -pr PROJECT```
There are different ways to mark files in Nabu
For fastq files typically organized by run folder, one possibility is to point to the run directory containing the file links with the following command:

```dare mark -u USER -st pass -rn DIRECTORY -c TICKET -pr PROJECT```

For analysis pipelines, it is still possible to point to a directory containing links provided that links are not in sub-folders.
However, it is more convinient to mark files using the same json structure that was used to link the files (see baove section).

```dare mark -u USER -st pass -c TICKET -pr PROJECT -a JSON_FILE```



Parameters

Expand All @@ -330,8 +340,12 @@ Parameters
| -w | Workflow used to generate the output files | optional |
| -px | Prefix to file paths if FPR contains relative paths | optional |
| --exclude_miseq | Exclude miseq runs | optional |
| -a | Nabu api | default |
| -n | Nabu api | default |
| -fpr | Path to FPR | default |
| -e | Files with libraries tagged for non-release | optional |
| -f | File with file names to be released | optional |
| -a | Files with hierarchical structure storing sample and workflow ids | optional |



## Generating a batch release report with dare.py ##
Expand Down
106 changes: 98 additions & 8 deletions dare.py
Original file line number Diff line number Diff line change
Expand Up @@ -3633,6 +3633,60 @@ def change_nabu_status(api, file_swid, qc_status, user_name, comment=None):
print('Could not update {0} status. Nabu response code: {1}'.format(file_swid, response.status_code))





def get_pipeline_swids(data_structure, files):
'''
(dict, dict) -> list
Returns a list of file swids for each sample and workflow specified in the data_structure
Parameters
----------
- data_structure (str): Dictionary with samples, workflows and workflow_run_id hierarchical structure
- files (dict): Dictionary with all file records for a given project extracted from FPR
'''

L = []

for donor in data_structure:
for sample_id in data_structure[donor]:
for file_swid in files:
sample = files[file_swid]['sample_name']
workflow = files[file_swid]['workflow']
version = files[file_swid]['workflow_version']
wf_id = files[file_swid]['workflow_run_id']
if donor == sample and sample in sample_id and workflow in data_structure[donor][sample_id]:
for d in data_structure[donor][sample_id][workflow]:
if version == d['workflow_version'] and wf_id == d['workflow_id']:
# check which files are collected
if 'extension' in d:
# files are collected based on file extension
# get the file extension
extension = pathlib.Path(files[file_swid]['file_path']).suffix
if extension in d['extension']:
L.append(file_swid)
elif 'files' in d:
# files are collected based on file name
if os.path.basename(files[file_swid]['file_path']) in list(map(lambda x: os.path.basename(x), d['files'])):
L.append(file_swid)
elif 'rename_files' in d:
# files are collected based on file name and renamed
file_paths = []
for i in d['rename_files']:
file_paths.append(os.path.basename(i['file_path']))
if os.path.basename(files[file_swid]['file_path']) in file_paths:
L.append(file_swid)
else:
L.append(file_swid)

L = list(set(L))

return L



def mark_files_nabu(args):
'''
(str, str, str) -> None
Expand All @@ -3649,22 +3703,40 @@ def mark_files_nabu(args):
- user (str): User name to appear in Nabu for each released or whitheld file
- comment (str): A comment to used to tag the file. For instance the Jira ticket
- provenance (str): Path to File Provenance Report
- run_directory (str): Directory with links organized by project and run in gsi space
- runs (list): List of run IDs
- libraries (str): Path to file with libraries tagged for release
- workflow (str): Worflow used to generate the output files
- nomiseq (bool): Exclude MiSeq runs if True
- prefix (str): Use of prefix assumes that FPR containes relative paths.
Prefix is added to the relative paths in FPR to determine the full file paths
- exclude (str): Path to file with libraries tagged for non-release
- release_files (str): File with file names to be released
- nabu (str): URL of the Nabu API. Default is https://nabu-prod.gsi.oicr.on.ca
- provenance (str): Path to File Provenance Report. Default is /scratch2/groups/gsi/production/vidarr/vidarr_files_report_latest.tsv.gz
- analysis (str): Path to the file with hierarchical structure storing sample and workflow ids
'''

# check valid combinations of parameters
if args.runs and args.libraries:
sys.exit('-r and -l are exclusive parameters')
if args.run_directory and (args.workflow or args.runs or args.libraries or args.release_files or args.exclude or args.prefix or args.nomiseq):
sys.exit('-rn cannot be used with options -w, -r, -l, -f, -e, -px or --exclude_miseq')
if all(map(lambda x: x is None, [args.workflow, args.runs, args.libraries, args.release_files, args.exclude, args.prefix, args.nomiseq])) and args.run_directory is None:
sys.exit('Please provide the path to a run folders')
if args.workflow is None and args.run_directory is None:
sys.exit('Please provide the path to a run folder or a workflow')

if args.analysis is None:
if all(map(lambda x: x is None, [args.workflow, args.runs, args.libraries, args.release_files, args.exclude, args.prefix, args.nomiseq])) and args.run_directory is None:
sys.exit('Please provide the path to a run folders')
if args.workflow is None and args.run_directory is None:
sys.exit('Please provide the path to a run folder or a workflow')
else:
if args.runs or args.libraries or args.run_directory or args.workflow or args.release_files or args.exclude or args.prefix or args.nomiseq:
sys.exit('''--analysis cannot be used with options -r, -l, -rn, -w, -f, -e, -px or --exclude_miseq
You are attempting to mark analysis pipeline files. Provide the same json structure used to link the files''')

# dereference link to FPR
provenance = os.path.realpath(args.provenance)

if args.run_directory:
# get the swids of the files linked in run directory
# check directory
if os.path.isdir(args.run_directory) == False:
sys.exit('{0} is not a valid directory'.format(args.run_directory))
Expand All @@ -3683,16 +3755,33 @@ def mark_files_nabu(args):
if no_swids:
for file in no_swids:
print('File {0} in directory {1} does not have a swid'.format(os.path.basename(file), args.run_directory))
elif args.analysis:
# get the file swids from the json structure
infile = open(args.analysis)
data_structure = json.load(infile)
infile.close()

# parse FPR records
# make a list of workflows
workflows = []
for i in data_structure:
for j in data_structure[i]:
workflows.extend(list(data_structure[i][j].keys()))
workflows = list(set(workflows))
print('workflows', workflows)
files = parse_fpr_records(provenance, args.project, workflows, args.prefix)
print('Extracted files from File Provenance Report')
swids = get_pipeline_swids(data_structure, files)
else:
# collect relevant information from File Provenance Report about fastqs for project
files = parse_fpr_records(provenance, args.project, [args.workflow], args.prefix)
print('Information was extracted from FPR {0}'.format(provenance))
released_files, _ = collect_files_for_release(files, args.release_files, args.nomiseq, args.runs, args.libraries, args.exclude)
swids = list(released_files.keys())

# mark files il nabu
for i in swids:
change_nabu_status(args.api, i, args.status.upper(), args.user, comment=args.comment)
change_nabu_status(args.nabu, i, args.status.upper(), args.user, comment=args.comment)


if __name__ == '__main__':
Expand Down Expand Up @@ -3749,8 +3838,9 @@ def mark_files_nabu(args):
n_parser.add_argument('-e', '--exclude', dest='exclude', help='File with libraries tagged for non-release. The first column is always the library. The optional second column is the run id')
n_parser.add_argument('-f', '--files', dest='release_files', help='File with file names to be released')
n_parser.add_argument('-c', '--comment', dest='comment', help='Comment to be added to the released file')
n_parser.add_argument('-a', '--api', dest='api', default='https://nabu-prod.gsi.oicr.on.ca', help='URL of the Nabu API. Default is https://nabu-prod.gsi.oicr.on.ca')
n_parser.add_argument('-n', '--nabu', dest='nabu', default='https://nabu-prod.gsi.oicr.on.ca', help='URL of the Nabu API. Default is https://nabu-prod.gsi.oicr.on.ca')
n_parser.add_argument('-fpr', '--provenance', dest='provenance', default='/scratch2/groups/gsi/production/vidarr/vidarr_files_report_latest.tsv.gz', help='Path to File Provenance Report. Default is /scratch2/groups/gsi/production/vidarr/vidarr_files_report_latest.tsv.gz')
n_parser.add_argument('-a', '--analysis', dest='analysis', help='Path to the file with hierarchical structure storing sample and workflow ids')
n_parser.set_defaults(func=mark_files_nabu)

# write a report
Expand Down

0 comments on commit a4c1372

Please sign in to comment.