Skip to content

Commit

Permalink
Merge pull request #48 from broadinstitute/develop
Browse files Browse the repository at this point in the history
Releasing SS2 and Optimus for HCA update
  • Loading branch information
nikellepetrillo authored Aug 26, 2020
2 parents 9692363 + f221e75 commit 0281d46
Show file tree
Hide file tree
Showing 111 changed files with 1,707 additions and 424 deletions.
20 changes: 10 additions & 10 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,22 +104,22 @@ jobs:
./tests/skylab/trigger_test.sh bulk_rna_pipeline
no_output_timeout: 1.5h

test_snap_atac:
test_sc_atac:
machine: true
steps:
- checkout
- run:
command: |
./tests/skylab/trigger_test.sh snap_atac
./tests/skylab/trigger_test.sh scATAC
no_output_timeout: 1.5h

test_sc_atac:
test_atac:
machine: true
steps:
- checkout
- run:
command: |
./tests/skylab/trigger_test.sh sc_atac
./tests/skylab/trigger_test.sh ATAC
no_output_timeout: 3.0h

workflows:
Expand All @@ -130,13 +130,13 @@ workflows:
- test_optimus_snrna
# - test_emptyDropsWrapper
- test_optimus_mouse
# - test_smartseq2
# - test_smartseq2_single_end
- test_smartseq2
- test_smartseq2_single_end
# - test_npz2rds
# - test_snap_atac
# - test_sc_atac
- test_sc_atac
- test_atac
# - test_bulk_rna
# - test_optimus_v3
# - test_smartseq2_multisample
# - test_smartseq2_multisample_single_end
- test_smartseq2_multisample
- test_smartseq2_multisample_single_end

3 changes: 3 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ workflows:
- name: IlluminaGenotypingArray
subclass: WDL
primaryDescriptorPath: /pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl
- name: scATAC
subclass: WDL
primaryDescriptorPath: /pipelines/skylab/scATAC/scATAC.wdl
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ workflow ATAC {

# Output prefix/base name for all intermediate files and pipeline outputs
String output_base_name

String bin_size_list = "10000"
}

parameter_meta {
Expand All @@ -50,6 +52,7 @@ workflow ATAC {
min_map_quality: "the minimum mapping quality to be filtered by samtools view and snap-pre (snaptools task)"
max_fragment_length: "the maximum fragment length for filtering out reads by gatk and snap-pre (snaptools task)"
output_base_name: "base name to be used for the pipelines output and intermediate files"
bin_size_list: "space separated list of bins to generate"
}

call TrimAdapters {
Expand Down Expand Up @@ -136,13 +139,14 @@ workflow ATAC {

call SnapCellByBin {
input:
snap_input=SnapPre.snap_file_output,
bin_size_list = "10000"
snap_input = SnapPre.snap_file_output,
bin_size_list = bin_size_list
}

call BreakoutSnap {
input:
snap_input = SnapCellByBin.snap_output
snap_input = SnapCellByBin.snap_output,
bin_size_list = bin_size_list
}

output {
Expand Down Expand Up @@ -703,6 +707,7 @@ task BreakoutSnap {
input {
File snap_input
String docker_image = "quay.io/humancellatlas/snap-breakout:0.0.1"
String bin_size_list
}
Int num_threads = 1
Float input_size = size(snap_input, "GiB")
Expand All @@ -715,9 +720,9 @@ task BreakoutSnap {
output {
File barcodes = 'output/barcodes.csv'
File fragments = 'output/fragments.csv'
File binCoordinates = 'output/binCoordinates_10000.csv'
File binCounts = 'output/binCounts_10000.csv'
File barcodesSection = 'output/barcodesSection.csv'
File binCoordinates = 'output/binCoordinates_~{bin_size_list}.csv'
File binCounts = 'output/binCounts_~{bin_size_list}.csv'
File barcodesSection = 'output/barcodesSection.csv'
}
runtime {
docker: docker_image
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion dockers/skylab/loom-output/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@ WORKDIR /tools

COPY create_loom_optimus.py .
COPY create_loom_ss2.py .
COPY ss2_loom_merge.py .
COPY loomCompare.py .
COPY ss2_loom_merge.py .
45 changes: 37 additions & 8 deletions dockers/skylab/loom-output/create_loom_optimus.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
from scipy import sparse
import pandas as pd
import scipy as sc
import logging

logging.basicConfig(level=logging.INFO)

def create_gene_id_name_map(gtf_file):
""" Creates a map from gene_id to gene_name by reading in the GTF file
Expand Down Expand Up @@ -331,7 +329,7 @@ def generate_matrix(args):

def create_loom_files(args):
"""This function creates the loom file or folder structure in output_loom_path in format file_format,
with sample_id from the input folder analysis_output_path
with input_id from the input folder analysis_output_path
Args:
args (argparse.Namespace): input arguments for the run
Expand All @@ -352,8 +350,14 @@ def create_loom_files(args):
attrDict = dict()
attrDict['expression_data_type'] = args.expression_data_type
attrDict['optimus_output_schema_version'] = version
attrDict['sample_id'] = args.sample_id

attrDict['input_id'] = args.input_id
if args.input_name is not None:
attrDict['input_name'] = args.input_name
if args.input_id_metadata_field is not None:
attrDict['input_id_metadata_field'] = args.input_id_metadata_field
if args.input_name_metadata_field is not None:
attrDict['input_name_metadata_field'] = args.input_name_metadata_field
attrDict['pipeline_version'] = args.pipeline_version
#generate loom file
loompy.create(args.output_loom_path, expr_sp_t, row_attrs, col_attrs, file_attrs=attrDict)

Expand Down Expand Up @@ -430,12 +434,30 @@ def main():
)

parser.add_argument(
"--sample_id",
dest="sample_id",
default="Unknown sample",
"--input_id",
dest="input_id",
required=True,
help="the sample name in the bundle",
)

parser.add_argument(
"--input_name",
dest="input_name",
help= "sequencing_input.biomaterial_core.biomaterial_id in HCA metadata, defined by the user",
)

parser.add_argument(
"--input_id_metadata_field",
dest="input_id_metadata_field",
help= "sequencing_process.provenance.document_id: [UUID] defined by the user",
)

parser.add_argument(
"--input_name_metadata_field",
dest="input_name_metadata_field",
help= "sequencing_input.biomaterial_core.biomaterial_id defined by the user",
)

parser.add_argument(
"--verbose",
dest="verbose",
Expand All @@ -451,6 +473,13 @@ def main():
help="The expression data type",
)

parser.add_argument(
"--pipeline_version",
dest="pipeline_version",
required=True,
help="The version of Optimus that generated data",
)

args = parser.parse_args()

create_loom_files(args)
Expand Down
64 changes: 48 additions & 16 deletions dockers/skylab/loom-output/create_loom_ss2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
import scipy as sc
import loompy

def generate_col_attr(qc_paths):
def generate_col_attr(args):
"""Converts the QC of Smart Seq2 gene file pipeline outputs to loom file
Args:
qc_path (str): path to the QCs csv
"""
# read the QC values
qc_path = [p for p in qc_paths if p.endswith("_QCs.csv")][0]
qc_path = [p for p in args.qc_files if p.endswith("_QCs.csv")][0]
with open(qc_path, 'r') as f:
qc_values = [row for row in csv.reader(f)]

Expand Down Expand Up @@ -49,7 +49,12 @@ def generate_col_attr(qc_paths):
# Column attributes
col_attrs = dict()
col_attrs["cell_names"] = [cell_id]


if args.input_id_metadata_field:
col_attrs["input_id_metadata_field"] = args.input_id_metadata_field
if args.input_name_metadata_field:
col_attrs["input_name_metadata_field"] = args.input_name_metadata_field

numeric_field_names = np.array(sorted_numeric_labels[:])
for i in range(0, numeric_field_names.shape[0]):
name = numeric_field_names[i]
Expand Down Expand Up @@ -107,30 +112,32 @@ def generate_row_attr_and_matrix(rsem_gene_results_path):
return row_attrs, expression_tpms,expected_counts


def create_loom_files(sample_id, qc_files, rsem_genes_results_file,
output_loom_path):
def create_loom_files(args):
"""This function creates the loom file or folder structure in output_loom_path in
format file_format, with sample_id from the input folder analysis_output_path
format file_format, with input_id from the input folder analysis_output_path
Args:
sample_id (str): sample or cell id
input_id (str): sample or cell id
qc_analysis_output_files_string (str): a string with the file names in the QCGroup of SS2
pipeline output, separated by commas
rsem_genes_results_file (str): the file for the expression count
output_loom_path (str): location of the output loom
"""
# generate a dictionarty of column attributes
col_attrs = generate_col_attr(qc_files)
# generate a dictionary of column attributes
col_attrs = generate_col_attr(args)

# add the expression count matrix data
# generate a dictionary of row attributes
row_attrs, expr_tpms, expr_counts = generate_row_attr_and_matrix(rsem_genes_results_file)
row_attrs, expr_tpms, expr_counts = generate_row_attr_and_matrix(args.rsem_genes_results_file)

attrDict = dict()
attrDict['sample_id'] = sample_id
attrDict['input_id'] = args.input_id
if args.input_name is not None:
attrDict['input_name'] = args.input_name
attrDict['pipeline_version'] = args.pipeline_version

#generate loom file
loompy.create(output_loom_path, expr_tpms, row_attrs, col_attrs, file_attrs=attrDict)
ds = loompy.connect(output_loom_path)
loompy.create(args.output_loom_path, expr_tpms, row_attrs, col_attrs, file_attrs=attrDict)
ds = loompy.connect(args.output_loom_path)
ds.layers['estimated_counts'] = expr_counts
ds.close()

Expand All @@ -142,23 +149,48 @@ def main():

parser = argparse.ArgumentParser(description=description)
parser.add_argument('--qc_files',
dest="qc_files",
nargs = "+",
help=('the grouped QC files from the GroupQCOutputs task of SS2 '
'Single Sample workflow'))

parser.add_argument('--rsem_genes_results',
dest="rsem_genes_results_file",
help='path to the folder containing the files to be added to the loom')

parser.add_argument('--output_loom_path',
dest="output_loom_path",
help='path where the loom file is to be created')

parser.add_argument('--sample_id',
default="Unknown sample",
parser.add_argument('--input_id',
dest="input_id",
help='the sample name in the bundle')

parser.add_argument(
"--input_name",
dest="input_name",
help= "sequencing_input.biomaterial_core.biomaterial_id in HCA metadata, defined by the user",
)

parser.add_argument(
"--input_id_metadata_field",
dest="input_id_metadata_field",
help= "sequencing_process.provenance.document_id: [UUID] defined by the user",
)

parser.add_argument(
"--input_name_metadata_field",
dest="input_name_metadata_field",
help= "sequencing_input.biomaterial_core.biomaterial_id defined by the user",
)

parser.add_argument('--pipeline_version',
default="Unknown sample",
help='the version of SS2 used to generate data')

args = parser.parse_args()

create_loom_files(args.sample_id, args.qc_files, args.rsem_genes_results, args.output_loom_path)
create_loom_files(args)


if __name__ == '__main__':
Expand Down
19 changes: 15 additions & 4 deletions dockers/skylab/loom-output/ss2_loom_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,29 @@ def main():
dest='output_loom_file',
required=True,
help="Path to output loom file")
parser.add_argument('--plate-sample-id',
dest='plate_sample_id',
parser.add_argument('--batch_id',
dest='batch_id',
required=True,
help="Plate sample id for output loom")
help="Batch id for output loom")
parser.add_argument('--batch_name',
dest='batch_name',
help='User provided plate id for output loom')
parser.add_argument('--pipeline_version',
dest='pipeline_version',
required=True,
help='Multisample SS2 version')
args = parser.parse_args()

# The list of Loom files that we need to merge

loom_file_list = args.input_loom_files

attrDict = dict()
attrDict['sample_id'] = args.plate_sample_id
attrDict['batch_id'] = args.batch_id
attrDict['pipeline_version'] = args.pipeline_version
if args.batch_name is not None:
attrDict['batch_name'] = args.batch_name

loompy.combine(loom_file_list,output_file=args.output_loom_file,file_attrs = attrDict)

if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 0281d46

Please sign in to comment.