Merge pull request #42 from ctg-lund/add-docs

Add docs
ctg-lund · Sep 20, 2023 · 2c270d0 · 2c270d0
2 parents 251b90d + 5457d16
commit 2c270d0
Show file tree

Hide file tree

Showing 20 changed files with 269 additions and 148 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -4,7 +4,7 @@ on:
     branches:
     - master
 jobs:
-  example:
+  stub-run:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3

diff --git a/bin/countmetric2mqc.py b/bin/countmetric2mqc.py
@@ -2,17 +2,17 @@
 
 # Check if input file name was provided
 if len(sys.argv) < 3:
-    print('Usage: python countmetric2mqc.py file.csv sample_name output_mqc.yaml')
+    print("Usage: python countmetric2mqc.py file.csv sample_name output_mqc.yaml")
     sys.exit(1)
 
 # Get input file name from command-line argument
 input_file = sys.argv[1]
 sample_name = sys.argv[2]
 mqc_yaml = sys.argv[3]
 # Initialize dictionaries for each category
-with open(input_file, 'r') as file:
-    keys, values = file.readline().split(','), file.readline().split(',')
-    data = {k.strip():v.strip() for (k,v) in zip(keys,values)}
+with open(input_file, "r") as file:
+    keys, values = file.readline().split(","), file.readline().split(",")
+    data = {k.strip(): v.strip() for (k, v) in zip(keys, values)}
 # Appends to an already existing mqc.yaml file
-with open(mqc_yaml, 'a') as file:
-    file.write(f'  {sample_name}: {data}\n')
+with open(mqc_yaml, "a") as file:
+    file.write(f"  {sample_name}: {data}\n")
diff --git a/bin/multimetric2mqc.py b/bin/multimetric2mqc.py
@@ -4,7 +4,7 @@
 
 # Check if input file name was provided
 if len(sys.argv) < 3:
-    print('Usage: python convert.py file.csv sample_name')
+    print("Usage: python convert.py file.csv sample_name")
     sys.exit(1)
 
 # Get input file name from command-line argument
@@ -16,7 +16,7 @@
 other = {}
 
 # Open CSV file
-with open(input_file, 'r') as f:
+with open(input_file, "r") as f:
     # Create CSV reader
     reader = csv.reader(f)
 
@@ -29,15 +29,15 @@
         category, library_type, grouped_by, group_name, metric_name, metric_value = row
 
         # Add metric_name and metric_value to appropriate dictionary based on category
-        if category == 'Cells':
-            cells[library_type+'_'+metric_name] = metric_value
-        elif category == 'Library':
-            library[library_type+'_'+metric_name] = metric_value
+        if category == "Cells":
+            cells[library_type + "_" + metric_name] = metric_value
+        elif category == "Library":
+            library[library_type + "_" + metric_name] = metric_value
         else:
             other[metric_name] = metric_value
 
 # Write dictionaries to JSON files
-with open('{}_cells.json'.format(sample_name), 'w') as f:
+with open("{}_cells.json".format(sample_name), "w") as f:
     json.dump(cells, f)
-with open('{}_library.json'.format(sample_name), 'w') as f:
+with open("{}_library.json".format(sample_name), "w") as f:
     json.dump(library, f)
diff --git a/docs/10X-genomics/citeseq.md b/docs/10X-genomics/citeseq.md
@@ -0,0 +1,91 @@
+# Introduction
+This guide is to describe how to process data from the CiteSeq protocol. Using antibodies to tag your cells, which can help the researcher to detect things such as cell surface proteins, or pool multiple cell types together (Antibodybody Derived Tags (ADTs) or HashTag Oligonucleotides (HTOs) respectively).
+
+At it's complicated it consists of 3 modalities(Gene Expression + ADTs + HTOs). This page aim to describe how we process it here at CTG.
+
+## Feature reference (ADT)
+
+The feature reference should be provided by the lab which did the experiments. How a feature reference is constructed is [described here.](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis)
+
+The feature reference is the file which describes these following things fo each ADT:
+* id : Unique ID used to track feature counts. May only include ASCII characters and must not use whitespace, slash, quote, or comma characters. Each ID must be unique and must not collide with a gene identifier from the transcriptome
+* name : Human-readable name for this feature. May only include ASCII characters and must not use whitespace, slash, quote, or comma characters. This name will be displayed in the Loupe Browser Active Feature list
+* read : Specifies which RNA sequencing read contains the Feature Barcode sequence. Must be R1 or R2. Note: in most cases R2 is the correct read
+* pattern : Specifies how to extract the Feature Barcode sequence from the read
+* sequence : Nucleotide barcode sequence associated with this feature. E.g., antibody barcode or sgRNA protospacer sequence. 
+* feature_type : Type of the feature. See the Library/Feature Types section for details on the allowed values for this field. FASTQ data specified in the Library CSV file with a library_type that matches the feature_type will be scanned for occurrences of this feature. Each feature type in the feature reference must match a library_type entry in the Libraries CSV file. This field is case-sensitive.
+
+Example:
+
+|id   |name |read|pattern|sequence       |feature_type        |
+|-----|-----|----|-------|---------------|--------------------|
+|A0001|Ms.CD4|R2  |5P(BC) |AACAAGACCCTTGAG|Antibody Capture    |
+|A0002|Ms.CD8a|R2  |5P(BC) |TACCCGTAATAGCGT|Antibody Capture    |
+|A0003|Ms.CD366|R2  |5P(BC) |ATTGGCACTCAGATG|Antibody Capture    |
+|A0004|Ms.CD279|R2  |5P(BC) |GAAAGTCAAAGCACT|Antibody Capture    |
+|A0013|Ms.Ly.6C|R2  |5P(BC) |AAGTCGTGAGGCATG|Antibody Capture    |
+|A0014|HuMs.CD11b|R2  |5P(BC) |TGAAGGCTCATTTGT|Antibody Capture    |
+|A0015|Ms.Ly.6G|R2  |5P(BC) |ACATTGACGCAACTA|Antibody Capture    |
+|A0070|HuMs.CD49f|R2  |5P(BC) |TTCCGAGGATGATCT|Antibody Capture    |
+|A0073|HuMs.CD44|R2  |5P(BC) |TGGCTTCAGGTCCTA|Antibody Capture    |
+
+## CMO-Set (HTOs)
+If you want to use cellranger multi to demultiplex the HTOs, write a cmo file like this. Otherwise they can be specified as ADT and demultiplexed with the Seurats HTO demux.
+CMO_reference file:
+
+|id   |name |read|pattern|sequence       |feature_type        |
+|-----|-----|----|-------|---------------|--------------------|
+|HTO1 |HTO1 |R2  |5P(BC) |ACCCACCAGTAAGAC|Multiplexing Capture|
+|HTO2 |HTO2 |R2  |5P(BC) |GGTCGAGAGCATTCA|Multiplexing Capture|
+|HTO3 |HTO3 |R2  |5P(BC) |CTTGCCGCATGTCAT|Multiplexing Capture|
+|HTO6 |HTO6 |R2  |5P(BC) |TATGCTGCCACGGTA|Multiplexing Capture|
+|HTO11|HTO11|R2  |5P(BC) |GCTTACCGAATTAAC|Multiplexing Capture|
+|HTO12|HTO12|R2  |5P(BC) |CTGCAAATATAACGG|Multiplexing Capture|
+
+## Config 
+The csv file which cellranger multi takes as input
+```yaml
+[gene-expression]
+reference,/path/to/refdata-gex-mm10-2020-A
+cmo-set,/path/to/CMO_reference.csv
+
+[feature]
+reference,/path/to/HB_feature_ref.csv
+
+[libraries]
+fastq_id,fastqs,feature_types
+230316_HB_1,/path/to/0_fastq,Gene Expression
+230316_HB_1_ADT,/path/to/0_fastq,Antibody Capture
+230316_HB_1_HTO,/path/to/0_fastq,Multiplexing Capture
+
+[samples]
+sample_id,cmo_ids
+sample1,HTO1
+sample2,HTO2
+sample3,HTO3
+sample4,HTO6
+sample5,HTO11
+sample6,HTO12
+```
+How to create a CMO-set is described more in [detail here.](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/multi)
+## Cellranger multi for HTO + anything else
+`singularity run --bind /root/ /path/to/cellranger.simg cellranger multi --id=230316_BM --csv=/path/to/BM_config.csv`
+## ADT + GEX only
+libraries.csv:
+```
+fastqs,sample,library_type
+/path/to/fastq/GEX_Sample,GEX_Sample,Gene Expression
+/path/to/fastq/ADT_Sample,GEX_Sample,Antibody Capture
+```
+command for running:
+```
+singularity run --bind /projects/ /path/to/cellranger.simg cellranger count \
+	--id=Ram_014 \
+	--libraries=/path/to/libraries.csv \
+	--transcriptome=/path/to/refdata-gex-GRCh38-2020-A \
+	--feature-ref=/path/to/TotalSeq_A_Human_Universal_Cocktail_V1_399907_Antibody_reference_UMI_counting_CellRanger.csv \
+	--localmem=140 \
+	--jobmode=local \
+	--localcores=24
+	```
+	
diff --git a/docs/10X-genomics/flex.md b/docs/10X-genomics/flex.md
@@ -0,0 +1,50 @@
+# Basic run instructions:
+The fixation uses cellranger multi. Example:
+```
+singularity run --bind /projects/ /path/to/cellranger.simg cellranger multi \
+--id=sample_id \
+--csv=/path/to/config.csv
+```
+
+# Config files:
+For single plexed fixation samples:
+config.csv
+```
+[gene-expression]
+reference,/path/to/refdata-gex-GRCh38-2020-A
+probe-set,/path/to/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv
+
+[libraries]
+fastq_id,fastqs,feature_types
+sample_01_fix,/path/to/fastq,Gene Expression
+```
+For multiplexed samples:
+config.csv
+```
+[gene-expression]
+reference,/path/to/refdata-gex-GRCh38-2020-A
+probe-set,/path/to/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv
+
+[libraries]
+fastq_id,fastqs,feature_types
+221117_Milladur,/path/to/fastq/sample_id/,Gene Expression
+
+[samples]
+sample_id,probe_barcode_ids
+sample1,BC001
+sample2,BC002
+sample3,BC003
+sample4,BC004
+sample5,BC005
+sample6,BC006
+sample7,BC007
+sample8,BC008
+sample9,BC009
+sample10,BC010
+sample11,BC011
+sample12,BC012
+sample13,BC013
+sample14,BC014
+sample15,BC015
+
+```
diff --git a/docs/10X-genomics/vdj.md b/docs/10X-genomics/vdj.md
@@ -0,0 +1,32 @@
+# VDJ only
+```
+cellranger vdj --id=sample345 \
+                 --reference=/opt/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.1.0 \
+                 --fastqs=/home/jdoe/runs/HAWT7ADXX/outs/fastq_path \
+```
+
+
+# VDJ + GEX
+You will need to construct a config.csv file like
+```
+[gene-expression]
+reference,/path/to/transcriptome
+
+[vdj]
+reference,/path/to/vdj_reference
+
+[libraries]
+fastq_id,fastqs,feature_types
+GEX_fastqs_id,/path/to/GEX_fastqs,Gene Expression
+VDJ_B_fastqs_id,/path/to/vdj_B_fastqs,VDJ-B
+VDJ_T_fastqs_id,/path/to/vdj_T_fastqs,VDJ-T
+```
+Where VDJ-T/B is set manually as opposed to cellranger vdj where it autodetects Antigens.
+Then run with:
+```
+cellranger multi --id=<sample-id> --csv=/path/to/config.csv
+```
+# Reference names
+I haven't changed the names provided by 10X standard refernces. These are the names.
+* Human: refdata-cellranger-vdj-GRCh38-alts-ensembl-7.1.0
+* Mouse: refdata-cellranger-vdj-GRCm38-alts-ensembl-7.0.0
diff --git a/docs/10X-genomics/visium.md b/docs/10X-genomics/visium.md
@@ -0,0 +1,48 @@
+# Introduction
+This is a document describing how to run spaceranger. The document will be continuously updated while the pipeline matures.
+
+# Spaceranger Count
+**Required input files**
+* Fastq-files
+* Cytassist image
+* Microscope image
+* Slide area
+* Slide .gpr file (This will need to be downloaded)
+
+
+**Count sbatch script:**
+```bash
+#!/bin/bash -ue
+#SBATCH -c 24
+#SBATCH -t 12:00:00
+#SBATCH --mem 124G
+#SBATCH -J sample
+#SBATCH -o /path/to/out/sample.out
+#SBATCH -e /path/to/err/sample.err
+
+singularity run --bind /projects/ \
+	/path/to/spaceranger.simg \
+		spaceranger count \
+			--id="Visium_FFPE_Human" \
+			--transcriptome=/path/to/refdata-gex-GRCh38-2020-A \
+			--probe-set=/path/to/Visium_Human_Transcriptome_Probe_Set_v2.0_GRCh38-2020-A.csv \
+			--fastqs=/path/to/sample/fastq \
+			--sample=sample \
+			--image=/path/to/CytAssist_HighRes_Sample-A-index-D1.tif \
+			--slide=V42L13-392 \
+			--slidefile=/path/to/V42L13-392.gpr \
+			--area=A1 \
+			--cytaimage=/path/to/CytAssist_LowRes_Sample-A-Index-D1.tif \
+```
+
+# Glossary
+**Visium CytAssist Instrument**: An instrument that mediates the tissue permeabilization to release the ligation products from tissues on standard glass slide for capture by spatially barcoded oligonucleotides within each Capture Area on the Visium slide. It also captures the image of the tissue section on the Visium slide.
+
+**CytAssist Captured Image (or CytAssist Image)**: A low resolution brightfield image in TIFF format that is captured by the CytAssist of the eosin stained tissue section on the CytAssist slide inside the instrument. This image contains the fiducial frame.
+
+**Microscope Image**: A high resolution brightfield or fluorescence image of the tissue section on the standard glass slide captured by a microscope. This image does not contain the fiducial frame.
+
+**CytAssist Spatial Gene Expression Slide, 6.5 mm**: Visium Spatial Gene Expression slide for use with CytAssist instrument with two capture areas each with dimensions of 6.5 mm by 6.5 mm. The spots within the capture area on these slides contain specialized oligos for capturing poly-adenylated mRNA tags. These slides have serial numbers starting with "V4".
+
+**CytAssist Spatial Gene Expression Slide, 11 mm**: Visium Spatial Gene Expression slide for use with CytAssist instrument with two capture areas each with dimensions 11 mm by 11 mm. The number of spots within the capture area on these slides are ~3x higher compared to the 6.5 mm capture areas and contain specialized oligos for capturing poly-adenylated mRNA tags. These slides have serial numbers starting with "V5".
+
diff --git a/docs/scarc-10x.md → docs/sub-workflows/scarc-10x.md b/docs/scarc-10x.md → docs/sub-workflows/scarc-10x.md
@@ -15,4 +15,9 @@
 Other than standard parameters, following parameters needs to be defined:
 * human_atac : Path to the human arc reference genome
 * mouse_atac : Path to the arc mouse reference genome
-* COUNT_ARC : The path to the container which cellranger-arc is called from
+* COUNT_ARC : The path to the container which cellranger-arc is called from
+
+# Workflow specific processing steps
+* Generate library.csv files as described here https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/count#library-csv
+* `cellranger-arc count` on library files as described here
+* Generate multiqc parseable files based on the metrics files generated by cellranger-arc
diff --git a/docs/scatac-10x.md → docs/sub-workflows/scatac-10x.md b/docs/scatac-10x.md → docs/sub-workflows/scatac-10x.md
@@ -16,4 +16,8 @@
 Other than standard parameters, following parameters needs to be defined:
 * human_atac : Path to the human arc reference genome
 * mouse_atac : Path to the arc mouse reference genome
-* COUNT_ATAC : The path to the container which cellranger-arc is called from
+* COUNT_ATAC : The path to the container which cellranger-arc is called from
+
+# Workflow specific processing steps
+* `cellranger-atac count` on fastq files as described here https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/what-is-cell-ranger-atac
+* Generate multiqc parseable files based on the metrics files generated by cellranger-atac
diff --git a/docs/scciteseq-10x.md → docs/sub-workflows/scciteseq-10x.md b/docs/scciteseq-10x.md → docs/sub-workflows/scciteseq-10x.md
@@ -28,12 +28,7 @@ Explanation of each column:
 ## 10X_FeatureReference
 Described [here](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis) by 10X official documentation.
 
-# How to Run
-
-To execute the workflow, use the following command:
-
-```
-nextflow run main.nf --samplesheet </path/to/your/samplesheet.csv> 
-```
-
-Replace `</path/to/your/samplesheet.csv>` with the actual path to your SampleSheet.csv file. The `--analysis` option should be set to `scciteseq-10x` to indicate the pipeline to use for analysis.
+# Workflow specific processing steps
+* Generate library.csv files as described here https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis#libraries-csv
+* `cellranger count` on library files as described here: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis#overview
+* Generate multiqc parseable files based on the metrics files generated by cellranger
diff --git a/docs/scflex-10x.md → docs/sub-workflows/scflex-10x.md b/docs/scflex-10x.md → docs/sub-workflows/scflex-10x.md
@@ -28,18 +28,9 @@ Explanation of each column:
 Described [here](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/multi-frp#samples) by 10X official documentation.
 Special case is `Sample_Source` which need to match a sample from the 10X_Data section. If no match is found, pipeline assumes it's a singleplex sample.
 
-# How to Run
-
-To execute the workflow, use the following command:
-
-```
-nextflow run main.nf --samplesheet </path/to/your/samplesheet.csv>
-```
-Or if using custom probes:
-```
-nextflow run main.nf --samplesheet </path/to/your/samplesheet.csv> --custom_genome </path/to/custom_genome> --custom_probes </path/to/custom_probes>
-```
-Replace `</path/to/your/samplesheet.csv>` with the actual path to your SampleSheet.csv file. The `--analysis` option should be set to `scflex-10x` to indicate the pipeline to use for analysis.
-
 ## Note if using custom probes
 You will need to construct both a reference genome and a reference probe set. Until I have set up a guide for that, contact 10X for more information.
+
+# Workflow specific processing steps
+* Generation of config.csv files a described here: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/multi#examples
+* Running of cellranger as described here: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/multi#cellranger-multi
diff --git a/docs/scmulti-10x.md → docs/sub-workflows/scmulti-10x.md b/docs/scmulti-10x.md → docs/sub-workflows/scmulti-10x.md
@@ -18,4 +18,8 @@ Other than standard parameters, following parameters needs to be defined:
 * mouse : Path to the mouse reference transcriptome
 * human_vdj : Path to the human vdj reference
 * mouse_vdj : Path to the mouse vdj reference
-* COUNT_ARC : The path to the container which cellranger-arc is called from
+* COUNT_ARC : The path to the container which cellranger-arc is called from
+
+# Workflow specific processing steps
+* Generation of config.csv files a described here: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/multi#examples
+* Running of cellranger as described here: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/multi#cellranger-multi
diff --git a/docs/scrna-10x.md → docs/sub-workflows/scrna-10x.md b/docs/scrna-10x.md → docs/sub-workflows/scrna-10x.md
@@ -20,12 +20,6 @@ Explanation of each column:
 * **agg**: If you want to aggregate all the processed samples for visualization, set this column accordingly.
 * **pipeline**: This column specifies the pipeline that should be used for the sample. In this case, the value should be set to `scrna-10x`.
 
-# How to Run
-
-To execute the workflow, use the following command:
-
-```
-nextflow run main.nf --samplesheet </path/to/your/samplesheet.csv> 
-```
-
-Replace `</path/to/your/samplesheet.csv>` with the actual path to your SampleSheet.csv file. The `--analysis` option should be set to `scrna-10x` to indicate the pipeline to use for analysis.
+# Workflow specific processing steps
+* Generation of config.csv as described here: https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/using/multi#examples
+* Running of cellranger multi on config files as described here https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/using/multi#running-multi