Skip to content

Commit

Permalink
switch to nextflow
Browse files Browse the repository at this point in the history
  • Loading branch information
feserm committed Jan 8, 2024
1 parent 63e9e27 commit 193d62f
Show file tree
Hide file tree
Showing 21 changed files with 776 additions and 133 deletions.
6 changes: 0 additions & 6 deletions 01_split.sh

This file was deleted.

7 changes: 0 additions & 7 deletions 02_send_data.sh

This file was deleted.

9 changes: 0 additions & 9 deletions 03_singleFileImputation.sh

This file was deleted.

1 change: 0 additions & 1 deletion 03_start_job.sh

This file was deleted.

1 change: 0 additions & 1 deletion 03_start_job_gt.sh

This file was deleted.

3 changes: 0 additions & 3 deletions 04_receive_data.sh

This file was deleted.

4 changes: 0 additions & 4 deletions 05_get_log.sh

This file was deleted.

2 changes: 0 additions & 2 deletions 06_cleanup.sh

This file was deleted.

621 changes: 621 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

68 changes: 2 additions & 66 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,66 +1,2 @@
# DivImpute

Scripts to start imputation jobs on the virtual machines of a project running on the de.NBI Cloud.
Three virtual machines are used in this particular setup. One Head Node to start the jobs by running the scripts and two Computational Nodes. All nodes run with Ubuntu and the later specified software was installed. The nodes store the data in a Cinder Volume mounted as /volumes/volume001.

## Required Tools

- [Beagle4.1](https://faculty.washington.edu/browning/beagle/b4_1.html) and bref
- [bcftools 1.10](http://www.htslib.org/download/)

## Usage

1. Install required tools
3. The enumerated scripts are run from a header node, that can access some computational nodes by running the scripts *batch_impute.sh* or *imputeOnReferencePanel.sh*
4. the single node example can be accessed through the script *03_singleFileImputation.sh*

## Parameters

### 01_split.sh
Parameter | Description | Example
--- | --- | ---
$1 | Input VCF | input.vcf.gz
$2 | Output Name | output

### 02_send_data.sh
Parameter | Description | Example
--- | --- | ---
$1 | List of Chromosome Names | chr_names.txt
$2 | IP Address Computational Node | 192.168.0.80
$3 | Private Key | .ssh/my-private-key

### 03_singleFileImputation.sh
Parameter | Description | Example
--- | --- | ---
$1 | IP Address Computational Node | 192.168.0.80
$2 | Private Key | .ssh/my-private-key

### 03_start_job_gt.sh
Parameter | Description | Example
--- | --- | ---
$1 | IP Address Computational Node | 192.168.0.80
$2 | Private Key | .ssh/my-private-key

### 03_start_job.sh
Parameter | Description | Example
--- | --- | ---
$1 | IP Address Computational Node | 192.168.0.80
$2 | Private Key | .ssh/my-private-key

### 04_receive_data.sh
Parameter | Description | Example
--- | --- | ---
$1 | IP Address Computational Node | 192.168.0.80
$2 | Private Key | .ssh/my-private-key

### 05_get_logs.sh
Parameter | Description | Example
--- | --- | ---
$1 | IP Address Computational Node | 192.168.0.80
$2 | Private Key | .ssh/my-private-key

### 06_cleanup.sh
Parameter | Description | Example
--- | --- | ---
$1 | IP Address Computational Node | 192.168.0.80
$2 | Private Key | .ssh/my-private-key
# DivImpute

5 changes: 0 additions & 5 deletions batch_impute.sh

This file was deleted.

Binary file added data/bridge_core1000_renamed.vcf.gz
Binary file not shown.
Binary file added data/bridge_core1000_renamed.vcf.gz.csi
Binary file not shown.
Empty file added docs/output.md
Empty file.
Empty file added docs/usage.md
Empty file.
22 changes: 0 additions & 22 deletions examples/chr_names.txt

This file was deleted.

5 changes: 0 additions & 5 deletions imputeOnReferencePanel.sh

This file was deleted.

142 changes: 142 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
process indexVCF {
container 'quay.io/biocontainers/bcftools:1.18--h8b25389_0'
publishDir params.outdir+'/raw', mode: 'copy'

input:
path(vcfFile)

output:
path("${vcfFile}"), emit: vcf
path("${vcfFile}.csi"), emit: index

script:
"""
bcftools index ${vcfFile}
"""
}

process listVariants {
container 'quay.io/biocontainers/bcftools:1.18--h8b25389_0'

input:
path(vcfFile)
path(vcfFileIndex)
output:
path("variants.txt")
script:
"""
bcftools query -f '%CHROM\t%POS\n' ${vcfFile} > variants.txt
"""
}

process writeWindows {
container 'quay.io/biocontainers/pandas:1.5.2'
publishDir params.outdir, mode: 'copy'

input:
path(variantFile)
val(windowSize)
val(overlap)

output:
path("windows.txt")

script:
"""
#!/usr/bin/env python3
import pandas as pd
from math import ceil
windowSize=$windowSize
overlap=$overlap
df=pd.read_csv('${variantFile}', header=None, sep='\t')
grouped_df = df.groupby(df[0])
with open('windows.txt', 'w') as f:
for group_name, group_data in grouped_df:
tmp = group_data.reset_index(drop=True)
end = 0
for i in range(0, ceil(len(tmp)/windowSize)+1):
start = max(end-overlap, 0)
end = min(start+windowSize, len(tmp))
f.write(str(group_name)+':'+str(tmp[1][start])+'-'+str(tmp[1][end-1])+'\\n')
"""
}

process splitVCFByWindow {
container 'quay.io/biocontainers/bcftools:1.18--h8b25389_0'
publishDir params.outdir+'/windows', mode: 'copy'

input:
val(window)
path(vcfFile)
path(vcfFileIndex)
output:
path("${window}_${vcfFile}")
script:
"""
bcftools view -r ${window} ${vcfFile} -Oz -o ${window}_${vcfFile}
"""
}

process imputeWindows {
container 'quay.io/biocontainers/beagle:5.4_22Jul22.46e--hdfd78af_0'
publishDir params.outdir+'/imputed/windows', mode: 'copy'

input:
path(vcfFile)
output:
path("imputed_${vcfFile}")
script:
"""
beagle gt=${vcfFile} out=imputed_${vcfFile.name.replaceAll('.vcf.gz', '')}
"""
}

process indexImputedWindow {
container 'quay.io/biocontainers/bcftools:1.18--h8b25389_0'

input:
path(vcfFile)
output:
path("${vcfFile}.csi"), emit: index
script:
"""
bcftools index ${vcfFile}
"""
}

process mergeImputedWindows {
container 'quay.io/biocontainers/bcftools:1.18--h8b25389_0'
publishDir params.outdir+'/imputed/', mode: 'copy'

input:
path(vcfFiles)
path(vcfIndices)
output:
path("merged_imputed.vcf.gz")
script:
"""
bcftools concat imputed*.vcf.gz -Oz -o merged_imputed.vcf.gz
"""
}

workflow {
indexVCF(params.vcf)
| listVariants

writeWindows(listVariants.out, params.windowSize, params.overlap)
| splitText
| map { it.trim()}
| set { ch_windows}

splitVCFByWindow(ch_windows, indexVCF.out.vcf, indexVCF.out.index)
| imputeWindows
| collect
| set { ch_imputedWindows}

indexImputedWindow(imputeWindows.out)
| collect
| set { ch_windowIndices}

mergeImputedWindows(ch_imputedWindows, ch_windowIndices)
}
Empty file added nextflow-schema.json
Empty file.
11 changes: 11 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

params {
vcf = '/home/ubuntu/divimpute/data/bridge_core1000_renamed.vcf.gz'
outdir = './results'
windowSize = 5000
overlap = 1000
}

docker {
enabled = true
}
2 changes: 0 additions & 2 deletions singleFileImpute.sh

This file was deleted.

0 comments on commit 193d62f

Please sign in to comment.