adoebley · yueyaog · Apr 4, 2023 · Apr 11, 2023
diff --git a/DockerImage/Dockerfile b/DockerImage/Dockerfile
@@ -0,0 +1,17 @@
+# Use base image with miniconda3 installed
+FROM continuumio/miniconda3
+MAINTAINER Yueyao Gao <tag@broadinstitute.org>
+
+# Specify Workdir
+WORKDIR /BaseImage
+
+# Create the environment
+COPY Griffin_env_explicit.yml .
+RUN conda env create -f Griffin_env_explicit.yml
+
+# Copy Python scripts from Griffin
+RUN mkdir -p /BaseImage/Griffin/scripts
+COPY scripts/*py /BaseImage/Griffin/scripts/
+
+# Set the working directory to the Griffin
+WORKDIR /BaseImage/Griffin
diff --git a/DockerImage/Griffin_env_explicit.yml b/DockerImage/Griffin_env_explicit.yml
@@ -0,0 +1,18 @@
+name: griffin_env
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - python=3.7.4
+  - pandas=1.3.2
+  - scipy=1.7.1
+  - samtools=1.13
+  - bedtools=2.29.2
+  - pybedtools=0.8.0
+  - pybigwig=0.3.17
+  - statsmodels=0.9.0
+  - pip
+  - pip:
+      - snakemake==5.19.2
+      - matplotlib==3.4.1
+      - scikit-learn==1.0.2
diff --git a/DockerImage/README.md b/DockerImage/README.md
@@ -0,0 +1,44 @@
+# Griffin Docker
+Griffin docker container aims to streamline the setup process for Griffin and enhance the user experience by encapsulating all dependencies and required configurations. 
+
+## Getting Started
+To get started with the Griffin Docker container, please follow the instructions below:
+1. Change your working direcotry
+```
+cd Griffin/DockerImage
+```
+2. Build the Docker image `griffin-docker:v0.2.0` using Dockerfile
+```
+docker build --platform linux/amd64 -t griffin-docker:v0.2.0 .
+```
+3. Run the Griffin Docker container
+```
+docker run -it --rm griffin-docker:v0.2.0
+```
+
+## Run Griffin Demo
+If you would like to run the [Griffin Demo](https://github.com/adoebley/Griffin/wiki), it is recommended to use the 
+Dockerfile located in `Griffin/DockerImage/demo_DockerImage`. This Dockerfile contains all necessary reference files 
+and a bash script that helps you get started with running the Griffin Demo.
+
+1. Change your working direcotry
+```
+cd Griffin/DockerImage/demo_DockerImage
+```
+2. Build the Docker image `griffin-docker:demo` using demo Dockerfile
+```
+docker build --platform linux/amd64 -t griffin-docker:demo .
+```
+3. Run the Demo Griffin Docker container
+```
+docker run -it --rm griffin-docker:demo
+```
+4. Run Griffin Demo
+Once inside the container, you can start using Griffin by following the Griffin wiki guide.
+The demo user guide has been summarized into `run-demo.sh`. The bash script will dry run 
+GC correction and griffin_nucleosome_profiling snakefile. 
+```
+bash run-demo.sh
+```
+
+## Happy nucleosome profiling!
diff --git a/DockerImage/demo_DockerImage/Dockerfile b/DockerImage/demo_DockerImage/Dockerfile
@@ -0,0 +1,33 @@
+# Use base image with miniconda3 installed
+FROM continuumio/miniconda3
+MAINTAINER Yueyao Gao <tag@broadinstitute.org>
+
+# Specify Workdir
+WORKDIR /BaseImage
+
+# Create the environment
+COPY Griffin_env_explicit.yml .
+RUN conda env create -f Griffin_env_explicit.yml
+
+# Clone the git repository
+RUN git clone https://github.com/adoebley/Griffin.git
+
+# Set the working directory to the cloned repository
+WORKDIR /BaseImage/Griffin
+
+# DEMO Specifc Commands:
+# Create run-demo directory and move snakemakes workflow scripts to run_demo
+RUN mkdir run_demo
+RUN cp -r snakemakes/griffin_GC_and_mappability_correction run_demo
+RUN cp -r snakemakes/griffin_nucleosome_profiling run_demo
+
+# Edit the config yaml files for demo run
+COPY demo_samples.yaml ./run_demo/griffin_GC_and_mappability_correction/config/samples.yaml
+COPY demo_sites.yaml ./run_demo/griffin_nucleosome_profiling/config/sites.yaml
+COPY demo_samples.GC.yaml ./run_demo/griffin_nucleosome_profiling/config/samples.GC.yaml
+
+# Run the config with mappabilty task enabled
+COPY mappabilty_enabled_config.yaml ./run_demo/griffin_GC_and_mappability_correction/config/config.yaml
+
+# Copy the demo bash script
+COPY run-demo.sh .
diff --git a/DockerImage/demo_DockerImage/Griffin_env_explicit.yml b/DockerImage/demo_DockerImage/Griffin_env_explicit.yml
@@ -0,0 +1,17 @@
+name: griffin_env
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - python=3.7.4
+  - pandas=1.3.2
+  - scipy=1.7.1
+  - samtools=1.13
+  - bedtools=2.29.2
+  - pybedtools=0.8.0
+  - pybigwig=0.3.17
+  - statsmodels=0.9.0
+  - pip
+  - pip:
+      - snakemake==5.19.2
+      - matplotlib==3.4.1
diff --git a/DockerImage/demo_DockerImage/demo_samples.GC.yaml b/DockerImage/demo_DockerImage/demo_samples.GC.yaml
@@ -0,0 +1,5 @@
+samples:
+  Healthy_demo:
+    bam: ../../demo/bam/Healthy_GSM1833219_downsampled.sorted.mini.bam
+    GC_bias: ../../demo/griffin_GC_correction_demo_files/expected_results/Healthy_demo.GC_bias.txt
+
diff --git a/DockerImage/demo_DockerImage/demo_samples.yaml b/DockerImage/demo_DockerImage/demo_samples.yaml
@@ -0,0 +1,2 @@
+samples:
+  Healthy_demo: ../../demo/bam/Healthy_GSM1833219_downsampled.sorted.mini.bam
diff --git a/DockerImage/demo_DockerImage/demo_sites.yaml b/DockerImage/demo_DockerImage/demo_sites.yaml
@@ -0,0 +1,2 @@
+site_lists:
+   CTCF_demo: ../../demo/griffin_nucleosome_profiling_demo_files/sites/CTCF.hg38.1000.txt
diff --git a/DockerImage/demo_DockerImage/mappabilty_enabled_config.yaml b/DockerImage/demo_DockerImage/mappabilty_enabled_config.yaml
@@ -0,0 +1,46 @@
+#griffin_GC_correction.snakefile
+#Anna-Lisa Doebley
+#Template made 2021-04-06
+#Ha Lab
+#Fred Hutchinson Cancer Research Center
+
+griffin_scripts_dir: ../../scripts
+
+#################
+#mappability bias params##
+#################
+# #SELECT CORRECT REFERENCE GENOME
+# #reference genome for alignment, with index files in same folder as .fa file
+reference_genome: ../../Ref/hg38.fa
+
+#chrom sizes for the selected reference genome
+chrom_sizes: ../../Ref/hg38.standard.chrom.sizes 
+
+#file containing mappability value for each bp in the genome 
+mappability_bw: ../../Ref/k100.Umap.MultiTrackMappability.bw
+mappability_correction: True #whether to run a mappability correction step, we found that this does not improve signals and we do not recommend it 
+
+#bed file containing regions to exclude
+encode_exclude: ../../Ref/encode_unified_GRCh38_exclusion_list.bed
+centromeres: ../../Ref/hg38_centromeres.bed
+gaps: ../../Ref/hg38_gaps.bed
+patches: ../../Ref/hg38_fix_patches.bed
+alternative_haplotypes: ../../Ref/hg38_alternative_haplotypes.bed
+
+#where the GC bias output will go
+out_dir: results
+
+#minimum mapping quality to keep a read
+map_quality: 20
+
+
+#################
+#GC bias params##
+#################
+mappable_regions: ../../Ref/k100_minus_exclusion_lists.mappable_regions.hg38.bed
+
+#folder with the gc frequencies for all fragment sizes in the mapable regions (must match the mapable_regions)
+#For typical hg38 WGS the correct path is below
+genome_GC_frequency: ../../Ref/genome_GC_frequency
+
+GC_bias_size_range: 15 500
diff --git a/DockerImage/demo_DockerImage/run-demo.sh b/DockerImage/demo_DockerImage/run-demo.sh
@@ -0,0 +1,27 @@
+# Download Reference genome
+wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
+# Unzip reference genome
+gunzip hg38.fa.gz
+# Move the reference genome to Ref directory
+mv hg38.fa Ref/
+
+# Download the mappability track
+wget https://hgdownload.soe.ucsc.edu/gbdb/hg38/hoffmanMappability/k100.Umap.MultiTrackMappability.bw
+# Move the mappability track to Ref directory
+mv k100.Umap.MultiTrackMappability.bw Ref/
+
+# Convert the demo cram file to bam 
+samtools view -b -T Ref/hg38.fa -o demo/bam/Healthy_GSM1833219_downsampled.sorted.mini.bam demo/bam/Healthy_GSM1833219_downsampled.sorted.mini.cram
+# Index the bam file
+samtools index demo/bam/Healthy_GSM1833219_downsampled.sorted.mini.bam
+
+# Navigate to the folder with the griffin_GC_and_mappability_correction snakefile
+cd run_demo/griffin_GC_and_mappability_correction/
+
+# Dry run GC and mappability correction
+snakemake -s griffin_GC_and_mappability_correction.snakefile --cores 8 -np 
+
+# Navigate to the folder with the griffin_nucleosome_profiling snakefile
+cd ../griffin_nucleosome_profiling/
+# Dry run Nucleosome profiling
+snakemake -s griffin_nucleosome_profiling.snakefile --cores 8 -np
diff --git a/DockerImage/scripts/__pycache__/griffin_functions.cpython-37.pyc b/DockerImage/scripts/__pycache__/griffin_functions.cpython-37.pyc
diff --git a/DockerImage/scripts/griffin_GC_bias.py b/DockerImage/scripts/griffin_GC_bias.py
diff --git a/DockerImage/scripts/griffin_GC_counts.py b/DockerImage/scripts/griffin_GC_counts.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[ ]:
+
+
+import pysam
+import os
+
+import pandas as pd
+import numpy as np
+import time
+import argparse
+import sys
+
+from multiprocessing import Pool
+
+
+
+# In[ ]:
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('--bam_file', help='sample_bam_file', required=True)
+parser.add_argument('--bam_file_name', help='sample name (does not need to match actual file name)', required=True)
+parser.add_argument('--mappable_regions_path', help='highly mappable regions to be used in GC correction, bedGraph or bed foramt', required=True)
+
+parser.add_argument('--ref_seq',help='reference sequence (fasta format)',required=True)
+parser.add_argument('--chrom_sizes',help='path to chromosome sizes for the reference seq',required=True)
+
+parser.add_argument('--out_dir',help='folder for GC bias results',required=True)
+
+parser.add_argument('--map_q',help='minimum mapping quality for reads to be considered',type=int,required=True)
+parser.add_argument('--size_range',help='range of read sizes to be included',nargs=2, type=int, required=True)
+
+parser.add_argument('--CPU',help='number of CPU for parallelizing', type=int, required=True)
+
+args = parser.parse_args()
+
+bam_file_path = args.bam_file
+bam_file_name = args.bam_file_name
+mappable_regions_path=args.mappable_regions_path
+
+ref_seq_path = args.ref_seq
+chrom_sizes_path = args.chrom_sizes
+out_dir = args.out_dir
+
+map_q = args.map_q
+size_range = args.size_range
+CPU = args.CPU
+
+
+# In[ ]:
+
+
+print('arguments provided:')
+
+print('\tbam_file_path = "'+bam_file_path+'"')
+print('\tbam_file_name = "'+bam_file_name+'"')
+print('\tmappable_regions_path = "'+mappable_regions_path+'"')
+
+print('\tref_seq_path = "'+ref_seq_path+'"')
+print('\tchrom_sizes_path = "'+chrom_sizes_path+'"')
+print('\tout_dir = "'+out_dir+'"')
+
+print('\tmap_q = '+str(map_q))
+print('\tsize_range = '+str(size_range))
+print('\tCPU = '+str(CPU))
+
+
+# In[ ]:
+
+
+out_file = out_dir +'/GC_counts/'+ bam_file_name+'.GC_counts.txt'
+
+print('out_file',out_file)
+
+#create a directory for the GC data
+if not os.path.exists(out_dir +'/GC_counts/'):
+    os.mkdir(out_dir +'/GC_counts/')
+
+
+# In[ ]:
+
+
+#import filter
+mappable_intervals = pd.read_csv(mappable_regions_path, sep='\t', header=None)
+
+#remove non standard chromosomes and X and Y
+chroms = ['chr'+str(m) for m in range(1,23)]
+mappable_intervals = mappable_intervals[mappable_intervals[0].isin(chroms)]
+
+print('chroms:', chroms)
+print('number_of_intervals:',len(mappable_intervals))
+
+sys.stdout.flush()
+
+
+# In[ ]:
+
+
+def collect_reads(sublist):
+    #create a dict for holding the frequency of each read length and GC content
+    GC_dict = {}
+    for length in range(size_range[0],size_range[1]+1):
+        GC_dict[length]={}
+        for num_GC in range(0,length+1):
+            GC_dict[length][num_GC]=0
+
+    #import the bam file
+    #this needs to be done within the loop otherwise it gives a truncated file warning
+    bam_file = pysam.AlignmentFile(bam_file_path, "rb")
+    print('sublist intervals:',len(sublist))
+
+    #this might also need to be in the loop
+    #import the ref_seq
+    ref_seq=pysam.FastaFile(ref_seq_path)
+
+    for i in range(len(sublist)):
+        chrom = sublist.iloc[i][0]
+        start = sublist.iloc[i][1]
+        end = sublist.iloc[i][2]
+        if i%5000==0:
+            print('interval',i,':',chrom,start,end,'seconds:',np.round(time.time()-start_time))
+            sys.stdout.flush()
+        #fetch any read that overlaps the inteterval 
+        fetched = bam_file.fetch(chrom,start,end)
+        for read in fetched:
+            #use both fw (positive template length) and rv (negative template length) reads
+            if (read.is_reverse==False and read.template_length>=size_range[0] and read.template_length<=size_range[1]) or             (read.is_reverse==True and -read.template_length>=size_range[0] and -read.template_length<=size_range[1]):
+                #qc filters, some longer fragments are considered 'improper pairs' but I would like to keep these
+                if read.is_paired==True and read.mapping_quality>=map_q and read.is_duplicate==False and read.is_qcfail==False:
+                    if read.is_reverse==False:
+                        fragment_start = read.reference_start
+                        fragment_end = read.reference_start+read.template_length
+                    elif read.is_reverse==True:
+                        fragment_end = read.reference_start + read.reference_length
+                        fragment_start = fragment_end + read.template_length
+
+                    #count the GC content
+                    fragment_seq = ref_seq.fetch(read.reference_name,fragment_start,fragment_end)
+                    fragment_seq = np.array(list(fragment_seq.upper()))
+                    fragment_seq[np.isin(fragment_seq, ['A','T','W'])] = 0
+                    fragment_seq[np.isin(fragment_seq, ['C','G','S'])] = 1
+                    rng = np.random.default_rng(fragment_start)
+                    fragment_seq[np.isin(fragment_seq, ['N','R','Y','K','M','B','D','H','V'])] = rng.integers(2, size=len(fragment_seq[np.isin(fragment_seq, ['N','R','Y','K','M','B','D','H','V'])])) #random integer in range(2) (i.e. 0 or 1)
+                    fragment_seq = fragment_seq.astype(float)
+
+
+                    num_GC = int(fragment_seq.sum())
+                    GC_dict[abs(read.template_length)][num_GC]+=1
+
+    print('done')
+    return(GC_dict)
+
+
+# In[ ]:
+
+
+start_time = time.time()
+p = Pool(processes=CPU) #use the available CPU
+sublists = np.array_split(mappable_intervals,CPU) #split the list into sublists, one per CPU
+
+GC_dict_list = p.map(collect_reads, sublists, 1)
+
+
+# In[ ]:
+
+
+all_GC_df = pd.DataFrame()
+for i,GC_dict in enumerate(GC_dict_list):
+    GC_df = pd.DataFrame()
+    for length in GC_dict.keys():
+        current = pd.Series(GC_dict[length]).reset_index()
+        current = current.rename(columns={'index':'num_GC',0:'number_of_fragments'})
+        current['length']=length
+        current = current[['length','num_GC','number_of_fragments']]
+        GC_df = GC_df.append(current, ignore_index=True)
+    GC_df = GC_df.set_index(['length','num_GC'])
+    all_GC_df[i] = GC_df['number_of_fragments']
+    del(GC_df,GC_dict)
+
+all_GC_df = all_GC_df.sum(axis=1)
+all_GC_df = pd.DataFrame(all_GC_df).rename(columns = {0:'number_of_fragments'})
+all_GC_df = all_GC_df.reset_index()
+all_GC_df.to_csv(out_file,sep='\t',index=False)
+
+
+# In[ ]:
+
+
+print('done')
+
+
+# In[ ]:
+
+
+
+
+
+# In[ ]:
+
+
+
+
diff --git a/DockerImage/scripts/griffin_calc_GC_frequency.py b/DockerImage/scripts/griffin_calc_GC_frequency.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[ ]:
+
+
+import pysam
+import os
+import pandas as pd
+import numpy as np
+import time
+import argparse
+import sys
+import pybedtools
+
+
+# In[ ]:
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('--mappable_regions_path', help='highly mappable regions to be used in GC correction, bed or bedGraph format', required=True)
+parser.add_argument('--ref_seq',help='reference sequence (fasta format)',required=True)
+parser.add_argument('--chrom_sizes',help='path to chromosome sizes for the reference seq',required=True)
+parser.add_argument('--out_dir',help='folder for results',required=True)
+parser.add_argument('--fragment_length',help='length of fragment (in bp) for which GC will be calculated',type=int, required=True)
+parser.add_argument('--read_length',help='length of read (in bp)',type=int, required=True)
+
+args = parser.parse_args()
+
+mappable_regions_path=args.mappable_regions_path
+ref_seq_path = args.ref_seq
+chrom_sizes_path = args.chrom_sizes
+out_dir = args.out_dir
+fragment_length = args.fragment_length
+read_length = args.read_length
+
+
+# In[ ]:
+
+
+mappable_name = mappable_regions_path.rsplit('/',1)[1].rsplit('.',1)[0]
+out_file = out_dir+'/'+mappable_name+'.'+str(fragment_length)+'bp.GC_frequency.txt'
+
+
+# In[ ]:
+
+
+#keep autosomes only
+chroms = ['chr'+str(m) for m in range(1,23)]
+
+
+# In[ ]:
+
+
+if read_length>fragment_length:
+    read_length = fragment_length 
+
+
+# In[ ]:
+
+
+print('mappable_regions_path:',mappable_regions_path)
+print('out_file:',out_file)
+print('fragment_length:',fragment_length)
+print('read_length:',read_length)
+
+
+# In[ ]:
+
+
+#import filter
+mappable_intervals = pd.read_csv(mappable_regions_path, sep='\t', header=None)
+
+mappable_intervals = mappable_intervals[mappable_intervals[0].isin(chroms)]
+
+print('chroms:', chroms)
+print('number_of_intervals:',len(mappable_intervals))
+sys.stdout.flush()
+
+
+# In[ ]:
+
+
+#get chrom sizes info
+chrom_sizes = pd.read_csv(chrom_sizes_path, sep='\t', header=None)
+
+#also keep as a dict
+chrom_size_dict = chrom_sizes.set_index(0).to_dict()[1]
+
+
+# In[ ]:
+
+
+#import the ref_seq
+ref_seq=pysam.FastaFile(ref_seq_path)
+
+
+# In[ ]:
+
+
+#count the GC content of all fragments where the forward read overlaps the specified regions
+start_time = time.time()
+print('Calculating forward read frequency')
+
+#create the GC frequencies dict
+fw_GC_dict = {}
+for num_GC in range(0,fragment_length+1):
+    fw_GC_dict[num_GC]=0
+
+for i in range(len(mappable_intervals)):
+    chrom = mappable_intervals.iloc[i][0]
+    start = mappable_intervals.iloc[i][1]+1
+    end = mappable_intervals.iloc[i][2]-1
+    if i%5000==0:
+        print('interval',i,':',chrom,start,end,'seconds:',np.round(time.time()-start_time))
+        sys.stdout.flush()
+
+    #count up all possible fw reads that overlap the interval
+    #adjust the start and end so it includes all possible fragment that overlap the interval 
+    adjusted_start = start-read_length
+    adjusted_end = end+fragment_length
+
+    if adjusted_start<0:
+        adjusted_start = 0
+    if adjusted_end>chrom_size_dict[chrom]:
+        adjusted_end = chrom_size_dict[chrom]
+        print(chrom,chrom_size_dict[chrom],'modifying_end_to_end_of_chromosome')
+
+    #count the GC content
+    fetched = ref_seq.fetch(chrom,adjusted_start,adjusted_end)
+    fetched = np.array(list(fetched.upper()))
+    fetched[np.isin(fetched, ['A','T','W'])] = 0
+    fetched[np.isin(fetched, ['C','G','S'])] = 1
+    rng = np.random.default_rng(start)
+    fetched[np.isin(fetched, ['N','R','Y','K','M','B','D','H','V'])] = rng.integers(2, size=len(fetched[np.isin(fetched, ['N','R','Y','K','M','B','D','H','V'])])) #random integer in range(2) (i.e. 0 or 1)
+    fetched = fetched.astype(float)
+
+    n=len(fetched)
+
+    window_sum = int(sum(fetched[:fragment_length]))
+    #print(k,len(fetched[:k]),window_sum)
+
+    fw_GC_dict[window_sum]+=1
+    for i in range(n-fragment_length):
+        window_sum = int(window_sum - fetched[i] + fetched[i+fragment_length])
+        fw_GC_dict[window_sum]+=1
+
+
+# In[ ]:
+
+
+#count the GC content of all fragments where the reverse read overlaps the specified regions
+print('Calculating reverse read frequency')
+
+#create the GC frequencies dict
+rv_GC_dict = {}
+for num_GC in range(0,fragment_length+1):
+    rv_GC_dict[num_GC]=0
+
+for i in range(len(mappable_intervals)):
+    chrom = mappable_intervals.iloc[i][0]
+    start = mappable_intervals.iloc[i][1]+1 #skip the first and last positions because these reads aren't fetched by pysam
+    end = mappable_intervals.iloc[i][2]-1
+    if i%5000==0:
+        print('interval',i,':',chrom,start,end,'seconds:',np.round(time.time()-start_time))
+        sys.stdout.flush()
+
+    #count up all possible rv reads that overlap the interval
+    #adjust the start and end so it includes all possible fragment that overlap the interval 
+    adjusted_start = start-fragment_length
+    adjusted_end = end+read_length
+
+    if adjusted_start<0:
+        adjusted_start = 0
+    if adjusted_end>chrom_size_dict[chrom]:
+        adjusted_end = chrom_size_dict[chrom]
+        print(chrom,chrom_size_dict[chrom],'modifying_end_to_end_of_chromosome')
+
+    #count the GC content
+    fetched = ref_seq.fetch(chrom,adjusted_start,adjusted_end)
+    fetched = np.array(list(fetched.upper()))
+    fetched[np.isin(fetched, ['A','T','W'])] = 0
+    fetched[np.isin(fetched, ['C','G','S'])] = 1
+    rng = np.random.default_rng(start)
+    fetched[np.isin(fetched, ['N','R','Y','K','M','B','D','H','V'])] = rng.integers(2, size=len(fetched[np.isin(fetched, ['N','R','Y','K','M','B','D','H','V'])])) #random integer in range(2) (i.e. 0 or 1)
+    fetched = fetched.astype(float)
+
+    n=len(fetched)
+
+    window_sum = int(sum(fetched[:fragment_length]))
+    #print(k,len(fetched[:k]),window_sum)
+
+    rv_GC_dict[window_sum]+=1
+    for i in range(n-fragment_length):
+        window_sum = int(window_sum - fetched[i] + fetched[i+fragment_length])
+        rv_GC_dict[window_sum]+=1
+
+
+# In[ ]:
+
+
+#convert to df and export
+GC_df = pd.DataFrame()
+#save GC dict
+current = (pd.Series(rv_GC_dict)+pd.Series(fw_GC_dict)).reset_index()
+current = current.rename(columns={'index':'num_GC',0:'number_of_fragments'})
+current['length']=fragment_length
+current = current[['length','num_GC','number_of_fragments']]
+GC_df = GC_df.append(current, ignore_index=True)
+GC_df.to_csv(out_file,sep='\t',index=False)
+
+
+# In[ ]:
+
+
+print('done')
+
+
diff --git a/DockerImage/scripts/griffin_coverage.py b/DockerImage/scripts/griffin_coverage.py
diff --git a/DockerImage/scripts/griffin_functions.py b/DockerImage/scripts/griffin_functions.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[ ]:
+
+
+def import_and_filter_sites(site_name,site_file,strand_column,chrom_column,position_column,chroms,ascending,sort_by,number_of_sites):
+    import pandas as pd
+    current_sites = pd.read_csv(site_file,sep='\t')
+    if strand_column not in current_sites.columns:
+        current_sites[strand_column]=0 
+
+    #throw out sites that aren't on the selected chroms
+    current_sites = current_sites[current_sites[chrom_column].isin(chroms)]
+
+    #select the sites to use if specified
+    if sort_by.lower()=='none': #if using all sites 
+        print(site_name,'processing all '+str(len(current_sites))+' sites')
+
+    else: #othewise sort by the specified column
+        current_sites=current_sites.sort_values(by=sort_by,ascending=ascending).reset_index(drop=True)#sort and reset index
+        current_sites=current_sites.iloc[0:int(number_of_sites)]
+        print(site_name+'\tprocessing',len(current_sites),'sites\t('+
+              str(sort_by),'range after sorting: ',min(current_sites[sort_by]),'to',
+              str(max(current_sites[sort_by]))+')')
+
+    current_sites = current_sites[[chrom_column,position_column,strand_column]]
+    current_sites['site_name']=site_name
+    return(current_sites)
+
+
+# In[2]:
+
+
+def define_fetch_interval(name_to_print,sites,chrom_column,position_column,chroms,chrom_sizes_path,upstream_bp,downstream_bp):
+    import pandas as pd
+    import numpy as np
+    #separate fw and reverse sites
+    fw_markers = ['+',1,'1']
+    rv_markers = ['-',-1,'-1']
+    fw_sites = sites[sites['Strand'].isin(fw_markers)].copy()
+    rv_sites = sites[sites['Strand'].isin(rv_markers)].copy()
+
+    undirected_sites = sites[~(sites['Strand'].isin(fw_markers+rv_markers))].copy()
+
+    if len(rv_sites)+len(fw_sites)+len(undirected_sites)==len(sites):
+        print(name_to_print+' (fw/rv/undirected/total): '+
+              str(len(fw_sites))+'/'+
+              str(len(rv_sites))+'/'+
+              str(len(undirected_sites))+'/'+
+              str(len(sites)))
+    else: #I don't think this should ever happen...
+        print('total fw sites:\t\t'+str(len(fw_sites)))
+        print('total rv sites:\t\t'+str(len(rv_sites)))
+        print('total undirected sites:'+'\t'+str(len(undirected_sites)))
+        print('total sites:\t\t'+str(len(sites)))
+        sys.exit('Problem with strand column')
+
+    #set up to fetch a window extending across the desired window
+    fw_sites['fetch_start'] = fw_sites[position_column]+upstream_bp
+    fw_sites['fetch_end'] = fw_sites[position_column]+downstream_bp
+
+    undirected_sites['fetch_start'] = undirected_sites[position_column]+upstream_bp
+    undirected_sites['fetch_end'] = undirected_sites[position_column]+downstream_bp
+
+    #for reverse sites, flip the window
+    rv_sites['fetch_start'] = rv_sites[position_column]-downstream_bp
+    rv_sites['fetch_end'] = rv_sites[position_column]-upstream_bp
+
+    #merge fw and reverse back together and sort them back into the original order
+    sites = fw_sites.append(rv_sites).append(undirected_sites).sort_index()
+    sites = sites.sort_values(by = [chrom_column,position_column]).reset_index(drop=True)
+
+    chrom_sizes = pd.read_csv(chrom_sizes_path, sep='\t', header=None)
+    chrom_sizes = chrom_sizes[chrom_sizes[0].isin(chroms)]
+    chrom_sizes = chrom_sizes.set_index(0)
+
+    adjusted_ends_df = pd.DataFrame()
+
+    for chrom in chroms:
+        length = chrom_sizes.loc[chrom][1]
+        current = sites[sites[chrom_column]==chrom].copy()
+        current['fetch_start'] = np.where(current['fetch_start']<0,0,current['fetch_start'])
+        current['fetch_end'] = np.where(current['fetch_end']>length,length,current['fetch_end'])    
+        adjusted_ends_df = adjusted_ends_df.append(current)
+    adjusted_ends_df = adjusted_ends_df.sort_values(by = [chrom_column,position_column]).reset_index(drop=True)
+    adjusted_ends_df = adjusted_ends_df.copy()
+
+    return(adjusted_ends_df)
+
+
+# In[3]:
+
+
+def progress_report(name,unit,start_time,current_time,item_index,total_items):
+    import numpy as np
+    time_elapsed = current_time-start_time
+    elapsed_min = int(np.floor(time_elapsed/60))
+    elapsed_sec = int(np.floor(time_elapsed%60))
+
+    expected_time = (time_elapsed/(item_index+1))*total_items
+
+    remaining_time = expected_time-time_elapsed
+    remaining_min = int(np.floor(remaining_time/60))
+    remaining_sec = int(np.floor(remaining_time%60))
+
+    name = [str(m) for m in name]
+    name_str = '_'.join(name)
+    printout = name_str+': '+ str(item_index+1) +' of '+ str(total_items)+' '+str(unit)+' done in '+           str(elapsed_min)+' min '+str(elapsed_sec)+' sec, '+str(remaining_min)+' min '+str(remaining_sec)+' sec remaining'
+    return(printout)
+
+
+# In[ ]:
+
+
+
+
diff --git a/DockerImage/scripts/griffin_mappability_correction.py b/DockerImage/scripts/griffin_mappability_correction.py
diff --git a/DockerImage/scripts/griffin_merge_sites.py b/DockerImage/scripts/griffin_merge_sites.py
diff --git a/DockerImage/scripts/griffin_plot.py b/DockerImage/scripts/griffin_plot.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+import os
+from matplotlib import pyplot as plt
+import pandas as pd
+import numpy as np
+import time
+import sys
+import yaml
+import argparse
+
+
+# In[2]:
+
+
+# #for ipynb
+# %matplotlib inline
+
+# in_dir = 'results/'
+# samples_yaml = 'test_samples.yaml'
+# mappability_correction = 'True'
+
+# save_window = [-1000,1000]
+# step = 15
+
+# individual = 'False'
+# out_dir = 'results'
+
+
+# In[3]:
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('--in_dir', help='path/to/results/', required=True)
+parser.add_argument('--samples_yaml', help='samples.GC.yaml', required=True)
+parser.add_argument('--mappability_correction', help='True/False; whether to perform mappability correction', required=True)
+
+parser.add_argument('--save_window',help='start and end of window to be plotted',nargs=2, type=int, default=(-1000,1000))
+parser.add_argument('--step',help='step size when calculating coverage', type=int, default=5)
+
+parser.add_argument('--individual',help='if individual sites were saved in previous steps. (True/False)',default='False')
+parser.add_argument('--out_dir',help='folder for results',required=True)
+
+args = parser.parse_args()
+
+in_dir = args.in_dir
+samples_yaml = args.samples_yaml
+mappability_correction = args.mappability_correction
+
+save_window=args.save_window
+step = args.step
+
+individual = args.individual
+out_dir = args.out_dir
+
+
+# In[4]:
+
+
+save_window=[int(np.ceil(save_window[0]/step)*step),int(np.floor(save_window[1]/step)*step)] #round to the nearest step inside the window
+save_columns = np.arange(save_window[0],save_window[1],step)
+str_save_columns = [str(m) for m in save_columns]
+print('save_window rounded to step:',save_window)
+
+
+# In[5]:
+
+
+with open(samples_yaml,'r') as f:
+    samples = yaml.safe_load(f)
+
+samples = samples['samples']
+samples = list(samples.keys())
+
+
+# In[6]:
+
+
+#dict to hold results grouped by correction type
+print('Importing data')
+results_dict = {'uncorrected': pd.DataFrame(),
+                'GC_corrected': pd.DataFrame()}
+
+if mappability_correction.lower() == 'true':
+    results_dict['GC_map_corrected'] = pd.DataFrame()
+
+#import
+for sample in samples:
+    print(sample)
+    for key in results_dict.keys():
+        current_file = in_dir+'/'+sample+'/'+sample+'.'+key+'.coverage.tsv'
+        current = pd.read_csv(current_file, sep='\t')
+        if individual.lower()=='true':
+            current = current.groupby('site_name')[str_save_columns].mean()
+            current = current.reset_index() #retain site_name
+            current['sample'] = sample
+        results_dict[key] = results_dict[key].append(current, ignore_index=True)
+
+site_names = results_dict['uncorrected']['site_name'].unique()
+
+
+# In[7]:
+
+
+#if info about individual sites was kept, the averaging process can take quite a while. Save for later use. 
+if individual.lower()=='true':
+    for i,key in enumerate(results_dict.keys()):
+        data = results_dict[key].copy()
+
+        data.to_csv(out_dir+'/plots/'+key+'.mean_data.txt', sep='\t', index=False)
+
+
+
+# In[8]:
+
+
+#generate plots
+num_plots = len(results_dict.keys())
+
+for j,site_name in enumerate(site_names):
+    fig,axes = plt.subplots(1,num_plots,figsize=(4*num_plots,3.5), sharey = 'row')
+    for i,key in enumerate(results_dict.keys()):
+        data = results_dict[key].copy()
+        ax = axes[i]
+        for sample in data['sample'].unique():
+            current = data[(data['sample']==sample) & (data['site_name']==site_name)]
+            ax.plot(save_columns, current[str_save_columns].T, label=sample)
+            ax.tick_params(labelleft=True)
+        ax.set_title(site_name+' '+key)
+        ax.set_xlabel('distance from site')
+
+    axes[0].set_ylabel('normalized coverage')
+
+    if len(data['sample'].unique())<15:
+        axes[-1].legend(bbox_to_anchor=[1,1],loc = 'upper left')
+    else:
+        axes[-1].legend(bbox_to_anchor=[1,1],loc = 'upper left',ncol=2)
+
+    fig.tight_layout()
+    plt.savefig(out_dir+'/plots/'+site_name+'.pdf')
+    plt.close('all')
+    if j%20==0:
+        print(j,site_name)
+        sys.stdout.flush()
+
+
+# In[ ]:
+
+
+
+
+
+# In[ ]:
+
+
+
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		samples:
		Healthy_demo: ../../demo/bam/Healthy_GSM1833219_downsampled.sorted.mini.bam
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		site_lists:
		CTCF_demo: ../../demo/griffin_nucleosome_profiling_demo_files/sites/CTCF.hg38.1000.txt