diff --git a/.misc/build.sh b/.misc/build.sh new file mode 100644 index 0000000..4ff9a13 --- /dev/null +++ b/.misc/build.sh @@ -0,0 +1,22 @@ +#! usr/bin/env bash + +# for installing conda package +mkdir -p $PREFIX/bin +mkdir -p $PREFIX/bin/lepmap3 +mkdir -p $PREFIX/bin/lepanchor +# LepWrap executable +chmod +x LepWrap +cp LepWrap $PREFIX/bin/ +# associated scripts +chmod +x scripts/* +cp scripts/* $PREFIX/bin/ +# LepMap3 modules and scripts +cp software/LepMap3/*.class $PREFIX/bin/lepmap3 +cp software/LepMap3/scripts/* $PREFIX/bin +# LepAnchor modules and scripts +cp software/LepAnchor/*.class $PREFIX/bin/lepanchor +cp software/LepAnchor/scripts/* $PREFIX/bin +cp software/LepAnchor/deps/ucsc_binaries/* $PREFIX/bin +cp software/LepAnchor/deps/*.pl software/LepAnchor/deps/Red software/LepAnchor/deps/all_lastz.ctl software/LepAnchor/deps/scoreMatrix.q software/LepAnchor/deps/step* $PREFIX/bin +# Snakemake rules +cp rules/LepAnchor/*.smk rules/LepMap3/*.smk $PREFIX/bin diff --git a/.misc/install.sh b/.misc/install.sh new file mode 100644 index 0000000..f791fdc --- /dev/null +++ b/.misc/install.sh @@ -0,0 +1,28 @@ +#! /usr/bin/env bash + +if [ -z "$CONDA_PREFIX" ]; then + echo "No active conda environment detected, will not install dependencies unless in an active environment" + exit 1 +fi + +# install LepWrap into conda PATH +mkdir -p $CONDA_PREFIX/bin +mkdir -p $CONDA_PREFIX/bin/lepmap3 +mkdir -p $CONDA_PREFIX/bin/lepanchor +# LepWrap executable +cp LepWrap $CONDA_PREFIX/bin/ +chmod +x $CONDA_PREFIX/bin/LepWrap +# associated scripts +chmod +x scripts/* +cp scripts/* $CONDA_PREFIX/bin/ +# LepMap3 modules and scripts +cp software/LepMap3/*.class $CONDA_PREFIX/bin/lepmap3 +cp software/LepMap3/scripts/* $CONDA_PREFIX/bin +# LepAnchor modules and scripts +cp software/LepAnchor/*.class $CONDA_PREFIX/bin/lepanchor +cp software/LepAnchor/scripts/* $CONDA_PREFIX/bin +ln -s $CONDA_PREFIX/bin/lepmap3/*.class $CONDA_PREFIX/bin/lepanchor/*.class $CONDA_PREFIX/bin/ +cp software/LepAnchor/deps/ucsc_binaries/* $CONDA_PREFIX/bin +cp software/LepAnchor/deps/*.pl software/LepAnchor/deps/Red software/LepAnchor/deps/all_lastz.ctl software/LepAnchor/deps/scoreMatrix.q software/LepAnchor/deps/step* $CONDA_PREFIX/bin +# Snakemake rules +cp rules/LepAnchor/*.smk rules/LepMap3/*.smk $CONDA_PREFIX/bin \ No newline at end of file diff --git a/.misc/meta.yml b/.misc/meta.yml new file mode 100644 index 0000000..15c625f --- /dev/null +++ b/.misc/meta.yml @@ -0,0 +1,49 @@ +{% set version = "3.6.2" %} +{% set sha256 = "0f0998ecd50b586d69b50e53d9a0b3d5aa3afa70bd96fe85c2fa3ba02e58278b" %} + +package: + name: lepwrap + version: '{{ version }}' + +source: + url: https://github.com/pdimens/LepWrap/archive/refs/tags/{{ version }}.tar.gz + sha256: {{ sha256 }} + +build: + number: 3 + noarch: generic + +requirements: + build: + host: + run: + - bzip2 + - font-ttf-dejavu-sans-mono + - font-ttf-ubuntu + - pygraphviz + - graphviz + - imagemagick + - openjdk + - pandoc + - python >=3.9 + - r-base >=4 + - r-dplyr + - r-tidyr + - r-stringr + - r-ggplot2 + - r-readr + - sed + - snakemake >=6.4 + +test: + commands: + - "java --version" + - "R --version" +about: + home: "https://github.com/pdimens/LepWrap/" + license: "The GNU General Public License v3.0 (GPL3)" + summary: "The Snakemake pipeline to use Lep-Map3 to create linkage maps and LepAnchor for anchoring+orienting genome assemblies." + +extra: + container: + extended-base: True diff --git a/LepWrap b/LepWrap index e612ca6..3c4f521 100755 --- a/LepWrap +++ b/LepWrap @@ -1,230 +1,120 @@ #! /usr/bin/env bash +# Help text +if [[ -z "$1" ]]; then + echo "Perform the modules of Lep-Map3 and/or Lep-Anchor" + echo -n "A config file (default: config.yml) must be configured" + if [ ! -f config.yml ]; then + echo " (one was created for you)" + generate_config.sh > config.yml + fi + echo -e "The second positional argument is optional if your config file is named config.yml\n" + printf "\033[01;32m" + printf "[usage]" + printf "\033[0m" + echo " LepWrap <# of threads> " + printf "\033[01;32m" + printf "[example]" + printf "\033[0m" + echo " LepWrap 16" + printf "\033[01;32m" + printf "[example]" + printf "\033[0m" + echo -e " LepWrap 30 kosambi.20iterations.yml" + exit 1 +fi + +if [ -z "$CONDA_PREFIX" ]; then + echo -e "No active conda environment detected and one is required for the pipeline to work. If you cloned the repository rather than installing the conda package, you may install the environment using:\n" + printf "\033[01;32m" + printf "conda env create -f conda_setup.yml\n\n" + printf "\033[0m" + echo -e "Then, activate it using:\n" + printf "\033[01;32m" + printf "conda activate lepwrap\n" + printf "\033[0m" + exit 1 +fi + +# Check for snakemake if which snakemake &>/dev/null; then foo=1 else echo -e "ERROR:\nSnakemake installation is required to run LepWrap, but not found in the current environment." - echo -e "If [ana|mini]conda are installed, created a pre-configred environment with:\n" + echo -e "It is likely LepWrap was not installed using [ana|mini]conda, which bundles all the dependencies." + echo -e "Please install LepWrap using conda: " + printf "\033[01;32m" + printf "conda install -c bioconda lepwrap\n\n" + printf "\033[0m" + echo -e "Alternatively, if you cloned the repository, you may install the environment using:\n" printf "\033[01;32m" printf "conda env create -f conda_setup.yml\n" printf "\033[0m" exit 1 fi -if [[ -z "$2" ]]; then - if [ ! -f config.yml ]; then - echo -n "Error: " - printf "\033[01;32m" - printf "config.yml" - printf "\033[0m" - echo " not found" - echo "The file 'config.yml' was created for you, please edit it and run LepWrap again" - cat <config.yml -# Configuration file for LepWrap -#=======================================================# -# Lep-Map 3 # -#=======================================================# -# Change this to false if you want to skip Lep-Map3 -run_lepmap: true - - #----- ParentCall2 ------# -# The filtered VCF file with your genotype likelihoods: -vcf: "mydata.vcf" - -# Instructions to create pedigree file: https://sourceforge.net/p/lep-map3/wiki/software/LepMap3 Home/#parentcall2 -# the pedigree file associated with your data -pedigree: "pedigree.txt" - -# Additional parameters for ParentCall2 (e.g. halfSibs=1), if any -extra_params_ParentCall: "removeNonInformative=1" - - - #----- Filtering2 -----# -# Data tolerance value-- set this to 0 if you want to skip the Filtering2 module -data_tol: 0 - -# Additional parameters for Filtering2 (e.g. convert2Biallelic=1), if any -extra_params_Filtering: "" - - - #----- SeperateChromosomes2 -----# -# LepWrap will iteratively perform SeperateChromosomes2 for each -# LOD score in the range of lod_min to lod_max - -# The minimum LOD for SeperateChromosomes2 -lod_min: 10 - -# The maximum LOD for SeperateChromosomes2 -lod_max: 50 - -# Use only markers with informative father (1), mother(2), both parents(3) or neither parent(0) -informative: "informativeMask=3" - -# Additional parameters for SeparateChromosomes2 (e.g. distrotionLOD=1), if any -extra_params_SeparateChromosomes: "sizeLimit=5 distortionLod=1" - - - #----- JoinSingles2ALL -----# -# Set this to false if you want to skip joining singles (0) to linkage groups -run_joinsingles2all: false - -# These are the parameters for JoinSingles2ALL, and are highly data-dependent -# Start with lower values for lod_limit and increase as necessary -lod_limit: "lodLimit=2" - -# Start with lower values for lod_limit and increase as necessary -lod_difference: "lodDifference=2" - -# Additional parameters for JoinSingles2All (e.g. iterate=1), if any -extra_params_JoinSingles: "iterate=1 distortionLod=1" - - - #----- OrderMarkers2 -----# -# Set exp_lg to your expected number of chromosomes for iterative ordering -exp_lg: 24 - -# Additional parameters for OrderMarkers2 (e.g. hyperPhaser=1), if any -# I recommend setting numMergeIterations to ~100 (Lep-Map3 default is 6) -extra_params_OrderMarkers: "useKosambi=1 phasingIterations=2 numMergeIterations=100" - - - #----- Edge Trimming -----# -# Edge trimming will examine the first and last X% of markers in a linkage group -# and remove clusters that are N% centimorgans (of the total cM span) away from -# the next marker. You can "skip" trimming by setting trim_cutoff really high (e.g. 80-100). - -# Set edge_length to the percent number of markers you would like to examine from either end of the linkage group -# Value can be an integer or decimal, i.e. 15 is the same as 0.15, which both mean "15%" (10-20 is reasonable) -edge_length: 20 +# make sure the first argument is a number (number of threads) +# function from https://stackoverflow.com/a/61835747 +is_int() { case ${1#[-+]} in '' | *[!0-9]* ) return 1;; esac ;} -# Set trim_cuttoff to the centiMorgan distance cutoff (5-10 is reasonable) -trim_cutoff: 100 - - - #----- Re-OrderMarkers2 -----# -# The second round of OrderMarkers will use the same basic parameters as the first round (but not the extra params) -# If there are additional parameters you would like to use, add them here: -extra_params_reOrderMarkers: "improveOrder=1 useKosambi=1 numMergeIterations=75" - - #----- Calculate Distances -----# -# If you used useKosambi=1 or useMorgan=1 for Ordering/reOrdering, add that same -# parameter to distance_method, otherwise leave it as a blank string -distance_method: "useKosambi=1" - - -#=======================================================# -# Lep-Anchor # -#=======================================================# - #---- global settings ----# -# Change this to false if you want to skip Lep-Anchor -run_lepanchor: true - -# The path to the genome assembly you are trying to anchor -assembly: "assembly.fasta" - -# The number of linkage groups you have -lg_count: 24 - -# If you have a PAF file of long reads mapped to your genome, add it here, otherwise leave the text as "/dev/null" -PAF_file: "/dev/null" - -# If you have a proximity file add it here, otherwise leave the text as "/dev/null". -# This isn't yet implemented in Lep-Anchor. -proximity_file: "/dev/null" - - - #----- CleanMap -----# -# Additional parameters for CleanMap (e.g. chimericDistance=500), if any -extra_params_CleanMap: "" - - - #----- Map2Bed -----# -# Additional parameters for Map2Bed (e.g. markerSupport=4), if any -extra_params_Map2Bed: "" - - - #----- PlaceAndOrientContigs -----# -# Choose which of the input types you want to generate by leaving it uncommented. Intervals are the default, but either works. -#lepanchor_input: "noIntervals=0" # data is intervals -lepanchor_input: "noIntervals=1" # data is distances - -# The size limit for detecting potential haplotype contigs (default: ~5000) -# Set this value really high (50000+) to ignore haplotype removal in between PlaceOrient iterations -haplotype_limit: 5000 - -# Additional parameters you would like to use for PlaceAndOrientContigs, if any (e.g. randomOrder=1) -extra_params_PlaceOrient: "keepEmptyIntervals=1 numRuns=10" - - - #----- Edge Trimming -----# -# Edge trimming will examine the first and last X% of markers in a linkage group -# and remove clusters that are N% centimorgans (of the total cM span) away from -# the next marker. You can "skip" trimming by setting LA_trim_cutoff really high (e.g. 80-100) - -# Set edge_length to the percent number of markers you would like to examine from either end of the linkage group -# Value can be an integer or decimal, i.e. 15 is the same as 0.15, which both mean "15%" (10-15 is reasonable) -LA_edge_length: 20 - -# Set trim_cuttoff to the centiMorgan distance cutoff (5 is reasonable) -LA_trim_cutoff: 5 -EOF - exit 1 - fi - CONF=config.yml +if $(is_int $1); then + foo=1 else - if [ ! -f $2 ]; then - echo -n "Error: Config file " - printf "\033[01;32m" - printf "$2" - printf "\033[0m" - echo " not found" - exit 1 - else - CONF=$2 - fi + echo "Error: the first argument must be the number of threads, as an integer." + exit 1 fi -if [[ -z "$1" ]]; then - echo "Perform the modules of Lep-Map3 and/or Lep-Anchor" - echo "Make sure the config file (default: config.yml) is properly configured for how you want to run things" - echo "The second positional argument is optional if your config file is named config.yml" - echo "" - echo "[usage] LepWrap <# of threads> " - echo "[example] LepWrap 16" - echo "[example] LepWrap 30 kosambi.20iterations.yml" - exit 1 +if [[ ! -z "$2" ]]; then + if [ ! -f $2 ]; then + echo -n "Error: Config file " + printf "\033[01;32m" + printf "$2" + printf "\033[0m" + echo " was not found, but it was created for you. Please edit it and run LepWrap again" + generate_config.sh > $2 + exit 1 + else + CONF=$2 + fi +else + if [ ! -f config.yml ]; then + echo -n "Error: " + printf "\033[01;32m" + printf "config.yml" + printf "\033[0m" + echo " not found" + echo " was not found, but it was created for you. Please edit it and run LepWrap again" + generate_config.sh > config.yml + exit 1 + fi + CONF=config.yml fi lepmap(){ - echo "Running Lep-Map3" + printf "\033[01;32m" + printf "Running Lep-Map3\n" + printf "\033[0m" sleep 2s - snakemake --cores $1 --snakefile ./rules/LepMap3/LepMap3.smk --configfile $CONF --directory . + snakemake --cores $1 --snakefile $CONDA_PREFIX/bin/LepMap3.smk --configfile $CONF --directory . } lepanchor(){ - echo -e "\nRunning Lep-Anchor" + printf "\033[01;32m" + printf "Running Lep-Anchor\n" + printf "\033[0m" sleep 2s - snakemake --cores $1 --snakefile ./rules/LepAnchor/LepAnchor.smk --configfile $CONF --directory . + snakemake --cores $1 --snakefile $CONDA_PREFIX/bin/LepAnchor.smk --configfile $CONF --directory . } -# copy lastz binaries into conda path if they aren't already there -if [ -z "$CONDA_PREFIX" ]; then - echo "No active conda environment detected" -else - cp -n software/LepAnchor/deps/ucsc_binaries/* $CONDA_PREFIX/bin -fi - -LM=$(grep "run_lepmap" config.yml | cut -d":" -f2 | xargs | tr '[:upper:]' '[:lower:]') -LA=$(grep "run_lepanchor" config.yml | cut -d":" -f2 | xargs | tr '[:upper:]' '[:lower:]') +LM=$(grep "run_lepmap" $CONF | cut -d":" -f2 | xargs | tr '[:upper:]' '[:lower:]') +LA=$(grep "run_lepanchor" $CONF | cut -d":" -f2 | xargs | tr '[:upper:]' '[:lower:]') if [ $LM == "true" ]; then lepmap $1 $CONF else echo "Skipping Lep-Map3" -fi - -if [ $LA == "true" ]; then +fi && if [ $LA == "true" ]; then lepanchor $1 $CONF else - echo "Skipping LepAnchor" -fi \ No newline at end of file + echo "Skipping Lep-Anchor" +fi diff --git a/README.md b/README.md index f74c2d6..ecfec7a 100644 --- a/README.md +++ b/README.md @@ -2,24 +2,30 @@ _It's Lep-Map3 and Lep-Anchor, but with snakes 🐍🐍_ -[![documentation link](https://img.shields.io/badge/docs-wiki-75ae6c?style=for-the-badge&logo=Read%20The%20Docs)](https://github.com/pdimens/LepWrap/wiki) +[![alt text](https://img.shields.io/badge/docs-wiki-75ae6c?style=for-the-badge&logo=Read%20The%20Docs)](https://github.com/pdimens/LepWrap/wiki) [![Cite](https://img.shields.io/badge/Cite-10.5281/zenodo.6055566-e1e1e1?style=for-the-badge)](https://zenodo.org/badge/latestdoi/260516189) - - # LepWrap LepWrap is a reusable pipeline to use the linkage map software [Lep-Map3](https://sourceforge.net/projects/lep-map3/) and the genome assembly map-based anchoring and orienting software [Lep-Anchor](https://sourceforge.net/p/lep-anchor/wiki/Home/). Check out [the documentation/wiki](https://github.com/pdimens/LepWrap/wiki) for detailed installation, usage, and workflow information. ### How to install -You will need a `conda` installation ([Anaconda](https://docs.anaconda.com/anaconda/install/) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html), I recommend Miniconda), along with downloading the latest release or cloning this repository locally. Using the latest release is a more stable solution. +You will need a `conda` installation ([Anaconda](https://docs.anaconda.com/anaconda/install/) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html), I recommend Miniconda). Alternatively, you can download latest release or clone this repository locally. -#### 1. Cloning LepWrap +#### 1. The Easy Way™️ +Create an environment called `lepwrap` and install `LepWrap` into it in a single command. +```bash +conda create -n lepwrap -c bioconda lepwrap +``` +Activate the environment with `conda activate lepwrap` + +#### 2. The Other Way +##### 2.1 Cloning LepWrap Download a zip of this repository using the "Code" button on the top-right and unzip it on your machine or: ```bash git clone https://github.com/pdimens/LepWrap.git ``` -#### 2. Installing other dependencies +##### 2.2 Installing everything Assuming you have `anaconda` or `miniconda` installed: ```bash cd LepWrap @@ -29,18 +35,22 @@ This will create an environment called `lepwrap` that can be activated with: ```bash conda activate lepwrap ``` +Then, install all the software dependencies into the environment's path with +```bash +.misc/install.sh +``` ### How to run You will need to modify `config.yml` to suit your needs, then you can simply run the pipeline with the wrapper: ```bash -./LepWrap +LepWrap ``` where `` is an integer of the maximum number of cores/threads you want the pipeline to use and `` (optional!) is the name of the config file, if it's different than `config.yml`. If no config file is found in the directory, LepWrap will generate a default one for you to edit. **Examples** ```bash -./LepWrap 15 # assumes config.yml -./LepWrap 32 nojoinsingles.yml # specific config file +LepWrap 15 # assumes config.yml +LepWrap 32 nojoinsingles.yml # specific config file ``` ### Something to keep in mind LepWrap does things a certain way, employing the most common/reasonable way of using Lep-Map3 (and LepAnchor more or less). Current versions are **a lot** more flexible that the predecessors, but might still lack something you need. Your study is unique, and I encourage you to clone/fork this repository and adapt LepWrap to it! All of the code in LepWrap is written in human-readable bash or aggressively annotated R, so give it a shot and adapt it to your workflow. PR's always welcome! diff --git a/conda_setup.yml b/conda_setup.yml index c81b970..8457238 100644 --- a/conda_setup.yml +++ b/conda_setup.yml @@ -207,7 +207,6 @@ dependencies: - r-cli=2.5.0=r40hc72bb7e_0 - r-clipr=0.7.1=r40h142f84f_0 - r-colorspace=2.0_1=r40hcfec24a_0 - - r-cowplot=1.1.1=r40hc72bb7e_0 - r-cpp11=0.2.7=r40hc72bb7e_0 - r-crayon=1.4.1=r40hc72bb7e_0 - r-curl=4.3.1=r40hcfec24a_0 diff --git a/config.yml b/config.yml index b65ff0f..afdc5be 100644 --- a/config.yml +++ b/config.yml @@ -126,9 +126,9 @@ extra_params_Map2Bed: "" #lepanchor_input: "noIntervals=0" # data is intervals lepanchor_input: "noIntervals=1" # data is distances -# The size limit for detecting potential haplotype contigs (default: ~5000) +# The size limit for detecting potential haplotype contigs (LepAnchor default: 2000) # Set this value really high (50000+) to ignore haplotype removal in between PlaceOrient iterations -haplotype_limit: 5000 +haplotype_limit: 2000 # Additional parameters you would like to use for PlaceAndOrientContigs, if any (e.g. randomOrder=1) extra_params_PlaceOrient: "keepEmptyIntervals=1 numRuns=10" diff --git a/rules/LepAnchor/LA_extra.smk b/rules/LepAnchor/LA_extra.smk deleted file mode 100644 index d98b15d..0000000 --- a/rules/LepAnchor/LA_extra.smk +++ /dev/null @@ -1,11 +0,0 @@ -rule remove_haplos: - input: - bedfile = "10_Anchoring/map_extra.bed", - haplotypes = "10_Anchoring/suspected.haplotypes.after" - output: "10_Anchoring/map.nohaplotypes.bed" - message: "Creating bedfile with suspected haplotypes removed" - shell: - """ - grep -w -v -f <(cut -f 2 {input.haplotypes}) {input.bedfile} > {output} - #awk -f software/LepAnchor/scripts/removeHaplotypes.awk {input.bedfile} {input.haplotypes} > {output}" - """ \ No newline at end of file diff --git a/rules/LepAnchor/LepAnchor.smk b/rules/LepAnchor/LepAnchor.smk index a746cef..c318868 100644 --- a/rules/LepAnchor/LepAnchor.smk +++ b/rules/LepAnchor/LepAnchor.smk @@ -21,9 +21,10 @@ lg_range = list(range(1,lg+1)) include: "generate_inputs.smk" include: "mask_and_chain.smk" -include: "place_orient.smk" -include: "place_orient_ii.smk" -include: "place_orient_iii.smk" +include: "place_orient1.smk" +include: "place_orient2.smk" +include: "place_orient3.smk" +#include: "place_orient4.smk" include: "build_agp.smk" include: "build_fasta.smk" include: "mareymaps_untrimmed.smk" diff --git a/rules/LepAnchor/build_agp.smk b/rules/LepAnchor/build_agp.smk index a3f19bc..983d57a 100644 --- a/rules/LepAnchor/build_agp.smk +++ b/rules/LepAnchor/build_agp.smk @@ -1,6 +1,5 @@ rule construct_agp: - input: - cleaned = "10_PlaceAndOrientContigs/overlaps_rm.la" + input: "10_PlaceAndOrientContigs/overlaps.removed.la" output: agp = report("11_AGP/contigs/chr.{lg_range}.agp", category = "Contig AGP Files"), scaff_agp = report("11_AGP/scaffolds/chr.{lg_range}.scaffolds.agp", category = "Scaffold AGP Files") @@ -9,27 +8,25 @@ rule construct_agp: chrom = "{lg_range}" shell: """ - awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp_full2.awk - > {output.agp} - awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp2.awk - > {output.scaff_agp} + awk -vn={params.chrom} '($5==n)' {input} | awk -vprefix="LG" -vlg={params.chrom} -f $CONDA_PREFIX/bin/makeagp_full2.awk - > {output.agp} + awk -vn={params.chrom} '($5==n)' {input} | awk -vprefix="LG" -vlg={params.chrom} -f $CONDA_PREFIX/bin/makeagp2.awk - > {output.scaff_agp} """ - rule unused: input: lengths = "10_PlaceAndOrientContigs/contigs.length", - haplos = "10_PlaceAndOrientContigs/suspected.haplotypes.before", - agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range), + haplos = "10_PlaceAndOrientContigs/suspected.haplotypes.initial", + agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range) output: - txt = "11_AGP/not_used_final.txt", + txt = "11_AGP/not_used.txt", agp = "11_AGP/not_used.agp" message: "Finding unused contigs" shell: """ - cut -f 1 {input.lengths} | grep -v -w -F -f <(cut -f 2 {input.haplos};awk '($5!="U"){{print $6}}' {input.agp}) > {output.txt} + cut -f 1 {input.lengths} | grep -v -w -F -f <(cut -f 2 {input.haplos}; awk '($5!="U"){{print $6}}' {input.agp}) > {output.txt} grep -F -w -f {output.txt} {input.lengths} | awk '{{print $1,1,$2,1,"W",$1,1,$2,"+"}}' > {output.agp} """ - rule build_final_agp: input: agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range), diff --git a/rules/LepAnchor/build_fasta.smk b/rules/LepAnchor/build_fasta.smk index 8aefe0a..9d20133 100644 --- a/rules/LepAnchor/build_fasta.smk +++ b/rules/LepAnchor/build_fasta.smk @@ -2,49 +2,33 @@ rule build_scaffold_only_fasta: input: assembly = geno, agp = "11_AGP/lepanchor.contigs.only.agp" - output: - fasta = "12_Fasta/Anchored.scaffolds.only.fa.gz", - message: "Constructing final scaffold-only fasta file {output.fasta}" - shell: - """ - gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.agp} | gzip > {output.fasta} - """ + output: "12_Fasta/Anchored.scaffolds.only.fa.gz" + message: "Constructing final scaffold-only fasta file {output}" + shell: "gunzip -fc {input.assembly} | awk -f $CONDA_PREFIX/bin/makefasta.awk - {input.agp} | gzip > {output}" rule build_scaffold_contig_fasta: input: assembly = geno, agp = "11_AGP/lepanchor.contigs.all.agp" - output: - fasta = "12_Fasta/Anchored.scaffolds.fa.gz", - message: "Constructing final scaffold fasta file {output.fasta}" - shell: - """ - gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.agp} | gzip > {output.fasta} - """ + output: "12_Fasta/Anchored.scaffolds.fa.gz" + message: "Constructing final scaffold fasta file {output}" + shell: "gunzip -fc {input.assembly} | awk -f $CONDA_PREFIX/bin/makefasta.awk - {input.agp} | gzip > {output}" rule build_contig_only_fasta: input: assembly = geno, scaff_agp = "11_AGP/lepanchor.scaffolds.only.agp" - output: - fasta = "12_Fasta/Anchored.contigs.only.fa.gz" - message: "Constructing final contig-only fasta file {output.fasta}" - shell: - """ - gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.scaff_agp} | gzip > {output.fasta} - """ + output: "12_Fasta/Anchored.contigs.only.fa.gz" + message: "Constructing final contig-only fasta file {output}" + shell: "gunzip -fc {input.assembly} | awk -f $CONDA_PREFIX/bin/makefasta.awk - {input.scaff_agp} | gzip > {output}" rule build_contig_fasta: input: assembly = geno, scaff_agp = "11_AGP/lepanchor.scaffolds.all.agp" - output: - fasta = "12_Fasta/Anchored.contigs.fa.gz" - message: "Constructing final contig fasta file {output.fasta}" - shell: - """ - gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.scaff_agp} | gzip > {output.fasta} - """ \ No newline at end of file + output: "12_Fasta/Anchored.contigs.fa.gz" + message: "Constructing final contig fasta file {output}" + shell: "gunzip -fc {input.assembly} | awk -f $CONDA_PREFIX/bin/makefasta.awk - {input.scaff_agp} | gzip > {output}" \ No newline at end of file diff --git a/rules/LepAnchor/generate_inputs.smk b/rules/LepAnchor/generate_inputs.smk index 6c34c83..1151398 100644 --- a/rules/LepAnchor/generate_inputs.smk +++ b/rules/LepAnchor/generate_inputs.smk @@ -2,13 +2,13 @@ rule extract_markers: input: "2_Filtering/data.filtered.lepmap3.gz" output: report("snps.txt", category = "Data") message: "Extracting marker information from Lep-Map3 data file {input}" - shell: "scripts/extract_markers.sh {input}" + shell: "extract_markers.sh {input}" rule generate_input_data: input: markers = "snps.txt", - data = expand("7_Intervals/ordered.{x}.intervals", x = range(1, lg + 1)) if data_type == "noIntervals=0" else expand("7_Distances/ordered.{x}.distances", x = range(1, lg + 1)) + data = expand("7_Intervals/ordered.{x}.intervals", x = lg_range) if data_type == "noIntervals=0" else expand("7_Distances/ordered.{x}.distances", x = lg_range) output: data = report("10_PlaceAndOrientContigs/lepanchor.input", category = "Data") message: "Combining {params} Lep-Map3 files into single LepAnchor input {output}" @@ -31,31 +31,28 @@ rule contiglengths: input: geno output: report("10_PlaceAndOrientContigs/contigs.length", category = "Data") message: "Getting contig lengths" - shell: "gunzip -fc {input} | awk -f software/LepAnchor/scripts/contigLength.awk > {output}" + shell: "gunzip -fc {input} | awk -f $CONDA_PREFIX/bin/contigLength.awk > {output}" rule find_haplotypes: input: "9_Chain/chainfile.gz" - output: report("10_PlaceAndOrientContigs/suspected.haplotypes.before", category = "Logs") + output: report("10_PlaceAndOrientContigs/suspected.haplotypes.initial", category = "Logs") message: "Finding non-haplotype contigs not included in map.bed" - shell: - """ - gunzip -fc {input} | awk -f software/LepAnchor/scripts/findFullHaplotypes.awk > {output} - """ + shell: "gunzip -fc {input} | awk -f $CONDA_PREFIX/bin/findFullHaplotypes.awk > {output}" rule liftover: input: chain = "9_Chain/chainfile.gz", intervals = "10_PlaceAndOrientContigs/lepanchor.input", - haplos = "10_PlaceAndOrientContigs/suspected.haplotypes.before" + haplos = "10_PlaceAndOrientContigs/suspected.haplotypes.initial" output: lift = report("10_PlaceAndOrientContigs/liftover.la", category = "Lifted Intervals"), sortedlift = report("10_PlaceAndOrientContigs/liftover.sorted.la", category = "Lifted Intervals") message: "Running liftoverHaplotypes for the input maps" shell: """ - gunzip -fc {input.chain} | java -cp software/LepAnchor LiftoverHaplotypes map={input.intervals} haplotypes={input.haplos} chain=- > {output.lift} + gunzip -fc {input.chain} | java -cp $CONDA_PREFIX/bin/lepanchor LiftoverHaplotypes map={input.intervals} haplotypes={input.haplos} chain=- > {output.lift} cat {output.lift} | sort -V -k 1,1 -k 2,2n > {output.sortedlift} """ @@ -67,7 +64,8 @@ rule cleanmap: message: "Running CleanMap" params: extras = cleanmap_extra - shell: "java -cp software/LepAnchor CleanMap map={input} {params.extras} > {output} 2> {log}" + shell: "java -cp $CONDA_PREFIX/bin/lepanchor CleanMap map={input} {params.extras} > {output} 2> {log}" + rule map2bed: input: @@ -78,13 +76,13 @@ rule map2bed: message: "Running Map2Bed" params: extras = map2bed_extra - shell: "java -cp software/LepAnchor Map2Bed map={input.cleanmap} contigLength={input.lengths} {params.extras} > {output} 2> {log}" + shell: "java -cp $CONDA_PREFIX/bin/lepanchor Map2Bed map={input.cleanmap} contigLength={input.lengths} {params.extras} > {output} 2> {log}" rule ungrouped: input: lengths = "10_PlaceAndOrientContigs/contigs.length", - haplos = "10_PlaceAndOrientContigs/suspected.haplotypes.before", + haplos = "10_PlaceAndOrientContigs/suspected.haplotypes.initial", bedfile = "10_PlaceAndOrientContigs/map.bed" output: bedfile = "10_PlaceAndOrientContigs/map_extra.bed" diff --git a/rules/LepAnchor/mareymaps_untrimmed.smk b/rules/LepAnchor/mareymaps_untrimmed.smk index bf93a90..d285a5b 100644 --- a/rules/LepAnchor/mareymaps_untrimmed.smk +++ b/rules/LepAnchor/mareymaps_untrimmed.smk @@ -19,12 +19,12 @@ rule mareymap_data: """ for c in $(seq 1 {params.chrom}) do - awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 11_AGP/contigs/chr.$c.agp - | awk -vm=1 '(/LG/ && NF>=4){{if (NF==4) $5=$4;print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$5}}' | gzip + awk -vn=$c '($3==n)' {input.lift} | awk -f $CONDA_PREFIX/bin/liftover.awk 11_AGP/contigs/chr.$c.agp - | awk -vm=1 '(/LG/ && NF>=4){{if (NF==4) $5=$4;print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$5}}' | gzip done > {output.mareydata} 2> {log} for c in $(seq 1 {params.chrom}) do - awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 11_AGP/contigs/chr.$c.agp - | awk -vm=1 '(/LG/ && NR>=4){{if (NF>4) s=0.5; else s=1;print $1"\t"$2"\t"$3"\t"m"\t"s*($4+$5)}}' | gzip + awk -vn=$c '($3==n)' {input.lift} | awk -f $CONDA_PREFIX/bin/liftover.awk 11_AGP/contigs/chr.$c.agp - | awk -vm=1 '(/LG/ && NR>=4){{if (NF>4) s=0.5; else s=1;print $1"\t"$2"\t"$3"\t"m"\t"s*($4+$5)}}' | gzip done > {output.sexavg} 2> /dev/null """ @@ -43,7 +43,7 @@ rule mareymaps: message: "Creating Marey Maps" shell: """ - Rscript software/LepAnchor/scripts/plot_marey.R {input.data} 11_AGP/contigs - Rscript scripts/LASummary.r {input.data} true - Rscript scripts/LASummarySexAvg.r {input.sexavg} + Rscript $CONDA_PREFIX/bin/plot_marey.R {input.data} 11_AGP/contigs + LASummary.r {input.data} true + LASummarySexAvg.r {input.sexavg} """ \ No newline at end of file diff --git a/rules/LepAnchor/mask_and_chain.smk b/rules/LepAnchor/mask_and_chain.smk index 2e26d64..9e06d0e 100644 --- a/rules/LepAnchor/mask_and_chain.smk +++ b/rules/LepAnchor/mask_and_chain.smk @@ -1,10 +1,10 @@ rule prep_geno: input: geno - output: "8_Repeatmask/inputgenome/lepanchorinput.fa" + output: "8_RepeatMask/inputgenome/lepanchorinput.fa" message: "Preparing genome for repeat masking" shell: """ - mkdir -p 8_Repeatmask/inputgenome + mkdir -p 8_RepeatMask/inputgenome if (file {input} | grep -q compressed); then echo "- Assembly is compressed, creating decompressed copy: {output}" gunzip --stdout {input} > {output} @@ -16,25 +16,34 @@ rule prep_geno: rule repeatmask: - input: "8_Repeatmask/inputgenome/lepanchorinput.fa" - output: "8_Repeatmask/repeatmasked.fa.gz" - log: "8_Repeatmask/Red.log" + input: "8_RepeatMask/inputgenome/lepanchorinput.fa" + output: "8_RepeatMask/repeatmasked.fa.gz" + log: "8_RepeatMask/Red.log" message: "Using Red to repeat-mask {input}" threads: 30 shell: """ - echo "- Running Red" - software/LepAnchor/deps/Red -gnm 8_Repeatmask/inputgenome -msk 8_Repeatmask -sco 8_Repeatmask -cnd 8_Repeatmask -rpt 8_Repeatmask > {log} 2>> {log} + Red -gnm 8_RepeatMask/inputgenome -msk 8_RepeatMask -sco 8_RepeatMask -cnd 8_RepeatMask -rpt 8_RepeatMask > {log} 2>> {log} echo "- Compressing repeat-masked genome from Red" - gzip --stdout 8_Repeatmask/*.msk > {output} && rm 8_Repeatmask/*.msk + gzip --stdout 8_RepeatMask/*.msk > {output} && rm 8_RepeatMask/*.msk + """ + +rule lastz_config: + output: + ctl = "9_Chain/all_lastz.ctl", + scoremtx = "9_Chain/scoreMatrix.q" + message: "Creating LASTZ configuration inputs" + shell: + """ + generate_lastzctl.sh > {output.ctl} + generate_qscoremtx.sh > {output.scoremtx} """ - rule chain_1: input: - geno = "8_Repeatmask/repeatmasked.fa.gz", - ctrl = "software/LepAnchor/deps/all_lastz.ctl", - scoremtx = "software/LepAnchor/deps/scoreMatrix.q" + geno = "8_RepeatMask/repeatmasked.fa.gz", + ctrl = "9_Chain/all_lastz.ctl", + scoremtx = "9_Chain/scoreMatrix.q" output: out1 = "9_Chain/repeatmaskedx.sizes", out2 = "9_Chain/repeatmasked.sizes" @@ -42,9 +51,9 @@ rule chain_1: threads: 30 shell: """ - ln -srf {input} 9_Chain/ + ln -srf {input.geno} 9_Chain/ cd 9_Chain - ../software/LepAnchor/deps/step1.HM2 repeatmasked {threads} + step1.HM2 repeatmasked {threads} """ @@ -60,6 +69,6 @@ rule chain_2: shell: """ cd 9_Chain - ../software/LepAnchor/deps/step2.HM2 repeatmasked {threads} && rm -r repeatmasked.repeatmaskedx.result/raw.axt + step2.HM2 repeatmasked {threads} && rm -r repeatmasked.repeatmaskedx.result/raw.axt ln -sr ../{output.original} ../{output.slink} """ diff --git a/rules/LepAnchor/place_orient.smk b/rules/LepAnchor/place_orient.smk deleted file mode 100644 index dd3546c..0000000 --- a/rules/LepAnchor/place_orient.smk +++ /dev/null @@ -1,35 +0,0 @@ -rule place_orient: - input: - chain = "9_Chain/chainfile.gz", - bedfile = "10_PlaceAndOrientContigs/map_extra.bed", - paf = paf, - prox = proximity, - lift = "10_PlaceAndOrientContigs/liftover.la" - output: - chrom = "10_PlaceAndOrientContigs/orient_1/chr.{lg_range}.la", - haplos = "10_PlaceAndOrientContigs/orient_1/haplotypes/chr.{lg_range}.haplo.suspected" - log: - chrom = report("10_PlaceAndOrientContigs/orient_1/logs/chr.{lg_range}.la.log", category = "Anchoring I Logs"), - haplos = "10_PlaceAndOrientContigs/orient_1/haplotypes/chr.{lg_range}.haplo.all", - errors = "10_PlaceAndOrientContigs/orient_1/errors/chr.{lg_range}.errors" - params: - chrom = "{lg_range}", - extras = place_orient_extra, - datatype = data_type, - haplo = haplo_limit - threads: 3 - message: "Running the 1st round of PlaceAndOrientContigs for linkage group {params.chrom}" - shell: - """ - gunzip -fc {input.chain} | java -cp software/LepAnchor PlaceAndOrientContigs numThreads={threads} bed={input.bedfile} chromosome={params.chrom} map={input.lift} chain=- paf={input.paf} proximity={input.prox} {params.datatype} {params.extras} > {output.chrom} 2> {log.chrom} - sort -n -r {log.chrom} | awk '($NF=="haplotype" && (!(($5 SUBSEP $6 SUBSEP $7) in h))){{h[$2,$3,$4]; print}}' > {log.haplos} - sort -n -r {log.chrom} | awk -vlimit={params.haplo} '($NF=="haplotype" && ($1>=($4-$3+1-limit)/limit) && (!(($5 SUBSEP $6 SUBSEP $7) in h))){{h[$2,$3,$4]; print}}' > {output.haplos} - grep "error$" {log.chrom} > {log.errors} - """ - - -rule mergehaplos: - input: expand("10_PlaceAndOrientContigs/orient_1/haplotypes/chr.{lg}.haplo.suspected", lg = lg_range) - output: "10_PlaceAndOrientContigs/suspected.haplotypes.after" - message: "Merging suspected haplotype contig information from the linkage groups" - shell: "cat {input} | sort | uniq > {output}" diff --git a/rules/LepAnchor/place_orient1.smk b/rules/LepAnchor/place_orient1.smk new file mode 100644 index 0000000..5f024aa --- /dev/null +++ b/rules/LepAnchor/place_orient1.smk @@ -0,0 +1,33 @@ +rule place_orient: + input: + chain = "9_Chain/chainfile.gz", + bedfile = "10_PlaceAndOrientContigs/map_extra.bed", + paf = paf, + prox = proximity, + lift = "10_PlaceAndOrientContigs/liftover.la" + output: + chrom = "10_PlaceAndOrientContigs/1_orient/chr.{lg_range}.la", + chromerr = "10_PlaceAndOrientContigs/1_orient/logs/chr.{lg_range}.la.log" + params: + chrom = "{lg_range}", + extras = place_orient_extra, + datatype = data_type + threads: 2 + message: "Running the 1st round of PlaceAndOrientContigs for linkage group {params.chrom}" + shell: + """ + gunzip -fc {input.chain} | java -cp $CONDA_PREFIX/bin/lepanchor PlaceAndOrientContigs chromosome={params.chrom} numThreads={threads} bed={input.bedfile} map={input.lift} chain=- paf={input.paf} proximity={input.prox} {params.datatype} {params.extras} > {output.chrom} 2> {output.chromerr} + """ + +rule propogate1: + input: + placed = expand("10_PlaceAndOrientContigs/1_orient/chr.{lgs}.la", lgs = lg_range), + errors = expand("10_PlaceAndOrientContigs/1_orient/logs/chr.{lgs}.la.log", lgs = lg_range), + bedfile = "10_PlaceAndOrientContigs/map_extra.bed" + output: + propogated = "10_PlaceAndOrientContigs/map.propogated.bed", + message: "First round of propogation with propogate4.awk" + shell: + """ + awk -f $CONDA_PREFIX/bin/propagate4.awk pass=1 {input.placed} pass=2 {input.errors} | awk -f $CONDA_PREFIX/bin/pickbed.awk - {input.bedfile} > {output.propogated} + """ diff --git a/rules/LepAnchor/place_orient2.smk b/rules/LepAnchor/place_orient2.smk new file mode 100644 index 0000000..243cef8 --- /dev/null +++ b/rules/LepAnchor/place_orient2.smk @@ -0,0 +1,48 @@ +rule place_orient2: + input: + chain = "9_Chain/chainfile.gz", + bedfile = "10_PlaceAndOrientContigs/map.propogated.bed", + paf = paf, + prox = proximity, + lift = "10_PlaceAndOrientContigs/liftover.la", + chrom = "10_PlaceAndOrientContigs/1_orient/chr.{lg_range}.la" + output: + chrom = "10_PlaceAndOrientContigs/2_orient/chr.{lg_range}.la", + chromerr = "10_PlaceAndOrientContigs/2_orient/logs/chr.{lg_range}.err" + params: + chrom = "{lg_range}", + extras = place_orient_extra, + datatype = data_type + threads: 2 + message: "Running 2nd round of PlaceAndOrientContigs for linkage group {params.chrom}" + shell: + """ + gunzip -fc {input.chain} | java -cp $CONDA_PREFIX/bin/lepanchor PlaceAndOrientContigs chromosome={params.chrom} numThreads={threads} $(awk -f $CONDA_PREFIX/bin/pickorientation.awk {input.chrom}) bed={input.bedfile} map={input.lift} chain=- paf={input.paf} proximity={input.prox} {params.datatype} {params.extras} > {output.chrom} 2> {output.chromerr} + """ + +rule propogate2: + input: + placed = expand("10_PlaceAndOrientContigs/2_orient/chr.{lgs}.la", lgs = lg_range), + bedfile = "10_PlaceAndOrientContigs/map.propogated.bed" + output: + prop = expand("10_PlaceAndOrientContigs/propogate/propogated.{lgs}.la", lgs = lg_range), + propogated = "10_PlaceAndOrientContigs/map.propogated2.bed", + iter1 = temp("10_PlaceAndOrientContigs/tmp1.la"), + iter2 = temp("10_PlaceAndOrientContigs/tmp2.la"), + iter3 = temp("10_PlaceAndOrientContigs/tmp3.la") + message: "Second round of propogation" + shell: + """ + awk -f $CONDA_PREFIX/bin/propagate.awk {input.placed} > {output.iter1} + awk -f $CONDA_PREFIX/bin/propagate.awk {output.iter1} > {output.iter2} + i=2 + + while ! cmp -s "10_PlaceAndOrientContigs/tmp$i.la" "10_PlaceAndOrientContigs/tmp$(( $i-1 )).la" ;do + awk -f $CONDA_PREFIX/bin/propagate.awk 10_PlaceAndOrientContigs/tmp$i.la > 10_PlaceAndOrientContigs/tmp$[$i+1].la + i=$[$i+1] + done + + #create prop*.la + awk -f $CONDA_PREFIX/bin/propagate2.awk 10_PlaceAndOrientContigs/tmp$i.la | awk '(/^[^#]/ && NF>=8){{++d[$1"\t"($7+0)"\t"($8+0)]; data[++line]=$0}}END{{for (i=1; i<=line; ++i) {{$0=data[i];if (d[$1"\t"($7+0)"\t"($8+0)] == 1) {{fn="10_PlaceAndOrientContigs/propogate/propogated."$5".la";print $0>fn}}}}}}' + awk '{{print $1"\t"($7+0)"\t"($8+0)"\t?\t"$5}}' {output.prop} | awk -f $CONDA_PREFIX/bin/pickbed.awk - {input.bedfile} > {output.propogated} + """ \ No newline at end of file diff --git a/rules/LepAnchor/place_orient3.smk b/rules/LepAnchor/place_orient3.smk new file mode 100644 index 0000000..342eeb1 --- /dev/null +++ b/rules/LepAnchor/place_orient3.smk @@ -0,0 +1,49 @@ + +rule place_orient3: + input: + chain = "9_Chain/chainfile.gz", + bedfile = "10_PlaceAndOrientContigs/map.propogated2.bed", + paf = paf, + prox = proximity, + lift = "10_PlaceAndOrientContigs/liftover.la", + chrom = "10_PlaceAndOrientContigs/1_orient/chr.{lg_range}.la", + propogated = "10_PlaceAndOrientContigs/propogate/propogated.{lg_range}.la" + output: + chrom = "10_PlaceAndOrientContigs/3_orient/chr.{lg_range}.la", + errors = "10_PlaceAndOrientContigs/3_orient/errors/chr.{lg_range}.err" + params: + chrom = "{lg_range}", + extras = place_orient_extra, + datatype = data_type + message: "Running 3rd round of PlaceAndOrientContigs for linkage group {params.chrom}" + threads: 2 + shell: + """ + gunzip -fc {input.chain} | java -cp $CONDA_PREFIX/bin/lepanchor PlaceAndOrientContigs chromosome={params.chrom} numThreads={threads} $(awk -f $CONDA_PREFIX/bin/pickorientation.awk {input.chrom}) bed={input.bedfile} map={input.lift} chain=- paf={input.paf} proximity={input.prox} evaluateAnchoring={input.propogated} improveAnchoring=1 {params.datatype} {params.extras} > {output.chrom} 2> {output.errors} + """ + +rule prune_contigblocks: + input: "10_PlaceAndOrientContigs/3_orient/chr.{lg_range}.la" + output: + chrom = "10_PlaceAndOrientContigs/pruned/chr.{lg_range}.pruned.la", + err = "10_PlaceAndOrientContigs/pruned/err/chr.{lg_range}.pruned.err" + message: "Pruning contig blocks without map support and removing overlaps" + params: + chrom = lg + shell: "awk -f $CONDA_PREFIX/bin/prune.awk {input} > {output.chrom} 2> {output.err}" + +rule prune_post: + input: + bedfile = "10_PlaceAndOrientContigs/map.propogated2.bed", + prunedchrom = expand("10_PlaceAndOrientContigs/pruned/chr.{lgs}.pruned.la", lgs = lg_range), + prunederr = expand("10_PlaceAndOrientContigs/pruned/err/chr.{lgs}.pruned.err", lgs = lg_range) + output: + overlaps = "10_PlaceAndOrientContigs/overlaps.removed.la", + pruned = "10_PlaceAndOrientContigs/pruned.la" + message: "Removing overlaps" + threads: 1 + shell: + """ + cat {input.prunederr} > {output.pruned} + awk -f $CONDA_PREFIX/bin/removeOverlaps.awk {input.bedfile} {input.prunedchrom} > {output.overlaps} + """ \ No newline at end of file diff --git a/rules/LepAnchor/place_orient4.smk b/rules/LepAnchor/place_orient4.smk new file mode 100644 index 0000000..e86a093 --- /dev/null +++ b/rules/LepAnchor/place_orient4.smk @@ -0,0 +1,47 @@ +rule place_orient4: + input: + chain = "9_Chain/chainfile.gz", + bedfile = "10_PlaceAndOrientContigs/map.propogated2.nohaplo.bed", + paf = paf, + prox = proximity, + lift = "10_PlaceAndOrientContigs/liftover.la", + chrom = "10_PlaceAndOrientContigs/1_orient/chr.{lg_range}.la", + chromlast = "10_PlaceAndOrientContigs/3_orient/chr.{lg_range}.la" + output: + chrom = "10_PlaceAndOrientContigs/4_orient/chr.{lg_range}.la", + err = "10_PlaceAndOrientContigs/4_orient/errors/chr.{lg_range}.errors" + message: "Running 4th round of PlaceAndOrientContigs for linkage group {params.chrom}" + params: + chrom = "{lg_range}", + extras = place_orient_extra, + datatype = data_type + threads: 2 + shell: + """ + gunzip -fc {input.chain} | java -cp $CONDA_PREFIX/bin/lepanchor PlaceAndOrientContigs chromosome={params.chrom} numThreads={threads} $(awk -f $CONDA_PREFIX/bin/pickorientation.awk {input.chrom}) bed={input.bedfile} map={input.lift} chain=- paf={input.paf} proximity={input.prox} {params.datatype} {params.extras} evaluateAnchoring={input.chromlast} improveAnchoring=1 > {output.chrom} 2> {output.err} + """ + +rule prune_contigblocks: + input: "expand(10_PlaceAndOrientContigs/4_orient/chr.{lgs}.la", lgs = lg_range) + output: + chrom = "10_PlaceAndOrientContigs/pruned/chr.{lg_range}.pruned.la", + message: "Pruning contig blocks without map support and removing overlaps" + params: + chrom = lg + shell: "awk -f $CONDA_PREFIX/bin/prune.awk {input} > {output.chrom} 2> {output.err}" + +rule prune_post: + input: + bedfile = "10_PlaceAndOrientContigs/map.propogated2.bed", + prunedchrom = expand("10_PlaceAndOrientContigs/pruned/chr.{lgs}.pruned.la", lgs = lg_range), + prunederr = expand("10_PlaceAndOrientContigs/pruned/err/chr.{lgs}.pruned.err", lgs = lg_range) + output: + overlaps = "10_PlaceAndOrientContigs/overlaps.removed.la", + pruned = "10_PlaceAndOrientContigs/pruned.la" + message: "Removing overlaps" + threads: 1 + shell: + """ + cat {input.prunederr} > {output.pruned} + awk -f $CONDA_PREFIX/bin/removeOverlaps.awk {input.bedfile} {input.prunedchrom} > {output.overlaps} + """ \ No newline at end of file diff --git a/rules/LepAnchor/place_orient_ii.smk b/rules/LepAnchor/place_orient_ii.smk deleted file mode 100644 index 4bbb5f6..0000000 --- a/rules/LepAnchor/place_orient_ii.smk +++ /dev/null @@ -1,60 +0,0 @@ -rule liftover2: - input: - chain = "9_Chain/chainfile.gz", - intervals = "10_PlaceAndOrientContigs/lepanchor.input", - haplos = "10_PlaceAndOrientContigs/suspected.haplotypes.before", - haplos2 = "10_PlaceAndOrientContigs/suspected.haplotypes.after", - lengths = "10_PlaceAndOrientContigs/contigs.length" - output: - haplos = "10_PlaceAndOrientContigs/suspected.haplotypes.all", - lift = report("10_PlaceAndOrientContigs/liftover.nohaplotypes.la", category = "Lifted Intervals"), - sortedlift = report("10_PlaceAndOrientContigs/liftover.sorted.nohaplotypes.la", category = "Lifted Intervals"), - mapfile = "10_PlaceAndOrientContigs/map.nohaplotypes.clean", - bedfile = "10_PlaceAndOrientContigs/map.nohaplotypes.bed", - unused = "10_PlaceAndOrientContigs/not_used.nohaplotypes.txt", - chr0 = "10_PlaceAndOrientContigs/chr0.nohaplotypes.bed", - mapextra = "10_PlaceAndOrientContigs/map.nohaplotypes.extra.bed" - message: "Recreating bedfile omitting haplotypes discovered from PlaceAndOrientContigs" - params: - chrom = lg - shell: - """ - cat {input.haplos} {input.haplos2} > {output.haplos} - gunzip -fc {input.chain} | java -cp software/LepAnchor LiftoverHaplotypes map={input.intervals} haplotypes={output.haplos} chain=- > {output.lift} - cat {output.lift} | sort -V -k 1,1 -k 2,2n > {output.sortedlift} - java -cp software/LepAnchor CleanMap map={output.sortedlift} > {output.mapfile} - java -cp software/LepAnchor Map2Bed map={output.mapfile} contigLength={input.lengths} > {output.bedfile} - cut -f 1 {input.lengths} | grep -v -w -F -f <(cut -f 2 {output.haplos}; cut -f 1 {output.bedfile}) > {output.unused} - grep -w -F -f {output.unused} {input.lengths} | awk -vn={params.chrom} '{{s=$1"\t1\t"$2"\t?\t"; for (i=1;i<=n;++i) print s i}}' > {output.chr0} - cat {output.bedfile} {output.chr0} > {output.mapextra} - """ - - -rule place_orient2: - input: - chain = "9_Chain/chainfile.gz", - bedfile = "10_PlaceAndOrientContigs/map.nohaplotypes.extra.bed", - paf = paf, - prox = proximity, - lift = "10_PlaceAndOrientContigs/liftover.nohaplotypes.la" - output: - chrom = "10_PlaceAndOrientContigs/orient_2/chr.{lg_range}.la", - haplos = "10_PlaceAndOrientContigs/orient_2/haplotypes/chr.{lg_range}.haplo.suspected" - log: - chrom = report("10_PlaceAndOrientContigs/orient_2/logs/chr.{lg_range}.la.log", category = "Anchoring II Logs"), - haplos = "10_PlaceAndOrientContigs/orient_2/haplotypes/chr.{lg_range}.haplo.all", - errors = "10_PlaceAndOrientContigs/orient_2/errors/chr.{lg_range}.errors" - params: - chrom = "{lg_range}", - extras = place_orient_extra, - datatype = data_type, - haplo = haplo_limit - threads: 3 - message: "Running 2nd round of PlaceAndOrientContigs for linkage group {params.chrom}" - shell: - """ - gunzip -fc {input.chain} | java -cp software/LepAnchor PlaceAndOrientContigs numThreads={threads} bed={input.bedfile} chromosome={params.chrom} map={input.lift} chain=- paf={input.paf} proximity={input.prox} {params.datatype} {params.extras} > {output.chrom} 2> {log.chrom} - sort -n -r {log.chrom} | awk '($NF=="haplotype" && (!(($5 SUBSEP $6 SUBSEP $7) in h))){{h[$2,$3,$4]; print}}' > {log.haplos} - sort -n -r {log.chrom} | awk -vlimit={params.haplo} '($NF=="haplotype" && ($1>=($4-$3+1-limit)/limit) && (!(($5 SUBSEP $6 SUBSEP $7) in h))){{h[$2,$3,$4]; print}}' > {output.haplos} - grep "error$" {log.chrom} > {log.errors} - """ \ No newline at end of file diff --git a/rules/LepAnchor/place_orient_iii.smk b/rules/LepAnchor/place_orient_iii.smk deleted file mode 100644 index 5938a5e..0000000 --- a/rules/LepAnchor/place_orient_iii.smk +++ /dev/null @@ -1,74 +0,0 @@ -rule propogate: - input: - placed = expand("10_PlaceAndOrientContigs/orient_2/chr.{lgs}.la", lgs = lg_range), - bedfile = "10_PlaceAndOrientContigs/map.nohaplotypes.bed" - output: - propogated = "10_PlaceAndOrientContigs/map.propogated.bed", - tmp_prop = temp(expand("10_PlaceAndOrientContigs/propogate/propogated.{lgs}.la", lgs = range(lg + 1))), - message: "Propogating ...something" - shell: - """ - awk -f software/LepAnchor/scripts/propagate.awk {input.placed} > 10_PlaceAndOrientContigs/tmp1.la - awk -f software/LepAnchor/scripts/propagate.awk 10_PlaceAndOrientContigs/tmp1.la > 10_PlaceAndOrientContigs/tmp2.la - i=2 - - while ! cmp -s "10_PlaceAndOrientContigs/tmp$i.la" "10_PlaceAndOrientContigs/tmp$(( $i-1 )).la" ;do - awk -f software/LepAnchor/scripts/propagate.awk 10_PlaceAndOrientContigs/tmp$i.la > 10_PlaceAndOrientContigs/tmp$[$i+1].la - i=$[$i+1] - done - #create prop*.la - awk '/^[^#]/{{++d[$1 "\t" $7+0 "\t" $8+0]; data[++line]=$0}}END{{for (i = 1; i <= line; ++i) {{$0=data[i];if (d[$1 "\t" $7+0 "\t" $8+0] == 1) fn="10_PlaceAndOrientContigs/propogate/propogated."$5".la"; else if ($5==1) fn="10_PlaceAndOrientContigs/propogate/propogated.0.la"; else fn=""; if (fn != "") print $0>fn}}}}' 10_PlaceAndOrientContigs/tmp$i.la - - #create a new bed by combining propogated.[1-9]*.la and map.nohaplotypes.bed - awk '(NR==FNR){{print;c[$1]}}(NR!=FNR && !($1 in c)){{print $1 "\t" $7+0 "\t" $8+0"\t?\t"$5}}' {input.bedfile} {output.tmp_prop} > {output.propogated} - rm 10_PlaceAndOrientContigs/tmp*.la - """ - - -rule place_orient3: - input: - chain = "9_Chain/chainfile.gz", - bedfile = "10_PlaceAndOrientContigs/map.propogated.bed", - paf = paf, - prox = proximity, - lift = "10_PlaceAndOrientContigs/liftover.nohaplotypes.la" - output: - chrom = "10_PlaceAndOrientContigs/orient_3/ichr.{lg_range}.la", - haplos = "10_PlaceAndOrientContigs/orient_3/haplotypes/chr.{lg_range}.haplo.suspected" - log: - chrom = report("10_PlaceAndOrientContigs/orient_3/logs/ichr.{lg_range}.la.log", category = "Anchoring III Logs"), - haplos = "10_PlaceAndOrientContigs/orient_3/haplotypes/chr.{lg_range}.haplo.all", - errors = "10_PlaceAndOrientContigs/orient_3/errors/chr.{lg_range}.errors" - params: - chrom = "{lg_range}", - extras = place_orient_extra, - datatype = data_type, - haplo = haplo_limit - message: "Running 3rd round of PlaceAndOrientContigs for linkage group {params.chrom}" - shell: - """ - gunzip -fc {input.chain} | java -cp software/LepAnchor PlaceAndOrientContigs bed={input.bedfile} chromosome={params.chrom} map={input.lift} chain=- paf={input.paf} proximity={input.prox} {params.datatype} {params.extras} > {output.chrom} 2> {log.chrom} - sort -n -r {log.chrom} | awk '($NF=="haplotype" && (!(($5 SUBSEP $6 SUBSEP $7) in h))){{h[$2,$3,$4]; print}}' > {log.haplos} - sort -n -r {log.chrom} | awk -vlimit={params.haplo} '($NF=="haplotype" && ($1>=($4-$3+1-limit)/limit) && (!(($5 SUBSEP $6 SUBSEP $7) in h))){{h[$2,$3,$4]; print}}' > {output.haplos} - grep "error$" {log.chrom} > {log.errors} - """ - - -rule prune: - input: - oriented = expand("10_PlaceAndOrientContigs/orient_3/ichr.{lgs}.la", lgs = lg_range), - bedfile = "10_PlaceAndOrientContigs/map.propogated.bed" - output: - pruned = report("10_PlaceAndOrientContigs/orient_3/pruned.la", category = "Logs"), - cleaned = report("10_PlaceAndOrientContigs/overlaps_rm.la", category = "Logs") - message: "Pruning contig blocks without map support and removing overlaps" - params: - chrom = lg - shell: - """ - for i in $(seq {params.chrom}) - do - awk -f software/LepAnchor/scripts/prune.awk 10_PlaceAndOrientContigs/orient_3/ichr.$i.la > 10_PlaceAndOrientContigs/orient_3/ichr.${{i}}.pruned.la - done 2> {output.pruned} - awk -f software/LepAnchor/scripts/removeOverlaps.awk {input.bedfile} 10_PlaceAndOrientContigs/orient_3/ichr.*.pruned.la > {output.cleaned} - """ \ No newline at end of file diff --git a/rules/LepAnchor/trim_edges.smk b/rules/LepAnchor/trim_edges.smk index 627c025..5c41824 100644 --- a/rules/LepAnchor/trim_edges.smk +++ b/rules/LepAnchor/trim_edges.smk @@ -19,7 +19,7 @@ rule trim_newintervals: params: edge = edgelen, dist = trimdist - shell: "Rscript scripts/LATrim.r {input} {params.dist} {params.edge} 15_Trim" + shell: "LepWrapTrim.r {input} {params.dist} {params.edge} 15_Trim" rule merge_trimplots: @@ -39,4 +39,5 @@ rule merge_trimmedintervals: rule plot_trimmedintervals: input: "16_MareyMapsTrimmed/data.marey.trimmed.gz" output: report("16_MareyMapsTrimmed/LepAnchor.mareymaps.pdf", category = "Trimmed Marey Maps") - shell: "Rscript scripts/LASummary.r {input}" \ No newline at end of file + message: "Plotting results of edge trimming" + shell: "LASummary.r {input}" \ No newline at end of file diff --git a/rules/LepMap3/distances.smk b/rules/LepMap3/distances.smk index b03f166..d6585da 100644 --- a/rules/LepMap3/distances.smk +++ b/rules/LepMap3/distances.smk @@ -17,12 +17,10 @@ rule calculate_distances: threads: 2 shell: """ - cp {input.lg} {output.distance} - - zcat {input.data_call} | java -cp software/LepMap3 OrderMarkers2 evaluateOrder={input.lg} data=- {params.dist_method} numThreads={threads} improveOrder=0 sexAveraged=1 &> {output.sex_averagedtmp} + cp {input.lg} {output.distance} + zcat {input.data_call} | java -cp $CONDA_PREFIX/bin/lepmap3 OrderMarkers2 evaluateOrder={input.lg} data=- {params.dist_method} numThreads={threads} improveOrder=0 sexAveraged=1 &> {output.sex_averagedtmp} sed -i -e 's/LG \= 0/LG \= {params.lg}/g' {output.sex_averagedtmp} sed -n '/\*\*\* LG \=/,$p' {output.sex_averagedtmp} > {output.sex_averaged} awk '/#java/{{flag=1}} flag; /*** LG =/{{flag=0}}' {output.sex_averagedtmp} > {log.sex_averaged} - - zcat {input.data_call} | java -cp software/LepMap3 OrderMarkers2 evaluateOrder={input.lg} data=- {params.dist_method} numThreads={threads} calculateIntervals={output.intervals} > {log.intervals} 2>&1 + zcat {input.data_call} | java -cp $CONDA_PREFIX/bin/lepmap3 OrderMarkers2 evaluateOrder={input.lg} data=- {params.dist_method} numThreads={threads} calculateIntervals={output.intervals} > {log.intervals} 2>&1 """ \ No newline at end of file diff --git a/rules/LepMap3/generate_map.smk b/rules/LepMap3/generate_map.smk index 5612f06..3356fc8 100644 --- a/rules/LepMap3/generate_map.smk +++ b/rules/LepMap3/generate_map.smk @@ -9,7 +9,7 @@ rule separate_chromosomes: extra = sepchrom_extra, shell: """ - zcat {input} | java -cp software/LepMap3 SeparateChromosomes2 data=- {params.extra} {informative} lodLimit={params.lod} numThreads={threads} > {output} 2> {log} + zcat {input} | java -cp $CONDA_PREFIX/bin/lepmap3 SeparateChromosomes2 lodLimit={params.lod} data=- {params.extra} {informative} numThreads={threads} > {output} 2> {log} """ @@ -17,7 +17,7 @@ rule map_summary: input: expand("3_SeparateChromosomes/LOD.{LOD}", LOD = lod_range) output: "3_SeparateChromosomes/all.LOD.summary" message: "Summarizing SeperateChromosomes2 maps >> {output}" - shell: "scripts/MapSummary.r 3_SeparateChromosomes" + shell: "MapSummary.r 3_SeparateChromosomes" rule choose_map: @@ -51,7 +51,7 @@ rule join_singles: JS2A=$(echo {params.run_js2all} | tr '[:upper:]' '[:lower:]') THEMAP=$(tail -1 {input.map_choice}) if [ $JS2A == "true" ]; then - zcat {input.datacall} | java -cp software/LepMap3 JoinSingles2All map=$THEMAP data=- {params.extra} {params.lod_limit} {params.lod_diff} numThreads={threads} > {output} + zcat {input.datacall} | java -cp $CONDA_PREFIX/bin/lepmap3 JoinSingles2All map=$THEMAP data=- {params.extra} {params.lod_limit} {params.lod_diff} numThreads={threads} > {output} else echo -e "\nSkipping JoinSingles2All and creating a symlink to $THEMAP instead" ln -sr $THEMAP {output} diff --git a/rules/LepMap3/order.smk b/rules/LepMap3/order.smk index f820f5c..4643962 100644 --- a/rules/LepMap3/order.smk +++ b/rules/LepMap3/order.smk @@ -15,7 +15,7 @@ rule order_markers: threads: 2 shell: """ - zcat {input.datacall} | java -cp software/LepMap3 OrderMarkers2 chromosome={params.chrom} map={input.filt_map} {params.extra} data=- numThreads={threads} &> {output.runlog} + zcat {input.datacall} | java -cp $CONDA_PREFIX/bin/lepmap3 OrderMarkers2 chromosome={params.chrom} map={input.filt_map} {params.extra} data=- numThreads={threads} &> {output.runlog} sed -n '/\*\*\* LG \=/,$p' {output.runlog} > {output.lg} grep "recombin" {output.runlog} > {log.recomb} awk '/#java/{{flag=1}} flag; /logL/{{flag=0}}' {output.runlog} > {log.run} @@ -25,7 +25,4 @@ rule recomb_summary: input: expand("4_OrderMarkers/ordered.{lg}", lg = lg_range) output: "4_OrderMarkers/recombination/recombination.summary" message: "Recombination summary: {output}" - shell: - """ - Rscript scripts/RecombinationSummary.r 4_OrderMarkers/recombination > {output} - """ + shell: "RecombinationSummary.r 4_OrderMarkers/recombination > {output}" diff --git a/rules/LepMap3/prepare_data.smk b/rules/LepMap3/prepare_data.smk index fa7b5b9..c066a77 100644 --- a/rules/LepMap3/prepare_data.smk +++ b/rules/LepMap3/prepare_data.smk @@ -6,7 +6,7 @@ rule parent_call: message: "Creating Lep-Map3 data file from {input.vcf} and {input.pedigree}" params: extra = parentcall_extra - shell: "java -cp software/LepMap3 ParentCall2 data={input.pedigree} vcfFile={input.vcf} {params} | gzip > {output}" + shell: "java -cp $CONDA_PREFIX/bin/lepmap3lepmap3lepmap3 ParentCall2 data={input.pedigree} vcfFile={input.vcf} {params} | gzip > {output}" rule filtering: input: "1_ParentCall/data.lepmap3.gz" @@ -21,6 +21,6 @@ rule filtering: echo "Skipping Filtering2 and creating symlink {output} instead" ln -sr {input} {output} else - zcat {input} | java -cp software/LepMap3 Filtering2 data=- dataTolerance={params.data_tolerance} {params.extra} | gzip > {output} + zcat {input} | java -cp $CONDA_PREFIX/bin/lepmap3 Filtering2 data=- dataTolerance={params.data_tolerance} {params.extra} | gzip > {output} fi """ diff --git a/rules/LepMap3/reorder.smk b/rules/LepMap3/reorder.smk index c798918..154ade2 100644 --- a/rules/LepMap3/reorder.smk +++ b/rules/LepMap3/reorder.smk @@ -16,7 +16,7 @@ rule reorder_markers: threads: 2 shell: """ - zcat {input.datacall} | java -cp software/LepMap3 OrderMarkers2 evaluateOrder={input.lg_order} {params.extra} map={input.filt_map} data=- numThreads={threads} &> {output.runlog} + zcat {input.datacall} | java -cp $CONDA_PREFIX/bin/lepmap3 OrderMarkers2 evaluateOrder={input.lg_order} {params.extra} map={input.filt_map} data=- numThreads={threads} &> {output.runlog} sed -n '/\*\*\* LG \=/,$p' {output.runlog} > {output.lg} grep "recombin" {output.runlog} > {log.recomb} awk '/#java/{{flag=1}} flag; /logL/{{flag=0}}' {output.runlog} > {log.run} @@ -28,5 +28,5 @@ rule reorder_summary: message: "Recombination summary of reordering: {output}" shell: """ - Rscript scripts/RecombinationSummary.r 6_OrderMarkers/recombination > {output} + RecombinationSummary.r 6_OrderMarkers/recombination > {output} """ diff --git a/rules/LepMap3/trim.smk b/rules/LepMap3/trim.smk index 7f122c1..2021782 100644 --- a/rules/LepMap3/trim.smk +++ b/rules/LepMap3/trim.smk @@ -10,7 +10,7 @@ rule trim_edge_clusters: message: "Removing edge clusters >{params.trim_threshold}%cM apart from the other markers at the ends of {input}" shell: """ - Rscript scripts/LepWrapTrim.r {input} {params.trim_threshold} {params.edge_length} 5_Trim + LepWrapTrim.r {input} {params.trim_threshold} {params.edge_length} 5_Trim """ rule trim_summary: @@ -33,8 +33,8 @@ rule trim_summary: BASE=$(basename $each | cut -d "." -f1,2) sed -e "s/^/$BASE /" $each done | sort -V > {output.detailed} - scripts/TrimCounts.r {output.detailed} {params.lg} > {output.summary} - scripts/TrimSummaryPlot.r {output.summary} + TrimCounts.r {output.detailed} {params.lg} > {output.summary} + TrimSummaryPlot.r {output.summary} echo "Merging QC plots for all linkage groups" convert -density 300 {input.plots} {output.mergeplots} """ diff --git a/scripts/FilterLinkageMap.r b/scripts/FilterLinkageMap.r index 31fa7ff..c7c995b 100755 --- a/scripts/FilterLinkageMap.r +++ b/scripts/FilterLinkageMap.r @@ -1,3 +1,5 @@ +#! /usr/bin/env Rscript + # This R file performs an adaptive method of filtering a linkage map # It works by creating a spline on the linkage map, then performing # a sliding window analysis on the residuals of the spline, calculating diff --git a/scripts/LATrim.r b/scripts/LATrim.r deleted file mode 100755 index c4581d8..0000000 --- a/scripts/LATrim.r +++ /dev/null @@ -1,162 +0,0 @@ -#! /usr/bin/env Rscript - -suppressMessages(if (!require("tidyverse")) install.packages("tidyverse")) -suppressMessages(library("tidyverse")) -suppressMessages(if (!require("cowplot")) install.packages("cowplot")) -library(cowplot) - -args <- commandArgs(trailingOnly = TRUE) -# args[1] is the OrderMarkers2 output file -# args[2] is the centimorgan cutoff distance -# args[3] is the % of edge markers to scan -# args[4] is the name of the output folder - -lgfile <- read.delim( - args[1], - header = FALSE, - sep = "\t", - comment.char="#" -) %>% - mutate(Mpass = T, Fpass = T) - -## setup output file names ## -# split the filename by path -filename <- unlist(strsplit(args[1], "/")) -# pop out just the filename -filename <- filename[length(filename)] -lg <- (strsplit(lgfile[1,1], "LG") %>% unlist())[2] %>% as.numeric() - -#========= output instantiation ========# - -dir.create(args[4], showWarnings = FALSE) -dir.create(paste0(args[4],"/plots"), showWarnings = FALSE) -dir.create(paste0(args[4],"/logs"), showWarnings = FALSE) -dir.create(paste0(args[4],"/QC_raw"), showWarnings = FALSE) -outfile_base <- paste(args[4], filename, sep = "/") -outfile_log_base <- paste(args[4], "logs", filename, sep = "/") -plotfile_base <- paste(args[4], "plots", filename, sep = "/") -plotfile <- paste(plotfile_base, "trim.pdf", sep = ".") -rawfile_base <- paste(args[4], "QC_raw", filename, sep = "/") - -##### Pruning the ends ##### -dist_thresh <- as.numeric(args[2]) -if(dist_thresh >= 1){ - dist_thresh <- dist_thresh * .01 -} - -dist_thresh_all <- c( - abs(max(lgfile[, 5]) - min(lgfile[, 5])) * dist_thresh, # male - abs(max(lgfile[, 6]) - min(lgfile[, 6])) * dist_thresh # female -) - -# if the percent threshold is given as an integer, convert it to a decimal -edge_length <- as.numeric(args[3]) -if(edge_length >= 1){ - edge_length <- edge_length * .01 -} -n_markers <- nrow(lgfile) -forward_start <- round(n_markers * edge_length, digits = 0) -reverse_start <- round(n_markers - forward_start, digits = 0) - -for (j in 5:6){ # iterate over male (5) and female (6) - # sort on column - lgfile <- arrange(lgfile, j) - dist_thresh <- dist_thresh_all[j-4] - # trim beginning - # the loop goes towards the edges to be effecient with the break() call, removing the - # entire cluster once one bad marker is found - for(a in forward_start:2){ #first n% of total markers starting from the forward edge, going out - diff <- abs(lgfile[a,j]-lgfile[a-1,j]) # difference between two points - if( diff > dist_thresh ){ # is the difference between the two points > distance argument? - lgfile[(a-1):1, j+2] <- FALSE # mark that marker and all markers BEFORE it as FAIL - break() - } - } - # trim end - for(z in reverse_start:(n_markers-1)){ #last n% total markers starting from the reverse edge going out - diff <- abs(lgfile[z+1,j]-lgfile[z,j]) # difference between two points - if( diff > dist_thresh ){ # is the difference between the two points > distance argument? - lgfile[(z+1):n_markers,j+2] <- FALSE # mark that marker and all markers AFTER it as FAIL - break() - } - } -} - -# create new table of markers passing QC -cleaned_markers <- (lgfile %>% filter(Mpass & Fpass))[,1:6] -# re-scale cleaned markers to 0 by subtracting the minimum genetic position -cleaned_markers <- cleaned_markers %>% - mutate(V5 = V5 - min(V5), V6 = V6 - min(V6)) - -# isolate bad markers -removed_markers <- (lgfile %>% filter(!Mpass | !Fpass))[,1:6] - -# get simple counts -rm_male <- lgfile %>% filter(!Mpass & Fpass) %>% nrow() -rm_female <- lgfile %>% filter(!Fpass & Mpass) %>% nrow() -rm_both <- lgfile %>% filter(!Mpass & !Fpass) %>% nrow() - -pdf(NULL) - -plot_male <- lgfile %>% arrange(V5) %>% - ggplot(aes(x = seq_along(V5), y = V5, color = Mpass)) + - geom_point(shape = 19) + - scale_color_manual(values = c("dodgerblue", "indianred2"), limits = c(T, F)) + - geom_vline(xintercept = forward_start, linetype = "dashed", size = 0.2) + - geom_vline(xintercept = reverse_start, linetype = "dashed", size = 0.2) + - labs( - title = "", - subtitle = paste0(rm_male, " male markers >", dist_thresh_all[1], "cM trimmed"), - caption = paste0(edge_length, "% edge markers, ", dist_thresh, "% cM"), - x = "Marker Number", - y = "Position (cM)", - color = "Pass Filtering" - ) - -plot_female <- lgfile %>% arrange(V6) %>% - ggplot(aes(x = seq_along(V6), y = V6, color = Fpass)) + - geom_point(shape = 19) + - scale_color_manual(values = c("dodgerblue", "indianred2"), limits = c(T, F)) + - geom_vline(xintercept = forward_start, linetype = "dashed", size = 0.2) + - geom_vline(xintercept = reverse_start, linetype = "dashed", size = 0.2) + - labs( - title = paste("Edge Cluster Trimming for LG:", lg), - subtitle = paste0(rm_female, " female markers >", dist_thresh_all[2], "cM trimmed"), - caption = paste0("Markers failing both M+F: ", rm_both), - x = "Marker Number", - y = "Position (cM)", - color = "Pass Filtering", - legend.position = "none" - ) - -plot_grid(plot_female, plot_male, ncol = 2, nrow = 1) - -suppressMessages(ggsave(plotfile, width = 7, height = 3, units = "in")) - -write.table( - cleaned_markers, - file = paste(outfile_base, "trimmed", sep = "."), - sep = "\t", - quote = FALSE, - row.names = FALSE, - col.names = FALSE, -) - -write.table( - lgfile, - file = paste(rawfile_base, "filtered.raw", sep = "."), - sep = "\t", - quote = FALSE, - row.names = FALSE, - col.names = FALSE, -) - -write.table( - removed_markers, - file=paste(outfile_log_base, "removed", sep = "."), - append=FALSE, - sep = "\t", - quote = FALSE, - row.names = FALSE, - col.names = FALSE -) \ No newline at end of file diff --git a/scripts/LGcutoff.sh b/scripts/LGcutoff.sh index 5224c44..5f189c8 100755 --- a/scripts/LGcutoff.sh +++ b/scripts/LGcutoff.sh @@ -8,7 +8,7 @@ Essentially, set an LG maximum, and markers above it will be assigned to group 0 [usage]: LGcutoff.sh # unassign all markers in LG's 25+ -[example]: scripts/LGcutoff.sh 3_SeparateChromosomes/map.13 24 +[example]: LGcutoff.sh 3_SeparateChromosomes/map.13 24 EOF exit 1 diff --git a/scripts/LepWrapTrim.r b/scripts/LepWrapTrim.r index 6a6e17a..d4c6290 100755 --- a/scripts/LepWrapTrim.r +++ b/scripts/LepWrapTrim.r @@ -4,7 +4,7 @@ suppressMessages(if (!require("tidyverse")) install.packages("tidyverse")) suppressMessages(library("tidyverse")) args <- commandArgs(trailingOnly = TRUE) -# args[1] is the OrderMarkers2 output file +# args[1] is the OrderMarkers2/LepAnchor output file # args[2] is the centiMorgan cutoff threshold # args[3] is the % of edge markers to scan # args[4] is the output directory @@ -44,7 +44,23 @@ lgfile <- read.delim( filename <- unlist(strsplit(args[1], "/")) # pop out just the filename filename <- filename[length(filename)] -lg <- unlist(strsplit(filename, "\\."))[2] + +# Modify which columns to look at based on input format +# LM3 input has 5 columns + 2 from M|Fpass +# LA input has 6 columns + 2 from M|Fpass +if(ncol(lgfile) == 7) { + # LepMap3 input file + idxcol <- c(2,3) + keepcol <- 1:5 + lg <- unlist(strsplit(filename, "\\."))[2] + colshift <- 4 +} else { + # LepAnchor input file + idxcol <- c(5, 6) + keepcol <- 1:6 + lg <- unlist(strsplit(filename, "\\."))[3] + colshift <- 2 +} #========= instantiate output ========# dir.create(args[4], showWarnings = FALSE) @@ -66,8 +82,8 @@ if(dist_thresh >= 1){ } dist_thresh_all <- c( - abs(max(lgfile[, 2]) - min(lgfile[, 2])) * dist_thresh, # male - abs(max(lgfile[, 3]) - min(lgfile[, 3])) * dist_thresh # female + abs(max(lgfile[, idxcol[1]]) - min(lgfile[, idxcol[1]])) * dist_thresh, # male + abs(max(lgfile[, idxcol[2]]) - min(lgfile[, idxcol[2]])) * dist_thresh # female ) edge_length <- as.numeric(args[3]) @@ -80,15 +96,16 @@ forward_start <- round(n_markers * edge_length, digits = 0) reverse_start <- round(n_markers - forward_start, digits = 0) # iterate over male (2) and female (3) -for (j in 2:3){ +threshidx <- 1 +for (j in idxcol){ # sort on column lgfile <- arrange(lgfile, j) - dist_thresh <- dist_thresh_all[j-1] + dist_thresh <- dist_thresh_all[threshidx] # trim beginning for(a in forward_start:2){ #first n% of total markers from the beginning diff <- abs(lgfile[a,j]-lgfile[a-1,j]) # difference between two points if( diff > dist_thresh ){ # is the difference between the two points > distance argument? - lgfile[(a-1):1, j+4] <- FALSE # all markers BEFORE it as FAIL + lgfile[(a-1):1, j + colshift] <- FALSE # all markers BEFORE it as FAIL break() } } @@ -96,10 +113,11 @@ for (j in 2:3){ for(z in reverse_start:(n_markers-1)){ #last n% total markers starting from the back edge going out diff <- abs(lgfile[z+1,j]-lgfile[z,j]) # difference between two points if( diff > dist_thresh ){ # is the difference between the two points > distance argument? - lgfile[(z+1):n_markers,j+4] <- FALSE # all markers AFTER it as FAIL + lgfile[(z+1):n_markers, j + colshift] <- FALSE # all markers AFTER it as FAIL break() } } + threshidx <- threshidx + 1 } # isolate bad markers @@ -115,7 +133,7 @@ rm_both <- lgfile %>% filter(!Mpass & !Fpass) %>% nrow() pdf(NULL) plot_df <- lgfile %>% - rename(Male = V2, Female = V3) %>% + rename(Male = idxcol[1], Female = idxcol[2]) %>% arrange(Male) %>% mutate(Marker = seq_along(Mpass)) %>% rowwise() %>% @@ -126,15 +144,17 @@ plot_df <- lgfile %>% mutate(Fail = QAfix(Fail, Sex)) plot_df %>% - ggplot(aes(Marker, Position, color = Fail)) + - geom_point(shape = 19, alpha = 0.3) + + ggplot(aes(Marker, Position, color = Fail, shape = Fail)) + + geom_point(alpha = 0.5) + scale_color_manual(values = c("dodgerblue", "indianred2"), limits = c(T, F)) + + scale_shape_manual(values = c(19, 17), limits = c(T, F), guide = "none") + geom_vline(xintercept = forward_start, linetype = "dashed", size = 0.2) + geom_vline(xintercept = reverse_start, linetype = "dashed", size = 0.2) + + guides(color = guide_legend(override.aes = list(size = c(2,2), shape = c(19, 17)))) + labs( title = paste("Edge Cluster Trimming Results for Linkage Group", lg), subtitle = paste0("Markers Failing QC: ", rm_female, " female, ", rm_male, " male, ", rm_both, " both (", rm_female+rm_male+rm_both, " total)" ), - caption = paste0(edge_length*100, "% of edge markers, ", args[2], "% cM threshold: ", dist_thresh_all[2], "(F) & ", dist_thresh_all[1], "(M)"), + caption = paste0(edge_length*100, "% of edge markers, ", args[2], "% cM threshold: ", round(dist_thresh_all[2], digits = 2), "(F) & ", round(dist_thresh_all[1], digits = 2), "(M)"), x = "Marker Number", y = "Position (cM)", color = "Pass QA" @@ -151,14 +171,14 @@ fail_idx <- which(!lgfile$QC) # prepend a comment to the flagged markers so LepMap3 ignores them lgfile[fail_idx, 1] <- paste0("#", lgfile[fail_idx, 1]) # re-scale remaining markers to 0 by subtracting the minimum genetic position for each sex -lgfile[,2] <- round(lgfile[,2] - (min(lgfile[lgfile$QC,2])), digits = 3) -lgfile[,3] <- round(lgfile[,3] - (min(lgfile[lgfile$QC,3])), digits = 3) +lgfile[,idxcol[1]] <- round(lgfile[,idxcol[1]] - (min(lgfile[lgfile$QC,idxcol[1]])), digits = 3) +lgfile[,idxcol[2]] <- round(lgfile[,idxcol[2]] - (min(lgfile[lgfile$QC,idxcol[2]])), digits = 3) # write header to a new file writeLines(readLines(args[1], n=2), con = paste(outfile_base, "trimmed", sep = ".")) # write the remainder to that file write.table( - lgfile[,1:5], + lgfile[, keepcol], file = paste(outfile_base, "trimmed", sep = "."), sep = "\t", quote = FALSE, diff --git a/scripts/RecombinationSummary.r b/scripts/RecombinationSummary.r index e792bf4..fb37bbd 100755 --- a/scripts/RecombinationSummary.r +++ b/scripts/RecombinationSummary.r @@ -1,4 +1,5 @@ #! /usr/bin/env Rscript + # This script will parse all the recombination logs of LepWrap suppressMessages(library(tidyverse)) suppressMessages(library("stringr")) diff --git a/scripts/extract_markers.sh b/scripts/extract_markers.sh index acafdaa..4848635 100755 --- a/scripts/extract_markers.sh +++ b/scripts/extract_markers.sh @@ -6,7 +6,7 @@ cat < -[example]: scripts/extract_markers.sh data.filtered.lepmap3.gz +[example]: extract_markers.sh data.filtered.lepmap3.gz EOF exit 1 diff --git a/scripts/generate_config.sh b/scripts/generate_config.sh new file mode 100755 index 0000000..232b018 --- /dev/null +++ b/scripts/generate_config.sh @@ -0,0 +1,151 @@ +#! /usr/bin/env bash + +cat <], # inferred score output to this file. + +##dynamic masking +# equivalent to the BLASTZ_M, default is 0, when required, set to 50 normally, some set to 254 +--masking=254 +# --notrivial is required for haploMerger, should not changed! +--notrivial + +##scoring parameters for tuning +#--scores= # should not specify when using scoreMatrix inference procedure, i.e. --infer +#--inner=2000 # equivalent to the BLASTZ_H, normally H=2000 +--hspthresh=3000 # equivalent to the BLASTZ_K, 3000 by default +--ydrop=3400 # equivalent to the BLASTZ_Y, O+300E by default +--gappedthresh=3000 # equivalent to the BLASTZ_L, L=K by default + +################################################## +##scoring parameters usually don't need change + +# equivalent to BLASTZ_O and BLASTZ_E, 400 and 30, by default +--gap=400,30 +#--nogapped + +#Offset between the starting positions of successive target words considered for potential seeds. +#(But this does not apply to the query words, which always use a step size of 1.) +# equivalent to the BLASTZ_Z, z=1 by default +# set to 20 can reduce the memory consumtion by a factor of 3 +--step=20 + +# by default --seed=12of19 , or --seed=14of22 +--seed=12of19 + +# default setting=--transition, alternative --notransition +# set to --notranstion can shorten the time by a factor of 10 +--notransition + +################################################## +##non scoring parameters usually don't need change +--format=axt # lav (default), axt, maf, and so on +--markend # Just before normal completion, write "# lastz end-of-file" to the output file. +--verbosity=1 # logfile details +EOF \ No newline at end of file diff --git a/scripts/generate_qscoremtx.sh b/scripts/generate_qscoremtx.sh new file mode 100755 index 0000000..7385953 --- /dev/null +++ b/scripts/generate_qscoremtx.sh @@ -0,0 +1,9 @@ +#! /usr/bin/env bash + +cat < -[example]: scripts/iterate_js2all.sh 3_SeparateChromosomes/map.31 22 31 5 123 +[example]: iterate_js2all.sh 3_SeparateChromosomes/map.31 22 31 5 123 EOF exit 1 fi @@ -32,12 +32,12 @@ else fi for i in $(seq $LODMIN $LODMAX); do - zcat 2_Filtering/data.filtered.lepmap3.gz | java -cp software/LepMap3 JoinSingles2All map=$TARGETMAP data=- lodLimit=$i lodDifference=4 iterate=1 distortionLod=1 numThreads=10 informativeMask=$INFMASK > JoinSingles2All_iter/logs/map.$i.$4.js2all + zcat 2_Filtering/data.filtered.lepmap3.gz | java -cp $CONDA_PREFIX/bin/lepmap3lepmap3 JoinSingles2All map=$TARGETMAP data=- lodLimit=$i lodDifference=4 iterate=1 distortionLod=1 numThreads=10 informativeMask=$INFMASK > JoinSingles2All_iter/logs/map.$i.$4.js2all cut -f1 JoinSingles2All_iter/logs/map.$i.$4.js2all > JoinSingles2All_iter/LOD.$i.$4.js2all done echo "The generated maps are named LOD.LODlim.LODdiff.js2all" # generate a summary of the results -scripts/MapSummary.r JoinSingles2All_iter +MapSummary.r JoinSingles2All_iter diff --git a/scripts/lepmap2anchor b/scripts/lepmap2anchor index d666ab3..a5c46f3 100755 --- a/scripts/lepmap2anchor +++ b/scripts/lepmap2anchor @@ -6,7 +6,7 @@ cat < -[example]: scripts/lepmap2anchor snps.txt +[example]: lepmap2anchor snps.txt EOF exit 1 @@ -15,7 +15,7 @@ fi LG=$(ls 7_Intervals/*.intervals | wc -l) if [ ! -f $1 ]; then - echo "Error: marker file $(echo $1) not found. It may be generated using scripts/extract_markers.sh" + echo "Error: marker file $(echo $1) not found. It may be generated using extract_markers.sh" exit 1 fi diff --git a/scripts/refinemap.sh b/scripts/refinemap.sh index 092f0b2..d1fe0e4 100755 --- a/scripts/refinemap.sh +++ b/scripts/refinemap.sh @@ -9,7 +9,7 @@ Attempt to refine a map from SeparateChromosomes2 by splitting out markers from Uses 4 threads and distortionLod=1. Requires an input map, which linkage group to modify, LOD start:end values to iterate over, and a minimum size for new clusters. [usage]: refinemap.sh -[example]: scripts/refinemap.sh 3_SeparateChromosomes/map.31 1 22 70 30 +[example]: refinemap.sh 3_SeparateChromosomes/map.31 1 22 70 30 EOF exit 1 fi @@ -31,8 +31,8 @@ LODMAX=$4 SIZELIM=$5 for i in $(seq $LODMIN $LODMAX); do - zcat 2_Filtering/data.filtered.lepmap3.gz | java -cp software/LepMap3 SeparateChromosomes2 data=- map=$TARGETMAP lg=$TARGETLG sizeLimit=$SIZELIM lodLimit=$i distortionLod=1 numThreads=4 > $1.refine/map.$i + zcat 2_Filtering/data.filtered.lepmap3.gz | java -cp $CONDA_PREFIX/bin/lepmap3 SeparateChromosomes2 data=- map=$TARGETMAP lg=$TARGETLG sizeLimit=$SIZELIM lodLimit=$i distortionLod=1 numThreads=4 > $1.refine/map.$i done # generate a summary of the results -scripts/MapSummary.r $1.refine +MapSummary.r $1.refine diff --git a/scripts/usage b/scripts/usage index ab95740..8e3daec 100755 --- a/scripts/usage +++ b/scripts/usage @@ -8,7 +8,7 @@ Should be used in main project directory. Module names are case-sensitive. [usage]: params -[example]: scripts/usage OrderMarkers2 +[example]: usage OrderMarkers2 LepMap3 modules: - ParentCall2 @@ -27,7 +27,7 @@ EOF fi if [[ $1 == "Map2Bed" || $1 == "CleanMap" || $1 == "PlaceAndOrientContigs" ]]; then - java -cp software/LepAnchor $1 2>&1 + java -cp $CONDA_PREFIX/bin/ $1 2>&1 else - java -cp software/LepMap3 $1 2>&1 | tail -n +2 + java -cp $CONDA_PREFIX/bin/ $1 2>&1 | tail -n +2 fi diff --git a/software/LepAnchor/AltLinks.class b/software/LepAnchor/AltLinks.class new file mode 100644 index 0000000..c52127a Binary files /dev/null and b/software/LepAnchor/AltLinks.class differ diff --git a/software/LepAnchor/ChainPaf.class b/software/LepAnchor/ChainPaf.class new file mode 100644 index 0000000..44b003d Binary files /dev/null and b/software/LepAnchor/ChainPaf.class differ diff --git a/software/LepAnchor/CleanMap$CleanMarker.class b/software/LepAnchor/CleanMap$CleanMarker.class index f79445d..51cf7be 100644 Binary files a/software/LepAnchor/CleanMap$CleanMarker.class and b/software/LepAnchor/CleanMap$CleanMarker.class differ diff --git a/software/LepAnchor/CleanMap.class b/software/LepAnchor/CleanMap.class index b762a91..dfb7ff9 100644 Binary files a/software/LepAnchor/CleanMap.class and b/software/LepAnchor/CleanMap.class differ diff --git a/software/LepAnchor/ContigInterval.class b/software/LepAnchor/ContigInterval.class index 321a475..c830972 100644 Binary files a/software/LepAnchor/ContigInterval.class and b/software/LepAnchor/ContigInterval.class differ diff --git a/software/LepAnchor/CoverageAnalyser$Gaussian.class b/software/LepAnchor/CoverageAnalyser$Gaussian.class index b96bf65..6b4fa13 100644 Binary files a/software/LepAnchor/CoverageAnalyser$Gaussian.class and b/software/LepAnchor/CoverageAnalyser$Gaussian.class differ diff --git a/software/LepAnchor/CoverageAnalyser$Zeta.class b/software/LepAnchor/CoverageAnalyser$Zeta.class index 66d3862..927c8b6 100644 Binary files a/software/LepAnchor/CoverageAnalyser$Zeta.class and b/software/LepAnchor/CoverageAnalyser$Zeta.class differ diff --git a/software/LepAnchor/CoverageAnalyser.class b/software/LepAnchor/CoverageAnalyser.class index 212e049..88c6d10 100644 Binary files a/software/LepAnchor/CoverageAnalyser.class and b/software/LepAnchor/CoverageAnalyser.class differ diff --git a/software/LepAnchor/CoverageHMM.class b/software/LepAnchor/CoverageHMM.class index a10b002..8af8ddf 100644 Binary files a/software/LepAnchor/CoverageHMM.class and b/software/LepAnchor/CoverageHMM.class differ diff --git a/software/LepAnchor/FindContigErrors.class b/software/LepAnchor/FindContigErrors.class index a6b8712..09aff0f 100644 Binary files a/software/LepAnchor/FindContigErrors.class and b/software/LepAnchor/FindContigErrors.class differ diff --git a/software/LepAnchor/Input.class b/software/LepAnchor/Input.class index e7ce976..6b7f545 100644 Binary files a/software/LepAnchor/Input.class and b/software/LepAnchor/Input.class differ diff --git a/software/LepAnchor/InputData.class b/software/LepAnchor/InputData.class index 8a21bd6..309b7b7 100644 Binary files a/software/LepAnchor/InputData.class and b/software/LepAnchor/InputData.class differ diff --git a/software/LepAnchor/Map2Bed.class b/software/LepAnchor/Map2Bed.class index 4f12cd1..0bf99a7 100644 Binary files a/software/LepAnchor/Map2Bed.class and b/software/LepAnchor/Map2Bed.class differ diff --git a/software/LepAnchor/Misc$ArrayIndexComparator.class b/software/LepAnchor/Misc$ArrayIndexComparator.class index 3226bdc..339c345 100644 Binary files a/software/LepAnchor/Misc$ArrayIndexComparator.class and b/software/LepAnchor/Misc$ArrayIndexComparator.class differ diff --git a/software/LepAnchor/Misc$ArrayIndexComparator2.class b/software/LepAnchor/Misc$ArrayIndexComparator2.class index 6b0b8b6..66a8bd3 100644 Binary files a/software/LepAnchor/Misc$ArrayIndexComparator2.class and b/software/LepAnchor/Misc$ArrayIndexComparator2.class differ diff --git a/software/LepAnchor/Misc.class b/software/LepAnchor/Misc.class index ada0e21..040fbbd 100644 Binary files a/software/LepAnchor/Misc.class and b/software/LepAnchor/Misc.class differ diff --git a/software/LepAnchor/ParameterParser.class b/software/LepAnchor/ParameterParser.class index 2dd9463..8250bae 100644 Binary files a/software/LepAnchor/ParameterParser.class and b/software/LepAnchor/ParameterParser.class differ diff --git a/software/LepAnchor/PlaceAndOrientContigs$CalculateBest.class b/software/LepAnchor/PlaceAndOrientContigs$CalculateBest.class index c644c04..8960178 100644 Binary files a/software/LepAnchor/PlaceAndOrientContigs$CalculateBest.class and b/software/LepAnchor/PlaceAndOrientContigs$CalculateBest.class differ diff --git a/software/LepAnchor/PlaceAndOrientContigs$LongComparator0.class b/software/LepAnchor/PlaceAndOrientContigs$LongComparator0.class index 3b69197..13be9af 100644 Binary files a/software/LepAnchor/PlaceAndOrientContigs$LongComparator0.class and b/software/LepAnchor/PlaceAndOrientContigs$LongComparator0.class differ diff --git a/software/LepAnchor/PlaceAndOrientContigs$LongComparator1.class b/software/LepAnchor/PlaceAndOrientContigs$LongComparator1.class index 3281509..b6925bf 100644 Binary files a/software/LepAnchor/PlaceAndOrientContigs$LongComparator1.class and b/software/LepAnchor/PlaceAndOrientContigs$LongComparator1.class differ diff --git a/software/LepAnchor/PlaceAndOrientContigs$PossibleError.class b/software/LepAnchor/PlaceAndOrientContigs$PossibleError.class index 7afb903..c48918a 100644 Binary files a/software/LepAnchor/PlaceAndOrientContigs$PossibleError.class and b/software/LepAnchor/PlaceAndOrientContigs$PossibleError.class differ diff --git a/software/LepAnchor/PlaceAndOrientContigs.class b/software/LepAnchor/PlaceAndOrientContigs.class index cde0f6e..8551faf 100644 Binary files a/software/LepAnchor/PlaceAndOrientContigs.class and b/software/LepAnchor/PlaceAndOrientContigs.class differ diff --git a/software/LepAnchor/Proximity.class b/software/LepAnchor/Proximity.class index 01baebf..d7b2460 100644 Binary files a/software/LepAnchor/Proximity.class and b/software/LepAnchor/Proximity.class differ diff --git a/software/LepAnchor/deps/step1.HM2 b/software/LepAnchor/deps/step1.HM2 index 0849458..29fd6c1 100755 --- a/software/LepAnchor/deps/step1.HM2 +++ b/software/LepAnchor/deps/step1.HM2 @@ -38,7 +38,7 @@ querySize=1600000000 # split query fasta file by size N bp (default=1600000000) #### =========================================================== rm -f -r $name.seq ${name}x.seq $name.sizes ${name}x.sizes ${name}x.fa.gz -perl ../software/LepAnchor/deps/initiation.pl \ +perl $CONDA_PREFIX/bin/initiation.pl \ --faSplit \ --faToNib \ --faSize \ @@ -58,7 +58,7 @@ ln -s $name.seq ${name}x.seq rm -f -r $name.${name}x.result/raw.axt -perl ../software/LepAnchor/deps/HM_all_lastz_mThreads.pl \ +perl $CONDA_PREFIX/bin/HM_all_lastz_mThreads.pl \ --Species $name ${name}x \ --noself \ --threads=$threads \ diff --git a/software/LepAnchor/deps/step2.HM2 b/software/LepAnchor/deps/step2.HM2 index 1598e4b..bd764c8 100755 --- a/software/LepAnchor/deps/step2.HM2 +++ b/software/LepAnchor/deps/step2.HM2 @@ -33,7 +33,7 @@ threads=$2 # the number of cpu cores to use (default=1) #### run axtChainRecipBestNet #### =========================================================== -perl ../software/LepAnchor/deps/HM_axtChainRecipBestNet.pl \ +perl $CONDA_PREFIX/bin/HM_axtChainRecipBestNet.pl \ --rbestNet \ --axtChain \ --tbest \ diff --git a/software/LepAnchor/scripts/COPYING b/software/LepAnchor/scripts/COPYING new file mode 100644 index 0000000..94a9ed0 --- /dev/null +++ b/software/LepAnchor/scripts/COPYING @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/software/LepAnchor/scripts/chainpaf.awk b/software/LepAnchor/scripts/chainpaf.awk new file mode 100644 index 0000000..144820c --- /dev/null +++ b/software/LepAnchor/scripts/chainpaf.awk @@ -0,0 +1,313 @@ +#chains paf files +#assumes paf is sorted on first column (query string) as in output of minimap2 +#awk -f chainpaf.awk aln20DP.paf +BEGIN{ +#not implemented yet +# if (maxDistance == "") +# maxDistance=2000 + + if (gapPenalty == "") + gapPenalty=-1 + if (gapOpen == "") + gapOpen=-12 + OFS="\t" + FS="\t" +} +{ + if (prev == "" || $1 == prev) + data[++lines] = $0 + else { + tmp = $0 + processData() + lines=1 + data[lines] = tmp + $0 = tmp + } + prev = $1 +} +END{ + processData() +} +function gapScore(end, start) { + if (start >= end) { + #print "error" + #exit + return 0 + } + return gapOpen + gapPenalty * (end - start) +} + +function max(a, b) { + if (a >= b) + return a + return b +} +function min(a, b) { + if (a < b) + return a + return b +} + + +function processData(i,j,k,n,gl1,gl2,gp1,gp2,start1,start2,end1,end2,bj,starts,cstarts,startsi,other,cother,best,prev,deleted,ret,mydata,s,cigar,score,trim) +{ + for (i = 1; i <= lines; ++i) { + $0=data[i] + starts[i] = $3 + startsi[$3, ++cstarts[$3]]=i + } + n = asort(starts) + + #sort data based on query start + #and separate by target + for (i = 1; i <= lines; ++i) { + start = starts[i] + $0 = data[startsi[start, cstarts[start]]] + s = 0 + for (j = 13; j <= NF; ++j) { + if ($j ~ /^AS:i:/) { + s = substr($j, 6)+0 + break + } + } + if (s <= 0) #do not store chains with score <= 0 + continue + --cstarts[start] + other[$6,$5] + mydata[$6,$5,++cother[$6,$5]] = $0 + } + + for (o in other) { + if (cother[o] == 1) + print mydata[o,1] + else { # find max score non-overlapping chain... + first = 1 + while (cother[o] > 1) { + for (j = 1; j <= cother[o]; ++j) { + $0 = mydata[o,j] + if (first) + print + for (i = 13; i <= NF; ++i) { + if ($i ~ /^AS:i:/) + score[j] = substr($i, 6)+0 + if ($i ~ /^cg:Z:/) + cigar[j] = substr($i, 6) + } + best[j] = score[j] + prev[j] = j + trim[j] = 0 #trim cigar if needed + } + first = 0 + + for (j = 1; j < cother[o]; ++j) { + $0 = mydata[o,j] + end1 = $4 + if ($5 == "+") { + end2 = $9 + for (k = j + 1; k <= cother[o]; ++k) { + $0 = mydata[o,k] + #($5 == "+"), both alignments are in the same orientation + start1 = $3 + start2 = $8 + #if (start1 >= end1 && start2 >= end2) { #non-overlapping, increasing + ov = max(end1 - start1, end2 - start2) #overlap + if (ov <= 0 || ov <= maxTrim1(cigar[k])) { + if (ov <= 0) { + gp1 = gapScore(start1, end1) + gp2 = gapScore(start2, end2) + } + else { + gp1 = gapScore(start1 + ov, end1) + gp2 = gapScore(start2 + ov, end2) - score[k] * ov / min($9 - $8, $4 - $3) + } + if (best[j] + gp1 <= 0) + break + nb = best[j] + score[k] + gp1 + gp2 + if (nb > best[k]) { + best[k] = nb + prev[k] = j + trim[k] = ov + } + } + } + } else { #"-" orientation + end2 = $8 + for (k = j + 1; k <= cother[o]; ++k) { + $0 = mydata[o,k] + #($5 == "-"), both alignments are in the same orientation + start1 = $3 + start2 = $9 + #if (start1 >= end1 && start2 <= end2) { #non-overlapping, increasing (decreasing orientation -) + ov = max(end1 - start1, start2 - end2) #overlap + if (ov <= 0 || ov <= maxTrim1(cigar[k])) { + if (ov <= 0) { + gp1 = gapScore(start1, end1) + gp2 = gapScore(end2, start2) + } + else { + gp1 = gapScore(start1 + ov, end1) + gp2 = gapScore(end2, start2 - ov) - score[k] * ov / min($9 - $8, $4 - $3) + } + if (best[j] + gp1 <= 0) + break + nb = best[j] + score[k] + gp1 + gp2 + if (nb > best[k]) { + best[k] = nb + prev[k] = j + trim[k] = ov + } + } + } + } + } + while (1) { + bj = 1 + for (j = 2; j <= cother[o]; ++j) { + if ((best[j] > best[bj] && !(j in deleted)) || (bj in deleted)) + bj = j + } + n = 1 + ret[1] = bj; + while (prev[bj] != bj) { + bj = prev[bj] + ret[++n] = bj + } + s = 0 + for (i = 1; i <= n; ++i) + if (ret[i] in deleted) { + s = 1 + break + } + if (s) + break + + for (i = 1; i <= n; ++i) + deleted[ret[i]] + + if (n == 1) + continue + + $0 = mydata[o,ret[n]] + start1 = $3 + if ($5 == "+") { + start2 = $8 + } else { + end2 = $9 + } + + $0 = mydata[o,ret[1]] + end1 = $4 + if ($5 == "+") { + end2 = $9 + } else { + start2 = $8 + } + s = $1 "\t" $2 "\t" start1 "\t" end1 "\t" $5 "\t" $6 "\t" $7 "\t" start2 "\t" end2 "\t0\t0\t0\tAS:i:" int(best[ret[1]]) "\t" "CH:i:" n "\tcg:Z:" + + + $0 = mydata[o,ret[n]] + end1 = $4 + if ($5 == "+") + end2 = $9 + else + end2 = $8 + + #construct combined cigar + c = cigar[ret[n]] + + for (i = n - 1; i >= 1; --i) { + j = ret[i] + $0 = mydata[o,j] + + gl1 = $3 - end1 + end1 = $4 + if ($5 == "+") { + gl2 = $8 - end2 + end2 = $9 + } + else { + gl2 = end2 - $9 + end2 = $8 + } + if (trim[j] > 0) { + gl1 += trim[j] + gl2 += trim[j] + } + + if (gl1 < 0 || gl2 < 0) { + print "\nerror:negative gap" gl1 gl2 + exit + } + if (gl1 > 0) + c = c gl1 "I" + if (gl2 > 0) + c = c gl2 "D" + if (trim[j] > 0) + c = c "" trim1(cigar[j], trim[j]) + else + c = c "" cigar[j] + + } + if (n > 1) + print s c + } #while (1) + + k = 1 + for (j = 1; j <= cother[o]; ++j) { + mydata[o, k] = mydata[o, j] + if (!(j in deleted)) + ++k; + } + cother[o] = k - 1 + delete(deleted) + } + } + } +} +#how much you can trim cigar aligment from the first (match) +function maxTrim1(cigar) +{ + return cigar+0 +} +#and remove overlap from first match +function trim1(cigar,ov){ + return (cigar - ov) substr(cigar, index(cigar, "M")) +} + +#how much you can trim cigar aligment from the last (match) +function maxTrim2(cigar ,i) +{ + i = length(cigar) - 1 + while (substr(cigar, i, 1) ~ /[0-9]/) + --i + + return substr(cigar, i + 1) + 0 +} +#function trim1(cigar, ov ,s1,s2,le,op){ +# s1 = ov +# s2 = ov +# while (1) { +# le = (cigar + 0) +# op = substr(cigar, length(le) + 1, 1) +# if (op == "M") { +# if (s1 - le <= 0 && s2 - le <= 0) { +# if (s1 <= 0 && s2 <= 0) +# return cigar +# return (cigar - max(s1, s2)) substr(cigar, length(le) + 1) +# } +# s1 -= le +# s2 -= le +# } else if (op == "I") { +# s1 -= le +# } +# else if (op == "D") { +# s2 -= le +# } +# else { +# print "error parsing cigar string" +# exit +# } +# cigar = substr(cigar, length(le) + 2) +# } +#} + diff --git a/software/LepAnchor/scripts/cutBed.awk b/software/LepAnchor/scripts/cutBed.awk new file mode 100644 index 0000000..0ab51a0 --- /dev/null +++ b/software/LepAnchor/scripts/cutBed.awk @@ -0,0 +1,152 @@ +#cuts bed based on extra cut sites +#usage: awk -f cutBed.awk map.bed cut_sites.txt >cut.bed +# awk -f cutBed_fixN.awk contigs.length cut.bed >cut_fix.bed +#map.bed is from Map2Bed +#cut_sites is contig, start [,stop] +#both input files with non-overlapping intervals and sorted by start +BEGIN{ +# FS="\t" + OFS="\t" +} +#store bed +(NR==FNR && /^[^#]/){ + end[$1] = substr($3, index($3, "-") + 1) + 0 + if (!($1 in c)) + start[$1] = $2 + 0 + ++c[$1] + bed[$1,c[$1]]=$0 + beds[$1,c[$1]] = substr($2, index($2, "-") + 1) + 0 + bede[$1,c[$1]] = $3 + 0 + bedc[$1,c[$1]] = $5 + contigs[$1] + if ($5 > maxChr) + maxChr = $5 +} +#store cutsites +(NR!=FNR){ + if (NF < 2) + next + #gap can be in ($3-$2+2) positions, start-1...end are the possible positions + if (NF >= 3) + $2 = $2 - 1 + else + $3 = $2 + + #remove cut if it intersects several contig boundaries + #trim if only one... + trim = 0 + for (i = 1; i <= c[$1]; ++i) { + if ($2 < beds[$1,i] && $3 >= beds[$1,i]) { + if (trim > 0) { + print "cutBed: " $0 " removed" >"/dev/stderr" + next + } + trim = i + } + if ($2 <= bede[$1,i] && $3 > bede[$1,i]) { + if (trim > 0) { + print "cutBed: " $0 " removed" >"/dev/stderr" + next + } + trim = i + } + } + if (trim > 0) + for (i = 1; i <= c[$1]; ++i) { + if ($2 < beds[$1,i] && $3 >= beds[$1,i]) { + printf "cutBed: " $0 " trimmed => " >"/dev/stderr" + $3 = beds[$1,i] - 1 + print $0 >"/dev/stderr" + + } + if ($2 <= bede[$1,i] && $3 > bede[$1,i]) { + printf "cutBed: " $0 " trimmed => " >"/dev/stderr" + $2 = bede[$1,i] + 1 + print $0 >"/dev/stderr" + } + } + cut1[$1,++d[$1]] = $2 + cut2[$1, d[$1]] = $3 +} +END{ + for (contig in contigs) { + if (d[contig] + 0 == 0) { #no cuts + for (i = 1; i <= c[contig]; ++i) + print bed[contig, i] + } else { #at least one cut for contig + j = 1 + i = 1 + st = start[contig] + + #before bed i, split zero chr + printed = 0 + while (j <= d[contig] && cut2[contig, j] < beds[contig, i]) { + c1 = cut1[contig, j] + c2 = cut2[contig, j] + for (chr = 1; chr <= maxChr; ++chr) ## put to all chromosomes + print contig, st, c1"-"c2, "?", chr + st = (c1+1)"-"(c2+1) + ++j + printed = 1 + } + if (printed == 0) + st = st "-" beds[contig, i] + + while (i <= c[contig]) { + #split bed i, chr in bedc + #printed = 0 + while (j <= d[contig] && cut1[contig, j] >= beds[contig, i] && cut2[contig, j] <= bede[contig, i]) { + c1 = cut1[contig, j] + c2 = cut2[contig, j] + print contig, st, c1"-"c2, "?", bedc[contig, i] + st = (c1+1)"-"(c2+1) + ++j + #printed = 1 + } + + #after bed i, chr in bedc for first + printed = 0 + while (j <= d[contig] && cut1[contig, j] > bede[contig, i] && (i == c[contig] || cut2[contig, j] < beds[contig, i + 1])) { + c1 = cut1[contig, j] + c2 = cut2[contig, j] + if (printed == 0) { + print contig, st, c1"-"c2, "?", bedc[contig, i] + printed = 1 + } else + for (chr = 1; chr <= maxChr; ++chr) ## put to all chromosomes + print contig, st, c1"-"c2, "?", chr + st = (c1+1)"-"(c2+1) + ++j + } + if (printed == 0) { + if (i == c[contig]) + e2 = end[contig]"*" + else + e2 = beds[contig, i + 1] - 1 + e1 = bede[contig, i] + print contig, st, e1"-"e2, "?", bedc[contig, i] + st = (e1+1)"-"(e2+1) + } else if (i == c[contig]) { + for (chr = 1; chr <= maxChr; ++chr) ## put to all chromosomes + print contig, st, end[contig]"*", "?", chr + + } + + ++i + } + } + } + for (contig in d) + if (!(contig in contigs)){ + st = 1 + for (j = 1; j <= d[contig]; ++j) { + c1 = cut1[contig, j] + c2 = cut2[contig, j] + for (chr = 1; chr <= maxChr; ++chr) ## put to all chromosomes + print contig, st, c1"-"c2,"?",chr + st = (c1+1)"-"(c2+1) + } + for (chr = 1; chr <= maxChr; ++chr) ## put to all chromosomes + print contig, st, "N*","?",chr + } +} diff --git a/software/LepAnchor/scripts/cutBed_fixN.awk b/software/LepAnchor/scripts/cutBed_fixN.awk new file mode 100644 index 0000000..a484163 --- /dev/null +++ b/software/LepAnchor/scripts/cutBed_fixN.awk @@ -0,0 +1,23 @@ +#awk -f cutBed_fixN.awk contigs.length cut.bed +BEGIN{ +# FS="\t" + OFS="\t" +} +(NR==FNR){ + l[$1]=$2 +} +(NR!=FNR){ + if ($3 == "N*") { + if ($1 in l) + $3 = l[$1]"*" + else { + print "error: length for contig " $1 " not found!" + exit + } + } + d[++line] = $0 +} +END{ + for (i = 1; i <= line; ++i) + print d[i] +} diff --git a/software/LepAnchor/scripts/hic.awk b/software/LepAnchor/scripts/hic.awk new file mode 100644 index 0000000..48bbf0a --- /dev/null +++ b/software/LepAnchor/scripts/hic.awk @@ -0,0 +1,33 @@ +#samtools view hic.bam|awk -f hic.awk +#or samtools view -h hic.bam|samblaster/samblaster -r|awk -f hic.awk +#hic.bam: bwa mem -t16 -5SPM REF R1.fq.gz R2.fq.gz|samtools view -b - >hic.bam +#process hic data for Lep-Anchor +BEGIN{ + #mapping quality limit + if (qLimit == "") + qLimit=20 + FS="\t" +} +(NF>=7 && $5>=qLimit){ + if ($1 == prevR) { + for (i = 1; i <= n; ++i) { #print both ways + print $3"\t"$4"\t"c[i]"\t"p[i]"\t?\t"min(q[i], $5) + print c[i]"\t"p[i]"\t"$3"\t"$4"\t?\t"min(q[i], $5) + } + ++n + } else + n = 1 + c[n]=$3 + p[n]=$4 + q[n]=$5 + prevR = $1 + +} +function min(a,b){ + if (a <= b) + return a + else + return b +} +END{ +} diff --git a/software/LepAnchor/scripts/ld.awk b/software/LepAnchor/scripts/ld.awk new file mode 100644 index 0000000..113eabc --- /dev/null +++ b/software/LepAnchor/scripts/ld.awk @@ -0,0 +1,22 @@ +#zcat sorted_ld.gz|awk -f perm|awk -f maxmatch3 +#awk -vOFS="\t" '(!(($1"\t"$2"\t"$3) in d) && !(($3"\t"$4"\t"$1) in d) ){d[$1"\t"$2"\t"$3];d[$3"\t"$4"\t"$1]; print;t=$1;$1=$3;$3=t;t=$2;$2=$4;$4=t;print}' +BEGIN{ + if (window == "") + window = 10000 +} +function round(p) +{ + return 1 + window * int((p-1)/window) +} +{ + r1 = round($2) + r2 = round($4) + i1 = $1"\t"$2"\t"$3"\t"r2"\t"r1 + i2 = $3"\t"$4"\t"$1"\t"r1"\t"r2 + if (d[i1] == "" && d[i2] == "") { + ++d[i1] + ++d[i2] + print $1"\t"r1"\t"$3"\t"r2"\t"$5"\t"$6"\t"$2"\t"$4 + print $3"\t"r2"\t"$1"\t"r1"\t"$5"\t"$6"\t"$4"\t"$2 + } +} diff --git a/software/LepAnchor/lepanchor_wrapper.sh b/software/LepAnchor/scripts/lepanchor_wrapper.sh old mode 100644 new mode 100755 similarity index 54% rename from software/LepAnchor/lepanchor_wrapper.sh rename to software/LepAnchor/scripts/lepanchor_wrapper.sh index d80a0ab..67c4b37 --- a/software/LepAnchor/lepanchor_wrapper.sh +++ b/software/LepAnchor/scripts/lepanchor_wrapper.sh @@ -3,7 +3,7 @@ # # Lep-Anchor wrapper # -# usage: lepanchor_wrapper.sh -f ref.fasta -n num_chr -c chain_file -p paf_file -r prox_file -m map_file1 -m map_file2, ... +# usage: lepanchor_wrapper.sh -f ref.fasta -n num_chr -e extra_cut_site_file -c chain_file -p paf_file -r prox_file -m map_file1 -m map_file2, ... # # output: LA_REF.fa.gz anchored reference genome # LA_REF.agp agp file describing the genome @@ -16,6 +16,12 @@ # ########################################################## +#if [ "$#" -ne 3 ]; then +# echo "At least three input parameters must be provided" +# exit 1 +#fi + + #parse parameters function print_usage() @@ -24,7 +30,7 @@ echo "##########################################################" echo "#" echo "# Lep-Anchor wrapper" echo "#" -echo "# usage: lepanchor_wrapper.sh -t threads -f ref.fasta -n num_chr -c chain_file -r prox_file -p paf_file -m map_file1 -m map_file2, ... " +echo "# usage: lepanchor_wrapper.sh -t threads -f ref.fasta -n num_chr -e extra_cut_site_file -c chain_file -r prox_file -p paf_file -m map_file1 -m map_file2, ... " echo "#" echo "# download Lep-Anchor by" echo "# wget https://sourceforge.net/projects/lep-anchor/files/binary%2Bcode.zip/download -O la.zip;unzip la.zip" @@ -34,7 +40,7 @@ echo "#" echo "##########################################################" } -while getopts ":n:c:p:m:f:t:r:" OPTION; do +while getopts ":e:n:c:p:m:f:t:r:" OPTION; do case ${OPTION} in t) THREADS=$OPTARG;; @@ -44,6 +50,8 @@ while getopts ":n:c:p:m:f:t:r:" OPTION; do CHAIN=$OPTARG;; p) PAF=$OPTARG;; + e) + CUT=$OPTARG;; r) PROX=$OPTARG;; m) @@ -120,20 +128,20 @@ fi if [[ ! $REF =~ ^$ ]];then echo "calculating contigs.length file..." - echo "gunzip -fc $REF | awk -f scripts/contigLength.awk > contigs.length" | bash + echo "gunzip -fc $REF|awk -f contigLength.awk >contigs.length"|bash fi echo "finding full haplotypes..." -echo "gunzip -fc $CHAIN | awk -f scripts/findFullHaplotypes.awk > fullHaplotypes50.txt" | bash +echo "gunzip -fc $CHAIN|awk -f findFullHaplotypes.awk >fullHaplotypes50.txt"|bash wc -l fullHaplotypes50.txt echo "running liftoverHaplotypes for all input maps..." for i in $MAP do -echo "gunzip -fc $CHAIN | $JAVA -cp $LABIN LiftoverHaplotypes map=$i haplotypes=fullHaplotypes50.txt chain=- > $i.liftover" -done | $PARALLEL +echo "gunzip -fc $CHAIN|$JAVA -cp $LABIN LiftoverHaplotypes map=$i haplotypes=fullHaplotypes50.txt chain=- >$i.liftover" +done|$PARALLEL #store lift overed maps to $MAPL for i in $MAP @@ -145,90 +153,100 @@ done #make input for CleanMap for i in $MAP do - cat $i.liftover -done | sort -V -k 1,1 -k 2,2n > map_all_sorted.liftover +cat $i.liftover +done|sort -V -k 1,1 -k 2,2n >map_all_sorted.liftover #CleanMap echo "running CleanMap..." -$JAVA -cp $LABIN CleanMap map=map_all_sorted.liftover > map_all.clean +$JAVA -cp $LABIN CleanMap map=map_all_sorted.liftover >map_all.clean #Map2Bed echo "running Map2Bed..." -$JAVA -cp $LABIN Map2Bed map=map_all.clean contigLength=contigs.length > map.bed +$JAVA -cp $LABIN Map2Bed map=map_all.clean contigLength=contigs.length >map.bed + +if [[ ! $CUT =~ ^$ ]];then +echo "adding extra cuts to map.bed..." +awk '{print $1"\t"$2"\t"$3}' $CUT|sort -V -k 1,1 -k 2,2n|awk -f cutBed.awk map.bed - >map.bed.tmp +awk -f cutBed_fixN.awk contigs.length map.bed.tmp >map.bed +fi + #find contigs not put into chromosomes -cut -f 1 contigs.length | grep -v -w -F -f <(cut -f 2 fullHaplotypes50.txt; cut -f 1 map.bed) > not_used.txt +cut -f 1 contigs.length|grep -v -w -F -f <(cut -f 2 fullHaplotypes50.txt; cut -f 1 map.bed) >not_used.txt -grep -w -F -f not_used.txt contigs.length | awk -vn=$CHR '{s=$1"\t1\t"$2"\t?\t"; for (i=1;i<=n;++i) print s i}' > chr0.bed -cat map.bed chr0.bed > map_extra.bed +grep -w -F -f not_used.txt contigs.length|awk -vn=$CHR '{s=$1"\t1\t"$2"\t?\t"; for (i=1;i<=n;++i) print s i}' >chr0.bed +cat map.bed chr0.bed >map_extra.bed #PlaceAndOrientContigs echo "running PlaceAndOrientContigs (first iteration)..." for i in $(seq $CHR) do -echo "gunzip -fc $CHAIN | $JAVA -cp $LABIN PlaceAndOrientContigs bed=map_extra.bed chromosome=$i map=$MAPL chain=- paf=$PAF proximity=$PROX keepEmptyIntervals=1 > chr$i.la 2> chr$i.la.err" -done | $PARALLEL +echo "gunzip -fc $CHAIN|$JAVA -cp $LABIN PlaceAndOrientContigs bed=map_extra.bed chromosome=$i map=$MAPL chain=- paf=$PAF proximity=$PROX keepEmptyIntervals=1 >chr$i.la 2>chr$i.la.err" +done|$PARALLEL #propagate echo "running propagate..." -awk -f scripts/propagate.awk chr*.la > tmp1.la -awk -f scripts/propagate.awk tmp1.la > tmp2.la +awk -f propagate.awk chr*.la >tmp1.la +awk -f propagate.awk tmp1.la >tmp2.la i=2 while ! cmp -s "tmp$i.la" "tmp$(( $i-1 )).la" ;do - awk -f scripts/propagate.awk tmp$i.la > tmp$[$i+1].la + awk -f propagate.awk tmp$i.la >tmp$[$i+1].la i=$[$i+1] done #create prop*.la awk '/^[^#]/{++d[$1 "\t" $7+0 "\t" $8+0]; data[++line]=$0}END{for (i = 1; i <= line; ++i) {$0=data[i];if (d[$1 "\t" $7+0 "\t" $8+0] == 1) fn="prop"$5".la"; else if ($5==1) fn="prop0.la"; else fn=""; if (fn != "") print $0>fn}}' tmp$i.la +#use propagate2 to include possible bridge contigs as well... +#awk -f propagate2.awk tmp$i.la|awk '(/^[^#]/ && NF>=8){++d[$1 "\t" $7+0 "\t" $8+0]; data[++line]=$0}END{for (i = 1; i <= line; ++i) {$0=data[i];if (d[$1 "\t" $7+0 "\t" $8+0] == 1) fn="prop"$5".la"; else if ($5==1) fn="prop0.la"; else fn=""; if (fn != "") print $0>fn}}' + #create a new bed by combining prop[1-9]*.la and map.bed -awk '(NR==FNR){print;c[$1]}(NR!=FNR && !($1 in c)){print $1 "\t" $7+0 "\t" $8+0"\t?\t"$5}' map.bed prop[1-9]*.la > map_prop.bed +awk '(NR==FNR){print;c[$1]}(NR!=FNR && !($1 in c)){print $1 "\t" $7+0 "\t" $8+0"\t?\t"$5}' map.bed prop[1-9]*.la >map_prop.bed #PlaceAndOrientContigs echo "running PlaceAndOrientContigs (second iteration)..." for i in $(seq $CHR) do -echo "gunzip -fc $CHAIN | $JAVA -cp $LABIN PlaceAndOrientContigs bed=map_prop.bed chromosome=$i map=$MAPL chain=- paf=$PAF proximity=$PROX keepEmptyIntervals=1 > ichr$i.la 2> ichr$i.la.err" -done | $PARALLEL +echo "gunzip -fc $CHAIN|$JAVA -cp $LABIN PlaceAndOrientContigs bed=map_prop.bed chromosome=$i map=$MAPL chain=- paf=$PAF proximity=$PROX keepEmptyIntervals=1 >ichr$i.la 2>ichr$i.la.err" +done|$PARALLEL #pruning contig blocks without map support for i in $(seq $CHR) do - awk -f scripts/prune.awk ichr$i.la > ichr${i}_pruned.la -done 2> pruned.la + awk -f prune.awk ichr$i.la >ichr${i}_pruned.la +done 2>pruned.la #remove overlap(s) -awk -f scripts/removeOverlaps.awk map_prop.bed ichr*_pruned.la > iall.la +awk -f removeOverlaps.awk map_prop.bed ichr*_pruned.la >iall.la #construct agp files for i in $(seq $CHR) do -awk -vn=$i '($5==n)' iall.la | awk -vprefix="LG" -vlg=$i -f scripts/makeagp_full2.awk - > chr$i.agp -awk -vn=$i '($5==n)' iall.la | awk -vprefix="LG" -vlg=$i -f scripts/makeagp2.awk - > scaffolds_chr$i.agp +awk -vn=$i '($5==n)' iall.la|awk -vprefix="LG" -vlg=$i -f makeagp_full2.awk - >chr$i.agp +awk -vn=$i '($5==n)' iall.la|awk -vprefix="LG" -vlg=$i -f makeagp2.awk - >scaffolds_chr$i.agp done #find contigs not used -cut -f 1 contigs.length | grep -v -w -F -f <(cut -f 2 fullHaplotypes50.txt;awk '($5!="U"){print $6}' chr*.agp) > not_used_final.txt +cut -f 1 contigs.length|grep -v -w -F -f <(cut -f 2 fullHaplotypes50.txt;awk '($5!="U"){print $6}' chr*.agp) >not_used_final.txt -grep -F -w -f not_used_final.txt contigs.length | awk '{print $1,1,$2,1,"W",$1,1,$2,"+"}' > not_used.agp +grep -F -w -f not_used_final.txt contigs.length|awk '{print $1,1,$2,1,"W",$1,1,$2,"+"}' >not_used.agp -cat chr*.agp not_used.agp > REF_LA.agp +cat chr*.agp not_used.agp >REF_LA.agp #one could use scaffolds_chr*.agp as well instead of chr*.agp -cat scaffolds_chr*.agp not_used.agp > REF_LA_scaffolds.agp +cat scaffolds_chr*.agp not_used.agp >REF_LA_scaffolds.agp #make final fasta if [[ ! $REF =~ ^$ ]];then echo "constructing final fasta (REF_LA.fa.gz)..." - echo "gunzip -fc $REF | awk -f scripts/makefasta.awk - REF_LA.agp | gzip > REF_LA.fa.gz" | bash + echo "gunzip -fc $REF|awk -f makefasta.awk - REF_LA.agp|gzip >REF_LA.fa.gz"|bash echo "constructing final fasta (REF_LA_scaffolds.fa.gz)..." - echo "gunzip -fc $REF | awk -f scripts/makefasta.awk - REF_LA_scaffolds.agp | gzip > REF_LA_scaffolds.fa.gz" | bash + echo "gunzip -fc $REF|awk -f makefasta.awk - REF_LA_scaffolds.agp|gzip >REF_LA_scaffolds.fa.gz"|bash fi #construct Marey map @@ -238,9 +256,13 @@ for m in $MAP do for c in $(seq $CHR) do - awk -vn=$c '($3==n)' $m.liftover | awk -f scripts/liftover.awk chr$c.agp - |awk -vm=$j '(/LG/ && NF>=4){if (NF==4) $5=$4;print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$5}' +# awk -vn=$c '($3==n)' $m.liftover|awk -f liftover.awk chr$c.agp -|awk -vm=$j '(/LG/ && NF>=4){if (NF==4) $5=$4;print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$5}' + awk -vn=$c '($3==n)' $m.liftover|awk -f liftover.awk chr$c.agp -|awk -vm=$j '(/LG/ && NF>=4){if (NF==4) {print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$4} else for (i=4;i marey.data.gz +done|gzip >marey.data.gz + +Rscript plot_marey.R + +#TODO: lepanchor_wrapper_step2.sh -Rscript scripts/plot_marey.R \ No newline at end of file diff --git a/software/LepAnchor/scripts/lepanchor_wrapper2.sh b/software/LepAnchor/scripts/lepanchor_wrapper2.sh new file mode 100755 index 0000000..a5f7991 --- /dev/null +++ b/software/LepAnchor/scripts/lepanchor_wrapper2.sh @@ -0,0 +1,287 @@ +#!/bin/bash +########################################################## +# +# Lep-Anchor wrapper, version 2 +# +# usage: lepanchor_wrapper2.sh -f ref.fasta -e extra_cut_site_file -n num_chr -c chain_file -p paf_file -r prox_file -m map_file1 -m map_file2, ... +# +# output: REF_LA.fa.gz anchored reference genome +# REF_LA.agp agp file describing the genome +# REF_LA_scaffolds.fa.gz anchored reference genome in scaffolds +# REF_LA_scaffolds.agp anchored reference genome in scaffolds agp file +# marey*.png Marey maps for visual verification +# +# chr*.agp agp files for each chromosome +# scaffolds_chr*.agp agp files for each chromosome in scaffolds (each block of linked contigs as a scaffold) +# +# +# Pasi Rastas, (c) 2021, pasi.rastas@gmail.com +# +########################################################## + +#if [ "$#" -ne 3 ]; then +# echo "At least three input parameters must be provided" +# exit 1 +#fi + + +#parse parameters + +function print_usage() +{ +echo "##########################################################" +echo "#" +echo "# Lep-Anchor wrapper2" +echo "#" +echo "# usage: lepanchor_wrapper2.sh -T thread_per_run -t threads -f ref.fasta -n num_chr -e extra_cut_site_file -c chain_file -r prox_file -p paf_file -m map_file1 -m map_file2, ... " +echo "#" +echo "# download Lep-Anchor by" +echo "# wget https://sourceforge.net/projects/lep-anchor/files/binary%2Bcode.zip/download -O la.zip;unzip la.zip" +echo "#" +echo "# Pasi Rastas, (c) 2021, pasi.rastas@gmail.com" +echo "#" +echo "##########################################################" +} + +while getopts ":e:n:c:p:m:f:t:r:T:" OPTION; do + case ${OPTION} in + T) + THREADS2=$OPTARG;; + t) + THREADS=$OPTARG;; + n) + CHR=$OPTARG;; + c) + CHAIN=$OPTARG;; + p) + PAF=$OPTARG;; + e) + CUT=$OPTARG;; + r) + PROX=$OPTARG;; + m) + MAP="$MAP $OPTARG";; + f) + REF=$OPTARG;; + *) + echo "Incorrect options provided" + exit 1;; + esac +done + +if [[ $MAP =~ ^$ ]];then + print_usage + echo "Please provide at least one map" + exit 1 +fi + +if [[ $REF =~ ^$ ]];then + if [[ ! -e "contigs.length" ]];then + print_usage + echo "Please provide either reference fasta or contigs.length file" + exit 1 + fi +fi + +if [[ $CHR =~ ^[0-9]+$ ]];then + echo "Number of chromosomes = $CHR" +else + print_usage + echo "Please provide parameter n (number of chromosomes)" + exit 1 +fi + +if [[ $CHAIN =~ ^$ ]];then + echo "No chain provided" + CHAIN="/dev/null" +fi + +if [[ $PROX =~ ^$ ]];then + echo "No proximity data provided" + PROX="/dev/null" +fi + + +if [[ $PAF =~ ^$ ]];then + echo "No paf provided" + PAF="/dev/null" +fi + +#number of threads +if [[ $THREADS =~ ^$ ]];then + THREADS=8 +fi + +if [[ $THREADS2 =~ ^$ ]];then + THREADS2=1 +fi + +#get Lep-Anchor +#wget https://sourceforge.net/projects/lep-anchor/files/binary%2Bcode.zip/download -O la.zip +#unzip la.zip + +#where Lep-Anchor binaries are located +LABIN=bin/ + +#java runtime located here +JAVA=java + +#parallel +if ! command -v "parallel" +then + echo "command parallel not found, using only one thread" + PARALLEL=bash +else + PARALLEL="parallel --jobs $THREADS" +fi + +if [[ ! $REF =~ ^$ ]];then + echo "calculating contigs.length file..." + echo "gunzip -fc $REF|awk -f contigLength.awk >contigs.length"|bash +fi + + +echo "finding full haplotypes..." +echo "gunzip -fc $CHAIN|awk -f findFullHaplotypes.awk >fullHaplotypes50.txt"|bash +wc -l fullHaplotypes50.txt + + +echo "running liftoverHaplotypes for all input maps..." +for i in $MAP +do +echo "gunzip -fc $CHAIN|$JAVA -cp $LABIN LiftoverHaplotypes map=$i haplotypes=fullHaplotypes50.txt chain=- >$i.liftover" +done|$PARALLEL + +#store lift overed maps to $MAPL +for i in $MAP +do + MAPL="$MAPL $i.liftover" +done + +#make input for CleanMap +for i in $MAPL +do +cat $i +done|sort -V -k 1,1 -k 2,2n >map_all_sorted.liftover + + +#CleanMap +echo "running CleanMap..." +$JAVA -cp $LABIN CleanMap map=map_all_sorted.liftover >map_all.clean + +#Map2Bed +echo "running Map2Bed..." +$JAVA -cp $LABIN Map2Bed map=map_all.clean contigLength=contigs.length >map.bed + +if [[ ! $CUT =~ ^$ ]];then +echo "adding extra cuts to map.bed..." +awk '{print $1"\t"$2"\t"$3}' $CUT|sort -V -k 1,1 -k 2,2n|awk -f cutBed.awk map.bed - >map.bed.tmp +awk -f cutBed_fixN.awk contigs.length map.bed.tmp >map.bed +fi + + +#find contigs not put into chromosomes +cut -f 1 contigs.length|grep -v -w -F -f <(cut -f 2 fullHaplotypes50.txt; cut -f 1 map.bed) >not_used.txt +#cut -f 1 contigs.length|grep -v -w -F -f <(cut -f 1 map.bed) >not_used.txt + + +grep -w -F -f not_used.txt contigs.length|awk -vn=$CHR '{s=$1"\t1\t"$2"\t?\t"; for (i=1;i<=n;++i) print s i}' >chr0.bed +cat map.bed chr0.bed >map_extra.bed + +#PlaceAndOrientContigs +echo "running PlaceAndOrientContigs (first iteration)..." +for i in $(seq $CHR) +do +echo "gunzip -fc $CHAIN|$JAVA -cp $LABIN PlaceAndOrientContigs numThreads=$THREADS2 bed=map_extra.bed chromosome=$i map=$MAPL chain=- paf=$PAF proximity=$PROX keepEmptyIntervals=0 >chr$i.la 2>chr$i.la.err" +done|$PARALLEL + +#propagate4 +echo "running propagate4..." +awk -f propagate4.awk pass=1 chr*[0-9].la pass=2 chr*[0-9].la.err|awk -f pickbed.awk - map_extra.bed >map_extra_prop.bed + +#PlaceAndOrientContigs +echo "running PlaceAndOrientContigs (second iteration)..." +for i in $(seq $CHR) +do +echo "gunzip -fc $CHAIN|$JAVA -cp $LABIN PlaceAndOrientContigs numThreads=$THREADS2 $(awk -f pickorientation.awk chr$i.la) bed=map_extra_prop.bed chromosome=$i map=$MAPL chain=- paf=$PAF proximity=$PROX keepEmptyIntervals=1 >ichr$i.la 2>ichr$i.la.err" +done|$PARALLEL + +#propagate +echo "running propagate..." + +awk -f propagate.awk ichr*.la >tmp1.la +awk -f propagate.awk tmp1.la >tmp2.la +i=2 + +while ! cmp -s "tmp$i.la" "tmp$(( $i-1 )).la" ;do + awk -f propagate.awk tmp$i.la >tmp$[$i+1].la + i=$[$i+1] +done + +#create prop*.la, take only contig put to uniquely to a chromosome +#use propagate2 to include possible bridge contigs as well... +awk -f propagate2.awk tmp$i.la|awk '(/^[^#]/ && NF>=8){++d[$1"\t"($7+0)"\t"($8+0)]; data[++line]=$0}END{for (i=1; i<=line; ++i) {$0=data[i];if (d[$1"\t"($7+0)"\t"($8+0)] == 1) {fn="prop"$5".la";print $0>fn}}}' + +#create a new bed by combining prop[1-9]*.la and map_extra_prop.bed +awk '{print $1"\t"($7+0)"\t"($8+0)"\t?\t"$5}' prop[1-9]*.la|awk -f pickbed.awk - map_extra_prop.bed >map_extra_prop2.bed + +#third iteration could be just improveOrder + +#PlaceAndOrientContigs +echo "running PlaceAndOrientContigs (third iteration)..." +for i in $(seq $CHR) +do +echo "gunzip -fc $CHAIN|$JAVA -cp $LABIN PlaceAndOrientContigs numThreads=$THREADS2 $(awk -f pickorientation.awk chr$i.la) bed=map_extra_prop2.bed chromosome=$i map=$MAPL chain=- paf=$PAF proximity=$PROX keepEmptyIntervals=1 evaluateAnchoring=prop$i.la improveAnchoring=1 numRuns=1 >iichr$i.la 2>iichr$i.la.err" +done|$PARALLEL + +#pruning contig blocks without map support +for i in $(seq $CHR) +do + awk -f prune.awk iichr$i.la >iichr${i}_pruned.la +done 2>pruned.la + +#remove overlap(s) +awk -f removeOverlaps.awk map_extra_prop2.bed iichr*_pruned.la >iiall.la + +#construct agp files +for i in $(seq $CHR) +do +awk -vn=$i '($5==n)' iiall.la|awk -vprefix="LG" -vlg=$i -f makeagp_full2.awk - >chr$i.agp +awk -vn=$i '($5==n)' iiall.la|awk -vprefix="LG" -vlg=$i -f makeagp2.awk - >scaffolds_chr$i.agp +done + +#find contigs not used +cut -f 1 contigs.length|grep -v -w -F -f <(cut -f 2 fullHaplotypes50.txt;awk '($5!="U"){print $6}' chr*.agp) >not_used_final.txt + +grep -F -w -f not_used_final.txt contigs.length|awk '{print $1,1,$2,1,"W",$1,1,$2,"+"}' >not_used.agp + +cat chr*.agp not_used.agp >REF_LA.agp +#one could use scaffolds_chr*.agp as well instead of chr*.agp +cat scaffolds_chr*.agp not_used.agp >REF_LA_scaffolds.agp + +#make final fasta +if [[ ! $REF =~ ^$ ]];then + echo "constructing final fasta (REF_LA.fa.gz)..." + echo "gunzip -fc $REF|awk -f makefasta.awk - REF_LA.agp|gzip >REF_LA.fa.gz"|bash + + echo "constructing final fasta (REF_LA_scaffolds.fa.gz)..." + echo "gunzip -fc $REF|awk -f makefasta.awk - REF_LA_scaffolds.agp|gzip >REF_LA_scaffolds.fa.gz"|bash +fi + +#construct Marey map +echo "constructing Marey maps... (marey*.png)" +j=1 +for m in $MAP +do + for c in $(seq $CHR) + do +# awk -vn=$c '($3==n)' $m.liftover|awk -f liftover.awk chr$c.agp -|awk -vm=$j '(/LG/ && NF>=4){if (NF==4) $5=$4;print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$5}' + awk -vn=$c '($3==n)' $m.liftover|awk -f liftover.awk chr$c.agp -|awk -vm=$j '(/LG/ && NF>=4){if (NF==4) {print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$4} else for (i=4;imarey.data.gz + +Rscript plot_marey.R + +#TODO: lepanchor_wrapper_step2.sh + diff --git a/software/LepAnchor/scripts/lepanchor_wrapper3.sh b/software/LepAnchor/scripts/lepanchor_wrapper3.sh new file mode 100755 index 0000000..3df50a4 --- /dev/null +++ b/software/LepAnchor/scripts/lepanchor_wrapper3.sh @@ -0,0 +1,292 @@ +#!/bin/bash +########################################################## +# +# Lep-Anchor wrapper, version 3 (allopolyploid version) +# +# usage: lepanchor_wrapper3.sh -f ref.fasta -e extra_cut_site_file -n num_chr -c chain_file -p paf_file -r prox_file -m map_file1 -m map_file2, ... +# +# output: REF_LA.fa.gz anchored reference genome +# REF_LA.agp agp file describing the genome +# REF_LA_scaffolds.fa.gz anchored reference genome in scaffolds +# REF_LA_scaffolds.agp anchored reference genome in scaffolds agp file +# marey*.png Marey maps for visual verification +# +# chr*.agp agp files for each chromosome +# scaffolds_chr*.agp agp files for each chromosome in scaffolds (each block of linked contigs as a scaffold) +# +# +# Pasi Rastas, (c) 2021, pasi.rastas@gmail.com +# +########################################################## + +#if [ "$#" -ne 3 ]; then +# echo "At least three input parameters must be provided" +# exit 1 +#fi + + +#parse parameters + +function print_usage() +{ +echo "##########################################################" +echo "#" +echo "# Lep-Anchor wrapper3 (allopolyploid)" +echo "#" +echo "# usage: lepanchor_wrapper3.sh -T thread_per_run -t threads -f ref.fasta -e extra_cut_site_file -n num_chr -c chain_file -r prox_file -p paf_file -m map_file1 -m map_file2, ... " +echo "#" +echo "# download Lep-Anchor by" +echo "# wget https://sourceforge.net/projects/lep-anchor/files/binary%2Bcode.zip/download -O la.zip;unzip la.zip" +echo "#" +echo "# Pasi Rastas, (c) 2021, pasi.rastas@gmail.com" +echo "#" +echo "##########################################################" +} + +while getopts ":e:n:c:p:m:f:t:r:T:" OPTION; do + case ${OPTION} in + T) + THREADS2=$OPTARG;; + t) + THREADS=$OPTARG;; + n) + CHR=$OPTARG;; + c) + CHAIN=$OPTARG;; + p) + PAF=$OPTARG;; + e) + CUT=$OPTARG;; + r) + PROX=$OPTARG;; + m) + MAP="$MAP $OPTARG";; + f) + REF=$OPTARG;; + *) + echo "Incorrect options provided" + exit 1;; + esac +done + +if [[ $MAP =~ ^$ ]];then + print_usage + echo "Please provide at least one map" + exit 1 +fi + +if [[ $REF =~ ^$ ]];then + if [[ ! -e "contigs.length" ]];then + print_usage + echo "Please provide either reference fasta or contigs.length file" + exit 1 + fi +fi + +if [[ $CHR =~ ^[0-9]+$ ]];then + echo "Number of chromosomes = $CHR" +else + print_usage + echo "Please provide parameter n (number of chromosomes)" + exit 1 +fi + +if [[ $CHAIN =~ ^$ ]];then + echo "No chain provided" + CHAIN="/dev/null" +fi + +if [[ $PROX =~ ^$ ]];then + echo "No proximity data provided" + PROX="/dev/null" +fi + + +if [[ $PAF =~ ^$ ]];then + echo "No paf provided" + PAF="/dev/null" +fi + +#number of threads +if [[ $THREADS =~ ^$ ]];then + THREADS=8 +fi + +if [[ $THREADS2 =~ ^$ ]];then + THREADS2=1 +fi + +#get Lep-Anchor +#wget https://sourceforge.net/projects/lep-anchor/files/binary%2Bcode.zip/download -O la.zip +#unzip la.zip + +#where Lep-Anchor binaries are located +LABIN=bin/ + +#java runtime located here +JAVA=java + +#parallel +if ! command -v "parallel" +then + echo "command parallel not found, using only one thread" + PARALLEL=bash +else + PARALLEL="parallel --tmpdir . --jobs $THREADS" +fi + +if [[ ! $REF =~ ^$ ]];then + echo "calculating contigs.length file..." + echo "gunzip -fc $REF|awk -f contigLength.awk >contigs.length"|bash +fi + +#make input for CleanMap +for i in $MAP +do +cat $i +done|sort -V -k 1,1 -k 2,2n >map_all_sorted + + +#CleanMap +echo "running CleanMap..." +$JAVA -cp $LABIN CleanMap map=map_all_sorted >map_all.clean + +#Map2Bed +echo "running Map2Bed..." +$JAVA -cp $LABIN Map2Bed map=map_all.clean contigLength=contigs.length >map.bed + + +if [[ ! $CUT =~ ^$ ]];then +echo "adding extra cuts to map.bed..." +awk '{print $1"\t"$2"\t"$3}' $CUT|sort -V -k 1,1 -k 2,2n|awk -f cutBed.awk map.bed - >map.bed.tmp +awk -f cutBed_fixN.awk contigs.length map.bed.tmp >map.bed +fi + +#find contigs not put into chromosomes +cut -f 1 contigs.length|grep -v -w -F -f <(cut -f 1 map.bed) >not_used.txt + +grep -w -F -f not_used.txt contigs.length|awk -vn=$CHR '{s=$1"\t1\t"$2"\t?\t"; for (i=1;i<=n;++i) print s i}' >chr0.bed +cat map.bed chr0.bed >map_extra.bed + +#PlaceAndOrientContigs +echo "running PlaceAndOrientContigs (first iteration)..." +for i in $(seq $CHR) +do +echo "gunzip -fc $CHAIN|$JAVA -cp $LABIN PlaceAndOrientContigs numThreads=$THREADS2 bed=map_extra.bed chromosome=$i map=$MAP chain=- paf=$PAF proximity=$PROX keepEmptyIntervals=0 >chr$i.la 2>chr$i.la.err" +done|$PARALLEL + +#propagate4 +echo "running propagate4..." +awk -f propagate4.awk pass=1 chr*[0-9].la pass=2 chr*[0-9].la.err|awk -f pickbed.awk - map_extra.bed >map_extra_prop.bed + +#PlaceAndOrientContigs +echo "running PlaceAndOrientContigs (second iteration)..." +for i in $(seq $CHR) +do +echo "gunzip -fc $CHAIN|$JAVA -cp $LABIN PlaceAndOrientContigs numThreads=$THREADS2 $(awk -f pickorientation.awk chr$i.la) bed=map_extra_prop.bed chromosome=$i map=$MAP chain=- paf=$PAF proximity=$PROX keepEmptyIntervals=1 >ichr$i.la 2>ichr$i.la.err" +done|$PARALLEL + +#propagate +echo "running propagate..." + +awk -f propagate.awk ichr*.la >tmp1.la +awk -f propagate.awk tmp1.la >tmp2.la +i=2 + +while ! cmp -s "tmp$i.la" "tmp$(( $i-1 )).la" ;do + awk -f propagate.awk tmp$i.la >tmp$[$i+1].la + i=$[$i+1] +done + +#create prop*.la, take only contig put to uniquely to a chromosome +#use propagate2 to include possible bridge contigs as well... +awk -f propagate2.awk tmp$i.la|awk '(/^[^#]/ && NF>=8){++d[$1"\t"($7+0)"\t"($8+0)]; data[++line]=$0}END{for (i=1; i<=line; ++i) {$0=data[i];if (d[$1"\t"($7+0)"\t"($8+0)] == 1) {fn="prop"$5".la";print $0>fn}}}' + +#create a new bed by combining prop[1-9]*.la and map_extra_prop.bed +awk '{print $1"\t"($7+0)"\t"($8+0)"\t?\t"$5}' prop[1-9]*.la|awk -f pickbed.awk - map_extra_prop.bed >map_extra_prop2.bed + +#third iteration could be just improveOrder + +#PlaceAndOrientContigs +echo "running PlaceAndOrientContigs (third iteration)..." +for i in $(seq $CHR) +do +echo "gunzip -fc $CHAIN|$JAVA -cp $LABIN PlaceAndOrientContigs numThreads=$THREADS2 $(awk -f pickorientation.awk chr$i.la) bed=map_extra_prop2.bed chromosome=$i map=$MAP chain=- paf=$PAF proximity=$PROX keepEmptyIntervals=1 evaluateAnchoring=prop$i.la improveAnchoring=1 numRuns=1 >iichr$i.la 2>iichr$i.la.err" +done|$PARALLEL + +echo "finding full haplotypes..." +awk '($NF=="haplotype")' iichr*.err|sort -n -r|awk -vlimit=2000 '($NF=="haplotype" && ($1>=($4-$3+1-limit)/limit) && (!(($5 SUBSEP $6 SUBSEP $7) in h))){h[$2,$3,$4]; print}' >fullHaplotypes50.txt +wc -l fullHaplotypes50.txt + +echo "running liftoverHaplotypes for all input maps..." +for i in $MAP +do +echo "gunzip -fc $CHAIN|$JAVA -cp $LABIN LiftoverHaplotypes map=$i haplotypes=fullHaplotypes50.txt chain=- >$i.liftover" +done|$PARALLEL + +#store lift overed maps to $MAPL +for i in $MAP +do + MAPL="$MAPL $i.liftover" +done + +awk -f removeHaplotypes.awk map_extra_prop2.bed fullHaplotypes50.txt > map_extra_prop2_nh.bed + +#PlaceAndOrientContigs +echo "running PlaceAndOrientContigs (fourth iteration)..." +for i in $(seq $CHR) +do +echo "gunzip -fc $CHAIN|$JAVA -cp $LABIN PlaceAndOrientContigs numThreads=$THREADS2 $(awk -f pickorientation.awk chr$i.la) bed=map_extra_prop2_nh.bed chromosome=$i map=$MAPL chain=- paf=$PAF proximity=$PROX keepEmptyIntervals=1 evaluateAnchoring=iichr$i.la improveAnchoring=1 numRuns=1 >iiichr$i.la 2>iiichr$i.la.err" +done|$PARALLEL + + +#pruning contig blocks without map support +for i in $(seq $CHR) +do + awk -f prune.awk iiichr$i.la >iiichr${i}_pruned.la +done 2>pruned.la + +#remove overlap(s) +awk -f removeOverlaps.awk map_extra_prop2.bed iiichr*_pruned.la >iiall.la + +#construct agp files +for i in $(seq $CHR) +do +awk -vn=$i '($5==n)' iiall.la|awk -vprefix="LG" -vlg=$i -f makeagp_full2.awk - >chr$i.agp +awk -vn=$i '($5==n)' iiall.la|awk -vprefix="LG" -vlg=$i -f makeagp2.awk - >scaffolds_chr$i.agp +done + +#find contigs not used +cut -f 1 contigs.length|grep -v -w -F -f <(cut -f 2 fullHaplotypes50.txt;cut -f 1 iiall.la) >not_used_final.txt + +grep -F -w -f not_used_final.txt contigs.length|awk '{print $1,1,$2,1,"W",$1,1,$2,"+"}' >not_used.agp + +cat chr*.agp not_used.agp >REF_LA.agp +#one could use scaffolds_chr*.agp as well instead of chr*.agp +cat scaffolds_chr*.agp not_used.agp >REF_LA_scaffolds.agp + +#make final fasta +if [[ ! $REF =~ ^$ ]];then + echo "constructing final fasta (REF_LA.fa.gz)..." + echo "gunzip -fc $REF|awk -f makefasta.awk - REF_LA.agp|gzip >REF_LA.fa.gz"|bash + + echo "constructing final fasta (REF_LA_scaffolds.fa.gz)..." + echo "gunzip -fc $REF|awk -f makefasta.awk - REF_LA_scaffolds.agp|gzip >REF_LA_scaffolds.fa.gz"|bash +fi + +#construct Marey map +echo "constructing Marey maps... (marey*.png)" +j=1 +for m in $MAPL +do + for c in $(seq $CHR) + do + awk -vn=$c '($3==n)' $m|awk -f liftover.awk REF_LA.agp -|awk -vm=$j '(/LG/ && NF>=4){if (NF==4) {print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$4} else for (i=4;imarey.data.gz + +Rscript plot_marey.R + +#TODO: lepanchor_wrapper_step2.sh + diff --git a/software/LepAnchor/scripts/longread.awk b/software/LepAnchor/scripts/longread.awk new file mode 100644 index 0000000..8a97b97 --- /dev/null +++ b/software/LepAnchor/scripts/longread.awk @@ -0,0 +1,72 @@ +#awk -f longread.awk aln.paf +#aln.paf: minimap2 +#process long read alignment data for Lep-Anchor +BEGIN{ + #mapping quality limit + if (qLimit == "") + qLimit=1 + FS="\t" + rev["+"] = "-" + rev["-"] = "+" +} +(NF>=12 && $12>=qLimit){ + if ($1 == prevR) { + for (i = 1; i <= n; ++i) { + processOnePair($6,$8,$9,$3,$4,$5,c[i],start[i],end[i],rstart[i],rend[i],o[i],min(q[i], $12)) + } + ++n + } else + n = 1 + #print within contig part + print $6"\t"$8"\t"$6"\t"$9"\t++\t"$12 + print $6"\t"$9"\t"$6"\t"$8"\t--\t"$12 + o[n]=$5 #orientation + c[n]=$6 #contig + + start[n]=$8 #start + end[n]=$9 #end + + rstart[n]=$3 #read start + rend[n]=$4 #read end + + q[n]=$12 #quality + + prevR = $1 + +} +function min(a,b){ + if (a <= b) + return a + else + return b +} +function processOnePair(c1, start1, end1, rstart1, rend1, o1, c2, start2, end2, rstart2, rend2, o2, quality ,lp1,lp2,overlap) +{ + if (rstart1 > rstart2) { + processOnePair(c2, start2, end2, rstart2, rend2, o2, c1, start1, end1, rstart1, rend1, o1, quality) + return; + } + #now rstart1 <= rstart2 +# print "process" + overlap = 0 + if (rend1 > rstart2) + overlap = rend1 - rstart2 + overlap = int(overlap * 0.5) + + if (o1 == "+") + lp1 = end1 + 1 - overlap + else + lp1 = start1 + overlap + + if (o2 == "+") + lp2 = start2 + overlap + else + lp2 = end2 + 1 - overlap + + #print both ways + print c1"\t"lp1"\t"c2"\t"lp2"\t"o1 o2"\t"quality + print c2"\t"lp2"\t"c1"\t"lp1"\t"rev[o2] rev[o1]"\t"quality +# print "end" +} +END{ +} diff --git a/software/LepAnchor/scripts/makeagp2.awk b/software/LepAnchor/scripts/makeagp2.awk index 6b460f1..668df04 100644 --- a/software/LepAnchor/scripts/makeagp2.awk +++ b/software/LepAnchor/scripts/makeagp2.awk @@ -26,12 +26,16 @@ BEGIN{ pos = 0 n = "" } else if (gapLength > 0 && ($9=="null" || ($10!="null" && substr($10, 1, 5) != "chain"))) { #do not add gap for partial haplotypes - print prefix lg "." suffix "\t" pos + 1 "\t" pos + gapLength "\t" ++n "\tU\t" gapLength "\tcontig\tno\tna" - pos += gapLength + if ($9 != $1 || prevO != $4 || ((($4 ~ /+/) || $4=="?") && $2 != prevE + 1) || (($4 ~ /-/) && $3 != prevS - 1)) { #do not add gap if we have cut the same contig... + print prefix lg "." suffix "\t" pos + 1 "\t" pos + gapLength "\t" ++n "\tU\t" gapLength "\tcontig\tno\tna" + pos += gapLength + } } } print prefix lg "." suffix "\t" pos + 1 "\t" pos + ($3-$2+1) "\t" ++n "\tW\t" $1 "\t" $2 "\t" $3 "\t" map[$4] pos += ($3-$2+1) prevO = $4 + prevS = $2 + prevE = $3 } diff --git a/software/LepAnchor/scripts/makeagp_full2.awk b/software/LepAnchor/scripts/makeagp_full2.awk index 25e39de..3a8ddea 100644 --- a/software/LepAnchor/scripts/makeagp_full2.awk +++ b/software/LepAnchor/scripts/makeagp_full2.awk @@ -19,12 +19,16 @@ BEGIN{ (/^[^#]/ && /^[^$]/ && $3-$2>=0){ if (n != "") { if (gapLength > 0 && ($9=="null" || ($10!="null" && substr($10, 1, 5) != "chain"))) { #do not add gap for partial haplotypes - print prefix lg "\t" pos + 1 "\t" pos + gapLength "\t" ++n "\tU\t" gapLength "\tcontig\tno\tna" - pos += gapLength + if ($9 != $1 || prevO != $4 || ((($4 ~ /+/) || $4=="?") && $2 != prevE + 1) || (($4 ~ /-/) && $3 != prevS - 1)) { #do not add gap if we have cut the same contig... + print prefix lg "\t" pos + 1 "\t" pos + gapLength "\t" ++n "\tU\t" gapLength "\tcontig\tno\tna" + pos += gapLength + } } } print prefix lg "\t" pos + 1 "\t" pos + ($3-$2+1) "\t" ++n "\tW\t" $1 "\t" $2 "\t" $3 "\t" map[$4] pos += ($3-$2+1) prevO = $4 + prevS = $2 + prevE = $3 } diff --git a/software/LepAnchor/scripts/normalise.awk b/software/LepAnchor/scripts/normalise.awk new file mode 100644 index 0000000..3192958 --- /dev/null +++ b/software/LepAnchor/scripts/normalise.awk @@ -0,0 +1,62 @@ +#normalise proximity data based on within bin proximity +#awk -f normalise prox.data prox.data +BEGIN{ + OFS="\t" +} +(NR==FNR && $1==$3 && $2==$4){ #same position + sum2[$1,$2]+=$5 +} +(NR!=FNR) { + if (!first) { + for (i in sum2) { + s = sum2[i] + if (max == "" || max <= s) + max = s + if (min == "" || min >= s) + min = s + sum += s + sumS += s**2 + ++n + } + + avg = sum / n + iavg = 100.0 / avg + sd = sqrt((sumS + n * avg**2 - 2 * avg * sum)/(n - 1)) + print avg "\tsd=\t" sd "\tmin=\t" min "\tmax=\t" max "\tn=" n > "/dev/stderr" + first = 1 + } + s1 = sum2[$1,$2] + 0 + s2 = sum2[$3,$4] + 0 + $5 *= normalise(s1, s2) + print +} +function maxf(s1, s2) +{ + if (s1 >= s2) + return s1 + return s2 +} +function minf(s1, s2) +{ + if (s1 < s2) + return s1 + return s2 + +} +#maxf +function normalise(s1, s2) +{ + if (s1 <= 0.5 * avg && s2 <= 0.5 * avg) + return 2 * iavg + else + return 100.0 / maxf(s1, s2) +} +#minf +function normalise2(s1, s2) +{ + if (s1 <= avg || s2 <= avg) + return iavg + else + return 100.0 / minf(s1, s2) +} + diff --git a/software/LepAnchor/scripts/paf2chain.awk b/software/LepAnchor/scripts/paf2chain.awk new file mode 100644 index 0000000..66f6ee6 --- /dev/null +++ b/software/LepAnchor/scripts/paf2chain.awk @@ -0,0 +1,52 @@ +#makes a chain file from minimap2 (-c) paf +#minimap2 --no-long-join -c -PD -g 20000 -r 1000,40000 -k15 -w8 -A3 -B6 -O12 -E1 -s200 -z600 -N50 --min-occ-floor=100 -t 20 ref.fa ref.fa >ref-self.paf +#awk -f chainpaf.awk ref-self.paf|awk -f sortpaf.awk| sort -T . -n -r -k 1,1 | cut -f 2- > ref-self_sorted.paf +#awk -f paf2chain.awk rf-self_sorted.paf | gzip > paf.chain.gz +BEGIN{ + OFS="\t" + if (scale == "") + scale = 33 +} +($1!=$6){ + score = -1 + cigar = "" + for (i=13; i<=NF; ++i) { + if ($i ~ /^AS:i:/) + score = substr($i, 6)+0 + if ($i ~ /^cg:Z:/) + cigar = substr($i, 6) + } + if ($5 == "+") + print "chain\t" scale * score, $1, $2, "+", $3, $4, $6, $7, "+" ,$8, $9, NR + else + print "chain\t" scale * score, $1, $2, "+", $3, $4, $6, $7, "-" ,$7-$9, $7-$8, NR + + c = cigar + + d = 0 + i = 0 + prev = "" + do { + n = c+0 + o = substr(c, length(n)+1, 1) + c = substr(c, length(n)+2) +# print n " " o + if (o == "M") { + if (prev != "") + print prev, i, d + prev = n + i = 0 + d = 0 + } else if (o == "I") { + i+=n + } else if (o == "D") { + d+=n + } else { + print "error: cannot parse cigar operation " n " " o + exit + } + } while (c != "") + print prev +# print cigar +#"\t" cigar +} diff --git a/software/LepAnchor/scripts/prox10x.awk b/software/LepAnchor/scripts/prox10x.awk new file mode 100644 index 0000000..e39c363 --- /dev/null +++ b/software/LepAnchor/scripts/prox10x.awk @@ -0,0 +1,50 @@ +#samtools view -f 0x40 -q 20 5148.bam|awk -f umi.awk|awk -vbin=10000 -f prox10x.awk +#low memory version +BEGIN{ + if (bin == "") + bin = 10000 + ibin = 1.0 / bin + if (minD == "") + minD = 2 + +} +function toBin(p) { + return 1 + bin * int((p - 1) * ibin) +} +{ + p = toBin($2) + i = $1 "\t" p "\t" $3 + if (!(i in used)) { + snpUmi[$3,++numUmi[$3]] = $1"\t"p + used[i] + } + if (!($1 in contigs)) { + print "processing contig " $1 > "/dev/stderr" + contigs[$1]=$2 + } + else if ($2 > contigs[$1]) + contigs[$1]=$2 + +} +END{ + for (u in numUmi) + for (i = 1; i <= numUmi[u]; ++i) + umi[snpUmi[u,i], ++count[snpUmi[u,i]]] = u + + for (c in contigs) { + for (p = 1; p <= contigs[c]; p+=bin) { + delete dist + s1 = c "\t" p + for (i = 1; i <= count[s1]; ++i) { + u = umi[s1, i] + for (j = 1; j <= numUmi[u]; ++j) { + s2 = snpUmi[u,j] + ++dist[s1"\t"s2] + } + } + for (d in dist) + if (dist[d] >= minD) + print d "\t" dist[d] + } + } +} diff --git a/software/LepAnchor/scripts/removeHaplotypes.awk b/software/LepAnchor/scripts/removeHaplotypes.awk index a3d6510..7867095 100644 --- a/software/LepAnchor/scripts/removeHaplotypes.awk +++ b/software/LepAnchor/scripts/removeHaplotypes.awk @@ -7,7 +7,7 @@ #chr*.err are the output from Lep-Anchor error stream, e.g. java PlaceAndOrientContigs ... chromosome=X ... 2>chrX.err #haplotypes are put to maxChr+1 BEGIN{ - FS="\t" +# FS="\t" OFS="\t" } @@ -74,9 +74,10 @@ BEGIN{ bed[contig, jmax + 1] = $0 } } + ++numHaplotypes } END{ - if (jmax != "") + if (jmax != "" || numHaplotypes == 0) for (i in contigs) { for (j = 1; j <= c[i]; ++j) print bed[i, j] diff --git a/software/LepAnchor/scripts/sortpaf.awk b/software/LepAnchor/scripts/sortpaf.awk new file mode 100644 index 0000000..ee2c38a --- /dev/null +++ b/software/LepAnchor/scripts/sortpaf.awk @@ -0,0 +1,7 @@ +{ + for (j = 13; j <= NF; ++j) + if ($j ~ /^AS:i:/) { + print (substr($j, 6)+0) "\t" $0 + break + } +} diff --git a/software/LepAnchor/scripts/umi.awk b/software/LepAnchor/scripts/umi.awk new file mode 100644 index 0000000..45d0b15 --- /dev/null +++ b/software/LepAnchor/scripts/umi.awk @@ -0,0 +1,64 @@ +#samtools view -f 0x40 10x.bam|awk -f umi.awk +#10x.bam: bwa mem -t 16 REF R1.fq.gz R2.fq.gz|samtools view -b -|samtools sort - -@4 -T tmp1 -o 10x.bam +#get 10x barcodes from bam aligning raw reads (without cutting barcode off) +#process 10x data for Lep-Anchor +BEGIN{ + #store possible cigar codes for soft clipping of barcode (16bp + 7bp), 16..24 should work + for (i = 16; i<=24; ++i) + s[i "S"] +} +function reverseComplement(s ,i,n,map,ret,nc) { + n = length(s) + map["A"]="T" + map["C"]="G" + map["G"]="C" + map["T"]="A" + map["Y"]="R" + map["R"]="Y" + map["S"]="S" + map["W"]="W" + map["K"]="M" + map["M"]="K" + map["B"]="V" + map["V"]="B" + map["D"]="H" + map["H"]="D" + map["N"]="N" + + map["a"]="t" + map["c"]="g" + map["g"]="c" + map["t"]="a" + map["y"]="r" + map["r"]="y" + map["s"]="s" + map["w"]="w" + map["k"]="m" + map["m"]="k" + map["b"]="v" + map["v"]="b" + map["d"]="h" + map["h"]="d" + map["n"]="n" + + ret = "" + for (i = n; i >= 1; --i) { + nc = map[substr(s, i, 1)] + if (nc == "") { + print "Error, letter " substr(s, i, 1) " in fasta" + exit + } + ret = ret nc + } + return ret +} + + +(and($2, 0x40)!=0){ + if (and($2, 0x10)==0) { #+ orientation + if (substr($6,1,3) in s) + print $3"\t"$4"\t"substr($10,1,16) "\t+" + } + else if (substr($6,length($6)-2) in s) #-orientation + print $3"\t"$4"\t"reverseComplement(substr($10,length($10)-15)) "\t-" +} diff --git a/software/LepAnchor/src/CleanMap.java b/software/LepAnchor/src/CleanMap.java deleted file mode 100644 index 887c28f..0000000 --- a/software/LepAnchor/src/CleanMap.java +++ /dev/null @@ -1,559 +0,0 @@ -/** - This file is part of Lep-Anchor. - - Lep-Anchor is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-Anchor is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-Anchor. If not, see . - - Copyright (C) 2019 Pasi Rastas, pasi.rastas@gmail.com, University of Helsinki - -*/ -//User interface and implementation of ScaffoldHMM -import java.io.BufferedReader; -import java.io.FileReader; -import java.text.DecimalFormat; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; - -public class CleanMap { - - int markerDistance = 50; - int chimericDistance = 1000; - int numChromosomes; - - int numMarkerTypes = 1; // how many different marker types, used for having different error parameters for repeats or different maps... - - double P_CHIMERA = 0.01; - //double P_ERROR = 0.01;//new double[]{0.01}; - double P_ERROR[] = new double[]{0.01}; - - int MAX_ITERATIONS = 20; - - public void setMarkerDistance(int value) - { - markerDistance = value; - } - - public void setChimericDistance(int value) - { - chimericDistance = value; - } - - public void setInitialValues(double p_c, double p_e) - { - for (int type = 0; type < numMarkerTypes; ++type) - P_ERROR[type] = p_e; - //P_ERROR = p_e; - P_CHIMERA = p_c; - } - private class CleanMarker { - public CleanMarker(long position, int chr, int density, int type){ - this.position = position; - this.chr = chr; - this.density = density; - this.type = type; - } - - public long getPosition() - { - return position; - } - public int getChr() - { - return chr; - } - public int getDensity() - { - return density; - } - public int getType() - { - return type; - } - public void setDensity(int value) - { - density = value; - } - public void setType(int value) - { - type = value; - } - - long position; - int chr; - int density; - int type; - } - - - public void cleanMap(String mapFile, int annotation) - { - - HashMap> scaffoldMap = new HashMap>(); - - HashMap numAnnotations = new HashMap(); - - numChromosomes = 0; - - int noAnnotation = 0; - - try { - BufferedReader br = new BufferedReader(new FileReader(mapFile)); - - ArrayList line = Input.loadTableRow(br, " \t"); - while (line != null) { - if (!line.get(0).equals("CHR") && !line.get(0).equals("CHROM")) { - String scaffold = line.get(0); - long position = InputData.myParseLong(line.get(1)); - - int chr = Integer.parseInt(line.get(2)); - if (chr > 0) { - if (!scaffoldMap.containsKey(scaffold)) - scaffoldMap.put(scaffold, new ArrayList()); - ArrayList list = scaffoldMap.get(scaffold); - - //load possible annotation... - int annot = 0; - if (annotation > 0 && line.size() >= annotation) { - annot = Integer.parseInt(line.get(annotation - 1)); - if (annot >= 0) { // no negative annotations... - if (!numAnnotations.containsKey(annot)) - numAnnotations.put(annot, 0); - numAnnotations.put(annot, numAnnotations.get(annot) + 1); - } else - ++noAnnotation; - } else - ++noAnnotation; - - list.add(new CleanMarker(position, chr - 1, 1, annot)); - // code chr as 0..(c-1), negative missing - // density = 1, for calculating marker density... - numChromosomes = Math.max(numChromosomes, chr); - } - } - line = Input.loadTableRow(br, " \t"); - } - } catch (Exception e) { - e.printStackTrace(); - System.exit(-1); - } - - - int numScaffolds = scaffoldMap.keySet().size(); -// double markerDensityScales[][] = new double[numScaffolds][]; - ArrayList> positionsAndChrs = new ArrayList>(); - ArrayList scaffolds = new ArrayList(); - for (String scaffold : scaffoldMap.keySet()) - scaffolds.add(scaffold); - - int totalPads = 0; - for (String scaffold : scaffolds) { - ArrayList positionsAndChr = scaffoldMap.get(scaffold); - positionsAndChrs.add(positionsAndChr); - int numM = positionsAndChr.size(); -// markerDensityScales[si] = new double[numM]; - for (int i = 1; i < numM; ++i) { - if (positionsAndChr.get(i - 1).getPosition() > positionsAndChr.get(i).getPosition()) { - System.err.println("physical positions must be increasing..."); - System.exit(0); - } - } - int j = 0; - int k = 1; - for (int i = 0; i < numM; ++i) { - while (positionsAndChr.get(i).getPosition() - positionsAndChr.get(j).getPosition() > markerDistance) - ++j; - while (k < numM && positionsAndChr.get(k).getPosition() - positionsAndChr.get(i).getPosition() <= markerDistance) - ++k; - positionsAndChr.get(i).setDensity(k - j); - //markerDensityScales[si][i] = 1.0 / (k - j); - } - - ArrayList positionsAndChr_padded = new ArrayList(); - - for (int i = 0; i < numM; ++i) { - if (i > 0) { - long prevP = positionsAndChr.get(i - 1).getPosition(); - long pos = positionsAndChr.get(i).getPosition(); - - long numPads = (pos - prevP) / chimericDistance; - if (numPads >= 2) - totalPads += numPads - 1; - for (j = 1; j < numPads; ++j) { - double p = ((double) j) / numPads; - // chr missing - // density is 1 - positionsAndChr_padded.add(new CleanMarker((long)(pos + p * (prevP - pos)), -1, 1, 0)); - } - } - positionsAndChr_padded.add(positionsAndChr.get(i)); - } - positionsAndChr.clear(); - positionsAndChr.addAll(positionsAndChr_padded); - - } - System.err.println("Number of scaffolds = " + numScaffolds); - System.err.println("Number of chromosomes = " + numChromosomes); - - System.err.println("Number of padding markers = " + totalPads); - - System.err.println("Number of markers without annotation = " + noAnnotation); - if (numAnnotations.size() > 0) { - for (int key : numAnnotations.keySet()) { - numMarkerTypes = Math.max(numMarkerTypes, key + 1); - System.err.println("Number of markers with annotation " + key + " = " + numAnnotations.get(key)); - } - } - - DecimalFormat df = new DecimalFormat("#0.000"); - - for (int iterations = 0; iterations <= MAX_ITERATIONS; ++iterations) { - initParameters(); - - double QE[][] = new double[numMarkerTypes][2]; -// double QE[] = new double[2]; - double QT[] = new double[2]; - double logL = 0.0; - double l = 1.0; - for (int scaffold=0; scaffold < numScaffolds; ++scaffold) { - ArrayList positionsAndChr = positionsAndChrs.get(scaffold); -// double markerDensityScale[] = markerDensityScales[scaffold]; - int numM = positionsAndChr.size(); - double forward[][] = new double[numM + 1][numChromosomes]; - - for (int j = 0; j < numChromosomes; ++j) - forward[0][j] = 1.0 / numChromosomes; - - long prevPos = 0; - double scale[] = new double[numM + 1]; - scale[0] = 1.0; - - for (int i = 0; i < numM; ++i) { - CleanMarker m = positionsAndChr.get(i); - long pos = m.getPosition(); - int chr = m.getChr(); - int density = m.getDensity(); - int type = m.getType(); - - double p1 = 1.0; - if (i > 0) - p1 = transition((int)(pos - prevPos)); - - double p2 = (1.0 - p1) / numChromosomes; - - double sum = 0.0; - for (int j = 0; j < numChromosomes; ++j) - sum += forward[i][j]; - double sump2 = p2 * sum; - - for (int j = 0; j < numChromosomes; ++j) - forward[i + 1][j] = (sump2 + p1 * forward[i][j]) * emission(chr, j, density, type); - - sum = 0; - for (int j = 0; j < numChromosomes; ++j) - sum += forward[i + 1][j]; - double isum = 1.0 / sum; - - for (int j = 0; j < numChromosomes; ++j) - forward[i + 1][j] *= isum; - scale[i + 1] = isum; - - l *= sum; - if (l < 1e-100) { - logL += Math.log10(l); - l = 1.0; - } - - prevPos = pos; - } - - //backward - double backward[][] = new double[numM + 1][numChromosomes]; - for (int j = 0; j < numChromosomes; ++j) - backward[numM][j] = 1.0; - - for (int i = numM - 1; i >= 0; --i) { - CleanMarker m = positionsAndChr.get(i); - long pos = m.getPosition(); - int chr = m.getChr(); - int density = m.getDensity(); - int type = m.getType(); - double p1 = 1.0; - if (i > 0) { - prevPos = positionsAndChr.get(i - 1).getPosition(); - p1 = transition(pos - prevPos); - } - double p2 = (1.0 - p1) / numChromosomes; - - double sum = 0.0; - for (int j = 0; j < numChromosomes; ++j) - sum += backward[i + 1][j]; - - double sump2 = sum * p2; - - for (int j = 0; j < numChromosomes; ++j) - backward[i][j] = (sump2 + p1 * backward[i + 1][j]) * scale[i + 1] * emission(chr, j, density, type); - } - //for (int j = 0; j < numChromosomes; ++j) { - // System.err.println(backward[0][j]); - //} - - //QE - for (int i = 0; i < numM; ++i) { - CleanMarker m = positionsAndChr.get(i); - int chr = m.getChr(); - int density = m.getDensity(); - int type = m.type; - for (int j = 0; j < numChromosomes; ++j) { - if (chr < 0) { - //QE[0] += ? - //QE[1] += ? - } else { - if (chr == j) - QE[type][0] += backward[i + 1][j] * forward[i + 1][j] / density; - else - QE[type][1] += backward[i + 1][j] * forward[i + 1][j] / density; - } - } - } - - //QT - for (int i = 0; i < numM; ++i) { - CleanMarker m = positionsAndChr.get(i); - long pos = m.getPosition(); - if (i > 0) { - int chr = m.getChr(); - int density = m.getDensity(); - int type = m.getType(); - - double sum = 0.0; - for (int j = 0; j < numChromosomes; ++j) - sum += backward[i + 1][j]; - - double p1 = transition(pos - prevPos); - double p2 = (1.0 - p1) / numChromosomes; - //if (ts != 1.0) - // System.err.println(ts); - for (int j = 0; j < numChromosomes; ++j) { - - double se = scale[i + 1] * emission(chr, j, density, type); - QT[0] += p1 * forward[i][j] * backward[i + 1][j] * se; - QT[1] += p2 * forward[i][j] * sum * se; - } - - } - prevPos = pos; - } - - if (iterations == MAX_ITERATIONS) { - // Viterbi - double viterbi[][] = new double[numM + 1][numChromosomes]; - int path[][] = new int[numM + 1][numChromosomes]; - - for (int j = 0; j < numChromosomes; ++j) - viterbi[0][j] = 1.0 / numChromosomes; - - prevPos = 0; - - for (int i = 0; i < numM; ++i) { - CleanMarker m = positionsAndChr.get(i); - long pos = m.getPosition(); - int chr = m.getChr(); - int density = m.getDensity(); - int type = m.getType(); - - double p1 = 1.0; - if (i > 0) - p1 = transition((int)(pos - prevPos)); - - double p2 = (1.0 - p1) / numChromosomes; - - int maxj = 0; - for (int j = 1; j < numChromosomes; ++j) - if (viterbi[i][j] > viterbi[i][maxj]) - maxj = j; - - double sump2 = p2 * viterbi[i][maxj]; - - for (int j = 0; j < numChromosomes; ++j) { - double sump1 = p1 * viterbi[i][j]; - if (sump2 > sump1) { - viterbi[i + 1][j] = sump2 * emission(chr, j, density, type); - path[i + 1][j] = maxj; - } - else { - viterbi[i + 1][j] = sump1 * emission(chr, j, density, type); - path[i + 1][j] = j; - } - } - - double max = 0.0; - for (int j = 0; j < numChromosomes; ++j) - max = Math.max(max, viterbi[i + 1][j]); - - double imax = 1.0 / max; - for (int j = 0; j < numChromosomes; ++j) - viterbi[i + 1][j] *= imax; - - prevPos = pos; - } - //backtrack path - int finalPath[] = new int[numM]; - - int maxj = 0; - for (int j = 1; j < numChromosomes; ++j) - if (viterbi[numM][j] > viterbi[numM][maxj]) - maxj = j; - - for (int i = numM; i >= 1; --i) { - finalPath[i - 1] = maxj; - maxj = path[i][maxj]; - } - - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < numM; ++i) { - /*int maxJ = 0; - for (int j = 1; j < numChromosomes; ++j) { - if (backward[i + 1][j] * forward[i + 1][j] > backward[i + 1][maxJ] * forward[i + 1][maxJ]) - maxJ = j; - }*/ - int maxJ = finalPath[i]; // Viterbi path - double max2 = Double.NEGATIVE_INFINITY; - for (int j = 0; j < numChromosomes; ++j) { - if (j != maxJ) { - max2 = Math.max(max2, backward[i + 1][j] * forward[i + 1][j]); - } - } - max2 = Math.log10(backward[i + 1][maxJ] * forward[i + 1][maxJ] / max2); - CleanMarker m = positionsAndChr.get(i); - long pos = m.getPosition(); - int chr = m.getChr(); - if (chr >= 0) {// do not print padding markers - sb.append(scaffolds.get(scaffold)); - sb.append('\t'); - sb.append(pos); - sb.append('\t'); - sb.append((maxJ + 1)); - sb.append('\t'); - sb.append((chr + 1)); - sb.append('\t'); - sb.append(df.format(max2)); - sb.append('\n'); - if (sb.length() >= 10000) { - System.out.print(sb.toString()); - sb.setLength(0); - } - //System.out.println(scaffolds.get(scaffold) + "\t" + pos + "\t" + (maxJ + 1) + "\t" + (chr + 1) + "\t" + max2); - } - } - if (sb.length() > 0) - System.out.print(sb.toString()); - } - //System.err.println(scaffolds.get(scaffold) + "\t" + logScale + "\t" + QE[0] + "\t" + QE[1]); - } - logL = logL + Math.log10(l); - System.err.println("logL = " + logL); - for (int type = 0; type < numMarkerTypes; ++type) - System.err.print(P_ERROR[type] + "\t"); - System.err.println(P_CHIMERA); - for (int type = 0; type < numMarkerTypes; ++type) - P_ERROR[type] = QE[type][1] / (QE[type][0] + QE[type][1]); - P_CHIMERA = QT[1] / (QT[0] + QT[1]); - } - } - - private double transition(long distance) - { - return (1 - P_CHIMERA); - } - - private double emissionTable1[][] = null; - private double emissionTable2[][] = null; - - private void initParameters() - { - if (P_ERROR.length == 1 && numMarkerTypes > 1) { - double oldValue = P_ERROR[0]; - P_ERROR = new double[numMarkerTypes]; - Arrays.fill(P_ERROR, oldValue); - } - if (emissionTable1 == null) { - emissionTable1 = new double[numMarkerTypes][1000]; - emissionTable2 = new double[numMarkerTypes][1000]; - } - - for (int type = 0; type < numMarkerTypes; ++type) { - double e = P_ERROR[type] / (double) (numChromosomes - 1); // P_ERROR/0 does not matter! - for (int i = 0; i < emissionTable1[type].length; ++i) { - emissionTable1[type][i] = Math.pow(e, 1.0 / i); - emissionTable2[type][i] = Math.pow(1.0 - P_ERROR[type], 1.0 / i); - } - } - - } - - private double emission(int chr, int state, int density, int type){ - if (chr < 0) - return 1; - - assert(density >= 1); - - if (density < emissionTable1[type].length) { - if (chr != state) - return emissionTable1[type][density]; - else - return emissionTable2[type][density]; - - } - return Math.pow(emission(chr, state, 1, type), 1.0 / density); - } - - private static void usage() - { - System.err.println("Usage: java CleanMap map=contig_pos_map.txt [options]"); - System.err.println(" map=file a map file containing columns contig, position and chromosome"); - System.err.println(" markerDistance=NUM Minimum effective distance of markers [50]"); - System.err.println(" chimericDistance=NUM The default distance of markers for transition (change of chromosome) [1000]"); - System.err.println(" initialValues=NUM NUM Initial values for theta and epsilon [0.01 0.01]"); - - System.err.println(" markerAnnotation=NUM Load marker annotation from input map column NUM"); - } - - - public static void main(String args[]) - { - ParameterParser pp = new ParameterParser(); - - String extraParameters = ""; - for (int i = 0; i < args.length; ++i) { - extraParameters += " " + args[i]; - } - - if (args.length == 0 || !pp.init(extraParameters)) { - usage(); - System.exit(0); - } - pp.warning(new String[]{"map", "markerDistance", "chimericDistance", "initValues", "markerAnnotation"}); - - - System.out.println("#java CleanMap" + extraParameters); - - CleanMap cm = new CleanMap(); - cm.setChimericDistance(Integer.parseInt(pp.getValueAsString("chimericDistance", "1000"))); - cm.setMarkerDistance(Integer.parseInt(pp.getValueAsString("markerDistance", "50"))); - cm.setInitialValues(Double.parseDouble(pp.getValueAsString("initialValues",1,"0.01")), Double.parseDouble(pp.getValueAsString("initialValues", 2,"0.01"))); - - cm.cleanMap(pp.getValueAsString("map", null), Integer.parseInt(pp.getValueAsString("markerAnnotation", "-1"))); - } - -} diff --git a/software/LepAnchor/src/ContigInterval.java b/software/LepAnchor/src/ContigInterval.java deleted file mode 100644 index 2a84d45..0000000 --- a/software/LepAnchor/src/ContigInterval.java +++ /dev/null @@ -1,382 +0,0 @@ -/** - This file is part of Lep-Anchor. - - Lep-Anchor is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-Anchor is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-Anchor. If not, see . - - Copyright (C) 2019 Pasi Rastas, pasi.rastas@gmail.com, University of Helsinki - -*/ -import java.util.*; - -public class ContigInterval implements Comparable{ - private String name; - private long start; - private long end; - - private long startI[]; - private long endI[]; - - private int chr; - - private int currentOrientation = 0; - private int rank = 0; - - public ArrayList> markers = new ArrayList>(); - - public void setRank(int rank) { - this.rank = rank; - } - - public String getContig() - { - return name; - } - - public int getChromosome() - { - return chr; - } - public void setChromosome(int value) - { - chr = value; - } - - public long getMinStart() { - if (startI != null) - return startI[0]; - return start; - } - public long getMaxStart() { - if (startI != null) - return startI[1]; - return start; - } - - public long getMaxEnd() { - if (endI != null) - return endI[1]; - return end; - } - public long getMinEnd() { - if (endI != null) - return endI[0]; - return end; - } - - public long getStart() { - return start; - } - - public long[] getStartI() - { - long[] ret = new long[2]; - ret[0] = getMinStart(); - ret[1] = getMaxStart(); - return ret; - } - - public long[] getEndI() - { - if (endI == null) - return new long[]{end, end}; - - long[] ret = new long[endI.length]; - ret[0] = getMinEnd(); - ret[1] = getMaxEnd(); - return ret; - } - - - public void setStart(long value) { - start = value; - } - - public long getEnd() { - return end; - } - - public void setEnd(long value) { - end = value; - } - - public int getRank(){ - return rank; - } - public void setOrientation(int o) { - this.currentOrientation = o; - } - public int getOrientation(){ - return currentOrientation; - } - public void flipOrientation() - { - if (currentOrientation == 0) - currentOrientation = -1; - else - currentOrientation = -currentOrientation; - } - //copy ContigInterval - public ContigInterval(ContigInterval c) { - this.name = c.name; - this.start = c.start; - this.end = c.end; - - //TODO: consider Arrays.copyOf - this.startI = c.startI; - this.endI = c.endI; - - this.chr = c.chr; - this.currentOrientation = c.currentOrientation; - this.rank = c.rank; - - this.markers = new ArrayList>(); - for (ArrayList mm: c.markers) { - ArrayList mm_new = new ArrayList(); - this.markers.add(mm_new); - for (Marker m : mm) - mm_new.add(new Marker(m, this)); - } - } - - public ContigInterval(String name, long start, long end) - { - this.name = name; - this.start = start; - this.end = end; - } - - public ContigInterval(String name, long start[], long end[]) - { - this.name = name; - if (start[0] == 1) - this.start = 1; - else - this.start = (start[0] + start[1]) / 2; - - if (end.length > 2) - this.end = end[1]; - else - this.end = (end[0] + end[1]) / 2; - startI = start; - endI = end; - } - - public boolean inside(long p) - { - if (p >= start && p <= end) - return true; - return false; - } - @Override - public int hashCode() { - return name.hashCode() + Long.hashCode(start); - } - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null) - return false; - if (this.getClass() != o.getClass()) - return false; - - ContigInterval cio = (ContigInterval) o; - return (name.equals(cio.name) && start == cio.start); - } - @Override - public int compareTo(ContigInterval o) { - int strcmp = name.compareTo(o.name); - if (strcmp != 0) - return strcmp; - - if (start < o.start) - return -1; - if (start > o.start) - return 1; - return 0; - } - @Override - public String toString() - { - return name + "\t" + start + "\t" + end; - } - public void addMap() - { - markers.add(new ArrayList()); - } - public void addMarker(Marker m, int map) - { - while (markers.size() <= map) - addMap(); - markers.get(map).add(m); - - if (chr > 0 && chr != m.getChromosome()) { - System.err.println(this); - System.err.println("Error: Multiple chromosomes in one interval in bed"); - System.exit(-1); - } - chr = m.getChromosome(); - m.ci = this; - } - public int getNumMarkers() - { - int ret = 0; - for (ArrayList m : markers) - ret += m.size(); - return ret; - } - - public ArrayList getMarkers(int map) { - if (markers.size() > map) { - if (currentOrientation >= 0) { - ArrayList ret = new ArrayList(); - ret.addAll(markers.get(map)); - return ret; - } - else { - ArrayList ret = new ArrayList(); - ret.addAll(markers.get(map)); - Collections.reverse(ret); - return ret; - } - } else - return new ArrayList(); - } - public ArrayList getMarkersReverse(int map) { - return getMarkers(map, (currentOrientation >= 0) ? -1 : 0); - } - - - public ArrayList getMarkers(int map, int orientation) { - if (markers.size() > map) { - if (orientation == 0) { - ArrayList ret = new ArrayList(); - ret.addAll(markers.get(map)); - return ret; - } - else { - ArrayList ret = new ArrayList(); - ret.addAll(markers.get(map)); - Collections.reverse(ret); - return ret; - } - } else - return new ArrayList(); - } - - public Marker getMarker(int map, int index) { - return markers.get(map).get(index); - } - - //calculate gap length for - // ci in orientation -> c2 with alignment [astart to aend] - public long calculateGapLength(int orientation, long astart, long aend){ - long l = 0; - if (orientation == 0) { // + orientation - if (astart > getEnd()) - l = astart - getEnd() - 1; - else if (aend < getEnd()) - l = getEnd() - aend; - } else { // - orientation - if (aend < getStart()) - l = getStart() - aend - 1; - else if (astart > getStart()) - l = astart - getStart(); - } - return l; - } - - //ci is the haplotype, calculate how much of it does not align... - public long calculateGapLengthHaplotype(long astart, long aend){ - - long g1 = Math.max(0, astart - getStart()); // getStart() - long g2 = Math.max(0, getEnd() - aend); // getEnd() - - return g1 + g2; - } - - //split ContigInterval (this) to start..position and position+1,...end - public ContigInterval[] splitContigInterval(long position[]) - { - if (position[0] < start || position[1] >= end) // no need to split - return new ContigInterval[]{this}; - - long pos = (position[0] + position[1]) / 2; - - ContigInterval c1 = new ContigInterval(name, new long[]{getMinStart(), getMaxStart()}, position); - ContigInterval c2 = new ContigInterval(name, new long[]{position[0] + 1, position[1] + 1}, new long[]{getMinEnd(), getMaxEnd()}); - int numMaps = markers.size(); - - for (int map = 0; map < numMaps; ++map) { - ArrayList list = markers.get(map); - for (Marker m : list) { - if (m.getPosition() > pos) { - Marker mNew = new Marker(m); - mNew.ci = c2; - c2.addMarker(mNew, map); - } else { - Marker mNew = new Marker(m); - mNew.ci = c1; - c1.addMarker(mNew, map); - } - } - } - if (c1.getNumMarkers() == 0 || c2.getNumMarkers() == 0) - return new ContigInterval[]{this}; - - return new ContigInterval[]{c1, c2}; - } - public ContigInterval[] splitContigInterval(long position1[], long position2[], long position3[]) - { - if (position1[0] > position2[0]) { - long tmp[] = position1; - position1 = position2; - position2 = tmp; - } - if (position1[0] > position3[0]) { - long tmp[] = position1; - position1 = position3; - position3 = tmp; - } - ContigInterval sp1[] = splitContigInterval(position2, position3); - ContigInterval sp2[] = sp1[0].splitContigInterval(position1); - ArrayList list = new ArrayList(); - for (int i = 0; i < sp2.length; ++i) - list.add(sp2[i]); - for (int i = 1; i < sp1.length; ++i) - list.add(sp1[i]); - ContigInterval s[] = new ContigInterval[list.size()]; - return list.toArray(s); - - } - - public ContigInterval[] splitContigInterval(long position1[], long position2[]) - { - if (position1[0] > position2[0]) { - long tmp[] = position1; - position1 = position2; - position2 = tmp; - } - ContigInterval sp1[] = splitContigInterval(position1); - if (sp1.length == 2) { - ContigInterval sp2[] = sp1[1].splitContigInterval(position2); - if (sp2.length == 2) - return new ContigInterval[]{sp1[0], sp2[0], sp2[1]}; - else - return sp1; - } else - return splitContigInterval(position2); - } - -} diff --git a/software/LepAnchor/src/CoverageAnalyser.java b/software/LepAnchor/src/CoverageAnalyser.java deleted file mode 100644 index 40368a0..0000000 --- a/software/LepAnchor/src/CoverageAnalyser.java +++ /dev/null @@ -1,420 +0,0 @@ -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.HashMap; - -public class CoverageAnalyser { - - // private HashMap> posDepth = new HashMap>(); - private int column = 2; - private int numCategories = 2; - - private double normalDepth = 100.0; - - private int maxDepth = 1000; - private double zetaK = 1.5; - - private int MAX_ITERATIONS = 100; - - private long[] count = null; - - private double[] freq = new double[3]; - - boolean findNormal = false; - - // private ArrayList variance = new ArrayList(); - - /* - * public void loadDepth(String fn) { try { BufferedReader br = null; if - * (fn.equals("-")) br = new BufferedReader(new - * InputStreamReader(System.in)); else br = new BufferedReader(new - * FileReader(fn)); - * - * do { ArrayList row = Input.loadTableRow(br, "[\t ]"); if (row == - * null) break; if (row.size() <= column) { - * System.err.println("Warning: skipping " + row); continue; } - * - * String contig = row.get(0); ArrayList list = posDepth.get(contig); - * if (list == null) { list = new ArrayList(); posDepth.put(contig, - * list); } //only store last position with same depth... int lsize = - * list.size(); long cov = 0; if (lsize == 0) { cov = - * Long.parseLong(row.get(1)) list.add(cov); // genomic position - * list.add(Long.parseLong(row.get(column))); // depth } else { long lastcov - * = list.get(lsize - 1); long cov = Long.parseLong(row.get(column)); if - * (lastcov == cov) { list.set(lsize - 2, Long.parseLong(row.get(1))); // - * update position } else { list.add(Long.parseLong(row.get(1))); // genomic - * position list.add(cov); // depth } } } while (true); br.close(); } catch - * (Exception e) { e.printStackTrace(); System.err.println("Error in file " - * + fn); System.exit(-1); } } - */ - - public void loadDepth(String fn) { - count = new long[maxDepth + 2]; - try { - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - - do { - ArrayList row = Input.loadTableRow(br, "[\t ]"); - if (row == null) - break; - if (row.size() <= column) { - System.err.println("Warning: skipping " + row); - continue; - } - int depth = Integer.parseInt(row.get(column)); - if (depth > maxDepth) - depth = maxDepth + 1; - ++count[depth]; - } while (true); - br.close(); - } catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - System.exit(-1); - } - } - - public static final double isqrtPI = 1.0 / Math.sqrt(Math.PI); - public static final double isqrt2 = 1.0 / Math.sqrt(2.0); - - private class Gaussian { - private double mean; - private double stdev; - private double istdev; - private double istdev2sp; - private double logIstdev2sp; - - // Misc.i2sqrtPI = 1.0 / Math.sqrt(2.0 * Math.PI); - - public Gaussian(double mean, double stdev) { - istdev = isqrt2 / stdev; - istdev2sp = istdev * isqrtPI; - logIstdev2sp = Math.log(istdev2sp); - this.mean = mean; - this.stdev = stdev; - } - public double getMean() - { - return mean; - } - - public double getStdev() - { - return stdev; - } - - public double density(double x) { - double tmp = (x - mean) * istdev; - return istdev2sp * Math.exp(-tmp * tmp); - } - - public double logDensity(double x) { - double tmp = (x - mean) * istdev; - return logIstdev2sp - tmp * tmp; - } - } - - //distribution zeta - 1 - private class Zeta { - //double k; - int max; - ArrayList d = new ArrayList(); - //ArrayList ld = new ArrayList(); - public Zeta(double k, int max) { - //this.k = k; - this.max = max; - double sum = 0.0; - for (int i = 0; i <= max; ++i) { - double p = Math.pow(i + 1, -k); - sum += p; - d.add(p); - } - double iSum = 1.0 / sum; - for (int i = 0; i <= max; ++i) - d.set(i, d.get(i) * iSum); - } - public double density(int x) { - if (x > max) - return 0.0; - return d.get(x); - } - //public double logDensity(int x) { - // return ld.get(x); - //} - } - - private double likelihood(boolean verbose) { - Gaussian g[] = new Gaussian[numCategories]; - - /*double rate1 = count[0] / (double) count[1]; - double rate2 = count[0] / (double) count[2]; - double rate3 = count[0] / (double) count[3]; - double k1 = Math.log(rate1) / Math.log(2.0); - double k2 = Math.log(rate2) / Math.log(3.0); - double k3 = Math.log(rate3) / Math.log(4.0); - k = Math.min(k1, k2, k3); - if (k > 1.0) - ; - else k = 1.0;*/ - //double k = 1.2; - - Zeta z = new Zeta(zetaK, maxDepth); - if (verbose) - System.err.println("Zeta parameter " + zetaK); - - { // Initialisation - for (int j = 0; j < numCategories; ++j) { - double mean = (normalDepth * (j + 1)) / (double) numCategories; - g[j] = new Gaussian(mean, Math.sqrt(normalDepth) * Math.exp(2.0 * Math.random() - 1.0)); - } - double sum = 0.0; - for (int j = 0; j <= numCategories; ++j) { - double r = Math.random(); - freq[j] = r; - sum += r; - } - double iSum = 1.0 / sum; - for (int j = 0; j <= numCategories; ++j) - freq[j] *= iSum; - } - - double d[] = new double[numCategories + 1]; - - double likelihood = 0.0; - for (int iteration = 0; iteration < MAX_ITERATIONS; ++iteration) { - double qFreq[] = new double[numCategories + 1]; - double qVar[] = new double[numCategories]; - double qSum[] = new double[numCategories]; - - //System.err.println(freq[0] + "\t" + freq[1] + "\t" + freq[2]); - likelihood = 0.0; - for (int i = 0; i <= maxDepth; ++i) { - - double sum = 0.0; - for (int j = 0; j <= numCategories; ++j) { - double p = 0.0; - if (j < numCategories) - p = g[j].density(i) * freq[j]; - else - p = z.density(i) * freq[j]; - d[j] = p; - sum += p; - } - //System.err.println("Sum " + sum); - double iSum = 1.0 / sum; - - for (int j = 0; j <= numCategories; ++j) { - double q = d[j] * iSum * count[i]; - qFreq[j] += q; - if (j < numCategories) { - double diff = (i - g[j].mean); - qVar[j] += q * diff * diff; - qSum[j] += q; - } - } - likelihood += Math.log(sum) * count[i]; - } - double sum = 0.0; - for (int j = 0; j <= numCategories; ++j) - sum += qFreq[j]; - //for (int j = 0; j <= numCategories; ++j) - // if (qFreq[j] < 0.1 * sum) - // qFreq[j] = 0.1 * sum; - //sum = 0.0; - //for (int j = 0; j <= numCategories; ++j) - // sum += qFreq[j]; - - double iSum = 1.0 / sum; - for (int j = 0; j <= numCategories; ++j) - freq[j] = qFreq[j] * iSum; - - //variance in gaussians must be non-decreasing - for (int j = numCategories - 1; j >= 1; --j) { - if (qVar[j] * qSum[j - 1] < qVar[j - 1] * qSum[j]) { - double qs = qSum[j] + qSum[j - 1]; - double qv = qVar[j] + qVar[j - 1]; - qSum[j] = qs; - qSum[j - 1] = qs; - qVar[j] = qv; - qVar[j - 1] = qv; - } - } - for (int j = 0; j < numCategories; ++j) { - double sdev = Math.sqrt(qVar[j] / qSum[j]); - if (sdev < 1.0) - sdev = 1.0; - //System.err.println(freq[j] + "\t" + sdev); - g[j] = new Gaussian((normalDepth * (j + 1)) / (double) numCategories, sdev); - //System.err.println("mean=" + g[j].mean); - } - //System.err.println(freq[numCategories]); - // System.err.println(Math.sqrt(iSum * qVar[0]) + "\t" + - // Math.sqrt(iSum * qVar[1]) + "\t" + Math.sqrt(iSum * qVar[2])); - if (verbose) - System.err.println("logL=" + likelihood); - } - long totalCount = 0; - for (int i = 0; i <= maxDepth; ++i) { - totalCount += count[i]; - } - if (verbose) - System.err.println(freq[0] + "\t" + freq[1] + "\t" + g[0].getStdev() + "\t" + g[1].getStdev()); - - for (int i = 0; i <= maxDepth; ++i) { - - double tmp[] = new double[numCategories + 2]; - for (int j = 0; j < numCategories; ++j) - tmp[j] = g[j].density(i); - - //split zeta distribution to 0 (<=normalDepht/numCategories) and repeat (>=normalDepht) - double zf = z.density(i); - if (i <= g[0].mean) - tmp[numCategories] = zf; - else if (i >= g[numCategories - 1].mean) - tmp[numCategories + 1] = zf; - else { - double t = (i - g[0].mean) / (g[numCategories - 1].mean - g[0].mean); - tmp[numCategories] = zf * (1.0 - t); - tmp[numCategories + 1] = zf * t; - } - - double sum = 0.0; - double max = 0.0; - for (int j = 0; j <= numCategories + 1; ++j) { - double p = tmp[j] * freq[Math.min(j, numCategories)]; - sum += p; - max = Math.max(max, tmp[j]); - } - if (verbose) { - System.out.print(i + "\t" + count[i] + "\t" + (totalCount * sum) ); - - double iMax = 1.0 / max; - System.out.print("\t" + tmp[numCategories] * iMax); - for (int j = 0; j <= numCategories + 1; ++j) - if (j != numCategories) - System.out.print("\t" + tmp[j] * iMax); - - System.out.println(); - } - } - - - //System.err.println(num[0] + "\t" + num[1] + "\t" + num[2] + "\t" + num[3]); - return likelihood; - } - - - public void analyse() { - if (findNormal) { - // Find normalDepth and other parameters... - System.err.println("Estimating normal depth parameter..."); - double maxL = Double.NEGATIVE_INFINITY; - int maxNd = 1; - for (int nd = 1; nd + nd <= maxDepth; ++nd) { - normalDepth = nd; - double ll = likelihood(false); - if (ll > maxL) { - System.err.println(nd + "\t" + ll + "\t*"); - maxL = ll; - maxNd = nd; - } else - System.err.println(nd + "\t" + ll); - } - normalDepth = maxNd; - System.err.println("Value " + maxNd + " chosen for normal depth"); - } - likelihood(true); - } - - public void setNumCategories(int value) { - numCategories = value; - freq = new double[numCategories + 1]; - - - for (int i = 0; i <= numCategories; ++i) - freq[i] = Math.random(); - } - - public void setNormalDepth(double value) { - normalDepth = value; - maxDepth = (int)(3 * value + 0.99); - setNumCategories(numCategories); - } - - public void setColumn(int value) { - column = value; - } - - public void setMaxDepth(int value) { - maxDepth = value; - findNormal = true; - } - - public void setZeta(double value) { - zetaK = value; - } - - public static void usageInfo() { -// pp.warning(new String[] { "depth", "numCategories", "normalDepth", -// "column", "sample", "maxDepth", "zeta"}); - System.err.println("Usage: java CoverageAnalyser depth=depth.txt [parameters] "); - System.err.println(" depth=file output from samtools -a depth"); - System.err.println(" numCategories=NUM the ploidy [2]"); - System.err.println(" normalDepth=NUM the normal depth [not set, ML value searched]"); - System.err.println(" maxDepthDepth=NUM search normalDepth between 1..NUM, applies only when normalDepth is not set [1000]"); - System.err.println(" column=NUM Take column NUM from depth file [3]"); - System.err.println(" zeta=NUM Zeta distribution parameter [1.5]"); - - System.exit(0); - } - - public static void main(String args[]) { - ParameterParser pp = new ParameterParser(); - - String extraParameters = ""; - for (int i = 0; i < args.length; ++i) { - extraParameters += " " + args[i]; - } - if (args.length == 0 || !pp.init(extraParameters)) { - usageInfo(); - } - System.out.println("#java CoverageAnalyser " + extraParameters); - - pp.warning(new String[] { "depth", "numCategories", "normalDepth", - "column", "maxDepth", "zeta"}); - - String depthFile = pp.getValueAsString("depth", null); - if (depthFile == null) - usageInfo(); - - CoverageAnalyser ca = new CoverageAnalyser(); - int column = Integer.parseInt(pp.getValueAsString("column", "3")); - ca.setColumn(column - 1); - ca.setNumCategories(Integer.parseInt(pp.getValueAsString( - "numCategories", "2"))); - - String normalD = pp.getValueAsString("normalDepth", null); - if (normalD != null) { - ca.setNormalDepth(Double.parseDouble(normalD)); - //ca.setMaxDepth(); - String md = pp.getValueAsString("maxDepth", null); - if (md != null) - ca.setMaxDepth(Integer.parseInt(md)); - } - else - ca.setMaxDepth(Integer.parseInt(pp.getValueAsString("maxDepth", - "1000"))); - - ca.setZeta(Double.parseDouble(pp.getValueAsString("zeta", "1.5"))); - - ca.loadDepth(depthFile); - ca.analyse(); - - } -} diff --git a/software/LepAnchor/src/CoverageHMM.java b/software/LepAnchor/src/CoverageHMM.java deleted file mode 100644 index a03e111..0000000 --- a/software/LepAnchor/src/CoverageHMM.java +++ /dev/null @@ -1,571 +0,0 @@ -/** - This file is part of Lep-Anchor. - - Lep-Anchor is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-Anchor is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-Anchor. If not, see . - - Copyright (C) 2019 Pasi Rastas, pasi.rastas@gmail.com, University of Helsinki - -*/ -//User interface and implementation of ScaffoldHMM -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.InputStreamReader; -import java.text.DecimalFormat; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; - -public class CoverageHMM { - - private int column = 2; - private int numMixture = 0; - private int MAX_ITERATIONS = 1000; - private int numScaffolds; - - private double minProb = 0.001; - private double scale = 0.001; - private double sample = 0.1; - - private ArrayList scaffolds = new ArrayList(); - - public void setColumn(int value) - { - column = value; - } - - public void setMinProb(double value) - { - minProb = value; - } - - public void setSample(double value) - { - sample = value; - } - - public void setScale(double value) - { - scale = value; - } - - private ArrayList mixture = new ArrayList(); - private HashMap> scaffoldMap = new HashMap>(); - - public void loadMixture(String mixtureFile) - { - numMixture = 0; - try { - BufferedReader br = new BufferedReader(new FileReader(mixtureFile)); - - ArrayList line = Input.loadTableRow(br, " \t"); - while (line != null) { - if (numMixture == 0) - numMixture = line.size() - 3; - else - if (numMixture != line.size() - 3) { - System.err.println("Error: Different number of columns in the input"); - System.exit(-1); - } - int value = Integer.parseInt(line.get(0)); - if (mixture.size() != value) { - System.err.println("Error: Incorrect mixture file"); - System.exit(-1); - } - double m[] = new double[numMixture]; - for (int i = 0; i < numMixture; ++i) - m[i] = Double.parseDouble(line.get(i + 3)); - mixture.add(m); - line = Input.loadTableRow(br, " \t"); - } - } catch (Exception e) { - e.printStackTrace(); - System.exit(-1); - } - } - - - public void loadDepth(String depthFile) - { - try { - BufferedReader br = null; - if (depthFile.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(depthFile)); - - ArrayList line = Input.loadTableRow(br, " \t"); - double pick = 0.0; - while (line != null) { - if (line.size() <= column) { - System.err.println("Warning: skipping " + line); - continue; - } - pick += sample; - if (pick >= 1.0) { - pick -= 1.0; - String scaffold = line.get(0); - long position = Long.parseLong(line.get(1)); - if (position > 0xffffffffl) { - System.err.println("Error: Too long scaffold"); - System.exit(-1); - } - int cov = Integer.parseInt(line.get(column)); - - if (!scaffoldMap.containsKey(scaffold)) - scaffoldMap.put(scaffold, new ArrayList()); - ArrayList list = scaffoldMap.get(scaffold); - list.add((int) position); - list.add(cov); - } - line = Input.loadTableRow(br, " \t"); - } - } catch (Exception e) { - e.printStackTrace(); - System.exit(-1); - } - - numScaffolds = scaffoldMap.keySet().size(); -// double markerDensityScales[][] = new double[numScaffolds][]; - scaffolds = new ArrayList(); - for (String scaffold : scaffoldMap.keySet()) - scaffolds.add(scaffold); - - for (String scaffold : scaffolds) { - ArrayList positionsAndCov = scaffoldMap.get(scaffold); - int numM = positionsAndCov.size(); - for (int i = 2; i < numM; i+=2) { - if (positionsAndCov.get(i - 2) > positionsAndCov.get(i)) { - System.err.println("physical positions must be increasing..."); - System.exit(0); - } - } - } - System.err.println("Number of scaffolds = " + numScaffolds); - } - - public void hmm() - { - DecimalFormat df = new DecimalFormat("#0.000"); - - for (double m[] : mixture) - for (int i = 0; i < numMixture; ++i) { - double v = m[i]; - if (v < minProb) - m[i] = minProb; - m[i] = Math.pow(m[i], scale); - } - - //Init transition - double t[][] = new double[numMixture][numMixture]; - - for (double tmp[] : t) - Arrays.fill(tmp, 1.0 / numMixture); - - for (int i = 0; i < numMixture; ++i) { - double sum = 0.0; - for (int j = 0; j < numMixture; ++j) { - double r = (i == j) ? 1.0 : 0.1 * scale * Math.random(); - sum += r; - t[i][j] = r; - } - double iSum = 1.0 / sum; - for (int j = 0; j < numMixture; ++j) - t[i][j] *= iSum; - } - - boolean converge = false; - double oldLogL = Double.NEGATIVE_INFINITY; - for (int iterations = 0; iterations <= MAX_ITERATIONS; ++iterations) { - int maxCount = mixture.size() - 1; - double QT[][] = new double[numMixture][numMixture]; - double logL = 0.0; - double l = 1.0; - for (int scaffold=0; scaffold < numScaffolds; ++scaffold) { - String scaffoldS = scaffolds.get(scaffold); - ArrayList positionsAndCov = scaffoldMap.get(scaffoldS); - - int numM = positionsAndCov.size() / 2; - double forward[][] = new double[numM + 1][numMixture]; - double scale[] = new double[numM + 1]; - scale[0] = 1.0; - - double iM = 1.0 / numMixture; - for (int i = 0; i < numMixture; ++i) - forward[0][i] = iM; - - for (int p = 0; p < numM; ++p) { - int cov = positionsAndCov.get(p + p + 1); - if (cov >= maxCount) - cov = maxCount; - - double m[] = mixture.get(cov); - - for (int j = 0; j < numMixture; ++j) { - double fp1j = 0.0; - double f_[] = forward[p]; - for (int i = 0; i < numMixture; ++i) - fp1j += f_[i] * t[i][j]; - forward[p + 1][j] = fp1j * m[j]; - } - - - double sum = 0.0; - for (int j = 0; j < numMixture; ++j) - sum += forward[p + 1][j]; - double isum = 1.0 / sum; - - for (int j = 0; j < numMixture; ++j) - forward[p + 1][j] *= isum; - scale[p + 1] = isum; - - l *= sum; - if (l < 1e-100) { - logL += Math.log10(l); - l = 1.0; - } - } - double backward[][] = new double[numM + 1][numMixture]; - for (int j = 0; j < numMixture; ++j) - backward[numM][j] = 1.0; - - for (int p = numM - 1; p >= 0; --p) { - int cov = positionsAndCov.get(p + p + 1); - if (cov >= maxCount) - cov = maxCount; - - double m[] = mixture.get(cov); - - - for (int j = 0; j < numMixture; ++j) { - double tmp = backward[p + 1][j] * m[j] * scale[p + 1]; - for (int i = 0; i < numMixture; ++i) - backward[p][i] += t[i][j] * tmp; - } - } - //double sum = 0.0; - //for (int j = 0; j < numMixture; ++j) - // sum += backward[1][j] * forward[1][j]; - //System.err.println(sum); - - if (iterations == MAX_ITERATIONS || converge) { - for (int p = 0; p < numM; ++p) { - long pos = positionsAndCov.get(p + p) & 0xffffffffl; - int cov = positionsAndCov.get(p + p + 1); - double max = 0.0; - double max2 = 0.0; - int maxj = 0; - for (int j = 0; j < numMixture; ++j) { - double pj = forward[p][j] * backward[p][j]; - if (pj > max) { - max2 = max; - max = pj; - maxj = j; - } else if (pj > max2) - max2 = pj; - } - StringBuilder sb = new StringBuilder(); - sb.append(scaffoldS); - sb.append('\t'); - sb.append(pos); - sb.append('\t'); - sb.append(cov); - sb.append('\t'); - sb.append(maxj); - sb.append('\t'); - sb.append(df.format(Math.log10(max) - Math.log10(max2))); - System.out.println(sb.toString()); - } - - } else - for (int p = 1; p < numM; ++p) { - int cov = positionsAndCov.get(p + p + 1); - if (cov >= maxCount) - cov = maxCount; - double m[] = mixture.get(cov); - for (int j = 0; j < numMixture; ++j) { - double tmp = scale[p + 1] * backward[p + 1][j] * m[j]; - for (int i = 0; i < numMixture; ++i) - QT[i][j] += forward[p][i] * t[i][j] * tmp; - } - } - - } - if (iterations == MAX_ITERATIONS || converge) - break; - - for (int i = 0; i < numMixture; ++i) { - double sum = 0.0; - for (double q : QT[i]) - sum += q; - double iSum = 1.0 / sum; - for (int j = 0; j < numMixture; ++j) - t[i][j] = QT[i][j] * iSum; - } - - //for (int i = 0; i < numMixture; ++i) { - // for (int j = 0; j < numMixture; ++j) - // System.err.println(i + "\t" + j + "\t" + t[i][j]); - //} - - logL = logL + Math.log10(l); - System.err.println("logL = " + logL); - if (oldLogL > logL - 0.01) - converge = true; - - oldLogL = logL; - } - - -/* - //backward - double backward[][] = new double[numM + 1][numChromosomes]; - for (int j = 0; j < numChromosomes; ++j) - backward[numM][j] = 1.0; - - for (int i = numM - 1; i >= 0; --i) { - long pos = positionsAndChr.get(3 * i); - int chr = positionsAndChr.get(3 * i + 1).intValue(); - int density = positionsAndChr.get(3 * i + 2).intValue(); - double p1 = 1.0; - if (i > 0) { - prevPos = positionsAndChr.get(3 * i - 3); - p1 = transition(pos - prevPos); - } - double p2 = (1.0 - p1) / numChromosomes; - - double sum = 0.0; - for (int j = 0; j < numChromosomes; ++j) - sum += backward[i + 1][j]; - - double sump2 = sum * p2; - - for (int j = 0; j < numChromosomes; ++j) - backward[i][j] = (sump2 + p1 * backward[i + 1][j]) * scale[i + 1] * emission(chr, j, density); - } - //for (int j = 0; j < numChromosomes; ++j) { - // System.err.println(backward[0][j]); - //} - - //QE - for (int i = 0; i < numM; ++i) { - //int pos = positionsAndChr.get(3 * i); - int chr = positionsAndChr.get(3 * i + 1).intValue(); - int density = positionsAndChr.get(3 * i + 2).intValue(); - for (int j = 0; j < numChromosomes; ++j) { - if (chr < 0) { - //QE[0] += ? - //QE[1] += ? - } else { - if (chr == j) - QE[0] += backward[i + 1][j] * forward[i + 1][j] / density; - else - QE[1] += backward[i + 1][j] * forward[i + 1][j] / density; - } - } - } - - //QT - for (int i = 0; i < numM; ++i) { - long pos = positionsAndChr.get(3 * i); - if (i > 0) { - int chr = positionsAndChr.get(3 * i + 1).intValue(); - int density = positionsAndChr.get(3 * i + 2).intValue(); - - double sum = 0.0; - for (int j = 0; j < numChromosomes; ++j) - sum += backward[i + 1][j]; - - double p1 = transition(pos - prevPos); - double p2 = (1.0 - p1) / numChromosomes; - //if (ts != 1.0) - // System.err.println(ts); - for (int j = 0; j < numChromosomes; ++j) { - - double se = scale[i + 1] * emission(chr, j, density); - QT[0] += p1 * forward[i][j] * backward[i + 1][j] * se; - QT[1] += p2 * forward[i][j] * sum * se; - } - - } - prevPos = pos; - } - - if (iterations == MAX_ITERATIONS) { - // Viterbi - double viterbi[][] = new double[numM + 1][numChromosomes]; - int path[][] = new int[numM + 1][numChromosomes]; - - for (int j = 0; j < numChromosomes; ++j) - viterbi[0][j] = 1.0 / numChromosomes; - - prevPos = 0; - - for (int i = 0; i < numM; ++i) { - long pos = positionsAndChr.get(3 * i); - int chr = positionsAndChr.get(3 * i + 1).intValue(); - int density = positionsAndChr.get(3 * i + 2).intValue(); - - double p1 = 1.0; - if (i > 0) - p1 = transition((int)(pos - prevPos)); - - double p2 = (1.0 - p1) / numChromosomes; - - int maxj = 0; - for (int j = 1; j < numChromosomes; ++j) - if (viterbi[i][j] > viterbi[i][maxj]) - maxj = j; - - double sump2 = p2 * viterbi[i][maxj]; - - for (int j = 0; j < numChromosomes; ++j) { - double sump1 = p1 * viterbi[i][j]; - if (sump2 > sump1) { - viterbi[i + 1][j] = sump2 * emission(chr, j, density); - path[i + 1][j] = maxj; - } - else { - viterbi[i + 1][j] = sump1 * emission(chr, j, density);; - path[i + 1][j] = j; - } - } - - double max = 0.0; - for (int j = 0; j < numChromosomes; ++j) - max = Math.max(max, viterbi[i + 1][j]); - - double imax = 1.0 / max; - for (int j = 0; j < numChromosomes; ++j) - viterbi[i + 1][j] *= imax; - - prevPos = pos; - } - //backtrack path - int finalPath[] = new int[numM]; - - int maxj = 0; - for (int j = 1; j < numChromosomes; ++j) - if (viterbi[numM][j] > viterbi[numM][maxj]) - maxj = j; - - for (int i = numM; i >= 1; --i) { - finalPath[i - 1] = maxj; - maxj = path[i][maxj]; - } - - for (int i = 0; i < numM; ++i) { - int maxJ = finalPath[i]; // Viterbi path - double max2 = Double.NEGATIVE_INFINITY; - for (int j = 0; j < numChromosomes; ++j) { - if (j != maxJ) { - max2 = Math.max(max2, backward[i + 1][j] * forward[i + 1][j]); - } - } - max2 = Math.log10(backward[i + 1][maxJ] * forward[i + 1][maxJ]) - Math.log10(max2); - long pos = positionsAndChr.get(3 * i); - int chr = positionsAndChr.get(3 * i + 1).intValue(); - if (chr >= 0) // do not print padding markers - System.out.println(scaffolds.get(scaffold) + "\t" + pos + "\t" + (maxJ + 1) + "\t" + (chr + 1) + "\t" + max2); - } - } - - //System.err.println(scaffolds.get(scaffold) + "\t" + logScale + "\t" + QE[0] + "\t" + QE[1]); - } - logL = logL + Math.log10(l); - System.err.println("logL = " + logL); - System.err.println(P_ERROR + "\t" + P_CHIMERA); - P_ERROR = QE[1] / (QE[0] + QE[1]); - P_CHIMERA = QT[1] / (QT[0] + QT[1]);*/ - } - - private double emissionTable1[] = new double[1000]; - private double emissionTable2[] = new double[1000]; - - private void initParameters() - { - //double e = P_ERROR / (double) (numChromosomes - 1); // P_ERROR/0 does not matter! - for (int i = 0; i < emissionTable1.length; ++i) { - //emissionTable1[i] = Math.pow(e, 1.0 / i); - //emissionTable2[i] = Math.pow(1.0 - P_ERROR, 1.0 / i); - } - - } - - private double emission(int chr, int state, int density){ - if (chr < 0) - return 1; - - assert(density >= 1); - - if (density < emissionTable1.length) { - if (chr != state) - return emissionTable1[density]; - else - return emissionTable2[density]; - - } - return Math.pow(emission(chr, state, 1), 1.0 / density); - } - - private static void usageInfo() - { - System.err.println("Usage: java CoverageHMM depth=depth.txt mixture=mixture.txt [options]"); - System.err.println("options:"); - System.err.println(" depth=file output from samtools -a depth"); - System.err.println(" mixture=file output from CoverageAnalyser"); - System.err.println(" column=NUM Take column NUM from depth file [3]"); - System.err.println(" scale=NUM One position counts this much [0.01]"); - System.err.println(" sample=NUM Take only NUM fraction of data [0.1]"); - System.err.println(" (default scale*sample=0.001, 1 per 1kb)"); - System.err.println(" minProb=NUM One position can be this sure [0.001] (1:1000)"); - } - - - public static void main(String args[]) - { - ParameterParser pp = new ParameterParser(); - - String extraParameters = ""; - for (int i = 0; i < args.length; ++i) { - extraParameters += " " + args[i]; - } - - if (args.length == 0 || !pp.init(extraParameters)) { - usageInfo(); - System.exit(0); - } - pp.warning(new String[]{"depth", "mixture", "column", "scale", "minProb", "sample"}); - - - System.out.println("#java CoverageHMM" + extraParameters); - - int column = Integer.parseInt(pp.getValueAsString("column", "3")); - CoverageHMM ch = new CoverageHMM(); - ch.setColumn(column - 1); - String df = pp.getValueAsString("depth", null); - String mf = pp.getValueAsString("mixture", null); - if (df == null || mf == null) { - usageInfo(); - System.exit(0); - } - ch.setScale(Double.parseDouble(pp.getValueAsString("scale", "0.001"))); - ch.setMinProb(Double.parseDouble(pp.getValueAsString("minProb", "0.001"))); - ch.setSample(Double.parseDouble(pp.getValueAsString("sample", "0.1"))); - - ch.loadMixture(mf); - ch.loadDepth(df); - ch.hmm(); - } -} diff --git a/software/LepAnchor/src/FindContigErrors.java b/software/LepAnchor/src/FindContigErrors.java deleted file mode 100644 index ad1ebdf..0000000 --- a/software/LepAnchor/src/FindContigErrors.java +++ /dev/null @@ -1,557 +0,0 @@ -/** - This file is part of Lep-Anchor. - - Lep-Anchor is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-Anchor is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-Anchor. If not, see . - - Copyright (C) 2019 Pasi Rastas, pasi.rastas@gmail.com, University of Helsinki - -*/ - -import java.io.BufferedReader; - -import java.io.FileReader; -import java.io.InputStreamReader; -import java.util.*; - -//Calculate score over positions of each contig (paf + ?) -//...and to different position... - -public class FindContigErrors { - //calculate coverage for read mapping intervals... - //could be used to find (about) exact cut positions - - //prox, use borderScore - //map, - - private void addPositions(String contig, HashMap> positionHash, long start, long end, long weight) - { - ArrayList list = positionHash.get(contig); - if (list == null) { - list = new ArrayList(); - positionHash.put(contig, list); - } - list.add(new long[]{start, end, weight}); - } - - ArrayList contigs = new ArrayList(); - ArrayList contigLengths = new ArrayList(); - - HashMap contigHash = new HashMap(); - - private void addContig(String name, long length){ - if (!contigHash.containsKey(name)) { - contigHash.put(name, contigs.size()); - contigs.add(name); - contigLengths.add(length); - } - } - - //TODO: is this needed? Could we use cov instead? - private void mergeCovs(String contig, ArrayList cov, ArrayList cov2) { - System.out.println(contig); - int covi = 0; - int cov2i = 0; - //merge in cov and cov2 - - long pos = 0; - long cp = 0; - long cp2 = 0; - while (covi < cov.size() || cov2i < cov2.size()) { - if (covi >= cov.size() ) { - pos = cov2.get(cov2i); - cp2 = cov2.get(cov2i + 1); - cov2i+=2; - } - else if (cov2i >= cov2.size() ) { - pos = cov.get(covi); - cp = cov.get(covi + 1); - covi+=2; - } - else { - if (cov.get(covi) <= cov2.get(cov2i)) { - pos = cov.get(covi); - cp = cov.get(covi + 1); - covi+=2; - } - else { - pos = cov2.get(cov2i); - cp2 = cov2.get(cov2i + 1); - cov2i+=2; - } - } - System.out.println(pos + "\t" + cp + "\t" + cp2); - } - } - - HashMap> pafPositions = new HashMap>(); - HashMap> pafPositions2 = new HashMap>(); - public void processErrorsFromPaf(String fn) - { - System.err.println("loading paf..."); - ArrayList> rows = new ArrayList>(); - try { - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - - do { - ArrayList row = Input.loadTableRow(br, "\t "); - if (row == null) - break; - if (rows.size() == 0 || rows.get(0).get(0).equals(row.get(0))) - rows.add(row); - else { - processPAF(rows); - rows.clear(); - rows.add(row); - } - - } while (true); - processPAF(rows); - - for (String contig : contigs) { - ArrayList list = pafPositions.get(contig); - ArrayList cov = null; - if (list != null) - cov = Misc.cov(list); - else { - cov = new ArrayList(); - cov.add(0L); - cov.add(0L); - } - - ArrayList list2 = pafPositions2.get(contig); - ArrayList cov2 = null; - if (list2 != null) - cov2 = Misc.cov(list2); - else { - cov2 = new ArrayList(); - cov2.add(0L); - cov2.add(0L); - } - - mergeCovs(contig, cov, cov2); - //for (int i = 0; i < cov.size(); i+=2) { - // System.out.println(cov.get(i) + "\t" + cov.get(i + 1)); - //} - } - - } - catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - } - } - - private void processPAF(ArrayList> rows) { - for (ArrayList row : rows) { - String contig = row.get(5); - long length = Long.parseLong(row.get(6)); - addContig(contig, length); - long start = Long.parseLong(row.get(7)) + 1; - long end = Long.parseLong(row.get(8)); - addPositions(contig, pafPositions, start, end, 1); // calculate coverage... - } - //take two longest alignments to two different contigs... - long maxL = 0; - ArrayList maxRow = null; - for (ArrayList row : rows) { - long aLen = Long.parseLong(row.get(8)) - Long.parseLong(row.get(7)); - if (aLen > maxL) { - maxL = aLen; - maxRow = row; - } - } - long maxL2 = 0; - ArrayList maxRow2 = null; - for (ArrayList row : rows) - if (!row.get(5).equals(maxRow.get(5))) { - long aLen = Long.parseLong(row.get(8)) - Long.parseLong(row.get(7)); - if (aLen > maxL2) { - maxL2 = aLen; - maxRow2 = row; - } - } - if (maxRow2 != null) { // two aligments found to two contigs - long rstart1 = Long.parseLong(maxRow.get(2)); - long rstart2 = Long.parseLong(maxRow2.get(2)); - if (rstart1 > rstart2) { // make sure maxRow starts first... - long tmp = rstart2; - rstart2 = rstart1; - rstart1 = tmp; - - tmp = maxL; - maxL = maxL2; - maxL2 = tmp; - - ArrayList tmp2 = maxRow; - maxRow = maxRow2; - maxRow2 = tmp2; - } - long rend1 = Long.parseLong(maxRow.get(3)); - //long rend2 = Long.parseLong(maxRow2.get(3)); - - long distance = rstart2 - rend1 + 2; // - - long minL = Math.min(maxL, maxL2); - - if (distance < -minL || distance > 2 * minL) // no overlap distance 2 x the length of the shorter alignment - return; - if (distance < 0) - distance = 0; -// TODO: figure out what happens if distance < 0, maybe distance can be -2 minL - - String o1 = maxRow.get(4); - if ("+".equals(o1)) { - long end1 = Long.parseLong(maxRow.get(8)); - long length1 = Long.parseLong(maxRow.get(6)); - addPositions(maxRow.get(5), pafPositions2, end1 + 1, Math.min(length1 + 1, end1 + 1 + distance), 1); // calculate coverage... - } else { - long start1 = Long.parseLong(maxRow.get(7)); - addPositions(maxRow.get(5), pafPositions2, Math.max(0, start1 - distance), start1, 1); // calculate coverage... - } - String o2 = maxRow2.get(4); - if ("+".equals(o2)) { - long start2 = Long.parseLong(maxRow2.get(7)); - addPositions(maxRow2.get(5), pafPositions2, Math.max(0, start2 - distance), start2, 1); // calculate coverage... - } else { - long end2 = Long.parseLong(maxRow2.get(8)); - long length2 = Long.parseLong(maxRow2.get(6)); - addPositions(maxRow2.get(5), pafPositions2, end2 + 1, Math.min(length2 + 1, end2 + 1 + distance), 1); // calculate coverage... - } - - } - } - - HashMap> chainPositions = new HashMap>(); - public void processErrorsFromChain(String fn) - { - System.err.println("loading chain..."); - try { - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - - do { - ArrayList row = Input.loadTableRow(br, "\t "); - if (row == null) - break; - if (row.get(4).equals("+")) { - String contig1 = row.get(2); - long start1 = Long.parseLong(row.get(5)) + 1; - long end1 = Long.parseLong(row.get(6)); - long aScore = Long.parseLong(row.get(1)); - - long length1 = Long.parseLong(row.get(3)); - - addContig(contig1, length1); - addPositions(contig1, chainPositions, start1, end1, aScore / length1); - - String contig2 = row.get(7); - long length2 = Long.parseLong(row.get(8)); - addContig(contig2, length2); - long start2 = Long.parseLong(row.get(10)); - long end2 = Long.parseLong(row.get(11)); - if (row.get(9).equals("+")) - addPositions(contig2, chainPositions, start2 + 1, end2, aScore / length2); - else { - addPositions(contig2, chainPositions, length2 - end2 + 1, length2 - start2, aScore / length2); - } - } - } while (true); - - } - catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - } - } - -// 1 (n starts) -// 2,3 (n/2) -// 4,5,6,7 (n/4) -// 8,9,10,11,12,13,14,15 (n/8) -// 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 (n/16) -// ... - - private int getPosition(Marker m, boolean plusOrientation) { - if (plusOrientation) - return m.pPlus; - else - return m.pMinus; - } - - HashMap> proximityPositions = new HashMap>(); - HashMap> proximityPositions2 = new HashMap>(); - public void processErrorsFromProximity(String fn, int bin, int maxD, double scale) - { - System.err.println("loading proximity data..."); - long maxDistance = bin * maxD; - try { - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - - do { - ArrayList row = Input.loadTableRow(br, "\t "); - if (row == null) - break; - if (row.size() < 5) { - System.err.println("Warning: skipping " + row); - continue; - } - - String contig1 = row.get(0); - String contig2 = row.get(2); - long pos1 = Long.parseLong(row.get(1)); - long pos2 = Long.parseLong(row.get(3)); - long weight = (long) Double.parseDouble(row.get(4)); - if (contig1.equals(contig2)) { - if (pos1 < pos2 && pos2 - pos1 < maxDistance) { - addPositions(contig1, proximityPositions, pos1, pos2, weight); - } - } else { - addPositions(contig1, proximityPositions2, pos1, pos1 + bin, weight); - addPositions(contig2, proximityPositions2, pos2, pos2 + bin, weight); - } - } while (true); - - for (String key : proximityPositions.keySet()) - if (!contigHash.containsKey(key)) - addContig(key, 0); - - for (String key : proximityPositions2.keySet()) - if (!contigHash.containsKey(key)) - addContig(key, 0); - - - for (String contig : contigs) { - ArrayList list = proximityPositions.get(contig); - ArrayList cov = null; - if (list != null) - cov = Misc.cov(list); - else { - cov = new ArrayList(); - cov.add(0L); - cov.add(0L); - } - - ArrayList list2 = proximityPositions2.get(contig); - ArrayList cov2 = null; - if (list2 != null) - cov2 = Misc.cov(list2); - else { - cov2 = new ArrayList(); - cov2.add(0L); - cov2.add(0L); - } - - mergeCovs(contig, cov, cov2); - //for (int i = 0; i < cov.size(); i+=2) { - // System.out.println(cov.get(i) + "\t" + cov.get(i + 1)); - //} - } - - } - catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - } - - - } - - public void findErrors() { - int numContigs = contigs.size(); - for (int ci = 0; ci < numContigs; ++ci) { - String contig = contigs.get(ci); - long length = contigLengths.get(ci); - ArrayList list = pafPositions.get(contig); - } - - /*ArrayList t = new ArrayList(); - t.add(-2l); - t.add(-2l); - t.add(-1l); - t.add(-1l); - t.add(-1l); - t.add(10l); - t.add(10l); - t.add(10l); - t.add(11l); - t.add(12l); - cov(t);*/ - //paf - //for (String contig: contigs) { - // ArrayList list = pafCoverage.get(contig); - // if (list != null) { - // Collections.sort(list); - //System.err.println(list); - //cov(list); - //ArrayList new_list = new ArrayList(); - // } - //} - //chain - - //prox - - //map - - } - /* - if (processErrors) { - for (int c1 = 0; c1 < numContigs; ++c1) { - String contig1 = contigs.get(c1); - ArrayList list = pafCoverage.get(contig1); - if (list == null) { - list = new ArrayList(); - pafCoverage.put(contig1, list); - } - ArrayList> rows1 = alignmentHash.get(contig1); - - for (ArrayList row1 : rows1) { - long start1 = Long.parseLong(row1.get(7)) + 1; - long end1 = Long.parseLong(row1.get(8)); - list.add(-start1); // add -start - list.add(end1); // and end position - } - - //TODO: Try joining adjacent aligment blocks if the gap <= min{maxBridge, length1, length2} - /*ArrayList comp = new ArrayList(); // sort row by start position... - /for (ArrayList row1 : rows1) - comp.add(Long.parseLong(row1.get(7))); // start - Misc.ArrayIndexComparator comparator = new Misc.ArrayIndexComparator(comp); - Integer[] indexes = comparator.createIndexArray(); - Arrays.sort(indexes, comparator); - - ArrayList list = pafCoverage.get(contig1); - if (list == null) { - list = new ArrayList(); - pafCoverage.put(contig1, list); - } - - for (int orientation = 0; orientation < 2; ++orientation) { - long minStart = Long.MAX_VALUE; - long maxEnd = 0; - - long minRStart = Long.MAX_VALUE; - long maxREnd = 0; - for (int i : indexes) { - ArrayList row1 = rows1.get(i); - - if ((orientation != 0) ^ ("+".equals(row1.get(4)))) { // take only + or - orientation... - long start1 = Long.parseLong(row1.get(7)) + 1; - long end1 = Long.parseLong(row1.get(8)); - - long oldStart = minStart; - long oldEnd = maxEnd; - - minStart = Math.min(minStart, start1); - maxEnd = Math.max(maxEnd, end1); - - long rstart1 = Long.parseLong(row1.get(2)); - long rend1 = Long.parseLong(row1.get(3)); - - minRStart = Math.min(minRStart, rstart1); - maxREnd = Math.max(maxREnd, rend1); - - long RL = maxREnd - minRStart; // read length - long L = maxEnd - minStart; // contig length - - if (oldEnd == 0 || RL <= 1.5 * L && L <= 1.5 * RL) { - - } else { - list.add(-oldStart); // add -start - list.add(oldEnd); // and end position - } - - } - } - }*/ - - - private static void usageInfo() - { - System.err.println("usage: java FindContigErrors [options]"); - System.err.println(" chain=file chain file "); - - System.err.println(" paf=file load alignment file in paf (minimap2) format"); - - System.err.println(" proximity=file NUM1 NUM2 NUM3 load proximity data, NUM1=bin size [10000]"); - System.err.println(" NUM2=max distance in bins[25], NUM3=scale score [1.0]"); - } - - private static void test() - { - ArrayList list = new ArrayList(); - list.add(new long[]{0, 9, 1}); - list.add(new long[]{0, 10, 1}); - list.add(new long[]{0, 11, 1}); - list.add(new long[]{5, 10, 1}); - FindContigErrors fce = new FindContigErrors(); - System.err.println(Misc.cov(list)); - } - - public static void main(String[] args) - { - //FindContigErrors.test(); - - if (args.length == 0) { - usageInfo(); - System.exit(0); - } - String extraParameters = ""; - for (int i = 0; i < args.length; ++i) { - extraParameters += " " + args[i]; - } - ParameterParser pp = new ParameterParser(); - if (!pp.init(extraParameters)) { - usageInfo(); - System.exit(0); - } - pp.warning(new String[]{"paf", "chain", "proximity"}); - - FindContigErrors fe = new FindContigErrors(); - System.out.println("#java FindContigErrors" + extraParameters); - - //for (int i = 0; i < pp.getNumberOfValues("map"); ++i) { - // fe.processErrorsFromMap(pp.getValueAsString("map", i, null), pp.getValueAsString("noIntervals", "0").equals("1")); - //} - - String chain = pp.getValueAsString("chain", null); - if (chain != null) { - fe.processErrorsFromChain(chain); - } - - String paf = pp.getValueAsString("paf", null); - if (paf != null) - fe.processErrorsFromPaf(paf); - - String prox = pp.getValueAsString("proximity", 0, null); - if (prox != null) { - int bin = Integer.parseInt(pp.getValueAsString("proximity", 1, "10000")); - int maxD = Integer.parseInt(pp.getValueAsString("proximity", 2, "25")); - double scale = Double.parseDouble(pp.getValueAsString("proximity", 3, "1.0")); - fe.processErrorsFromProximity(prox, bin, maxD, scale); - } - fe.findErrors(); - } - -} diff --git a/software/LepAnchor/src/Input.java b/software/LepAnchor/src/Input.java deleted file mode 100644 index 0d57214..0000000 --- a/software/LepAnchor/src/Input.java +++ /dev/null @@ -1,209 +0,0 @@ -/** - This file is part of Lep-Anchor. - - Lep-Anchor is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-Anchor is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-Anchor. If not, see . - - Copyright (C) 2019 Pasi Rastas, pasi.rastas@gmail.com, University of Helsinki - -*/ -/** - This file is part of Lep-MAP. - - Lep-MAP is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-MAP is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-MAP. If not, see . - - Copyright (C) 2013 Pasi Rastas, pasi.rastas@gmail.com, University of Helsinki -*/ - -//Reads files to ArrayLists of Strings -import java.io.*; -import java.util.*; - -public class Input { - private Input() { }; - - private static boolean keepComments = false; - private static StringBuilder comments = new StringBuilder(); - public static String getComments() - { - return comments.toString(); - } - public static void setKeepComments(boolean value) - { - keepComments = value; - } - - public static String loadRow(BufferedReader br) throws Exception - { - String s = ""; - do { - s = br.readLine(); - //System.err.println(s); - if (s != null) { - int index = s.indexOf('#'); - if (index >= 0) { - s = s.substring(0, index); - if (keepComments) { - comments.append(s.substring(index + 1)); - comments.append('\n'); - } - } - } - } while ( s != null && (s.length() == 0) ); - return s; - } - - public static ArrayList splitRow(String s, String delim) - { - StringTokenizer st = new StringTokenizer(s, delim, false); - ArrayList row = new ArrayList(); - while (st.hasMoreTokens()) { - String nt = st.nextToken(); - row.add(nt); - } - return row; - } - - - public static ArrayList loadTableRow(BufferedReader br, String delim) throws Exception - { - String s = loadRow(br); - if (s == null) - return null; - - /*String sa[] = s.split(delim); - ArrayList row = new ArrayList(); - for (String tmp : sa) - row.add(tmp);*/ - return splitRow(s, delim); - } - - public static ArrayList> loadTable(BufferedReader br, String delim) throws Exception { - ArrayList> ret = new ArrayList>(); - //ArrayList comments = new ArrayList(); - while (true) { - ArrayList row = loadTableRow(br, delim); - if (row == null) - break; - else - ret.add(row); - } - return ret; - } - - public static ArrayList> loadTable(BufferedReader br, String delim, String returnDelim) throws Exception { - ArrayList> ret = new ArrayList>(); - - //ArrayList comments = new ArrayList(); - while (true) { - StringTokenizer st = null; - String s = null; - do { - s = loadRow(br); - if (s != null) - st = new StringTokenizer(s, delim, true); - } while ( s != null && !st.hasMoreTokens()); - if (s == null) - break; - ArrayList row = new ArrayList(); - while (st.hasMoreTokens()) { - String nt = st.nextToken(); - if (delim.indexOf(nt) >= 0) { - if (returnDelim.indexOf(nt) >= 0) - row.add(nt); - } else - row.add(nt); - } - ret.add(row); - } - return ret; - } - - public static ArrayList> loadTable(String filename, String delim, String returnDelim) { - try { - BufferedReader br = new BufferedReader(new FileReader(filename)); - ArrayList> t = loadTable(br, delim, returnDelim); - br.close(); - return t; - - } catch (Exception e) { - System.err.println(e); - return null; - } - } - - - public static ArrayList> loadTable(String filename, String delim) { - try { - BufferedReader br = new BufferedReader(new FileReader(filename)); - ArrayList> t = loadTable(br, delim); - br.close(); - return t; - - } catch (Exception e) { - System.err.println(e); - return null; - } - } - - public static int[][] loadIntTable(String filename, String delim) { - - ArrayList ret = new ArrayList(); - try { - BufferedReader br = new BufferedReader(new FileReader(filename)); - while (true) { - ArrayList t = loadTableRow(br, delim); - if (t == null) - break; - else { - int it[] = new int[t.size()]; - int i = 0; - for (String s : t) - it[i++] = Integer.parseInt(s); - ret.add(it); - } - - } - br.close(); - return ret.toArray(new int[ret.size()][]); - - } catch (Exception e) { - System.err.print(e); - return null; - } - } - - public static void main(String args[]) - { - ArrayList> test = Input.loadTable("pedigree.txt", " \t"); - for (ArrayList row : test) { - for (String item : row) - System.err.print(item + " "); - System.err.println(); - } - - - } - -} diff --git a/software/LepAnchor/src/InputData.java b/software/LepAnchor/src/InputData.java deleted file mode 100644 index 0e257fc..0000000 --- a/software/LepAnchor/src/InputData.java +++ /dev/null @@ -1,452 +0,0 @@ -/** - This file is part of Lep-Anchor. - - Lep-Anchor is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-Anchor is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-Anchor. If not, see . - - Copyright (C) 2019 Pasi Rastas, pasi.rastas@gmail.com, University of Helsinki - -*/ -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; - - - -public class InputData { - - public static long myParseLong(String s) - { - if (s.charAt(s.length() - 1) == '*') - return Long.parseLong(s.substring(0, s.length() - 1)); - else - return Long.parseLong(s); - - } - private static long[] myParseLongInterval(String s) - { - long ret[] = null; - int pos = s.indexOf('-'); - if (pos < 0) { - long p = myParseLong(s); - return new long[]{p, p}; - } - if (s.charAt(s.length() - 1) == '*') { - ret = new long[3]; - ret[1] = Long.parseLong(s.substring(pos + 1, s.length() - 1)); - } - else { - ret = new long[2]; - ret[1] = Long.parseLong(s.substring(pos + 1)); - } - ret[0] = Long.parseLong(s.substring(0, pos)); - return ret; - } - - public static HashMap>> loadRaw(String fn, HashMap> haplotypeHash) - { - HashMap>> ret = new HashMap>>(); - try { - - int numMarkers = 0; - - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - - do { - String row = Input.loadRow(br); - if (row == null) - break; - ArrayList srow = Input.splitRow(row, "[\t ]"); - if (srow.size() >= 3) { - String contig = srow.get(0); - boolean added = false; - if (haplotypeHash != null && haplotypeHash.containsKey(contig)) { - long pos = myParseLong(srow.get(1)); - for (ContigInterval ci : haplotypeHash.get(contig)) { - if (ci.inside(pos)) { - ArrayList> list = ret.get(ci); - if (list == null) { - list = new ArrayList>(); - ret.put(ci, list); - } - list.add(srow); - ++numMarkers; - added = true; - break; - } - } - } - if (!added) - System.out.println(row); - } else - System.err.println("Warning: skipping " + row); - } while (true); - - System.err.println("Liftover for " + numMarkers + " markers over " + ret.size() + " regions"); - return ret; - } catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - return null; - } - } - - public static ArrayList loadMap(String fn, boolean nochromosome, boolean nointervals, boolean compressMap) - { - ArrayList pos = new ArrayList(); - ArrayList chr = new ArrayList(); - - ArrayList ret = new ArrayList(); - try { - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - - do { - ArrayList row = Input.loadTableRow(br, "[\t ]"); - if (row == null) - break; - if (row.size() > 0) { - if (nochromosome) - row.add(2, "1"); - - if (row.size() >= 4 && (nointervals || (row.size() & 1) == 1) || row.size() == 4) { - int intervals[] = new int[Math.max(2, row.size() - 3)]; - if (row.size() == 4 || nointervals) { - if (nointervals) { - intervals = new int[2]; - double p = Double.parseDouble(row.get(3)); - if (row.size() >= 5) - p += Double.parseDouble(row.get(4)); - pos.add(p); - chr.add(Integer.parseInt(row.get(2))); - } - else - try { - intervals[0] = intervals[1] = Integer.parseInt(row.get(3)); - } catch (NumberFormatException e) { - System.err.println("Error: non-integer value for interval (check parameter noIntervals=1)" + row); - System.exit(-1); - } - } - else - for (int i = 3; i < row.size(); i+=2) { - try { - intervals[i-3] = Integer.parseInt(row.get(i)); - intervals[i-2] = Integer.parseInt(row.get(i + 1)); - } catch (NumberFormatException e) { - System.err.println("Error: non-integer value for interval (check parameter noIntervals=1) " + row); - System.exit(-1); - } - } - ret.add(new Marker(row.get(0), myParseLong(row.get(1)), Integer.parseInt(row.get(2)), intervals)); - } else - System.err.println("Warning: skipping " + row); - } - } while (true); - br.close(); - if (nointervals) { - //put markers in each chr to separate list... and sort... - HashMap> chrMarkers = new HashMap>(); // store chromosome info - HashMap> indexMarkers = new HashMap>(); // store original index in ret... - - for (int mi = 0; mi < chr.size(); ++mi) { - int c = chr.get(mi); - ArrayList list = chrMarkers.get(c); - ArrayList list2 = indexMarkers.get(c); - if (list == null) { - list = new ArrayList(); - chrMarkers.put(c, list); - list2 = new ArrayList(); - indexMarkers.put(c, list2); - } - list.add(pos.get(mi)); - list2.add(mi); - } - for (int c : chrMarkers.keySet()) { - ArrayList list = chrMarkers.get(c); - ArrayList list2 = indexMarkers.get(c); - Misc.ArrayIndexComparator comparator = new Misc.ArrayIndexComparator(list); - Integer[] indexes = comparator.createIndexArray(); - java.util.Arrays.sort(indexes, comparator); - - double prev = Double.NEGATIVE_INFINITY; - int ip = -1; - for (int i : indexes) { - double p = list.get(i); - if (p > prev) - ++ip; - Marker m = ret.get(list2.get(i)); - m.intervals[0] = ip; - m.intervals[1] = ip; - prev = p; - } - - } - } else if (compressMap){ // no noIntervals && compress map positions - - // compressing map speeds up computation... - HashMap> chrMarkers = new HashMap>(); - - for (Marker m : ret) { - int c = m.getChromosome(); - ArrayList list = chrMarkers.get(c); - if (list == null) { - list = new ArrayList(); - chrMarkers.put(c, list); - } - list.add(m); - } - for (ArrayList markers: chrMarkers.values()) { - ArrayList positions = new ArrayList(); - for (Marker m : markers) { - int[] interval = m.getIntervals(); - for (int i : interval) - positions.add(i); - } - Collections.sort(positions); - HashMap compressHash = new HashMap(); - int numPositions = 0; - for (int i : positions) { - if (!compressHash.containsKey(i)) - compressHash.put(i, numPositions++); - } - for (Marker m : markers) { - int[] interval = m.getIntervals(); - for (int ii = 0; ii < interval.length; ++ii) - interval[ii] = compressHash.get(interval[ii]); - } - } - } - return ret; - } catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - return null; - } - } - public static ArrayList loadBedAndComments(String fn, StringBuilder comments) - { - Input.setKeepComments(true); - ArrayList ret = loadBed(fn, -1); - Input.setKeepComments(false); - comments.append(Input.getComments()); - return ret; - } - - public static ArrayList loadBed(String fn) - { - return loadBed(fn, -1); - } - public static ArrayList loadBed(String fn, int chr) - { - ArrayList ret = new ArrayList(); - try { - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - - do { - ArrayList row = Input.loadTableRow(br, "[\t ]"); - if (row == null) - break; - if (row.size() >= 3) { - String start = row.get(1); - String end = row.get(2); - ContigInterval ci = null; - if (start.indexOf('-') >= 0 || end.indexOf('-') >= 0) - ci = new ContigInterval(row.get(0), myParseLongInterval(start), myParseLongInterval(end)); - else - ci = new ContigInterval(row.get(0), myParseLong(start), myParseLong(end)); - if (row.size() >= 4 && (row.get(3).equals("-") || row.get(3).equals("--") || row.get(3).equals("---"))) - ci.flipOrientation(); - - if (row.size() >= 5) { - int bChr = Integer.parseInt(row.get(4)); - ci.setChromosome(bChr); - } - if (chr < 0 || ci.getChromosome() == chr) - ret.add(ci); - } - else - System.err.println("Warning: skipping " + row); - } while (true); - - int prevC = 0; - for (ContigInterval ci : ret) { - if (prevC != 0 && ci.getChromosome() != prevC) { - System.err.println("Error: multiple cromosomes in the bed"); - System.err.println("Try providing parameter chromosome or split bed into chromosomes"); - System.exit(-1); - } - } - br.close(); - return ret; - } catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - return null; - } - } - - public static ArrayList loadLa(String fn) - { - ArrayList ret = new ArrayList(); - try { - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - - do { - ArrayList row = Input.loadTableRow(br, "[\t ]"); - if (row == null) - break; - if (row.size() >= 3) { - String start = row.get(1); - String end = row.get(2); - if (row.size() >= 8) { - start = row.get(6); - int sis = start.indexOf('/'); - if (sis >= 0) - start = start.substring(0, sis); - end = row.get(7); - int sie = end.indexOf('/'); - if (sie >= 0) - end = end.substring(0, sie); - } - ContigInterval ci = null; - if (start.indexOf('-') >= 0 || end.indexOf('-') >= 0) - ci = new ContigInterval(row.get(0), myParseLongInterval(start), myParseLongInterval(end)); - else - ci = new ContigInterval(row.get(0), myParseLong(start), myParseLong(end)); - if (row.size() >= 4 && (row.get(3).equals("-") || row.get(3).equals("--") || row.get(3).equals("---"))) - ci.flipOrientation(); - - if (row.size() >= 5) { - int bChr = Integer.parseInt(row.get(4)); - ci.setChromosome(bChr); - } - ret.add(ci); - } - else - System.err.println("Warning: skipping " + row); - } while (true); - - int prevC = 0; - for (ContigInterval ci : ret) { - if (prevC != 0 && ci.getChromosome() != prevC) { - System.err.println("Error: multiple cromosomes in the bed"); - System.err.println("Try providing parameter chromosome or split bed into chromosomes"); - System.exit(-1); - } - } - br.close(); - return ret; - } catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - return null; - } - } - - - public static ArrayList loadHaplotypes(String fn) - { - ArrayList ret = new ArrayList(); - try { - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - - do { - ArrayList row = Input.loadTableRow(br, "[\t ]"); - if (row == null) - break; - if (row.size() >= 4) { - ContigInterval ci = new ContigInterval(row.get(1), myParseLong(row.get(2)), myParseLong(row.get(3))); - ret.add(ci); - } - else - System.err.println("Warning: skipping haplotype " + row); - } while (true); - br.close(); - return ret; - } catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - return null; - } - } - - - public ArrayList loadLGMap(String fn) - { - ArrayList ret = new ArrayList(); - try { - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - String line = null; - do { - line = br.readLine(); - if (line == null) - break; - - int ic = line.indexOf('#'); - if (ic >= 0) - line = line.substring(0, ic); - - if (line.length() > 0) { - String[] row = line.split("\t"); - if (row.length > 2 || row.length == 3 || (row.length & 1) == 0) { - int intervals[] = new int[Math.max(2, row.length - 2)]; - if (row.length == 3) - intervals[0] = intervals[1] = Integer.parseInt(row[2]); - else - for (int i = 2; i < row.length; i+=2) { - intervals[i-2] = Integer.parseInt(row[i]); - intervals[i-1] = Integer.parseInt(row[i + 1]); - } - //ret.add(new Marker(row[0], myParseInt(row[1]), intervals)); - } else - System.err.println("Warning: skipping " + line); - } - } while (true); - br.close(); - return ret; - } catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - return null; - } - } - - -} diff --git a/software/LepAnchor/src/LiftoverHaplotypes.java b/software/LepAnchor/src/LiftoverHaplotypes.java deleted file mode 100644 index 7edd03c..0000000 --- a/software/LepAnchor/src/LiftoverHaplotypes.java +++ /dev/null @@ -1,44 +0,0 @@ - -public class LiftoverHaplotypes { - private static void usageInfo() - { - System.err.println("usage: java LiftoverHaplotypes haplotypes=haplo.bed map=mapX.txt chain=all.chain >mapX_liftover.txt"); - System.err.println(" haplotypes=FILE try liftover for markers in removed haplotypes listed in FILE (from findFullHaplotypes or from PlaceAndOrientContigs)"); - System.err.println(" map=FILE2 any file with contig and pos in the first columns, typically the map file"); - System.err.println(" chain=FILE3 the chain file"); - } - - public static void main(String[] args) - { - - if (args.length == 0) { - usageInfo(); - System.exit(0); - } - String extraParameters = ""; - for (int i = 0; i < args.length; ++i) { - extraParameters += " " + args[i]; - } - ParameterParser pp = new ParameterParser(); - if (!pp.init(extraParameters)) { - usageInfo(); - System.exit(0); - } - pp.warning(new String[]{"map", "chain", "haplotypes"}); - - String haplotypes = pp.getValueAsString("haplotypes", null); - String chain = pp.getValueAsString("chain", null); - String map = pp.getValueAsString("map", null); - if (map == null || chain == null || haplotypes == null){ - usageInfo(); - System.exit(0); - } - - - System.out.println("#java LiftoverHaplotypes" + extraParameters); - - PlaceAndOrientContigs poc = new PlaceAndOrientContigs(); - poc.liftover(haplotypes, chain, map); - } - -} diff --git a/software/LepAnchor/src/Map2Bed.java b/software/LepAnchor/src/Map2Bed.java deleted file mode 100644 index 6759df9..0000000 --- a/software/LepAnchor/src/Map2Bed.java +++ /dev/null @@ -1,334 +0,0 @@ -import java.io.BufferedReader; -import java.io.FileReader; -import java.util.ArrayList; -import java.util.HashMap; - -/** - This file is part of Lep-Anchor. - - Lep-Anchor is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-Anchor is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-Anchor. If not, see . - - Copyright (C) 2019 Pasi Rastas, pasi.rastas@gmail.com, University of Helsinki - -*/ -public class Map2Bed { - private int markerSupport = 2; - private int splitLength = 10000; - private double minQuality = 2; - - public static void usage() - { - System.err.println("Usage: java Map2Bed map=contig_pos_map.txt contigLengths=contig.sizes [options] >map_clean.bed"); - System.err.println(" map=file a map file containing columns contig, position and chromosome "); - System.err.println(" in sorted order (typically from CleanMap)"); - - System.err.println(" contigLength=file a file containing columns contig name and its length"); - - System.err.println(" markerSupport=NUM at least NUM markers are needed for splitting a contig [2]"); - System.err.println(" minSplitLength=NUM do not split shorter regions than NUM [10000]"); - System.err.println(" minQuality=NUM only consider markers with quality >= NUM [2]"); - System.err.println(" (column 5 from CleanMap)"); - } - private void makeBed(String mapFile, String lengthFile) - { - - HashMap scaffoldLength = new HashMap(); - try { - BufferedReader br = new BufferedReader(new FileReader(lengthFile)); - - ArrayList line = Input.loadTableRow(br, " \t"); - while (line != null) { - String scaffold = line.get(0); - long length = Long.parseLong(line.get(1)); - scaffoldLength.put(scaffold, length); - line = Input.loadTableRow(br, " \t"); - } - } catch (Exception e) { - System.err.println("Error loading contig lengths"); - e.printStackTrace(); - System.exit(-1); - } - - int numChromosomes = 0; - String prevScaffold = ""; - ArrayList list = new ArrayList(); - - HashMap scaffoldMap = new HashMap(); - try { - BufferedReader br = new BufferedReader(new FileReader(mapFile)); - - ArrayList line = Input.loadTableRow(br, " \t"); - while (line != null) { - if (!line.get(0).equals("CHR") && !line.get(0).equals("CHROM")) { - String scaffold = line.get(0); - long position = Long.parseLong(line.get(1)); - int chr = Integer.parseInt(line.get(2)); - double quality = minQuality; - if (line.size() >= 5) - quality = Double.parseDouble(line.get(4)); - if (chr > 0 && quality >= minQuality) { - if (scaffoldMap.containsKey(scaffold) && scaffoldMap.get(scaffold) > position) { - System.err.println("Error: map file must be sorted"); - System.exit(-1); - } - if (!scaffoldLength.containsKey(scaffold)) { - System.err.println("Error: contig " + scaffold + " does not have length in " + lengthFile); - System.exit(-1); - } - - if (!prevScaffold.equals("") && !prevScaffold.equals(scaffold)) { - processScaffold(prevScaffold, scaffoldLength.get(prevScaffold), list); - list.clear(); - } - scaffoldMap.put(scaffold, position); - prevScaffold = scaffold; - - list.add(position); - list.add((long)chr); - list.add((long)(100 * quality)); - numChromosomes = Math.max(numChromosomes, chr); - } - } - line = Input.loadTableRow(br, " \t"); - } - processScaffold(prevScaffold, scaffoldLength.get(prevScaffold), list); - } catch (Exception e) { - System.err.println("Error loading map file"); - e.printStackTrace(); - System.exit(-1); - } - } - private class Interval{ - int startIndex; - int endIndex; - long quality; - long position; - long startPosition; - long chr; - int markers; - public Interval(int startIndex, long position, long quality, long chr){ - this.startIndex = startIndex; - this.endIndex = startIndex; - this.quality = quality; - this.chr = chr; - this.position = position; - startPosition = position; - markers = 1; - } - public void addMarker(int index, long position, long quality) { - endIndex = index; - this.quality += quality; - this.position = position; - ++markers; - } - - public void addMarkerLeft(int index, long position, long quality) { - startIndex = index; - this.quality += quality; - this.startPosition = position; - ++markers; - } - - public void addInterval(Interval i) { - endIndex = i.endIndex; - quality += i.quality; - position = i.position; - markers += i.markers; - } - public int getMarkers() { - return markers; - } - public long getLength() { - return position - startPosition + 1; - } - public long getQuality() { - return quality; - } - public long getChr() { - return chr; - } - public int getStartIndex() { - return startIndex; - } - public int getEndIndex() { - return endIndex; - } - public long getStartPosition() { - return startPosition; - } - - public long getPosition() { - return position; - } - public String toString() - { - //return chr + "\t" + markers + "\t" + quality; - return startPosition + "\t" + position + "\t" + chr; - } - } - - private void processScaffold(String scaffold, long scaffoldLength, ArrayList list) - { - - //create intervals of adjacent markers in the same chromosome - ArrayList intervals = new ArrayList(); - long prevChr = 0; - for (int i = 0; i < list.size(); i+=3) { - long position = list.get(i); - long chr = list.get(i + 1); - long quality = list.get(i + 2); - if (i == 0 || prevChr != chr) - intervals.add(new Interval(i, position, quality, chr)); - else - intervals.get(intervals.size() - 1).addMarker(i, position, quality); - prevChr = chr; - } - - if (intervals.size() > 1) { //more than one interval, prune - do { - int removed = -1; - for (int ii = 0; ii < intervals.size(); ++ii) { - Interval i = intervals.get(ii); - if (i.getMarkers() < markerSupport || i.getLength() < splitLength) { // < markerSupport markers and/or length < splitLength - if (removed < 0) - removed = ii; - else { - long l = i.getLength(); - long lr = intervals.get(removed).getLength(); - if (l < lr || (l == lr && i.getQuality() < intervals.get(removed).getQuality())) - removed = ii; - } - } - } - - if (removed >= 0) { //remove removed (too short and/or too few markers) - if (removed > 0 && removed + 1 < intervals.size() && intervals.get(removed - 1).getChr() == intervals.get(removed + 1).getChr()) { - Interval prev = intervals.get(removed - 1); - Interval next = intervals.get(removed + 1); - for (int li = prev.getEndIndex() + 3; li < next.getStartIndex(); li+=3) { - long chr = list.get(li + 1); - if (chr == prev.getChr()) { - long position = list.get(li); - long quality = list.get(li + 2); - prev.addMarker(li, position, quality); - } - } - prev.addInterval(next); - intervals.remove(removed + 1); - intervals.remove(removed); - } else - intervals.remove(removed); - } - else - break; - } while (true); - - //try to extend intervals - for (int ii = 0; ii + 1 < intervals.size(); ++ii) { - Interval prev = intervals.get(ii); - Interval next = intervals.get(ii + 1); - - for (int li = prev.getEndIndex() + 3; li < next.getStartIndex(); li+=3) { - long chr = list.get(li + 1); - if (chr == next.getChr()) // conflict - break; - if (chr == prev.getChr()) { - //System.err.println("Extend"); - long position = list.get(li); - long quality = list.get(li + 2); - prev.addMarker(li, position, quality); - } - } - - for (int li = next.getStartIndex() - 3; li > prev.getEndIndex(); li-=3) { - long chr = list.get(li + 1); - if (chr == prev.getChr()) // conflict - break; - if (chr == next.getChr()) { - //System.err.println("Extend"); - long position = list.get(li); - long quality = list.get(li + 2); - next.addMarkerLeft(li, position, quality); - } - } - } - - } - - for (int ii = 0; ii < intervals.size(); ++ii) { - Interval i = intervals.get(ii); - long start = 1; - if (ii > 0) - start = intervals.get(ii - 1).getPosition() + 1; - - String end = scaffoldLength + "*"; - if (ii + 1 < intervals.size()) - end = "" + (intervals.get(ii + 1).getStartPosition() - 1); - System.out.println(scaffold + "\t" + start + "-" + i.getStartPosition() + "\t" + i.getPosition() + "-" + end + "\t?\t" + i.getChr()); - } - - //System.err.println(scaffold + "\t" + intervals); - } - - private void setMarkerSupport(int parseInt) { - markerSupport = parseInt; - } - private void setSplitLength(int parseInt) { - splitLength = parseInt; - } - private void setMinQuality(double parseDouble) { - minQuality = parseDouble; - } - - public static void main(String args[]) - { - ParameterParser pp = new ParameterParser(); - - String extraParameters = ""; - for (int i = 0; i < args.length; ++i) { - extraParameters += " " + args[i]; - } - if (args.length == 0 || !pp.init(extraParameters)) { - usage(); - System.exit(0); - } - - - String mapFile = pp.getValueAsString("map", null); - String lengthFile = pp.getValueAsString("contigLength", null); - if (mapFile == null) { - System.err.println("Please specify a map file"); - usage(); - System.exit(0); - - } - if (lengthFile == null) { - System.err.println("Please specify a contig length file"); - usage(); - System.exit(0); - - } - - pp.warning(new String[]{"map", "contigLength", "markerSupport", "minSplitLength", "minQuality"}); - - - Map2Bed m2p = new Map2Bed(); - m2p.setMarkerSupport(Integer.parseInt(pp.getValueAsString("markerSupport", "2"))); - m2p.setSplitLength(Integer.parseInt(pp.getValueAsString("minSplitLength", "10000"))); - m2p.setMinQuality(Double.parseDouble(pp.getValueAsString("minQuality", "2.0"))); - - System.out.println("#java Map2Bed" + extraParameters); - m2p.makeBed(mapFile, lengthFile); - } -} diff --git a/software/LepAnchor/src/Marker.java b/software/LepAnchor/src/Marker.java deleted file mode 100644 index 181da59..0000000 --- a/software/LepAnchor/src/Marker.java +++ /dev/null @@ -1,139 +0,0 @@ -/** - This file is part of Lep-Anchor. - - Lep-Anchor is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-Anchor is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-Anchor. If not, see . - - Copyright (C) 2019 Pasi Rastas, pasi.rastas@gmail.com, University of Helsinki - -*/ -public class Marker implements Comparable{ - String contig; - int chromosome; - long position; - int intervals[]; - ContigInterval ci; - - int pPlus; - int pMinus; - - - public Marker(String contig, long position, int chromosome, int intervals[]) - { - this.contig = contig; - this.position = position; - this.chromosome = chromosome; - //TODO: consider cloning intervals... - this.intervals = intervals; - } - - public Marker(Marker m, long position) - { - this(m.contig, position, m.chromosome, m.intervals); - } - - public Marker(Marker m) - { - this(m.contig, m.position, m.chromosome, m.intervals); - } - - public Marker(Marker m, ContigInterval ci) - { - this(m.contig, m.position, m.chromosome, m.intervals); - this.ci = ci; - } - - //return next position inside from p or p + 1 if p is inside - public int nextInside(int p) { - int min = Integer.MAX_VALUE; - for (int j = 0; j < intervals.length; j+=2) - if (p <= intervals[j + 1] && min > intervals[j]) - min = intervals[j]; - if (min <= p) - return p + 1; - return min; - } - - public int inside(int p) - { - for (int j = 0; j < intervals.length; j+=2) - if (p >= intervals[j] && p <= intervals[j + 1]) - return 1; - return 0; - } - @Override - public String toString() - { - return contig + "\t" + position + "\t" + chromosome; - } - //orders first by contig and then by position... - @Override - public int compareTo(Marker o) { - int ret = getContig().compareTo(o.getContig()); - if (ret != 0) - return ret; - if (position < o.position) - return -1; - if (position > o.position) - return 1; - return 0; - } - public String getName(){ - return contig + '\t' + position; - } - public String getContig(){ - return contig; - } - public long getPosition(){ - return position; - } - - public int getChromosome(){ - return chromosome; - } - public int[] getIntervals() { - return intervals; - } - - public int maxBin() // get maximum bin value... - { - int ret = 0; - for (int j = 1; j < intervals.length; j+=2) - ret = Math.max(ret, intervals[j]); - return ret; - } - public int minBin() // get minimum bin value... - { - int ret = Integer.MAX_VALUE; - for (int j = 0; j < intervals.length; j+=2) - ret = Math.min(ret, intervals[j]); - return ret; - } - public void flip(int maxBin) - { - int i[] = new int[intervals.length]; - for (int j = 0; j < intervals.length; ++j) - i[j] = maxBin - intervals[j ^ 1]; - intervals = i; - } - - public ContigInterval getContigInterval() { - return ci; - } - public static void main(String args[]){ - Marker m = new Marker("Contig", 1, 1, new int[]{10,20,40,60}); - for (int i = 0; i <= 61; ++i) - System.out.println(i + ":" + m.nextInside(i)); - } - -} diff --git a/software/LepAnchor/src/Misc.java b/software/LepAnchor/src/Misc.java deleted file mode 100644 index 2c7ffe0..0000000 --- a/software/LepAnchor/src/Misc.java +++ /dev/null @@ -1,224 +0,0 @@ -/** - This file is part of Lep-Anchor. - - Lep-Anchor is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-Anchor is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-Anchor. If not, see . - - Copyright (C) 2019 Pasi Rastas, pasi.rastas@gmail.com, University of Helsinki - -*/ -/** - This file is part of Lep-MAP. - - Lep-MAP is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-MAP is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-MAP. If not, see . - - Copyright (C) 2013 Pasi Rastas, pasi.rastas@helsinki.fi, University of Helsinki -*/ - -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; - -public class Misc { - - //compare based on first element of long[] - private static class LongComparator0 implements Comparator - { - @Override - public int compare(long[] l1, long[] l2){ - return Long.compare(l1[0], l2[0]); - } - } - - //compare based on second element of long[] - private static class LongComparator1 implements Comparator - { - @Override - public int compare(long[] l1, long[] l2){ - return Long.compare(l1[1], l2[1]); - } - } - //list elements contains 3 values, start, end and weight as long[3] - //dynamic programming for finding coverage of multiple weighted intervals - public static ArrayList cov(ArrayList list_) - { - ArrayList listStop = new ArrayList(); - listStop.addAll(list_); - ArrayList list = new ArrayList(); - list.addAll(list_); - Collections.sort(list, new LongComparator0()); - Collections.sort(listStop, new LongComparator1()); - - ArrayList ret = new ArrayList(); - int n = list.size(); - int end = 0; - int start = 0; - long pos = 0; - - long cov = 0; - long ps = list.get(start)[0]; // start - long pe = listStop.get(end)[1]; // end - - long prevP = 0; - long prevC = 0; - while (end < n) { - if (start < n && ps <= pe) { - pos = ps; - while (start < n && pos == list.get(start)[0]) { // take all intervals with same start - cov += list.get(start)[2]; - ++start; - if (start < n) - ps = list.get(start)[0]; - } - } else { - pos = pe; - while (end < n && pos == listStop.get(end)[1]) { // teka all intervals with same end - cov -= listStop.get(end)[2]; - ++end; - if (end < n) - pe = listStop.get(end)[1]; - } - ++pos; - } - if (pos != prevP) { - //System.err.println(prevP + "\t" + prevC); - ret.add(prevP); - ret.add(prevC); - } - prevP = pos; - prevC = cov; - } - ret.add(pos); - ret.add(0L); - //System.err.println(pos + "\t0"); - return ret; - } - - - public static boolean intersectIntervals(long b1, long e1, long b2, long e2){ - if (e1 < b2 || e2 < b1) - return false; - return true; - } - - public static long intersectIntervalsLength(long b1, long e1, long b2, long e2){ - if (e1 <= b2 || e2 <= b1) - return 0; - return Math.min(e1 - b2, e2 - b1); - } - - private static final double LN10 = Math.log(10.0); - - public static double exp10(double a) - { - return Math.exp(a * LN10); - } - - public static class ArrayIndexComparator> implements Comparator - { - private final T[] array; - - public ArrayIndexComparator(T[] array) - { - this.array = array; - } - @SuppressWarnings("unchecked") - public ArrayIndexComparator(ArrayList alArray) - { - this.array = (T[]) alArray.toArray(new Comparable[0]); - } - - public Integer[] createIndexArray() - { - Integer[] indexes = new Integer[array.length]; - for (int i = 0; i < array.length; i++) - { - indexes[i] = i; - } - return indexes; - } - - public int compare(Integer index1, Integer index2) - { - return array[index1].compareTo(array[index2]); - } - } - //Misc.ArrayIndexComparator comparator = new Misc.ArrayIndexComparator(table); - //Integer[] indexes = comparator.createIndexArray(); - //Arrays.sort(indexes, comparator); - - public static class ArrayIndexComparator2 implements Comparator - { - private final T[] array; - private Comparator C; - - public ArrayIndexComparator2(T[] array, Comparator C) - { - this.array = array; - this.C = C; - } - @SuppressWarnings("unchecked") - public ArrayIndexComparator2(ArrayList alArray, Comparator C) - { - this.array = (T[]) alArray.toArray(new Comparable[0]); - this.C = C; - } - - public Integer[] createIndexArray() - { - Integer[] indexes = new Integer[array.length]; - for (int i = 0; i < array.length; i++) - { - indexes[i] = i; - } - return indexes; - } - - public int compare(Integer index1, Integer index2) - { - return C.compare(array[index1], array[index2]); - } - } - //Misc.ArrayIndexComparator2 comparator = new Misc.ArrayIndexComparator2(table, customDoubleComparator); - //Integer[] indexes = comparator.createIndexArray(); - //Arrays.sort(indexes, comparator); - - //returns random merge of two lists... - public static ArrayList randomMerge(ArrayList l1, ArrayList l2) - { - ArrayList ret = new ArrayList(); - int n = l1.size(); - int m = l2.size(); - int j = 0; - int k = 0; - for (int i = 0; i < n + m; ++i) { - if (j < n && (k == m || Math.random() * (n - j) > (m - k) * 0.5)) - ret.add(l1.get(j++)); - else - ret.add(l2.get(k++)); - } - return ret; - } - -} diff --git a/software/LepAnchor/src/ParameterParser.java b/software/LepAnchor/src/ParameterParser.java deleted file mode 100644 index 555b64f..0000000 --- a/software/LepAnchor/src/ParameterParser.java +++ /dev/null @@ -1,231 +0,0 @@ -/** - This file is part of Lep-Anchor. - - Lep-Anchor is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-Anchor is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-Anchor. If not, see . - - Copyright (C) 2019 Pasi Rastas, pasi.rastas@gmail.com, University of Helsinki - -*/ -/** - This file is part of Lep-MAP. - - Lep-MAP is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-MAP is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-MAP. If not, see . - - Copyright (C) 2013 Pasi Rastas, pasi.rastas@helsinki.fi, University of Helsinki -*/ -//Parses parameters given as "x=a y=b c d z = e" -import java.util.StringTokenizer; -import java.util.ArrayList; -import java.util.HashMap; - -public class ParameterParser { - private HashMap> hm; - - public ParameterParser() - { - - } - public ParameterParser(String parameters) - { - if (!init(parameters)) - hm = null; - } - - public boolean init(ArrayList tokens) - { - hm = new HashMap>(); - int n = tokens.size(); - int i = 0; - while (i < n) { - String key = tokens.get(i); - if (key.equals("=")) - return false; - ++i; - if (i < n - 1) { - if (!tokens.get(i).equals("=")) - return false; - ++i; - } else - return false; - - ArrayList value = new ArrayList(); - while (i == n - 1 || (i < n - 1 && !tokens.get(i + 1).equals("="))) { - String nv = tokens.get(i); - if (nv.equals("=")) - return false; - value.add(nv); - ++i; - } - if (hm.containsKey(key)) - return false; - hm.put(key, value); - } - return true; - } - public boolean init(String parameters) - { - String delims = "\t =;"; - StringTokenizer st = new StringTokenizer(parameters, delims, true); - - ArrayList tokens = new ArrayList (); - while (st.hasMoreTokens()) { - String nt = st.nextToken(); - if (delims.indexOf(nt) >= 0) { - if (nt.equals("=") || nt.equals(";")) - tokens.add(nt); - } else - tokens.add(nt); - - } - return init(tokens); - } - public ArrayList getValue(String variable) - { - return hm.get(variable); - } - public String getValueAsString(String variable, String defaultValue) - { - ArrayList ret = getValue(variable); - if (ret != null) { - String rets = ""; - for (String s : ret) - rets += " " + s; - return rets.substring(Math.min(1, rets.length())); // remove first space - } - return defaultValue; - } - public int getNumberOfValues(String variable) - { - ArrayList ret = getValue(variable); - if (ret != null) - return ret.size(); - else - return 0; - } - public String getValueAsString(String variable, int index, String defaultValue) - { - ArrayList ret = getValue(variable); - if (ret != null && index < ret.size()) - return ret.get(index); - return defaultValue; - } - public ArrayList getValuesAsList(String variable) - { - return getValuesAsList(variable, 1, null); - } - - public ArrayList getValuesAsList(String variable, int size, String defaultValue) - { - ArrayList ret = new ArrayList(); - if (getNumberOfValues(variable) == 1 && getValueAsString(variable, null).startsWith("file:")) { - ArrayList> matrix = Input.loadTable(getValueAsString(variable, null).substring(5), "[\t ]"); - if (matrix.size() == 1) - ret.addAll(matrix.get(0)); - if (matrix.size() > 1) { // transpose matrix - ArrayList tp = new ArrayList(); - boolean ok = true; -// System.err.println("Reading matrix"); - for (ArrayList als : matrix) { - if (als.size() != 1) - ok = false; - else - tp.add(als.get(0)); - } - if (ok) - ret.addAll(tp); -// System.err.println("Reading matrix ok" + ok); - } - } else - for (int i = 0; i < getNumberOfValues(variable); ++i) - ret.add(getValueAsString(variable, i, null)); - if (ret.size() == 0) - ret.add(defaultValue); - if (ret.size() == 1) - for (int i = 0; i < size - 1; ++i) - ret.add(new String(ret.get(0))); - return ret; - } - - - public ArrayList> getValuesAsMatrix(String variable) - { - if (getNumberOfValues(variable) == 1 && getValueAsString(variable, null).startsWith("file:")) { -// System.err.println("loading matrix " + getValueAsString(variable, null).substring(5)); - ArrayList> matrix = Input.loadTable(getValueAsString(variable, null).substring(5), "[\t ]"); - return matrix; - } - ArrayList> ret = new ArrayList>(); - if (getNumberOfValues(variable) > 0) - ret.add(new ArrayList()); - int line = 0; - - for (int i = 0; i < getNumberOfValues(variable); ++i) { - if (getValueAsString(variable, i, null).equals(";")) { - ++line; - ret.add(new ArrayList()); - } else - ret.get(line).add(getValueAsString(variable, i, null)); - } - return ret; - } - public boolean warning(String keyWords[]) - { - HashMap hm2 = new HashMap(); - - for (String s : hm.keySet()) - hm2.put(s, 1); - - for (String s : keyWords) - hm2.remove(s); - - if (hm2.size() > 0) { - System.err.println("Error! Unknown parameters: "); - for (String s : hm2.keySet()) - System.err.print(s + " "); - System.err.println(); - System.exit(-1); - } - - return false; - } - - public static void main(String args[]) - { - ParameterParser pp = new ParameterParser("a=40 10 30 b=10 d = 1 2;3 4 6 e=file:ped1"); - System.err.println("a = " + pp.getValueAsString("a", 0, "null")); - - System.err.println("a = " + pp.getValue("a")); - System.err.println("b = " + pp.getValue("b")); - System.err.println("c = " + pp.getValue("c")); - System.err.println("d = " + pp.getValue("d")); - System.err.println("number of values for a = " + pp.getNumberOfValues("a")); - System.err.println("number of values for b = " + pp.getNumberOfValues("b")); - System.err.println("number of values for c = " + pp.getNumberOfValues("c")); - System.err.println(pp.getValuesAsList("d")); - System.err.println(pp.getValuesAsMatrix("d")); - System.err.println(pp.getValuesAsMatrix("e")); - } - -} diff --git a/software/LepAnchor/src/PlaceAndOrientContigs.java b/software/LepAnchor/src/PlaceAndOrientContigs.java deleted file mode 100644 index 551edd3..0000000 --- a/software/LepAnchor/src/PlaceAndOrientContigs.java +++ /dev/null @@ -1,4821 +0,0 @@ -/** - This file is part of Lep-Anchor. - - Lep-Anchor is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Lep-Anchor is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Lep-Anchor. If not, see . - - Copyright (C) 2019 Pasi Rastas, pasi.rastas@gmail.com, University of Helsinki - -*/ - -import java.io.BufferedReader; -import java.io.FileOutputStream; -import java.io.FileReader; -import java.io.InputStreamReader; -import java.io.PrintStream; -import java.util.*; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; - -public class PlaceAndOrientContigs { - - private ArrayList bed = null; // store bed file - private HashMap> bedHash = new HashMap>(); // get intervals for each contig - - private HashMap> haplotypeHash = new HashMap>(); // get intervals for each haplotype contig - - private HashMap chainLinkHash = new HashMap(); // get score for each chain link... - private HashMap chainLinkHashCut1 = new HashMap(); // get cut point for each link - private HashMap chainLinkHashCut2 = new HashMap(); // get cut point for each link - - private HashMap chainLinkHashCut1a = new HashMap(); // get alternative cut point for each link - private HashMap chainLinkHashCut2a = new HashMap(); // get alternative cut point for each link - - private HashMap chainHaplotypeHash = new HashMap(); // get score for each contig pair for second being haplotype of the first... - private HashMap chainHaplotypeCut = new HashMap(); // get haplotype alignment end points - - private HashMap scaffoldingLink_tmp = new HashMap(); // get scaffolding score linking contigs... - private HashMap scaffoldingLink = new HashMap(); // get scaffolding score linking contigs... - - private HashMap scaffoldingLinkInfo = new HashMap(); // get scaffolding score linking contigs... - - private ArrayList contigs = new ArrayList(); // store contigs - - //private HashMap> pafCoverage = new HashMap>(); - - private int numErrorsPerContig = 3; - private int numErrors = 40; - - private int numMaps = 0; - private ArrayList intervalsInChr = null; - - private double scaleScore = 0.00001; - private double orientationPenalty = 0.5; - private double cutPenalty = 0.001; - - private double liftoverScoreDiff = 0.5; - - private boolean useChainAndPaf = false; - - private String printAnimation = ""; - - private int maxBridge = 50000; - private int maxIntersect = 2000; - - private int numRuns = 5; - - private int numThreads = 8; - - private int minHaplotypeAlignmentScore = -10; - private int minLinkAlignmentScore = -10; - private boolean keepEmptyIntervals; - - private boolean commentOutput; - private boolean compressMap; - - private Proximity prox = null; - - - private void setCommentOutput(boolean commentOutput) { - this.commentOutput = commentOutput; - } - public void setCompressMap(boolean value) { - compressMap = value; - } - public void setPrintAnimation(String fn) { - printAnimation = fn; - } - - public void setUseChainAndPaf(boolean value) { - useChainAndPaf = value; - } - - public void setNumRuns(int runs) { - numRuns = runs; - } - - public void setNumErrors(int errors) { - numErrors = errors; - } - - public void setNumErrorsPerContig(int errors) { - numErrorsPerContig = errors; - } - - public void setKeepEmptyIntervals(boolean value) { - keepEmptyIntervals = value; - } - - public ArrayList> getMarkers(ContigInterval ci_result){ - ArrayList> ret = new ArrayList>(); - for (int map = 0; map < numMaps; ++map) - ret.add(new ArrayList()); - - ArrayList cis = bedHash.get(ci_result.getContig()); - if (cis != null) { - for (ContigInterval c2 : cis) { - if (Misc.intersectIntervals(c2.getStart(), c2.getEnd(), ci_result.getStart(), ci_result.getEnd())) - for (int map = 0; map < numMaps; ++map) - ret.get(map).addAll(c2.getMarkers(map, ci_result.getOrientation())); - } - } - return ret; - } - - // forward calculation, increasing map positions - private void forward1(ArrayList markers, int S[][], int P[][], int startBin, int endBin) { - forward1(markers, S, P, startBin, endBin, 0, markers.size()); - } - - private void forward1(ArrayList markers, int S[][], int P[][], int startBin, int endBin, int start, int end) - { - for (int mi = start; mi < end; ++mi) { - Marker m = markers.get(mi); - - //substract startBin - int maxI = 0; - int Si[] = S[mi]; - int Si1[] = S[mi + 1]; - int Pi1[] = P[mi + 1]; - for (int b = 0; b <= endBin - startBin; ++b) { - if (Si[b] > Si[maxI]) - maxI = b; - Si1[b] = Si[maxI];// + m.inside(b + startBin); - Pi1[b] = maxI + startBin; - } - int b = startBin; - while (b <= endBin) { - Si1[b - startBin] += m.inside(b); - b = m.nextInside(b); - } - } - } - - private void forwardFast1(ArrayList markers, int S[][], int startBin, int endBin) { - forwardFast1(markers, S, startBin, endBin, 0, markers.size()); - } - - private void forwardFast1(ArrayList markers, int S[][], int startBin, int endBin, int start, int end) - { - for (int mi = start; mi < end; ++mi) { - Marker m = markers.get(mi); - - //substract startBin - int maxS = 0; - int Si[] = S[mi]; - int Si1[] = S[mi + 1]; - for (int b = 0; b <= endBin - startBin; ++b) { - int s = Si[b]; - if (s > maxS) - maxS = s; - Si1[b] = maxS;// + m.inside(b + startBin); - } - int b = startBin; - while (b <= endBin) { - Si1[b - startBin] += m.inside(b); - b = m.nextInside(b); - } - - } - } - - private void forward2(ArrayList markers, int S[][], int P[][], int startBin, int endBin) { - forward2(markers, S, P, startBin, endBin, 0, markers.size()); - } - // forward calculation, decreasing map positions - private void forward2(ArrayList markers, int S2[][], int P2[][], int startBin, int endBin, int start, int end) - { - for (int mi = start; mi < end; ++mi) { - Marker m = markers.get(mi); - int maxI2 = endBin - startBin; - int S2i[] = S2[mi]; - int S2i1[] = S2[mi + 1]; - int P2i1[] = P2[mi + 1]; - - for (int b = endBin - startBin; b >= 0; --b) { - - if (S2i[b] > S2i[maxI2]) - maxI2 = b; - - S2i1[b] = S2i[maxI2];// + m.inside(b + startBin); // add startBin as we subtract it from b - P2i1[b] = maxI2 + startBin; - } - int b = startBin; - while (b <= endBin) { - S2i1[b - startBin] += m.inside(b); - b = m.nextInside(b); - } - - } - } - - private void forwardFast2(ArrayList markers, int S[][], int startBin, int endBin) { - forwardFast2(markers, S, startBin, endBin, 0, markers.size()); - } - // forward calculation, decreasing map positions - private void forwardFast2(ArrayList markers, int S2[][], int startBin, int endBin, int start, int end) - { - for (int mi = start; mi < end; ++mi) { - Marker m = markers.get(mi); - int maxS = 0; - int S2i[] = S2[mi]; - int S2i1[] = S2[mi + 1]; - - for (int b = endBin - startBin; b >= 0; --b) { - int s = S2i[b]; - if (s > maxS) - maxS = s; - S2i1[b] = maxS;// + m.inside(b + startBin); // add startBin as we subtract it from b - } - int b = startBin; - while (b <= endBin) { - S2i1[b - startBin] += m.inside(b); - b = m.nextInside(b); - } - - } - } - - //positions increasing - private int solveForward(ArrayList markers) { - return solveForward(markers, getMinBin(markers), getMaxBin(markers)); - } - private int solveForward(ArrayList markers, int startBin, int endBin) { - //System.err.println(startBin); - //System.err.println(endBin); - - int numMarkers = markers.size(); - int S[][] = new int[numMarkers + 1][endBin - startBin + 1]; - int P[][] = new int[numMarkers + 1][endBin - startBin + 1]; - - forward1(markers, S, P, startBin, endBin); - - int maxI = startBin; - for (int i = startBin; i <= endBin; ++i) { - if (S[numMarkers][i - startBin] > S[numMarkers][maxI - startBin]) - maxI = i; - } - - int ret = S[numMarkers][maxI - startBin]; - - // + orientation - for (int mi = numMarkers; mi > 0; --mi) { - markers.get(mi - 1).pPlus = maxI; - //System.err.println(markers.get(mi - 1) + "\t" + maxI); - maxI = P[mi][maxI - startBin]; - } - return ret; - } - private int solveForwardFast(ArrayList markers) { - return solveForwardFast(markers, getMinBin(markers), getMaxBin(markers)); - } - private int solveForwardFast(ArrayList markers, int startBin, int endBin) { - //System.err.println(startBin); - //System.err.println(endBin); - - int numMarkers = markers.size(); - int S[][] = new int[numMarkers + 1][endBin - startBin + 1]; - - forwardFast1(markers, S, startBin, endBin); - - int ret = 0; - int SnM[] = S[numMarkers]; - for (int b = startBin; b <= endBin; ++b) { - int s = SnM[b - startBin]; - if (s > ret) - ret = s; - } - - return ret; - } - - - //positions decreasing - private int solveBackward(ArrayList markers) { - return solveBackward(markers, getMinBin(markers), getMaxBin(markers)); - } - private int solveBackward(ArrayList markers, int startBin, int endBin) { - //System.err.println(startBin); - //System.err.println(endBin); - - int numMarkers = markers.size(); - - int S2[][] = new int[numMarkers + 1][endBin - startBin + 1]; - int P2[][] = new int[numMarkers + 1][endBin - startBin + 1]; - - forward2(markers, S2, P2, startBin, endBin); - - int maxI2 = endBin; - for (int i = endBin; i >= startBin; --i) { - if (S2[numMarkers][i - startBin] > S2[numMarkers][maxI2 - startBin]) - maxI2 = i; - } - - int ret = S2[numMarkers][maxI2 - startBin]; - - // - orientation - for (int mi = numMarkers; mi > 0; --mi) { - markers.get(mi - 1).pMinus = maxI2; - //System.err.println(markers.get(mi - 1) + "\t" + maxI2); - maxI2 = P2[mi][maxI2 - startBin]; - } - return ret; - } - - public static int[] markerScore(ArrayList markers) - { - PlaceAndOrientContigs poc = new PlaceAndOrientContigs(); - return poc.solve(markers); - } - - private int[] solve(ArrayList markers) - { - int scorePlus = solveForward(markers); - int scoreMinus = solveBackward(markers); - int ret[] = new int[]{scorePlus, scoreMinus}; - return ret; - } - - public void loadBed(String fn, int chromosome) - { - bed = InputData.loadBed(fn, chromosome); - for (ContigInterval ci : bed) { - if (!bedHash.containsKey(ci.getContig())) { - bedHash.put(ci.getContig(), new ArrayList()); - contigs.add(ci.getContig()); - } - bedHash.get(ci.getContig()).add(ci); - } - for (String c: contigs) { - ArrayList aci = bedHash.get(c); - Collections.sort(aci); // Put positions into physical order - long prev = 0; - for (ContigInterval ci: aci) { - if (ci.getStart() > 0 && ci.getStart() > ci.getEnd() || ci.getStart() <= prev) { // check the consistency of the bed... - System.err.println(c + "\t" + ci.getStart() + "\t" + ci.getEnd()); - System.err.println("Error: erroneous or overlapping intervals in the bed file"); - System.exit(-1); - } - prev = ci.getEnd(); - } - } - System.err.println(contigs.size() + " contigs loaded from the bed file"); - //TODO: - } - - //reverse map - private void flip(ArrayList markers) { - int max = getMaxBin(markers); - for (Marker m : markers) - m.flip(max); - } - - private long[] chain2OneBase(String length, String orientation, String start, String stop) - { - if ("+".equals(orientation)) { - return new long[]{Long.parseLong(start) + 1, Long.parseLong(stop)}; - } - else { - long l = Long.parseLong(length); - return new long[]{l - Long.parseLong(stop) + 1, l - Long.parseLong(start)}; - } - } - - // first coordinate system to second - private long mapPosition12(ArrayList alignment, long pos1, boolean sameOrientation){ - int low = 0; - int high = alignment.size() - 1; - - while (low <= high) { - int mid = (low + high) / 2; - long a[] = alignment.get(mid); - long mpos = a[0]; - long len = a[2]; - - if (mpos > pos1) - high = mid - 1; - else if (mpos <= pos1 && mpos + len > pos1) - return a[1] + (sameOrientation ? (pos1 - mpos) : (mpos - pos1)); - else if (mpos < pos1) - low = mid + 1; - else - return -1; - } - return -1; - } - - // second coordinate system to first - private long mapPosition21(ArrayList alignment, long pos2, boolean sameOrientation){ - int low = 0; - int high = alignment.size() - 1; - - while (low <= high) { - int mid = (low + high) / 2; - long a[] = alignment.get(mid); - long mpos = a[1]; - long len = a[2]; - if (sameOrientation) { - if (mpos > pos2) - high = mid - 1; - else if (mpos <= pos2 && mpos + len > pos2) - return a[0] + (pos2 - mpos); - else if (mpos < pos2) - low = mid + 1; - else - return -1; - } else { - if (mpos < pos2) - high = mid - 1; - else if (mpos >= pos2 && mpos - len < pos2) - return a[0] + (pos2 - mpos); - else if (mpos > pos2) - low = mid + 1; - else - return -1; - } - } - return -1; - } - - //c2 is full haplotype of ofHaplotype? - private int calculateLiftOverHaplotype(ContigInterval ofHaplotype, int ofHaplotypeOrientation1, ContigInterval c2, ArrayList alignment, boolean sameOrientation) - { - int score = 0; - for (int map = 0; map < numMaps; ++map) { - int numM2 = c2.getMarkers(map).size(); - - if (numM2 > 0) { - ArrayList markers = new ArrayList(); - markers.addAll(ofHaplotype.getMarkers(map, ofHaplotypeOrientation1)); - - int score1 = solveForward(markers); - - ArrayList markers2 = c2.getMarkers(map); - for (Marker m: markers2) { - long p21 = mapPosition21(alignment, m.getPosition(), sameOrientation); - if (p21 > 0) - markers.add(new Marker(m, p21)); - } - Collections.sort(markers); - if (ofHaplotypeOrientation1 != 0) - Collections.reverse(markers); - - score += solveForward(markers) - score1; - } - } - return score; - } - - private int[] calculateLiftOver(ContigInterval c1, int orientation1, ContigInterval c2, int orientation2, ArrayList alignment, boolean sameOrientation, long interval1[], long interval2[]) - { - int score1 = 0; - int score2 = 0; - for (int map = 0; map < numMaps; ++map) { - int numM1 = c1.getMarkers(map).size(); - int numM2 = c2.getMarkers(map).size(); - if (numM2 > 0 || numM1 > 0){ // markers in one or both contigs... - int start1 = 0; - int end1 = numM1 - 1; - if (orientation1 != 0) { - end1 = 0; - start1 = numM1 - 1; - } - - int start2 = 0; - int end2 = numM2 - 1; - if (orientation2 != 0) { - end2 = 0; - start2 = numM2 - 1; - } - - // c1|----------------| - // ||||| (interval 1&2 markers put to am1 and am2) - // c2|-------------| - - ArrayList am1 = new ArrayList(); - ArrayList am2 = new ArrayList(); - - while (orientation1 == 0 && end1 >= start1 && c1.getMarker(map, end1).position >= interval1[0]) { - if (c1.getMarker(map, end1).position <= interval1[1]) - am1.add(c1.getMarker(map, end1)); - --end1; - } - - while (orientation1 != 0 && end1 <= start1 && c1.getMarker(map, end1).position <= interval1[1]) { - if (c1.getMarker(map, end1).position >= interval1[0]) - am1.add(c1.getMarker(map, end1)); - ++end1; - } - - while (orientation2 == 0 && end2 >= start2 && c2.getMarker(map, start2).position <= interval2[1]) { - if (c2.getMarker(map, start2).position >= interval2[0]) - am2.add(c2.getMarker(map, start2)); - ++start2; - } - - while (orientation2 != 0 && end2 <= start2 && c2.getMarker(map, start2).position >= interval2[0]) { - if (c2.getMarker(map, start2).position <= interval2[1]) - am2.add(c2.getMarker(map, start2)); - --start2; - } - - ArrayList test1 = new ArrayList(); - if (orientation1 == 0) - for (int i = start1; i <= end1; ++i) - test1.add(c1.getMarker(map, i)); - else - for (int i = start1; i >= end1; --i) - test1.add(c1.getMarker(map, i)); - - //DO MAGIC HERE... - Collections.reverse(am1); - - ArrayList mid1 = new ArrayList(); - ArrayList mid2 = new ArrayList(); - ArrayList merge1 = new ArrayList(); - ArrayList merge2 = new ArrayList(); - - //merge1 - for (Marker m : am1) { - long p2 = mapPosition12(alignment, m.getPosition(), sameOrientation); - if (p2 > 0) { // liftover possible - Marker m2 = new Marker(m, p2); - mid2.add(m2); - } - } - merge1.addAll(mid2); - merge1.addAll(am2); - Collections.sort(merge1); - if (orientation2 != 0) - Collections.reverse(merge1); - - //merge2 - for (Marker m : am2) { - long p1 = mapPosition21(alignment, m.getPosition(), sameOrientation); - if (p1 > 0) { // liftover possible - Marker m2 = new Marker(m, p1); - mid1.add(m2); - } - } - - merge2.addAll(mid1); - merge2.addAll(am1); - Collections.sort(merge2); - - if (orientation1 != 0) - Collections.reverse(merge2); - - //System.err.println(numM2 + "\t" + start2 + "\t" + end2 + "\t" + numM1 + "\t" + start1 + "\t" + end1 ); - - ArrayList test3 = new ArrayList(); - if (orientation2 == 0) - for (int i = start2; i <= end2; ++i) - test3.add(c2.getMarker(map, i)); - else - for (int i = start2; i >= end2; --i) - test3.add(c2.getMarker(map, i)); - - ArrayList test = new ArrayList(); - - // c1 + c2 - test.addAll(c1.getMarkers(map, orientation1)); - test.addAll(c2.getMarkers(map, orientation2)); - int s0 = solveForward(test); - - score1 -= s0; - score2 -= s0; - - //System.err.println(solve(test)[0] + "\t(" + test.size() + ")\t"); - - test.clear(); - test.addAll(test1); - test.addAll(merge1); - test.addAll(test3); - - // c1 + merge1 + c2 - int s1 = solveForward(test); - score1 += s1; - - //System.err.print(c1.getContig() + "_" + orientation1 + "\t" + c2.getContig() + "_" + orientation2); - - //System.err.print("\tscores\t" + solve(test)[0] + "\t(" + test.size() + ")\t"); - - test.clear(); - test.addAll(test1); - test.addAll(merge2); - test.addAll(test3); - //System.err.print(solve(test)[0] + "\t(" + test.size() + ")\t"); - - // c1 + merge2 + c2 - int s2 = solveForward(test); - score2 += s2; - - //System.out.println(s0 + "\t" + s1 + "\t" + s2); - - } - } - return new int[]{score1, score2}; - } - - // change alignment from contig1=>contig2 to contig2=>contig1 - private ArrayList reverseAlignment(ArrayList a, boolean sameOrientation){ - ArrayList ret = new ArrayList(); - if (sameOrientation) { - for (long[] aa : a) - ret.add(new long[]{aa[1],aa[0], aa[2]}); - } else { - int n = a.size(); - for (int i = n - 1; i>=0; --i) { // reverse and add length for each block - long[] aa = a.get(i); - ret.add(new long[]{aa[1] - aa[2] + 1, aa[0] + aa[2] - 1, aa[2]}); - } - } - return ret; - } - - public void loadProximity(String fn, int bin, int maxDistance, double scale){ - prox = new Proximity(bin, maxDistance, scale); - boolean nempty = prox.loadData(fn, bedHash); - if (!nempty) - prox = null; - } - - public void loadPaf(String fn, int maxScore, double scalePaf){ - System.err.println("loading paf..."); - try { - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - - ArrayList> rows = new ArrayList>(); - - do { - ArrayList row = Input.loadTableRow(br, "\t "); - if (row == null) - break; - - if (row.size() < 9) { - System.err.println("Warning: skipping row " + row); - continue; - } - - if (!bedHash.containsKey(row.get(0)) && bedHash.containsKey(row.get(5))) { // find "extra" contigs or single reads for scaffolding... - if (rows.size() == 0 || rows.get(0).get(0).equals(row.get(0))) - rows.add(row); - else { - processPAF(rows); - rows.clear(); - rows.add(row); - } - } - - } while (true); - processPAF(rows); - System.err.println("Scaffolding links:"); - for (String key: scaffoldingLink.keySet()) { - int value = (int)(0.5 + scalePaf * scaffoldingLink.get(key)); - if (value > maxScore) { - value = maxScore; - } - scaffoldingLink.put(key, value); - System.err.println(key + "\t" + value); - } - - br.close(); - } catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - } - } - - //int tmps = 0; - - private void helperPAF1(ContigInterval ci1, ContigInterval ci2, ArrayList row1, ArrayList row2) { - - //check that alignments are in right order in the read... - - long rstart1 = Long.parseLong(row1.get(2)); - long rend1 = Long.parseLong(row1.get(3)); - - long rstart2 = Long.parseLong(row2.get(2)); - long rend2 = Long.parseLong(row2.get(3)); - - // require non-intersecting intervals... - if (Misc.intersectIntervalsLength(rstart1, rend1, rstart2, rend2) > maxIntersect) - return; - - boolean plus = row1.get(4).equals("+"); - boolean r1first = rstart1 < rstart2; - - long start1 = Long.parseLong(row1.get(7)); - long end1 = Long.parseLong(row1.get(8)); - - long start2 = Long.parseLong(row2.get(7)); - long end2 = Long.parseLong(row2.get(8)); - - //++ orientation - long gap1 = ci1.calculateGapLength(0, start1 + 1, end1); - long gap2 = ci2.calculateGapLength(1, start2 + 1, end2); - - long length1 = end1 - start1; - long length2 = end2 - start2; - - if (length1 - gap1 > 0 && length2 - gap2 > 0 ) { - if (plus ^ !r1first) { - //System.err.println(ci1.getContig() + "->" + ci2.getContig()); - String key = ci1.toString() + '+' + ci2.toString() + '+'; - scaffoldingLink_tmp.put(key, 1); - scaffoldingLinkInfo.put(key, row1.get(0)); - } else { // read is not consistent - //System.err.println("*"); - //System.err.println(row1); - //System.err.println(ci1); - //System.err.println(row2); - //System.err.println(ci2); - //System.err.println("*"); - } - } - //-- orientation - gap1 = ci1.calculateGapLength(1, start1 + 1, end1); - gap2 = ci2.calculateGapLength(0, start2 + 1, end2); - - if (length1 - gap1 > 0 && length2 - gap2 > 0) { - if (plus ^ r1first) { - //System.err.println(ci1.getContig() + "->" + ci2.getContig()); - String key = ci1.toString() + '-' + ci2.toString() + '-'; - scaffoldingLink_tmp.put(key, 1); - scaffoldingLinkInfo.put(key, row1.get(0)); - - } else { // read is not consistent - //System.err.println("*"); - //System.err.println(row1); - //System.err.println(ci1); - //System.err.println(row2); - //qSystem.err.println(ci2); - //System.err.println("*"); - } - } - } - - private void helperPAF2(ContigInterval ci1, ContigInterval ci2, ArrayList row1, ArrayList row2) { - //check that alignments are in right order in the read... - long rstart1 = Long.parseLong(row1.get(2)); - long rend1 = Long.parseLong(row1.get(3)); - long rstart2 = Long.parseLong(row2.get(2)); - long rend2 = Long.parseLong(row2.get(3)); - - // require non-intersecting intervals... - if (Misc.intersectIntervalsLength(rstart1, rend1, rstart2, rend2) > maxIntersect) - return; - - boolean plus = row1.get(4).equals("+"); - boolean r1first = rstart1 < rstart2; - - long start1 = Long.parseLong(row1.get(7)); - long end1 = Long.parseLong(row1.get(8)); - - long start2 = Long.parseLong(row2.get(7)); - long end2 = Long.parseLong(row2.get(8)); - - //+- orientation - long gap1 = ci1.calculateGapLength(0, start1 + 1, end1); - long gap2 = ci2.calculateGapLength(0, start2 + 1, end2); - - long length1 = end1 - start1; - long length2 = end2 - start2; - - if (length1 - gap1 > 0 && length2 - gap2 > 0) { - if (plus ^ !r1first) { - //System.err.println(ci1.getContig() + "->" + ci2.getContig()); - String key = ci1.toString() + '+' + ci2.toString() + '-'; - scaffoldingLink_tmp.put(key, 1); - scaffoldingLinkInfo.put(key, row1.get(0)); - } else { // read is not consistent - //System.err.println("*"); - //System.err.println(row1); - //System.err.println(ci1); - //System.err.println(row2); - //System.err.println(ci2); - //System.err.println("*"); - } - } - //-+ orientation - gap1 = ci1.calculateGapLength(1, start1 + 1, end1); - gap2 = ci2.calculateGapLength(1, start2 + 1, end2); - - if (length1 - gap1 > 0 && length2 - gap2 > 0) { - if (plus ^ r1first) { - //System.err.println(ci1.getContig() + "->" + ci2.getContig()); - String key = ci1.toString() + '-' + ci2.toString() + '+'; - scaffoldingLink_tmp.put(key, 1); - scaffoldingLinkInfo.put(key, row1.get(0)); - - } else { // read is not consistent - //System.err.println("*"); - //System.err.println(row1); - //System.err.println(ci1); - //System.err.println(row2); - //System.err.println(ci2); - //System.err.println("*"); - } - } - } - - private void processPAF(ArrayList> rows) - { - HashMap>> alignmentHash = new HashMap>>(); - for (ArrayList row : rows) { - ArrayList> a = alignmentHash.get(row.get(5)); - if (a == null) { - a = new ArrayList>(); - alignmentHash.put(row.get(5), a); - } - a.add(row); - } - ArrayList contigs = new ArrayList(); - - contigs.addAll(alignmentHash.keySet()); - - int numContigs = contigs.size(); - - for (int c1 = 0; c1 < numContigs; ++c1) { - String contig1 = contigs.get(c1); - //for (int c2 = c1 + 1; c2 < numContigs; ++c2) { - for (int c2 = c1; c2 < numContigs; ++c2) { //allow links between same contig... - - if (c1 == c2 && bedHash.get(contig1).size() <= 1) // no need to go further if only one possible ContigInterval and contig... - continue; - - String contig2 = contigs.get(c2); - - - ArrayList> rows1 = alignmentHash.get(contig1); - ArrayList> rows2 = alignmentHash.get(contig2); - - - for (ArrayList row1 : rows1) - for (ArrayList row2 : rows2) { - long rstart1 = Long.parseLong(row1.get(2)); - long rend1 = Long.parseLong(row1.get(3)); - long rstart2 = Long.parseLong(row2.get(2)); - long rend2 = Long.parseLong(row2.get(3)); - - // require non-intersecting intervals... - long il = Misc.intersectIntervalsLength(rstart1, rend1, rstart2, rend2); - - if (il > maxIntersect || il > 0.5 * Math.min(rend1 - rstart1, rend2 - rstart2)) - continue; - - //TODO: Take into account useChainAndPaf in the calculation of bridgeLength and other... - //TODO: Have to take into account the contig overlap... - - //read [rs1 re1] [rs2 re2], bridgeLength = rs2-re1 - //read [rs2 re2] [rs1 re1], bridgeLength = rs1-re2 - long bridgeLength = Math.max(rstart2 - rend1, rstart1 - rend2); - if (bridgeLength > maxBridge) - continue; - - if (row1.get(4).equals(row2.get(4))) { // same orientation - for (ContigInterval ci1 : bedHash.get(contig1)) { - for (ContigInterval ci2 : bedHash.get(contig2)) { - if (ci1 != ci2) { - helperPAF1(ci1, ci2, row1, row2); - helperPAF1(ci2, ci1, row2, row1); - } - } - } - - } else { // different orientation - for (ContigInterval ci1 : bedHash.get(contig1)) { - for (ContigInterval ci2 : bedHash.get(contig2)) { - if (ci1 != ci2) { - helperPAF2(ci1, ci2, row1, row2); - helperPAF2(ci2, ci1, row2, row1); - } - } - } - } - } - } - } - - for (String key : scaffoldingLink_tmp.keySet()) { - Integer os = scaffoldingLink.get(key); - if (os == null) - scaffoldingLink.put(key, scaffoldingLink_tmp.get(key)); - else - scaffoldingLink.put(key, os + scaffoldingLink_tmp.get(key)); - } - scaffoldingLink_tmp.clear(); - - //if (numContigs > 1) - // System.err.println(numContigs + "\t" + (++tmps)); - - //calculate coverage for each contig (reads spanning each position...) - } - - private void addHaplotype(ContigInterval ofHaplotype, int ofHaplotypeOrientation, ContigInterval haplotype, long ha[], double ascore, ArrayList alignment, boolean sameOrientation) - { - - int haplotypeScore = (int) (ascore - haplotype.calculateGapLengthHaplotype(ha[0], ha[1]) * cutPenalty + 0.5); //+0.5 = rounding - if (haplotypeScore >= minHaplotypeAlignmentScore) { - haplotypeScore += calculateLiftOverHaplotype(ofHaplotype, ofHaplotypeOrientation, haplotype, alignment, sameOrientation); - if (haplotypeScore > 0) { - String key = ofHaplotype.toString() + ((ofHaplotypeOrientation == 0) ? '+':'-') + haplotype.toString(); - Integer oscore = chainHaplotypeHash.get(key); - if (oscore == null || oscore < haplotypeScore) { - chainHaplotypeHash.put(key, (int) haplotypeScore); - chainHaplotypeCut.put(key, ha); - } - - //System.err.println(key + "\t" + (ascore * scaleScore - calculateGapLengthHaplotype(ci2, p2[0], p2[1]) * cutPenalty) + "\t" + haplotypeScore1 + "\thaplotype"); - } - } - } - - // add score between contigs ci1 and ci2 in these orientation based on alignment chain... - private void addChainLink(ContigInterval ci1, int orientation1, ContigInterval ci2, int orientation2, long ai1[], long ai2[], double ascore, ArrayList alignment, boolean sameOrientation) - { - long l1 = ci1.calculateGapLength(orientation1, ai1[0], ai1[1]); - long l2 = ci2.calculateGapLength(1 - orientation2, ai2[0], ai2[1]); - - double score = ascore; - if ((orientation1 == orientation2) != sameOrientation) - score *= orientationPenalty; - - score -= (l1 + l2) * cutPenalty; - - if (score >= minLinkAlignmentScore) { // is this needed? - int los[] = calculateLiftOver(ci1, orientation1, ci2, orientation2, alignment, sameOrientation, ai1, ai2); - int max1 = 0; - if (los[1] > los[0]) - max1 = 1; - - score += los[max1]; - - if (score >= 0.5) { - String key = ci1.toString() + ((orientation1 == 0) ? '+':'-') + ci2.toString() + ((orientation2 == 0) ? '+':'-'); - //System.err.println(key + "\t" + score1); - Integer oldScore = chainLinkHash.get(key); - score = (int)(score + 0.5); // round up... - if (oldScore == null || oldScore < score) { - chainLinkHash.put(key, (int) score); - - //if ((orientation1 == orientation2) != sameOrientation) { // orientation do not match... make a palindrome... - // long start1 = (orientation1 == 0) ? ai1[1] : ai1[0]; - // chainLinkHashCut1.put(key, start1); - // chainLinkHashCut1a.put(key, start1); - // long end2 = (orientation2 == 0) ? ai2[0] : ai2[1]; - // chainLinkHashCut2.put(key, end2); - // chainLinkHashCut2a.put(key, end2); - //} else - { - if (orientation1 == 0) { // + orientation - if (max1 == 1) { - chainLinkHashCut1.put(key, ai1[1]); // take whole alignment - chainLinkHashCut1a.put(key, ai1[0] - 1); - } - else { - chainLinkHashCut1.put(key, ai1[0] - 1); // cut just before alignment - chainLinkHashCut1a.put(key, ai1[1]); - } - } - else { // - orientation - if (max1 == 1) { - chainLinkHashCut1.put(key, ai1[0]); // whole - chainLinkHashCut1a.put(key, ai1[1] + 1); - } - else { - chainLinkHashCut1.put(key, ai1[1] + 1); // after - chainLinkHashCut1a.put(key, ai1[0]); - } - } - if (orientation2 == 0) { // + orientation - if (max1 == 1) { - chainLinkHashCut2.put(key, ai2[1] + 1); // after - chainLinkHashCut2a.put(key, ai2[0]); - } - else { - chainLinkHashCut2.put(key, ai2[0]); // whole - chainLinkHashCut2a.put(key, ai2[1] + 1); - } - } - else { // - orientation - if (max1 == 1) { - chainLinkHashCut2.put(key, ai2[0] - 1); // before - chainLinkHashCut2a.put(key, ai2[1]); - } - else { - chainLinkHashCut2.put(key, ai2[1]); // whole - chainLinkHashCut2a.put(key, ai2[0] - 1); - } - } - } - } - } - } - } - // Find largest index i, where array[i] <= value, array sorted ascending order - private int binarySearchR(ArrayList array, int value){ - int right = array.size() - 1; - if (right < 0) - return -1; - int left = 0; - - while (right > left + 1) { - int mid = (left + right) / 2; - int am = array.get(mid); - if (am > value) - right = mid - 1; - else - left = mid + ((am == value) ? 0 : 1); - } - if (array.get(right) <= value) - return right; - if (array.get(left) <= value) - return left; - return left - 1; - } - - // Find smallest index i, where array[i] >= value, array sorted ascending order - private int binarySearchL(ArrayList array, int value){ - int right = array.size() - 1; - if (right < 0) - return 0; - int left = 0; - - while (right > left + 1) { - int mid = (left + right) / 2; - int am = array.get(mid); - if (am < value) - left = mid + 1; - else - right = mid - ((am == value) ? 0 : 1); - } - if (array.get(left) >= value) - return left; - if (array.get(right) >= value) - return right; - return right + 1; - } - - //calculate how many markers in array between start and stop - // array sorted - private int calculateNumMarkers(ArrayList array, int start, int end) - { - return (binarySearchR(array, end) - binarySearchL(array, start) + 1); - } - - private long[] getStartEnd1(ArrayList alignment) - { - long first[] = alignment.get(0); - long last[] = alignment.get(alignment.size() - 1); - return new long[]{first[0], last[0] + last[2] - 1}; - } - - private long[] getStartEnd2(ArrayList alignment, boolean sameOrientation) - { - long first[] = alignment.get(0); - long last[] = alignment.get(alignment.size() - 1); - if (sameOrientation) - return new long[]{first[1], last[1] + last[2] - 1}; - else - return new long[]{last[1] - last[2] + 1, first[1]}; - } - - // - private double trimChain(ArrayList alignment, boolean sameOrientation, ContigInterval ci1, ContigInterval ci2, ArrayList ret) { - if (alignment.size() == 0) - return 0.0; - - long ap1[] = getStartEnd1(alignment); - long ap2[] = getStartEnd2(alignment, sameOrientation); - - // if contig intervals and alignment do not intersect, we are done here... - if (!Misc.intersectIntervals(ap1[0], ap1[1], ci1.getMinStart(), ci1.getMaxEnd()) || !Misc.intersectIntervals(ap2[0], ap2[1], ci2.getMinStart(), ci2.getMaxEnd())) - return 0.0; - long trimLength1[] = new long[2]; - - if (ap1[0] < ci1.getMinStart()) // alignment spans too far - trimLength1[0] = ci1.getMinStart() - ap1[0]; - if (ap1[1] > ci1.getMaxEnd()) // alignment spans too far - trimLength1[1] = ap1[1] - ci1.getMaxEnd(); - - long trimLength2[] = new long[2]; - - - if (ap2[0] < ci2.getMinStart()) // alignment spans too far - trimLength2[0] = ci2.getMinStart() - ap2[0]; - if (ap2[1] > ci2.getMaxEnd()) // alignment spans too far - trimLength2[1] = ap2[1] - ci2.getMaxEnd(); - - if (!sameOrientation) { - long tmp = trimLength2[0]; - trimLength2[0] = trimLength2[1]; - trimLength2[1] = tmp; - } - - long tl1 = trimLength1[0] + trimLength2[0]; - long tl2 = trimLength1[1] + trimLength2[1]; - if (tl1 + tl2 > 0) { - //System.err.println("Alignment of " + ci1.getContig() + " and " + ci2.getContig() + " needs trimming of " + (tl1 + tl2)); - - long first[] = alignment.get(0); - long last[] = alignment.get(alignment.size() - 1); - //System.err.println(first[0] + "\t" + first[1] + "\t" + last[0] + "\t" + last[1] + "\t" + last[2] + "\t" + ci1 + "\t" + ci2); - - long alignL = 0; - for (long[] a : alignment) - alignL += a[2]; - - int t1 = 0; - long wt1 = 0; - long totalW1 = 0; - if (tl1 > 0) { - for (long[] a : alignment) { - long w = a[2]; - long dt1 = a[0] - ap1[0]; - long dt2 = ((sameOrientation) ? a[1] - ap2[0] : ap2[1] - a[1]); - if (dt1 + w - 1 >= trimLength1[0] && dt2 + w - 1 >= trimLength2[0]) { - if (dt1 >= trimLength1[0] && dt2 >= trimLength2[0]) { - wt1 = 0; - //System.err.print("*full "); - } else { - wt1 = Math.max(trimLength1[0] - dt1, trimLength2[0] - dt2); - //System.err.print("*"+ wt1 + " of " + w + " "); - } - totalW1 += wt1; - //System.err.println("trimming " + t1 + " blocks (of " + alignment.size() + ") with length " + dt1 + "\t" + dt2 + "\t" + ((alignL - totalW1) / (double) alignL)); - - break; - } else - totalW1 += w; - ++t1; - } - } - - int t2 = 0; - long wt2 = 0; - long totalW2 = 0; - if (tl2 > 0) { - for (int ai = alignment.size() - 1; ai >= 0; --ai) { - long[] a = alignment.get(ai); - long w = a[2]; - - long p1 = a[0] + w - 1; - long p2 = ((sameOrientation) ? a[1] + w - 1 : a[1] - w + 1); - - long dt1 = ap1[1] - p1; - long dt2 = ((sameOrientation) ? ap2[1] - p2 : p2 - ap2[0]); - - if (dt1 + w - 1 >= trimLength1[1] && dt2 + w - 1 >= trimLength2[1]) { - if (dt1 >= trimLength1[1] && dt2 >= trimLength2[1]) { - wt2 = 0; - //System.err.print("full "); - } else { - wt2 = Math.max(trimLength1[1] - dt1, trimLength2[1] - dt2); - //System.err.print(wt2 + " of " + w + " "); - } - totalW2 += wt2; - //System.err.println("trimming " + t2 + " blocks (of " + alignment.size() + ") with length " + dt1 + "\t" + dt2 + "\t" + ((alignL - totalW2) / (double) alignL)); - - break; - } else - totalW2 += w; - ++t2; - } - } - if (totalW1 + totalW2 >= alignL) { - //System.err.println("all trimmed of a chain"); - return 0.0; // all trimmed off - } else { - ret.clear(); - int n = alignment.size(); - for (int i = t1; i < n - t2; ++i) { - long a[] = alignment.get(i); - if (i == t1 && wt1 > 0) { - ret.add(new long[]{a[0] + wt1, a[1] + ((sameOrientation) ? + wt1 : -wt1), a[2] - wt1}); - } else if (i == n - t2 - 1 && wt2 > 0) { - ret.add(new long[]{a[0], a[1], a[2] - wt2}); - } else - ret.add(a); - } - return (alignL - totalW1 - totalW2) / (double) alignL; - } - - } else { - //System.err.println("Alignment of " + ci1.getContig() + " and " + ci2.getContig() + " does not need trimming"); - return 1.0; - } - //total length of aligning bases... - } - - public void loadChain(String fn){ - int numAlignments = 0; - /* - //calculate data structure to get the number of marker in each interval in O(log(n)) time - - HashMap> markerHash = new HashMap>(); - for (ContigInterval ci : intervalsInChr) { - ArrayList markers = new ArrayList(); - markerHash.put(ci.getContig(), markers); - for (int map = 0; map < numMaps; ++map) - for (Marker m : ci.getMarkers(map)) - markers.add(m.getPosition()); - Collections.sort(markers); - }*/ - - System.err.println("loading alignment chain..."); - try { - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - do { - ArrayList row = Input.loadTableRow(br, "\t "); - if (row == null) - break; - boolean skip = true; - if (row.size() >= 12 && "chain".equals(row.get(0))) { - String contig1 = row.get(2); - String contig2 = row.get(7); - - if (bedHash.containsKey(contig1) && bedHash.containsKey(contig2)) { - skip = false; - long p1_[] = chain2OneBase(row.get(3), row.get(4), row.get(5), row.get(6)); - long p2_[] = chain2OneBase(row.get(8), row.get(9), row.get(10), row.get(11)); - if ("-".equals(row.get(4))) { - System.err.println("Error: only ++, and +- orientation allowed in the chain file"); - continue; - } - double ascore_ = scaleScore * Double.parseDouble(row.get(1)); // alignment score - - //int numMarkers1 = calculateNumMarkers(markerHash.get(contig1), p1[0], p1[1]); - //int numMarkers2 = calculateNumMarkers(markerHash.get(contig2), p2[0], p2[1]); - //System.err.println(contig1 + "\t" + p1[0] + " " + p1[1] + " " + numMarkers1); - //System.err.println(contig2 + "\t" + p2[0] + " " + p2[1] + " " + numMarkers2); - - boolean sameOrientation = row.get(4).equals(row.get(9)); - - // load alignment for liftover... - ArrayList alignment_ = new ArrayList(); - ArrayList row2 = null; - long v1 = p1_[0]; - long v2 = ((sameOrientation) ? p2_[0] : p2_[1]); - int v3 = 0; - do { - row2 = Input.loadTableRow(br, "\t "); - - v3 = Integer.parseInt(row2.get(0)); - alignment_.add(new long[]{v1, v2, v3}); - v1 += v3; - v2 += ((sameOrientation) ? v3 : -v3); - if (row2.size() >= 3) { - v1 += Integer.parseInt(row2.get(1)); - v2 += ((sameOrientation) ? Integer.parseInt(row2.get(2)) : -Integer.parseInt(row2.get(2))); - } - } while (row2 != null && row2.size() >= 3); - - ArrayList revAlignment_ = reverseAlignment(alignment_, sameOrientation); - - //System.err.println(row); - //for (int[] a : alignment) - // System.err.println(a[0] + "\t" + a[1] + "\t" + a[2]); - - //for (int i = 1; i<=100; i+=1) - // System.err.println(mapPosition12(alignment, i, sameOrientation)); - //System.err.println("*"); - //for (int i = 1; i<=100; i+=1) - // System.err.println(mapPosition21(alignment, i, sameOrientation)); - - //if (true) - // System.exit(-1); - - //check here - - for (ContigInterval ci1 : bedHash.get(contig1)) - for (ContigInterval ci2 : bedHash.get(contig2)) { - - ArrayList alignment = new ArrayList(); - - double t = trimChain(alignment_, sameOrientation, ci1, ci2, alignment); - if (t == 0.0) - continue; - - ArrayList revAlignment = new ArrayList(); - long p1[]; - long p2[]; - double ascore = t * ascore_; - - if (t < 1.0) { - double t2 = trimChain(revAlignment_, sameOrientation, ci2, ci1, revAlignment); - if (t != t2) { - System.err.println("Error in parsing alignment chain:" + t +"!="+ t2); - System.exit(-1); - } - - p1 = getStartEnd1(alignment); - p2 = getStartEnd2(alignment, sameOrientation); - if (p2[0] > p2[1]) { - long tmp = p2[0]; - p2[0] = p2[1]; - p2[1] = tmp; - } - - } else { - alignment = alignment_; - revAlignment = revAlignment_; - p1 = p1_; - p2 = p2_; - } - for (int orientation1 = 0; orientation1 < 2; ++orientation1) { - //calculate haplotype scores... - //ci2 is the haplotype of ci1? - addHaplotype(ci1, orientation1, ci2, p2, ascore, alignment, sameOrientation); - //ci1 is the haplotype of ci2? - //use orientation1 for ci2 as well... (1 - orientation is not really needed) - addHaplotype(ci2, 1 - orientation1, ci1, p1, ascore, revAlignment, sameOrientation); - - for (int orientation2 = 0; orientation2 < 2; ++orientation2) { // four combinations, ++/+-/-+/-- - //add partial haplotype link scores - addChainLink(ci1, orientation1, ci2, orientation2, p1, p2, ascore, alignment, sameOrientation); - //add partial haplotype link scores (1 - orientation is not really needed) - addChainLink(ci2, 1 - orientation2, ci1, 1 - orientation1, p2, p1, ascore, revAlignment, sameOrientation); - //System.err.println("hiphei"); - } - } - } - - ++numAlignments; - } - } else { - if (skip && (row.size() == 3 || row.size() == 1)) - ; - else - System.err.println("Warning: skipping " + row); - } - } while (true); - br.close(); - } catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - } - System.err.println("loading " + numAlignments + " alignments"); - - System.err.println("Alignment links:"); - for (String key:chainLinkHash.keySet()) - System.err.println(key + "\t" + chainLinkHash.get(key)); - } - - public void addMap(String fn, boolean flip, boolean nochromosome, boolean nointervals, int chromosome) - { - - ArrayList markers = InputData.loadMap(fn, nochromosome, nointervals, compressMap); - - if (flip) { - flip(markers); - } - - for (Marker m : markers) - if (chromosome <= 0 || m.getChromosome() == chromosome) { - boolean inside = false; - if (bedHash.containsKey(m.contig)) - for (ContigInterval ci : bedHash.get(m.getContig())) { - if (ci.inside(m.position)) { - inside = true; - ci.addMarker(m, numMaps); - break; - } - } - if (!inside) { -// System.err.println(m); -// System.err.println("Warning: marker position not listed in the bed file"); - //System.exit(-1); - } - } - System.err.println("#map = " + fn); - System.err.println("contig\tstart_pos\tend_pos\tscore+orientation\tscore-orientation"); - for (ContigInterval ci : bed) { - if (ci.markers.size() > numMaps) { - Collections.sort(ci.markers.get(numMaps)); // Put positions into physical order - - int scores[] = solve(ci.markers.get(numMaps)); // Solve marker positions by dynamic programming... - //ci.scores.set(numMaps, scores); - - System.err.println(ci + "\t" + scores[0] + "\t" + scores[1]); - } - } - ++numMaps; - - int maxChromosome = 0; - for (ContigInterval ci : bed) - maxChromosome = Math.max(maxChromosome, ci.getChromosome()); - - int minChromosome = maxChromosome; - for (ContigInterval ci : bed) - if (ci.getChromosome() > 0) - minChromosome = Math.min(minChromosome, ci.getChromosome()); - - if (minChromosome != maxChromosome) { - System.err.println("Error: Only one chromosome allowed in the input maps!"); - System.exit(-1); - } - - intervalsInChr = new ArrayList(); - - for (ContigInterval ci : bed) { - if (ci.getChromosome() == maxChromosome || ci.getChromosome() == 0) - intervalsInChr.add(ci); - } - - } - - public void combineMaps(boolean findOrientation, boolean randomOrder, boolean finalImprove) - { - //(remove) check intervals without any markers... - ArrayList contigsWithMarkers = new ArrayList(); - - ArrayList contigsWithoutMarkers = new ArrayList(); - - for (ContigInterval ci : intervalsInChr) { - if (ci.markers.size() > 0) - contigsWithMarkers.add(ci); - else { - contigsWithoutMarkers.add(ci); - System.err.println("Contig " + ci + " does not have any markers"); - } - } - intervalsInChr = contigsWithMarkers; - if (keepEmptyIntervals) - for (ContigInterval ci : contigsWithoutMarkers) { - intervalsInChr.add(ci); - } - - - - if (randomOrder) { // destroy initial solution... - Collections.shuffle(intervalsInChr); - for (ContigInterval c : intervalsInChr) - if (Math.random() < 0.5) - c.flipOrientation(); - } else { // find contig order by ordering contigs on the most abundant marker - - ArrayList allMarkers = getMarkers(intervalsInChr, null, 0); - - int maxBin = getMaxBin(allMarkers); - int minBin = getMinBin(allMarkers); - - //int n = intervalsInChr.size(); - ArrayList sortValues = new ArrayList(); - - for (ContigInterval ci : intervalsInChr) { - ArrayList markersInFirstMap = ((ci.markers.size() > 0) ? ci.markers.get(0) : new ArrayList()); - int s[] = solve(markersInFirstMap); - if (s[1] > s[0] || (s[1] == s[0] && Math.random() < 0.5)) - ci.flipOrientation(); - - HashMap hm = new HashMap(); - - for (Marker m : markersInFirstMap) { - int numI = m.intervals.length; - for (int i = 0; i < numI; i+=2) - for (int p = m.intervals[i]; p <= m.intervals[i + 1]; ++p) { - Integer oldValue = hm.get(p); - if (oldValue == null) - hm.put(p, 1); - else - hm.put(p, oldValue + 1); - } - } - - int maxCount = -1; - int bin = 0; - int numMaxCount = 0; - for (int p : hm.keySet()) { - int count = hm.get(p); - if (count > maxCount) { - numMaxCount = 1; - bin = p; - maxCount = count; - } else if (count == maxCount) { - ++numMaxCount; - if (Math.random() * numMaxCount < 1.0) // randomly pick one of multiple bins with maximum count - bin = p; - } - } - if (maxCount > 0) // there was at least one marker - sortValues.add(bin + Math.random()); //bin + rand[0,1] - else // no markers - sortValues.add(Math.random() * (maxBin - minBin + 1) + minBin); - } - Misc.ArrayIndexComparator comparator = new Misc.ArrayIndexComparator(sortValues); - Integer[] indexes = comparator.createIndexArray(); - Arrays.sort(indexes, comparator); - - ArrayList cis_new = new ArrayList(); - for (int i : indexes) - cis_new.add(intervalsInChr.get(i)); - intervalsInChr = cis_new; - } - if (findOrientation && numMaps > 1) { - String orientation = "+"; - String scores = ""; - - //disable chains - HashMap oldChainLinkHash = chainLinkHash; - chainLinkHash = new HashMap(); - - //use only first map - int oldNumMaps = numMaps; - numMaps = 1; - - ArrayList cis_new = new ArrayList(); - - for (ContigInterval ci : intervalsInChr) { - if (ci.markers.size() > 0 && ci.markers.get(0).size() > 0) - cis_new.add(ci); - } - - //Collections.shuffle(cis_new); // random order as init (could be based on most common marker in each contig...) - - for (int map = 1; map < oldNumMaps; ++map) { - int oldNumRuns = numRuns; - if (map == 1) - numRuns = Math.min(2, oldNumRuns); - else - numRuns = 1; // this is probably enough... - improveAnchoring(cis_new, false, true); - numRuns = oldNumRuns; - - numMaps = map + 1; - - int score1 = calculateScore(cis_new, numMaps, false); - - flip(getMarkers(intervalsInChr, null, map)); - - int score2 = calculateScore(cis_new, numMaps, false); - - if (score1 >= score2) { - flip(getMarkers(intervalsInChr, null, map)); - orientation += " +"; - } else - orientation += " -"; - - scores = scores + " " + (score1 - score2); - - - //add new markers to cis_new to random places... - - ArrayList cis_new2 = new ArrayList(); - out: for (ContigInterval ci : intervalsInChr) { - if (ci.markers.size() > map && ci.markers.get(map).size() > 0) { - for (int m = 0; m < map; ++m) - if (ci.markers.get(m).size() > 0) - continue out; - cis_new2.add(ci); - } - } - //Collections.shuffle(cis_new2); no need to shuffle... - cis_new = Misc.randomMerge(cis_new, cis_new2); - } - //System.out.println(intervalsInChr.size() + "\t" + cis_new.size() ); - - //intervalsInChr.clear(); - //intervalsInChr.addAll(cis_new); - - //finally add contigs without markers - for (ContigInterval ci : intervalsInChr) - if (ci.markers.size() == 0) - cis_new.add(ci); - - assert(intervalsInChr.size() == cis_new.size()); - - - intervalsInChr = cis_new; - - //enable chains - chainLinkHash = oldChainLinkHash; - - System.out.println("#found orientation=" + orientation + " ( support " + scores + " )"); - } - if (finalImprove) - improveAnchoring(); - } - - private ArrayList getMarkers(ArrayList cis, ContigInterval excluded, int map) - { - ArrayList ret = new ArrayList(); - for (ContigInterval ci : cis) - if (ci != excluded) - ret.addAll(ci.getMarkers(map)); - return ret; - } - - private String calculateKey(ContigInterval prev, boolean orientationPrev, ContigInterval ci, boolean orientationCi){ - return prev.toString() + ((orientationPrev) ? '+':'-') + ci.toString() + ((orientationCi) ? '+':'-'); - } - - private String calculateKey(ContigInterval prev, ContigInterval ci){ - return calculateKey(prev, (prev.getOrientation() >= 0), ci, (ci.getOrientation() >= 0)); - } - - //chainScore with possible flips for prev and ci - private int calculateChainScore(ContigInterval prev, boolean flipPrev, ContigInterval ci, boolean flipCi) { - String key = calculateKey(prev, (prev.getOrientation() >= 0) ^ flipPrev, ci, (ci.getOrientation() >= 0) ^ flipCi); - return calculateChainScore(key); - } - private int calculateChainScore(ContigInterval prev, ContigInterval ci) { - String key = calculateKey(prev, ci); - return calculateChainScore(key); - } - - private int calculateChainScore(String key) { - if (chainLinkHash.containsKey(key)) { - int ret = chainLinkHash.get(key); - if (useChainAndPaf && scaffoldingLink.containsKey(key)) - ret += scaffoldingLink.get(key); - return ret; - } - if (scaffoldingLink.containsKey(key)) - return scaffoldingLink.get(key); - return 0; - } - - // haplotype is haplotype of ofHaplotype? - private int calculateChainScoreHaplotype(ContigInterval ofHaplotype, ContigInterval haplotype) { - String key = calculateKey(ofHaplotype, haplotype); - key = key.substring(0, key.length() - 1); // last orientation - - if (chainHaplotypeHash.containsKey(key)) - return chainHaplotypeHash.get(key); - return 0; - } - - private long[] calculateChainScoreHaplotypeCut(ContigInterval ofHaplotype, ContigInterval haplotype) { - String key = calculateKey(ofHaplotype, haplotype); - key = key.substring(0, key.length() - 1); // last orientation - - return chainHaplotypeCut.get(key); - } - - private int calculateChainScore(ArrayList cis) - { - int score = 0; - ContigInterval prev = null; - for (ContigInterval ci : cis) { - if (prev != null) - score += calculateChainScore(prev, ci); - prev = ci; - } - return score; - } - - private int calculateProximityScore(ArrayList cis) - { - if (prox != null) - return prox.score(cis); - return 0; - } - - private int evaluateScore(ArrayList eval, boolean improve) - { - ArrayList eval2 = new ArrayList(); - for (ContigInterval e : eval) { - boolean found = false; - for (ContigInterval ci : intervalsInChr) { - if (ci.equals(e)) { - eval2.add(ci); - ci.setOrientation(e.getOrientation()); - found = true; - break; - } - } - if (!found) { - System.err.println("Contig " + e + " not found"); - } - } - - if (improve) { - int score = calculateScore(eval2, false) + calculateNonMarkerScore(eval2); - //System.out.print("#initial score " + score + " "); - improveAnchoring(eval2, true, true); - } else { - ArrayList orderSupport = calculateSupport(eval2); - int score = calculateScore(eval2, false) + calculateNonMarkerScore(eval2); - System.out.println("#final score " + score); - printAnchoring(eval2, orderSupport); - findLikelyAssemblyErrors(eval2); - } - //change bed to the evaluated order... - intervalsInChr.clear(); - intervalsInChr.addAll(eval2); - - return calculateScore(eval2, false) + calculateNonMarkerScore(eval2); -/* - int score1 = calculateScore(eval2, true); - for (int map = 0; map < numMaps; ++map) - flip(getMarkers(intervalsInChr, null, map)); - int score2 = calculateScore(eval2, true); - - return Math.max(score1, score2);*/ - } - - - private int calculateScoreFast(ArrayList cis) - { - int score = 0; - for (int map = 0; map < numMaps; ++map) { - int s = solveForwardFast(getMarkers(cis, null, map)); - score += s; - } - return score; - } - - - private int calculateScore(ArrayList cis, boolean verbose) - { - return calculateScore(cis, numMaps, verbose); - } - - private int calculateScore(ArrayList cis, int numMaps, boolean verbose) - { - int score = 0; - for (int map = 0; map < numMaps; ++map) { - int s = solveForward(getMarkers(cis, null, map)); - if (verbose) - System.err.println("map " + map + " score " + s); - score += s; - } - return score; - } - - private int getMaxBin(ArrayList markers) { - int maxBin = Integer.MIN_VALUE; - for (Marker m : markers) - maxBin = Math.max(maxBin, m.maxBin()); - if (maxBin != Integer.MIN_VALUE) - return maxBin; - else - return 0; - } - - private int getMinBin(ArrayList markers) { - int minBin = Integer.MAX_VALUE; - for (Marker m : markers) - minBin = Math.min(minBin, m.minBin()); - if (minBin != Integer.MAX_VALUE) - return minBin; - else - return 0; - } - - //TODO: Remove calculateChainScore(cis) and calculateChainScore(c1, c2) add calculateNonMarkerScore(...) - private ArrayList calculateSupport(ArrayList cis) { - - //for (int iteration = 0; iteration < 1; ++iteration) { // add here "iteration < 2" if possible orientation errors should be corrected... - - - - //ArrayList score1 = new ArrayList(); - ArrayList score2 = new ArrayList(); - - ArrayList order1 = new ArrayList(); - ArrayList order2 = new ArrayList(); - - int rank = 0; - for (ContigInterval ci : cis) { - ci.setRank(rank++); - score2.add(0); - order1.add(0); - order2.add(0); - } - - for (int map = 0; map < numMaps; ++map) { - ArrayList markers = getMarkers(cis, null, map); - - int endBin = getMaxBin(markers); - int startBin = getMinBin(markers); - - int numMarkers = markers.size(); - int S[][] = new int[numMarkers + 1][endBin - startBin + 1]; - int P[][] = new int[numMarkers + 1][endBin - startBin + 1]; - - int Sb[][] = new int[numMarkers + 1][endBin - startBin + 1]; - //int Pb[][] = new int[numMarkers + 1][endBin - startBin + 1]; - - int Sc[][] = new int[numMarkers + 1][endBin - startBin + 1]; - int Pc[][] = new int[numMarkers + 1][endBin - startBin + 1]; - - //simple way to calculate (about correct) backward table... - Collections.reverse(markers); - forwardFast2(markers, Sb, startBin, endBin); - Collections.reverse(markers); - - forward1(markers, S, P, startBin, endBin); - int maxScore = -1; - { - int maxI = startBin; - for (int i = startBin; i <= endBin; ++i) - if (S[numMarkers][i - startBin] > maxScore) { - maxScore = S[numMarkers][i - startBin]; - maxI = i; - } - - for (int mi = numMarkers; mi > 0; --mi) { - markers.get(mi - 1).pPlus = maxI; - //System.err.println(markers.get(mi - 1) + "\t" + maxI); - maxI = P[mi][maxI - startBin]; - } - } - - ArrayList> markerIndex = new ArrayList>(); - for (ContigInterval ci : cis) - markerIndex.add(new ArrayList()); - - for (int mi = 0; mi < numMarkers; ++mi) { - Marker m = markers.get(mi); - rank = m.ci.getRank(); - markerIndex.get(rank).add(mi); - } - for (ContigInterval ci : cis) { - rank = ci.getRank(); - - ArrayList cMarkers = new ArrayList(); - - for (int i : markerIndex.get(rank)) - cMarkers.add(markers.get(i)); - - int maxB = Integer.MIN_VALUE; - int minB = Integer.MAX_VALUE; - - int numCMarkers = cMarkers.size(); - - if (numCMarkers > 0) { - for (Marker m : cMarkers) { - maxB = Math.max(maxB, m.pPlus); - minB = Math.min(minB, m.pPlus); - } - if (maxB > minB) - order1.set(rank, order1.get(rank) + 1); - - Collections.reverse(cMarkers); - int start = markerIndex.get(rank).get(0); - int end = markerIndex.get(rank).get(numCMarkers - 1) + 1; - - for (int b = startBin; b <= endBin; ++b) { - Sc[0][b - startBin] = S[start][b - startBin]; - Pc[0][b - startBin] = P[start][b - startBin]; - } - - forward1(cMarkers, Sc, Pc, startBin, endBin); - - - int Scc[] = Sc[cMarkers.size()]; - for (int b = startBin + 1; b <= endBin; ++b) { // make table non-decreasing - if (Scc[b - startBin] < Scc[b - startBin - 1]) - Scc[b - startBin] = Scc[b - startBin - 1]; - } - int maxI = startBin; - int max = -1; - for (int b = startBin; b <= endBin; ++b) { - int s = Scc[b - startBin] + Sb[numMarkers - end][b - startBin]; - if (s > max) { - max = s; - maxI = b; - } - } - score2.set(rank, score2.get(rank) + max); - { - for (int mi = numCMarkers; mi > 0; --mi) { - cMarkers.get(mi - 1).pPlus = maxI; - //System.err.println(markers.get(mi - 1) + "\t" + maxI); - maxI = Pc[mi][maxI - startBin]; - } - } - maxB = Integer.MIN_VALUE; - minB = Integer.MAX_VALUE; - for (Marker m : cMarkers) { - maxB = Math.max(maxB, m.pPlus); - minB = Math.min(minB, m.pPlus); - } - if (maxB > minB) - order2.set(rank, order2.get(rank) + 1); - } else { - score2.set(rank, score2.get(rank) + maxScore); - } - } - } - int oldNonMarkerScore = calculateNonMarkerScore(cis); - int oldScore = calculateScore(cis, false) + oldNonMarkerScore; - for (ContigInterval ci : cis) { - rank = ci.getRank(); - int newNonMarkerScore = oldNonMarkerScore + nonMarkerScoreChange(cis, rank, rank, 0, true, 0); - score2.set(rank, score2.get(rank) + newNonMarkerScore); - } - - ArrayList ret = new ArrayList(); - //boolean changes = false; - for (ContigInterval c : cis) { - rank = c.getRank(); - int support = oldScore - score2.get(rank); - int order = 0; - if (support == 0 && order1.get(rank) > order2.get(rank)) { - if (c.getOrientation() >= 0) - order = 1; - else - order = -1; - } - else if (support == 0 && order1.get(rank) < order2.get(rank)) { - if (c.getOrientation() >= 0) - order = -1; - else - order = 1; - } - //if (support < 0) { - // c.flipOrientation(); - // changes = true; - //} - //System.err.println("order_support\t" + c +"\t" + ((support == 0) ? 0.5 * order: support)); - ret.add(((support == 0) ? (0.5 * order): support)); - } - //if (!changes) - // break; - //} - return ret; - } - - //flip(i..cr) - //move(i...cr) to new_pos w/wo flipping - //move cr to new position - - private int calculateNonMarkerScore(ArrayList cis) - { - return calculateChainScore(cis) + calculateProximityScore(cis); - } - - - //start...end => start+moveDirection...end+moveDirection - public static void changeOrder(ArrayList cis, int start, int end, int moveDirection, boolean flip) - { - int n = cis.size(); - if (end + moveDirection >= n || start + moveDirection < 0) { - System.err.println("change too large"); - return; - } - if (moveDirection == 0 && !flip) // nothing to do - return; - - ArrayList cis_new = new ArrayList(); - if (moveDirection <= 0) { - for (int i = 0; i < start + moveDirection; ++i) - cis_new.add(cis.get(i)); - - if (!flip) - for (int i = start; i <= end; ++i) - cis_new.add(cis.get(i)); - else - for (int i = end; i >= start; --i) { //reverse and flip - ContigInterval ci = cis.get(i); - ci.flipOrientation(); - cis_new.add(ci); - } - - for (int i = start + moveDirection; i < start; ++i) - cis_new.add(cis.get(i)); - for (int i = end + 1; i < n; ++i) - cis_new.add(cis.get(i)); - } else { - for (int i = 0; i < start; ++i) - cis_new.add(cis.get(i)); - - for (int i = 1; i <= moveDirection; ++i) - cis_new.add(cis.get(i + end)); - - if (!flip) - for (int i = start; i <= end; ++i) - cis_new.add(cis.get(i)); - else - for (int i = end; i >= start; --i) { //reverse and flip - ContigInterval ci = cis.get(i); - ci.flipOrientation(); - cis_new.add(ci); - } - - for (int i = end + moveDirection + 1; i < n; ++i) - cis_new.add(cis.get(i)); - } - cis.clear(); - cis.addAll(cis_new); - } - - //reverse of changeOrder - public static void changeOrderReverse(ArrayList cis, int start, int end, int moveDirection, boolean flip) - { - changeOrder(cis, start + moveDirection, end + moveDirection, -moveDirection, flip); - } - public int nonMarkerScoreChange(ArrayList cis, int start, int end, int moveDirection, boolean flip, int flipScore) - { - int ret = ((flip) ? flipScore : 0); - if (moveDirection == 0) { // flip - if (flip) { - if (start > 0) { - ret -= calculateChainScore(cis.get(start - 1), cis.get(start)); - ret += calculateChainScore(cis.get(start - 1), false, cis.get(end), true); - } - if (end + 1 < cis.size()) { - ret -= calculateChainScore(cis.get(end), cis.get(end + 1)); - ret += calculateChainScore(cis.get(start), true, cis.get(end + 1), false); - } - } // else nothing to do... - } else { // move one or multiple contigs... - ContigInterval sc = cis.get(start); - ContigInterval ec = cis.get(end); - - if (start > 0) { - ret -= calculateChainScore(cis.get(start - 1), sc); - if (end + 1 < cis.size()) - ret += calculateChainScore(cis.get(start - 1), cis.get(end + 1)); - } - if (end + 1 < cis.size()) - ret -= calculateChainScore(ec, cis.get(end + 1)); - - if (moveDirection <= 0) { - if (flip) { - if (start + moveDirection > 0) { - ret -= calculateChainScore(cis.get(start + moveDirection - 1), cis.get(start + moveDirection)); - ret += calculateChainScore(cis.get(start + moveDirection - 1), false, ec, true); - } - ret += calculateChainScore(sc, true, cis.get(start + moveDirection), false); - } else { - if (start + moveDirection > 0) { - ret -= calculateChainScore(cis.get(start + moveDirection - 1), cis.get(start + moveDirection)); - ret += calculateChainScore(cis.get(start + moveDirection - 1), sc); - } - ret += calculateChainScore(ec, cis.get(start + moveDirection)); - } - } else { // moveDirection > 0 - if (flip) { - if (end + moveDirection + 1 < cis.size()) { - ret -= calculateChainScore(cis.get(end + moveDirection), cis.get(end + moveDirection + 1)); - ret += calculateChainScore(sc, true, cis.get(end + moveDirection + 1), false); - } - ret += calculateChainScore(cis.get(end + moveDirection), false, ec, true); - } else { - if (end + moveDirection + 1 < cis.size()) { - ret -= calculateChainScore(cis.get(end + moveDirection), cis.get(end + moveDirection + 1)); - ret += calculateChainScore(ec, cis.get(end + moveDirection + 1)); - } - ret += calculateChainScore(cis.get(end + moveDirection), sc); - } - } - } - if (prox != null) - ret += prox.scoreChange(cis, start, end, moveDirection, flip); - return ret; - } - - private boolean linkedIntervals(ArrayList cis, int c1, int c2) - { - return (calculateChainScore(cis.get(c1), cis.get(c2)) > 0 || (prox != null && prox.linkedIntervals(cis, c1, c2))); - } - - //join contig to linked contigs next to it and try to improve... - //not thread safe... fix?: separate scaffoldingLink initialisation - private int calculateBestLinkedContig(ArrayList cis, int cr, boolean verbose){ - - int start = cr; - if (start > 0 && linkedIntervals(cis, start - 1, start)) - return 0; // only start from the first contig of each linked chain of contigs - - int numCis = cis.size(); - - int end = cr; - while (end + 1 < numCis && linkedIntervals(cis, end, end + 1)) - ++end; - - boolean mapMarkers = false; - for (int i = start; i <= end; ++i) - if (cis.get(i).markers.size() > 0) { - mapMarkers = true; - break; - } - if (!mapMarkers) // no need to go further without markers... - return 0; - - int numLinked = end - start + 1; - - if (start < cr || start == end || numLinked == numCis) // starts at c, at least two ContigIntervals and not connected to all - return 0; - - ContigInterval ci_new = new ContigInterval("\t", 1, 1); // dummy name, this cannot be loaded from a file... - - int flipScore = 0; // calculate flipsScore to take into account possible decreasing score when flipping start..end - for (int i = start; i < end; ++i) - flipScore += calculateChainScore(cis.get(i + 1), true, cis.get(i), true) - calculateChainScore(cis.get(i), cis.get(i + 1)); - - //add scaffolding links, use scaffoldingLink_tmp not to mess other links... - scaffoldingLink_tmp.clear(); - for (int i = -1; i < numCis; ++i) - if (i < start || i > end) { - int moveDirection = ((i < start) ? i - start + 1 : i - end); - int nmss = nonMarkerScoreChange(cis, start, end, moveDirection, false, flipScore); - int nmss2 = nonMarkerScoreChange(cis, start, end, moveDirection, true, flipScore); - if (i >= 0) { - ContigInterval ci = cis.get(i); - if (moveDirection != 0) // with links for moveDirection==0, we end up updating start-1...end as well - scaffoldingLink_tmp.put(calculateKey(ci, ci_new), nmss); - scaffoldingLink_tmp.put(calculateKey(ci, ci.getOrientation() >= 0, ci_new, false), nmss2); - } else { // move to the very first position... - int firstMarker = ((start > 0) ? 0 : end + 1); - ContigInterval ci = cis.get(firstMarker); - if (moveDirection != 0) // with links for moveDirection==0, we end up updating start...end+1 as well - scaffoldingLink_tmp.put(calculateKey(ci_new, ci), nmss); - scaffoldingLink_tmp.put(calculateKey(ci_new, false, ci, ci.getOrientation() >= 0), nmss2); - } - } - - //and markers - for (int map = 0; map < numMaps; ++map) { - for (int i = start; i <= end; ++i) { - for (Marker m : cis.get(i).getMarkers(map)) - ci_new.addMarker(new Marker(m), map); - } - } - - ArrayList cis_new = new ArrayList(); - for (int i = 0; i < start; ++i) - cis_new.add(cis.get(i)); - cis_new.add(ci_new); - for (int i = end + 1; i < numCis; ++i) - cis_new.add(cis.get(i)); - - //int os = this.calculateScore(cis, false) + calculateNonMarkerScore(cis); - //int ns2 = this.calculateScore(cis_new, false) + calculateNonMarkerScore(cis_new); - //int ns = this.calculateScore(cis_new, false) + calculateNonMarkerScore(cis_new); - //System.err.println("linked " + ret + " " + start + "," + end + " " +ns2 + " " + os + " " + ns); - - Proximity tmpProx = prox; - prox = null; // do not use proximity - HashMap tmpScaffoldingLink = scaffoldingLink; - scaffoldingLink = scaffoldingLink_tmp; - HashMap tmpChainLink = chainLinkHash; - chainLinkHash = new HashMap(); //nor chainLinks - - int ret = calculateBest(cis_new, ci_new, true); - - prox = tmpProx; - scaffoldingLink = tmpScaffoldingLink; - chainLinkHash = tmpChainLink; - - if (ret > 0) { // update cis - //System.err.println("linked improvement " + ret); - int crn = 0; - for (ContigInterval ci : cis_new) - if (ci == ci_new) - break; - else - ++crn; - - cis_new.clear(); - //int os = calculateScore(cis, false) + calculateNonMarkerScore(cis); - - for (int i = 0; i < crn; ++i) - if (i >= start) - cis_new.add(cis.get(i + numLinked)); - else - cis_new.add(cis.get(i)); - int orientation = ci_new.getOrientation(); - if (orientation >= 0) - for (int i = start; i <= end; ++i) - cis_new.add(cis.get(i)); - else - for (int i = end; i >= start; --i) { - cis.get(i).flipOrientation(); - cis_new.add(cis.get(i)); - } - for (int i = crn; i < numCis - numLinked; ++i) - if (i >= start) - cis_new.add(cis.get(i + numLinked)); - else - cis_new.add(cis.get(i)); - int ns = calculateScore(cis_new, false) + calculateNonMarkerScore(cis_new); - //if (os >= ns) { - // if (orientation < 0) - // for (int i = start; i <= end; ++i) - // cis.get(i).flipOrientation(); - // System.err.println("Error: Score not improving! " + start + "-" + end + "\t" + crn + "\t" + orientation); - // return 0; - //} - System.err.println("LINKED\t" + cis.get(start) + "\t" + (end - start + 1) + "\t" + ns); - cis.clear(); - cis.addAll(cis_new); - } - return ret; - } - //join contig to linked contigs next to it and try to improve... - //thread safe version... - private int calculateBestLinkedContig_ts(ArrayList cis, int cr, boolean verbose){ - - int start = cr; - if (start > 0 && linkedIntervals(cis, start - 1, start)) - return 0; // only start from the first contig of each linked chain of contigs - - int numCis = cis.size(); - - int end = cr; - while (end + 1 < numCis && linkedIntervals(cis, end, end + 1)) - ++end; - - int numLinked = end - start + 1; - - if (start < cr || start == end || numLinked == numCis) // starts at c, at least two ContigIntervals and not connected to all - return 0; - - boolean mapMarkers = false; - for (int i = start; i <= end; ++i) - if (cis.get(i).markers.size() > 0) { - mapMarkers = true; - break; - } - - if (!mapMarkers) // no need to go further without markers... - return 0; - - //update rank - int rank = 0; - for (ContigInterval ci : cis) - ci.setRank(rank++); - - int bestRes[] = new int[numCis + 1]; - int bestRes2[] = new int[numCis + 1]; - - for (int map = 0; map < numMaps; ++map) { - - ArrayList cMarkers = new ArrayList(); - for (int i = start; i <= end; ++i) - cMarkers.addAll(cis.get(i).getMarkers(map)); - - ArrayList markers = new ArrayList(); - for (int i = 0; i < start; ++i) - markers.addAll(cis.get(i).getMarkers(map)); - for (int i = end + 1; i < numCis; ++i) - markers.addAll(cis.get(i).getMarkers(map)); - - int endBin = getMaxBin(markers); - int startBin = getMinBin(markers); - - for (Marker m : cMarkers) { - endBin = Math.max(endBin, m.maxBin()); - startBin = Math.min(startBin, m.minBin()); - } - //System.err.println(startBin); - //System.err.println(endBin); - - int numMarkers = markers.size(); - int S[][] = new int[numMarkers + 1][endBin - startBin + 1]; - - int Sb[][] = new int[numMarkers + 1][endBin - startBin + 1]; - - int Sc[][] = new int[1 + cMarkers.size()][endBin - startBin + 1]; - - forwardFast1(markers, S, startBin, endBin); - //simple way to calculate (about correct) backward table... - Collections.reverse(markers); - forwardFast2(markers, Sb, startBin, endBin); - Collections.reverse(markers); - - int old = -1; - for (int mi = 0; mi <= numMarkers; ++mi) { - - if (mi < numMarkers) { - Marker m = markers.get(mi); // this is the next marker - numCis = m.ci.getRank(); - } else - numCis = cis.size(); - - if (numCis != old) { - for (int b = 0; b <= endBin - startBin; ++b) - Sc[0][b] = S[mi][b]; - - forwardFast1(cMarkers, Sc, startBin, endBin); - - int Scc[] = Sc[cMarkers.size()]; - for (int b = 1; b <= endBin - startBin; ++b) { // make table non-decreasing - if (Scc[b] < Scc[b - 1]) - Scc[b] = Scc[b - 1]; - } - int Sbb[] = Sb[numMarkers - mi]; - int max = -1; - for (int b = 0; b <= endBin - startBin; ++b) { - int s = Scc[b] + Sbb[b]; - if (s > max) - max = s; - } - //System.out.println(max); - for (int i = old + 1; i <= numCis; ++i) - bestRes[i] += max; - - Collections.reverse(cMarkers); - forwardFast1(cMarkers, Sc, startBin, endBin); - Collections.reverse(cMarkers); - - for (int b = 1; b <= endBin - startBin; ++b) { // make table non-decreasing - if (Scc[b] < Scc[b - 1]) - Scc[b] = Scc[b - 1]; - } - max = -1; - for (int b = 0; b <= endBin - startBin; ++b) { - int s = Scc[b] + Sbb[b]; - if (s > max) - max = s; - } - //System.out.println(max); - for (int i = old + 1; i <= numCis; ++i) - bestRes2[i] += max; - } - old = numCis; - } - } - - numCis = cis.size(); - int flipScore = 0; // calculate flipsScore to take into account possible decreasing score when flipping start..end - for (int i = start; i < end; ++i) - flipScore += calculateChainScore(cis.get(i + 1), true, cis.get(i), true) - calculateChainScore(cis.get(i), cis.get(i + 1)); - - int oldNonMarkerScore = calculateNonMarkerScore(cis); - int oldScore = calculateScore(cis, false) + oldNonMarkerScore; - for (int ci = 0; ci <= numCis; ++ci) { - int mD = ci - start; - if (ci > start) - if (ci > end) - mD = ci - end - 1; - else - mD = 0; - bestRes[ci] += oldNonMarkerScore + nonMarkerScoreChange(cis, start, end, mD, false, flipScore); - bestRes2[ci]+= oldNonMarkerScore + nonMarkerScoreChange(cis, start, end, mD, true, flipScore); - } - //for (int ci = 0; ci <= cis.size(); ++ci) { - // System.err.print(bestRes[ci] + " "); - //} - //System.err.println(); - - int max = 0; - for (int ci = 0; ci <= cis.size(); ++ci) - if (bestRes[ci] > bestRes[max]) - max = ci; - - int max2 = 0; - for (int ci = 0; ci <= cis.size(); ++ci) - if (bestRes2[ci] > bestRes2[max2]) - max2 = ci; - - if (bestRes[max] > oldScore || bestRes2[max2] > oldScore) { - int pos = max2; - boolean flip = true; - if (bestRes[max] >= bestRes2[max2]) { - flip = false; - pos = max; - if (verbose) - System.err.println("LINKED_MOVE\t" + cis.get(start) + "\t" + numLinked + "\t" + bestRes[max]);// + "\t" + oldScore); - - } else { - if (verbose) - System.err.println(((max2 == start || max2 == start + 1) ? "LINKED_FLIP\t":"LINKED_MOVE+FLIP\t") + cis.get(start) + "\t" + numLinked + "\t" + bestRes2[max2]);// + "\t" + oldScore); - } - - //table index is the new rank for the contig start... without flipping... - int mD = pos - start; - if (pos > start) - if (pos > end) - mD = pos - end - 1; - else - mD = 0; - changeOrder(cis, start, end, mD, flip); - } - //int ns = (calculateNonMarkerScore(cis) + calculateScore(cis, false)); - //int ns2 = Math.max(bestRes[max], bestRes2[max2]); - //if (ns != ns2) - // System.err.println("LINKED_DIFF " + oldScore + " " + ns + " " + ns2); - return Math.max(bestRes[max], bestRes2[max2]) - oldScore; - } - - - private int calculateBest(ArrayList cis, ContigInterval c, boolean verbose) { - - //update rank - int numCis = 0; - for (ContigInterval ci : cis) - ci.setRank(numCis++); - - boolean mapMarkers = (c.markers.size() > 0); - - int oldNonMarkerScore = calculateNonMarkerScore(cis); - int oldScore = oldNonMarkerScore + ((mapMarkers) ? calculateScore(cis, false) : 0); - - int cr = c.getRank(); - - //try flipping and/or moving multiple Contigs [i...cr] - if (cr > 0) { - int minFlipStart = cr - 1; - while (minFlipStart >= 0 && linkedIntervals(cis, minFlipStart, minFlipStart + 1)) { - --minFlipStart; - } - - int flipS = 0; //calculate this to fix the nonsymmetrical pair-wise scores... - for (int flipStart = cr - 1; flipStart > minFlipStart; --flipStart) { - - if (!mapMarkers && cis.get(flipStart).markers.size() > 0) { - mapMarkers = true; - oldScore += calculateScore(cis, false); - } - - flipS -= calculateChainScore(cis.get(flipStart), cis.get(flipStart + 1)); //handles the nonsymmetrical pair-wise scores... - flipS += calculateChainScore(cis.get(flipStart + 1), true, cis.get(flipStart), true); //handles the nonsymmetrical pair-wise scores... - - int flipScore = nonMarkerScoreChange(cis, flipStart, cr, 0, true, flipS); - - if (flipScore >= 0) { //chainScore must not get worse, now calculate the marker score as well - - changeOrder(cis, flipStart, cr, 0, true); - int nms = calculateNonMarkerScore(cis); - if (nms - oldNonMarkerScore != flipScore) { - System.err.println("Error:scores differ " + (nms - oldNonMarkerScore) + " " + flipScore); - } - int newScore = ((mapMarkers) ? calculateScoreFast(cis) + nms : nms); - if (newScore > oldScore) {// and the total score improves - if (verbose) - System.err.println("FLIP\t" + c + "\t" + (cr - flipStart + 1) + "\t" + (newScore)); - return newScore - oldScore; - } else - changeOrderReverse(cis, flipStart, cr, 0, true); - } - - //try moving multiple Contigs [i...cr] with or without flipping... - out: for (int direction = -1; direction < 2;direction+=2){ - - ArrayList scores = new ArrayList(); - for (int movePosition = direction; cr + movePosition < numCis && flipStart + movePosition >= 0; movePosition+=direction) { - int moveScore1 = nonMarkerScoreChange(cis, flipStart, cr, movePosition, false, 0); - int moveScore2 = nonMarkerScoreChange(cis, flipStart, cr, movePosition, true, flipS); - scores.add(-Math.max(moveScore1, moveScore2)); - } - //sort possible positions based on the chain (nonMarker) score... - Misc.ArrayIndexComparator comparator = new Misc.ArrayIndexComparator(scores); - Integer[] indexes = comparator.createIndexArray(); - Arrays.sort(indexes, comparator); - - int maxEvaluations = 4; - - for (int i : indexes) { - int movePosition = (i + 1) * direction; - int moveScore1 = nonMarkerScoreChange(cis, flipStart, cr, movePosition, false, 0); - if (moveScore1 >= 0) { - changeOrder(cis, flipStart, cr, movePosition, false); - int score = ((mapMarkers) ? calculateScoreFast(cis) + calculateNonMarkerScore(cis) : calculateNonMarkerScore(cis)); - if (score > oldScore) { - if (verbose) - System.err.println("MOVE\t" + c + "\t" + (cr - flipStart + 1) + "\t" + score); - return score - oldScore; - } else - changeOrderReverse(cis, flipStart, cr, movePosition, false); - } - int moveScore2 = nonMarkerScoreChange(cis, flipStart, cr, movePosition, true, flipS); - if (moveScore2 >= 0) { - changeOrder(cis, flipStart, cr, movePosition, true); - int score = ((mapMarkers) ? calculateScoreFast(cis) + calculateNonMarkerScore(cis) : calculateNonMarkerScore(cis)); - if (score > oldScore) { - if (verbose) - System.err.println("MOVE+FLIP\t" + c + "\t" + (cr - flipStart + 1) + "\t" + score); - return score - oldScore; - } else - changeOrderReverse(cis, flipStart, cr, movePosition, true); - } - //if (--maxEvaluations <= 0 || (moveScore1 < 0 && moveScore2 < 0)) // maximum number of evaluations reached... - if (--maxEvaluations <= 0) // maximum number of evaluations reached... - continue out; - } - } - } - } - //Move only contig c into new position - int bestRes[] = new int[cis.size() + 1]; - int bestRes2[] = new int[cis.size() + 1]; - - //System.err.println(oldScore + "\t" + oldChainScore); - if (c.markers.size() > 0) { // no need without markers in c... - for (int map = 0; map < numMaps; ++map) { - ArrayList markers = getMarkers(cis, c, map); - ArrayList cMarkers = c.getMarkers(map); - - int endBin = getMaxBin(markers); - int startBin = getMinBin(markers); - - for (Marker m : cMarkers) { - endBin = Math.max(endBin, m.maxBin()); - startBin = Math.min(startBin, m.minBin()); - } - //System.err.println(startBin); - //System.err.println(endBin); - - int numMarkers = markers.size(); - int S[][] = new int[numMarkers + 1][endBin - startBin + 1]; - - int Sb[][] = new int[numMarkers + 1][endBin - startBin + 1]; - - int Sc[][] = new int[1 + cMarkers.size()][endBin - startBin + 1]; - - forwardFast1(markers, S, startBin, endBin); - //simple way to calculate (about correct) backward table... - Collections.reverse(markers); - forwardFast2(markers, Sb, startBin, endBin); - Collections.reverse(markers); - - int old = -1; - for (int mi = 0; mi <= numMarkers; ++mi) { - - if (mi < numMarkers) { - Marker m = markers.get(mi); // this is the next marker - numCis = m.ci.getRank(); - } else - numCis = cis.size(); - - if (numCis != old) { - for (int b = 0; b <= endBin - startBin; ++b) - Sc[0][b] = S[mi][b]; - - forwardFast1(cMarkers, Sc, startBin, endBin); - - int Scc[] = Sc[cMarkers.size()]; - for (int b = 1; b <= endBin - startBin; ++b) { // make table non-decreasing - if (Scc[b] < Scc[b - 1]) - Scc[b] = Scc[b - 1]; - } - int Sbb[] = Sb[numMarkers - mi]; - int max = -1; - for (int b = 0; b <= endBin - startBin; ++b) { - int s = Scc[b] + Sbb[b]; - if (s > max) - max = s; - } - //System.out.println(max); - for (int i = old + 1; i <= numCis; ++i) - bestRes[i] += max; - - Collections.reverse(cMarkers); - forwardFast1(cMarkers, Sc, startBin, endBin); - Collections.reverse(cMarkers); - - for (int b = 1; b <= endBin - startBin; ++b) { // make table non-decreasing - if (Scc[b] < Scc[b - 1]) - Scc[b] = Scc[b - 1]; - } - max = -1; - for (int b = 0; b <= endBin - startBin; ++b) { - int s = Scc[b] + Sbb[b]; - if (s > max) - max = s; - } - //System.out.println(max); - for (int i = old + 1; i <= numCis; ++i) - bestRes2[i] += max; - } - old = numCis; - } - } - } else { - if (mapMarkers) // if there are no markers in c, then we can only consider nonMarkerScore after this... - oldScore = oldNonMarkerScore; - mapMarkers = false; - } - for (int ci = 0; ci <= cis.size(); ++ci) { - int mD = ci - cr; - if (ci > cr) - --mD; - bestRes[ci] += oldNonMarkerScore + nonMarkerScoreChange(cis, cr, cr, mD, false, 0); - bestRes2[ci]+= oldNonMarkerScore + nonMarkerScoreChange(cis, cr, cr, mD, true, 0); - } - //for (int ci = 0; ci <= cis.size(); ++ci) { - // System.err.print(bestRes[ci] + " "); - //} - //System.err.println(); - - int max = 0; - for (int ci = 0; ci <= cis.size(); ++ci) - if (bestRes[ci] > bestRes[max]) - max = ci; - - int max2 = 0; - for (int ci = 0; ci <= cis.size(); ++ci) - if (bestRes2[ci] > bestRes2[max2]) - max2 = ci; - - if (bestRes[max] > oldScore || bestRes2[max2] > oldScore) { - int pos = max2; - if (bestRes[max] >= bestRes2[max2]) { - pos = max; - if (verbose) - System.err.println("MOVE\t" + c + "\t1\t" + bestRes[max]); - } else { - if (verbose) - System.err.println(((max2 == cr || max2 == cr + 1) ? "FLIP\t":"MOVE+FLIP\t") + c + "\t1\t" + bestRes2[max2]); - c.flipOrientation(); - } - cis.remove(cr); - if (pos > cr) - cis.add(pos - 1, c); - else - cis.add(pos, c); - } - return Math.max(bestRes[max], bestRes2[max2]) - oldScore; - } - - private ArrayList cloneContigs(ArrayList cis) - { - ArrayList ret = new ArrayList(); - for (ContigInterval c : cis) - ret.add(new ContigInterval(c)); - return ret; - } - - private void improveAnchoring() - { - improveAnchoring(intervalsInChr, true, true); - } - - //parallel implementation of calculateBest - private class CalculateBest implements Runnable{ - private AtomicInteger index; - private AtomicBoolean stop; - private ArrayList cis; - private ArrayList cisOrig; - private ArrayList perm; - private boolean verbose; - private boolean linked; - - private CalculateBest next; - private Thread thread; - - private int score = 0; - private int scoreIndex = 0; - CalculateBest(ArrayList cis, ArrayList perm, AtomicInteger index, AtomicBoolean stop, boolean verbose, boolean linked) { - this.index = index; - this.stop = stop; - //clone cis - this.cis = cloneContigs(cis); - this.cisOrig = new ArrayList(); - cisOrig.addAll(this.cis); - this.perm = perm; - this.verbose = verbose; - this.linked = linked; - } - public void setNext(CalculateBest next) - { - this.next = next; - } - public void setScore(int score) - { - this.score = score; - } - @Override - public void run() { - long time = System.currentTimeMillis(); - score = 0; - thread = null; - while (!stop.get()) { - if (thread == null && next != null && System.currentTimeMillis() - time >= 1000) { // start a new thread every 1 secs - next.setCis(this.cis); - if (stop.get()) //it is worth breaking here - break; - thread = new Thread(next); - if (stop.get()) //and maybe here as well even when we don't use thread... - break; - thread.start(); - } - int i = index.getAndIncrement(); - if (i >= perm.size()) - break; - if (linked) - score = calculateBestLinkedContig_ts(cis, perm.get(i), verbose); // TODO: fix perm to cisOrig order - else - score = calculateBest(cis, cisOrig.get(perm.get(i)), verbose); - if (score > 0) { - scoreIndex = i; - stop.set(true); - } - } - if (thread != null && thread.isAlive()) - try { - thread.join(); - } catch (Exception e) { - e.printStackTrace(); - System.exit(-1); - } - } - public void directRun(int i) { - if (linked) - score = calculateBestLinkedContig_ts(cis, perm.get(i), verbose); - else - score = calculateBest(cis, cisOrig.get(perm.get(i)), verbose); - } - public void setCis(ArrayList cis) - { - // avoid clone - //this.cis = cloneContigs(cis); - makeCisIdentical(this.cis, cis); - } - public int getScore() - { - return score; - } - public int getScoreIndex() - { - return scoreIndex; - } - } - // make two lists of ContigIntervals indentical by placement and orientation, source=>target - // lists must contain exactly the same contigs... - private void makeCisIdentical(ArrayList target, ArrayList source) - { - HashMap map1to1 = new HashMap(); - for (ContigInterval c : target) { - if (map1to1.containsKey(c)) { - System.err.println("Error: same contig multiple times..."); - - for (ContigInterval c2 : target) { - System.err.println(c2); - - } - System.exit(-1); - - } - map1to1.put(c, c); - } - target.clear(); - for (ContigInterval c : source) { - ContigInterval c2 = map1to1.get(c); - if (c2 == null) { - System.err.println("Error: some contig(s) not found..."); - System.exit(-1); - } - target.add(c2); - c2.setOrientation(c.getOrientation()); - } - } - - private void improveAnchoring(ArrayList cis, boolean finalRound, boolean verbose) - { - if (cis.size() == 0) - return; - - if (finalRound) - System.out.print("#initial score " + (calculateScore(cis, false) + calculateNonMarkerScore(cis))); - - - int numCis = cis.size(); - - ArrayList perm = new ArrayList(); - for (int i = 0; i < numCis; ++i) - perm.add(i); - - boolean foundBetter = false; - - - int iteration = 0; - - int bestScore = -1; - ArrayList cis_best = new ArrayList(); - ArrayList cis_bestO = new ArrayList(); - - for (int run = 0; run < numRuns; ++run) { - if (run > 0) { - int n = cis.size(); - for (ContigInterval ci : cis) - if (Math.random() < 0.05) - ci.flipOrientation(); - if (n > 1) - for (int flips = 0; flips < n; ++flips) { - int i = (int) (Math.random() * (n - 1)); - ContigInterval tmp = cis.get(i); - cis.set(i, cis.get(i + 1)); - cis.set(i + 1, tmp); - } - } - - do { - if (finalRound) { - System.err.println("iteration " + (++iteration) + " score=" + (calculateScore(cis, false) + calculateNonMarkerScore(cis))); - } - foundBetter = false; - Collections.shuffle(perm); // random order - - if (numThreads <= 1) { - ArrayList cisOrig = new ArrayList(); - cisOrig.addAll(cis); - for (int i = 0; i < numCis; ++i) - if (calculateBest(cis, cisOrig.get(perm.get(i)), verbose) > 0) - foundBetter = true; - - //try linked version... - if (!foundBetter) - for (int i = 0; i < numCis; ++i) { - //ArrayList cis_ = new ArrayList(); - //cis_.addAll(cis); - //calculateBestLinkedContig_ts(cis_, perm.get(i), verbose); - - if (calculateBestLinkedContig_ts(cis, perm.get(i), verbose) > 0) - foundBetter = true; - } - } else { - for (int linked = 0; linked < 2; ++linked) { - AtomicInteger index = new AtomicInteger(); - AtomicBoolean stop = new AtomicBoolean(); - - CalculateBest[] cbs = new CalculateBest[numThreads]; - for (int t = 0; t < numThreads; ++t) { - cbs[t] = new CalculateBest(cis, perm, index, stop, verbose, linked == 1); - if (t > 0) - cbs[t - 1].setNext(cbs[t]); - } - while (true) { - cbs[0].run(); - int best = 0; - for (int t = 1; t < numThreads; ++t) { - if (cbs[t].getScore() > cbs[best].getScore()) - best = t; - } - if (cbs[best].getScore() > 0) { - if (best > 0) { - CalculateBest tmp = cbs[0]; //swap cbs[0] and cbs[best] - cbs[0] = cbs[best]; - cbs[best] = tmp; - for (int t = 1; t < numThreads; ++t) - cbs[t - 1].setNext(cbs[t]); - cbs[numThreads - 1].setNext(null); - } - for (int t = 1; t < numThreads; ++t) { //run other contigs that improved the score... - if (cbs[t].getScore() > 0) { - cbs[0].directRun(cbs[t].getScoreIndex()); - } - } - stop.set(false); - foundBetter = true; - for (int t = 0; t < numThreads; ++t) - cbs[t].setScore(0); - } else - break; - } - if (foundBetter) { - makeCisIdentical(cis, cbs[0].cis); - break; // linked only if !foundBetter - } - } - //if (!foundBetter) //calculateBestLinkedContig is not thread safe... - // for (int i = 0; i < numCis; ++i) - // if (calculateBestLinkedContig(cis, perm.get(i), verbose) > 0) - // foundBetter = true; - - } - } while (foundBetter); - int score = (calculateScore(cis, false) + calculateNonMarkerScore(cis)); - if (score > bestScore) { - bestScore = score; - cis_best.clear(); - cis_bestO.clear(); - for (ContigInterval ci : cis) { - cis_best.add(ci); - cis_bestO.add(ci.getOrientation()); - } - } else { - cis.clear(); - int i = 0; - for (ContigInterval ci : cis_best) { - cis.add(ci); - ci.setOrientation(cis_bestO.get(i++)); - } - } - } - - if (finalRound) { - ArrayList orderSupport = calculateSupport(cis); - System.out.println(" final score " + (calculateScore(cis, false) + calculateNonMarkerScore(cis))); - printAnchoring(cis, orderSupport); - findLikelyAssemblyErrors(cis); - } - } - - private void printAnchoring(ArrayList cis, ArrayList orderSupport) { - printAnchoring(System.out, cis, orderSupport); - } - private void printAnchoring(PrintStream stream, ArrayList cis, ArrayList orderSupport) { - stream.print("#contig\tstart\tend\torientation\tchr\torder_support\torig_start/alt_start\torig_end/alt_end\tlinked1\tvia_1\tlink_score1\tlinked2\tvia_2\tlink_score2\tpos_support"); - for (int map = 0; map < numMaps; ++map) - stream.print("\tmap" + (map + 1)); - stream.println(); - - boolean knownOrientation[] = new boolean[cis.size()]; // calculate whether global orientation is known... - - - //calculate map info and follow linked2 links (fill forwards in knownOrientation) - int prevC = 0; - for (int cii = 0; cii < cis.size(); ++cii) { - ContigInterval ci = cis.get(cii); - boolean linked2 = false; // from chain file... - if (cii + 1 < cis.size() && linkedIntervals(cis, cii, cii + 1)) - linked2 = true; - if (!linked2) { - for (int map = 0; map < numMaps; ++map) { - int max = Integer.MIN_VALUE; - int min = Integer.MAX_VALUE; - for (int c = prevC; c <= cii; ++c) - for (Marker m : cis.get(c).getMarkers(map)) { - max = Math.max(max, m.pPlus); - min = Math.min(min, m.pPlus); - } - if (max > min) - for (int c = prevC; c <= cii; ++c) - knownOrientation[c] = true; - } - prevC = cii + 1; - } - } - - long starts[] = new long[cis.size()]; // calculate start of each contig - long ends[] = new long[cis.size()]; // calculate end of each contig - - long starts2[] = new long[cis.size()]; // calculate start of each contig - long ends2[] = new long[cis.size()]; // calculate end of each contig - - boolean prevSwapped = false; // fix for contigs cut out more than their length (a haplotype of two other contigs) - for (int cii = 0; cii < cis.size(); ++cii) { - ContigInterval ci = cis.get(cii); - - boolean linked1 = false; // from chain file... - boolean linked2 = false; // from chain file... - if (cii > 0 && linkedIntervals(cis, cii - 1, cii)) - linked1 = true; - if (cii < cis.size() - 1 && linkedIntervals(cis, cii, cii + 1)) - linked2 = true; - - ContigInterval prev = ((linked1) ? cis.get(cii - 1): null); - ContigInterval next = ((linked2) ? cis.get(cii + 1): null); - - long start = ci.getStart(); - long end = ci.getEnd(); - - long start2 = ci.getStart(); - long end2 = ci.getEnd(); - - if (prev != null) { - String key = this.calculateKey(prev, ci); - if (chainLinkHash.containsKey(key)) { - if (ci.getOrientation() >= 0) { - start = chainLinkHashCut2.get(key); - start2 = chainLinkHashCut2a.get(key); - if (prevSwapped) { - long tmp = start; - start = start2; - start2 = tmp; - } - } else { - end = chainLinkHashCut2.get(key); - end2 = chainLinkHashCut2a.get(key); - if (prevSwapped) { - long tmp = end; - end = end2; - end2 = tmp; - } - } - } - } - if (next != null) { - String key = this.calculateKey(ci, next); - if (chainLinkHash.containsKey(key)) { - if (ci.getOrientation() >= 0) { - end = chainLinkHashCut1.get(key); - end2 = chainLinkHashCut1a.get(key); - if (end < start - 1 && end2 > end) { // more than full length aligned... - long tmp = end; - end = end2; - end2 = tmp; - prevSwapped = true; - } else - prevSwapped = false; - } else { - start = chainLinkHashCut1.get(key); - start2 = chainLinkHashCut1a.get(key); - if (end < start - 1 && start2 < start) { // more than full length aligned... - long tmp = start; - start = start2; - start2 = tmp; - prevSwapped = true; - } else - prevSwapped = false; - } - } else - prevSwapped = false; - } - - starts[cii] = start; - starts2[cii] = start2; - - ends[cii] = end; - ends2[cii] = end2; - } - - //finally try skipping negative length contigs and try to find new starts and ends... - for (int cii = 2; cii < cis.size(); ++cii) { - if (ends[cii] >= starts[cii] - 1) { // non negative length - int prevNN = cii - 1; - while (prevNN >= 0 && ends[prevNN] < starts[prevNN] - 1) // negative length - --prevNN; - if (prevNN < cii - 1 && prevNN >= 0 && linkedIntervals(cis, prevNN, cii)) { - ContigInterval ci = cis.get(cii); - ContigInterval ci2 = cis.get(prevNN); - String key = calculateKey(ci2, ci); - if (chainLinkHash.containsKey(key)) { - System.err.println("skipping contig " + cis.get(cii - 1)); - if (ci.getOrientation() >= 0) { - starts[cii] = chainLinkHashCut2.get(key); - starts2[cii] = chainLinkHashCut2a.get(key); - } else { - ends[cii] = chainLinkHashCut2.get(key); - ends2[cii] = chainLinkHashCut2a.get(key); - } - if (ci2.getOrientation() >= 0) { - ends[prevNN] = chainLinkHashCut1.get(key); - ends2[prevNN] = chainLinkHashCut1a.get(key); - } else { - starts[prevNN] = chainLinkHashCut1.get(key); - starts2[prevNN] = chainLinkHashCut1a.get(key); - } - } - } - } - } - - boolean toggleScaffolding = true; - for (int cii = 0; cii < cis.size(); ++cii) { - ContigInterval ci = cis.get(cii); - - boolean linked1 = false; // from chain file... - boolean linked2 = false; // from chain file... - if (cii > 0 && linkedIntervals(cis, cii - 1, cii)) - linked1 = true; - if (cii < cis.size() - 1 && linkedIntervals(cis, cii, cii + 1)) - linked2 = true; - - ContigInterval prev = ((linked1) ? cis.get(cii - 1): null); - ContigInterval next = ((linked2) ? cis.get(cii + 1): null); - - String map_info = ""; - String orientation = "?"; - - int posSupport = 0; - - for (int map = 0; map < numMaps; ++map) { - int max = Integer.MIN_VALUE; - int min = Integer.MAX_VALUE; - for (Marker m : ci.getMarkers(map)) { - max = Math.max(max, m.pPlus); - min = Math.min(min, m.pPlus); - posSupport += m.inside(m.pPlus); - } - if (max >= min) - map_info = map_info + "\t" + min + "-" + max; - else - map_info = map_info + "\t-"; - } - if (knownOrientation[cii]) - orientation = ((ci.getOrientation() >= 0) ? "+" : "-"); - else if (linked1 || linked2) { - if (!linked1) - toggleScaffolding = !toggleScaffolding; - if (!toggleScaffolding) - orientation = ((ci.getOrientation() >= 0) ? "++" : "--"); - else - orientation = ((ci.getOrientation() >= 0) ? "+++" : "---"); - } - - int scorePrev = 0; - int scoreNext = 0; - - String prevScaffoldInfo = "null"; - String nextScaffoldInfo = "null"; - - if (prev != null) { - String key = this.calculateKey(prev, ci); - if (chainLinkHash.containsKey(key)) { - scorePrev = chainLinkHash.get(key); - if (useChainAndPaf && scaffoldingLink.containsKey(key)) { - scorePrev += scaffoldingLink.get(key); - prevScaffoldInfo = "chain+" + scaffoldingLinkInfo.get(key); - } else - prevScaffoldInfo = "chain"; - } else if (scaffoldingLink.containsKey(key)) { - scorePrev = scaffoldingLink.get(key); - prevScaffoldInfo = scaffoldingLinkInfo.get(key); - } - int sp = ((prox == null) ? 0 : prox.linkScore(cis, cii - 1, cii)); - if (sp > 0) { - scorePrev +=sp ; - prevScaffoldInfo = prevScaffoldInfo + "+proximity"; - } - } - if (next != null) { - String key = this.calculateKey(ci, next); - if (chainLinkHash.containsKey(key)) { - scoreNext = chainLinkHash.get(key); - if (useChainAndPaf && scaffoldingLink.containsKey(key)) { - scoreNext += scaffoldingLink.get(key); - nextScaffoldInfo = "chain+" + scaffoldingLinkInfo.get(key); - } - else - nextScaffoldInfo = "chain"; - } else if (scaffoldingLink.containsKey(key)) { - scoreNext = scaffoldingLink.get(key); - nextScaffoldInfo = scaffoldingLinkInfo.get(key); - } - int sp = ((prox == null) ? 0 : prox.linkScore(cis, cii, cii + 1)); - if (sp > 0) { - scoreNext += prox.linkScore(cis, cii, cii + 1); - nextScaffoldInfo = nextScaffoldInfo + "+proximity"; - } - } // next != null - - //just get the start and end from the previously computed arrays... - long start = starts[cii]; - long end = ends[cii]; - long start2 = starts2[cii]; - long end2 = ends2[cii]; - - if (commentOutput) - stream.print("#"); - stream.println(ci.getContig() + "\t" + start + "\t" + end + "\t" + orientation + "\t" - + ci.getChromosome() + "\t" + orderSupport.get(cii) + "\t" + ci.getStart() + (start2==ci.getStart() ? "":"/" + start2) + "\t" + ci.getEnd() + (end2==ci.getEnd() ? "":"/" + end2) - + "\t" + (prev == null ? "null": prev.getContig()) + "\t" + prevScaffoldInfo + "\t" + scorePrev - + "\t" + (next == null ? "null": next.getContig()) + "\t" + nextScaffoldInfo + "\t" + scoreNext + "\t" + posSupport + map_info); - } - } - - - ArrayList logTable = new ArrayList(); - - private double myLog(int x) { - if (logTable.size() <= x) { - for (int i = logTable.size(); i <= x; ++i) - logTable.add(Math.log(i)); - } - return logTable.get(x); - } - - - - //markers is sorted within contig so no need to check orientation - private String getOneInterval(ArrayList markers, int m1, int m2) - { - Marker mm1 = markers.get(m1); - Marker mm2 = markers.get(m2); - ContigInterval ci = mm1.getContigInterval(); - long start = 0; - long end = 0; - - if (m1 == 0 || !ci.equals(markers.get(m1 - 1).getContigInterval())) - start = ci.getStart(); - else - start = (mm1.getPosition() + markers.get(m1 - 1).getPosition()) / 2; - - if (m2 + 1 == markers.size() || !ci.equals(markers.get(m2 + 1).getContigInterval())) - end = ci.getEnd(); - else - end = (mm2.getPosition() + markers.get(m1 + 1).getPosition()) / 2; - - return ci.getContig() + "\t" + start + "\t" + end; - } - - //markers is sorted - private String getIntervals(ArrayList markers, int m1, int m2) - { - String ret = ""; - ContigInterval prev = null; - int start = -1; - int end = -1; - for (int i = m1; i <= m2; ++i) { - Marker m = markers.get(i); - ContigInterval c = m.getContigInterval(); - if (c.equals(prev)) - end = i; - else { - if (start >= 0) - ret += '\t' + getOneInterval(markers, start, end); - start = i; - end = i; - } - prev = c; - } - return ret + '\t' + getOneInterval(markers, start, end); - } - - private class PossibleError implements Comparable{ - private double ll; - private String info; - private int ci; - private long pos1[], pos2[]; - public PossibleError(double ll, int ci, String info) { - this.ll = ll; - this.info = info; - this.ci = ci; - } - public PossibleError(double ll, int ci, String info, long pos1[], long pos2[]) { - this.ll = ll; - this.info = info; - this.ci = ci; - this.pos1 = pos1; - this.pos2 = pos2; - } - public String getInfo(){return info;} - @Override - public int compareTo(PossibleError other) { - if (ll < other.ll) - return 1; - if (ll > other.ll) - return -1; - return 0; - } - public int getCi() { - return ci; - } - public long[] getPos1() { - return pos1; - } - public long[] getPos2() { - return pos2; - } - } - - // log( (a/(a+b))^a * (b/(a+b))^b) - private double logLike(int a, int b) - { - double ret = 0; - if (a > 0) - ret += a * (myLog(a) - myLog(a + b)); - if (b > 0) - ret += b * (myLog(b) - myLog(a + b)); - return ret; - } - - // log( (a/(a+b))^c * (b/(a+b))^d) - private double logLike(int a, int b, int c, int d) - { - double ret = 0; - if (c > 0) - ret += c * (myLog(a) - myLog(a + b)); - if (d > 0) - ret += d * (myLog(b) - myLog(a + b)); - return ret; - } - - // find max maxNumErrors regions with more errors than expected, error rate is e0/(e0 + e1) - private void calculateErrors2(ArrayList markers_, ArrayList errors, int e0, int e1, int maxNumErrors) - { - boolean inside[] = new boolean[markers_.size()]; - int cumulative[] = new int[markers_.size() + 1]; - - int sum = 0; - for (int mi = 0; mi < markers_.size(); ++mi) { - Marker m = markers_.get(mi); - if (m.inside(m.pPlus) > 0) - inside[mi] = true; - else - ++sum; - cumulative[mi + 1] = sum; - } - if (sum == 0) // all inside or no markers - return; - - ArrayList markers = new ArrayList(); - - for (int mi = 0; mi < markers_.size(); ++mi) { // take only first and last marker of multiple adjacent markers !inside - if ((mi == 0 || inside[mi - 1]) && !inside[mi]) - markers.add(mi); - else - if ((mi + 1 == markers_.size() || inside[mi + 1]) && !inside[mi]) - markers.add(mi); - } - //System.err.println(markers.size() + "\t" + markers_.size()); - - int numMarkers = markers.size(); - - ArrayList found = new ArrayList(); - for (int iteration = 0; iteration < maxNumErrors; ++iteration) { - double max = 0; - int max1 = -1; - int max2 = -1; - int maxo0 = 0; - int maxo1 = 0; -out2: for (int m1i = 0; m1i < numMarkers; ++m1i) { - int m1 = markers.get(m1i); - for (int[] f : found) - if (f[1] >= m1 && f[0] <= m1) // found intervals contain m1 - continue out2; - for (int m2i = m1i; m2i < numMarkers; ++m2i) { - int m2 = markers.get(m2i); - for (int[] f : found) - if (f[1] >= m2 && f[0] <= m2) // found intervals intersect m2 ([m1...m2]) - continue out2; - int o0 = cumulative[m2 + 1] - cumulative[m1]; // number of markers outside interval - int o1 = (m2 - m1 + 1) - o0; // number of markers within interval - //o0 * (e0 + e1) > e0 * (o0 + o1) (positive e and o) <=> o0 / (o0 + o1) > e0 / (e0 + e1) - if ( o0 * (long)(e0 + e1) > e0 * (long)(o0 + o1)) { //more errors than expected - double l = logLike(o0, o1) - logLike(e0, e1, o0, o1); //likelihood ratio - if (l > max) { - max = l; - max1 = m1; - max2 = m2; - maxo0 = o0; - maxo1 = o1; - } - } else - continue out2; // no need to continue if we fall short - } - } - if (max > 0) { - found.add(new int[]{max1, max2}); - ContigInterval ci = markers_.get(max1).getContigInterval(); - long pos1 = markers_.get(max1).getPosition(); - long pos1n = ((max1 > 0) ? markers_.get(max1 - 1).getPosition() + 1 : ci.getStart()); - - long pos2 = markers_.get(max2).getPosition(); - long pos2n = ((max2 + 1 < markers_.size()) ? markers_.get(max2 + 1).getPosition() - 1 : ci.getEnd()); - - //if (pos1 < pos1n || pos2 > pos2n) { - // System.exit(-1); - //} - - String info = max + "\t" + maxo0 + "\t" + maxo1 + "\t" + ci.getContig() + "\t" + pos1n + "-" + pos1 + "\t" + pos2 + "-" + pos2n; - errors.add(new PossibleError(max, ci.getRank(), info , new long[]{pos1n, pos1}, new long[]{pos2, pos2n})); - } - } - } - - - - private ArrayList calculateBinSum(ArrayList markers, int endBin, boolean plusOrientation){ - //calculate map position distribution - ArrayList binSum = new ArrayList(); - for (int i = 0; i <= endBin; ++i) - binSum.add(0); - - //int plus = 0; - //int minus = 0; - for (Marker m : markers) { - if (plusOrientation) - binSum.set(m.pPlus, binSum.get(m.pPlus) + 1); - else - binSum.set(m.pMinus, binSum.get(m.pMinus) + 1); - } - return binSum; - } - - private class LongComparator0 implements Comparator - { - @Override - public int compare(long[] l1, long[] l2){ - return Long.compare(l1[0], l2[0]); - } - } - - private class LongComparator1 implements Comparator - { - @Override - public int compare(long[] l1, long[] l2){ - return Long.compare(l1[1], l2[1]); - } - } - - //calculate how much score could be increasing if we split a contig... - //make a gap in map positions L..R, and see how many points we could add there - private ArrayList increaseScore(ArrayList markers, ArrayList binSum){ - - ArrayList ret = new ArrayList(); - - int endBin = getMaxBin(markers); - if (endBin < binSum.size() - 1) - endBin = binSum.size() - 1; - int startBin = getMinBin(markers); - if (startBin > 0) - startBin = 0; - - int numMarkers = markers.size(); - - int S[][] = new int[numMarkers + 1][endBin - startBin + 1]; - - int Sp[] = new int[endBin - startBin + 1]; - - int Sb[][] = new int[numMarkers + 1][endBin - startBin + 1]; - - //simple way to calculate (about correct) backward table... - Collections.reverse(markers); - forwardFast2(markers, Sb, startBin, endBin); - Collections.reverse(markers); - forwardFast1(markers, S, startBin, endBin); - //Should we make S and Sb monotonic (+/- 1) ? - - for (int i = 0; i <= numMarkers; ++i) { - int max = 0; - //int maxL = 0; - //int maxR = 0; - - //reuse P as PSp - for (int b = 0; b < endBin - startBin; ++b) { // add binSum to S - int bs = 0; - if (b + startBin < binSum.size()) - bs = binSum.get(b + startBin); - - //either we take score S[i][b]+bs or skip b and get bs + score Sp[b-1] - if (b == 0) { - Sp[b] = S[i][b] + bs; - //P[i][b] = b; - } else { - int s1 = S[i][b] + bs; - int s2 = Sp[b - 1] + bs; - if (s1 >= s2) { - Sp[b] = s1; - //P[i][b] = b; - } else { - Sp[b] = s2; - //P[i][b] = P[i][b - 1]; - } - } - } - - - for (int b = 0; b <= endBin - startBin; ++b) { - int s = Sp[b] + Sb[numMarkers - i][b]; - if (s > max) { - max = s; - //maxR = P[i][b] + startBin; - //maxL = b + startBin; - } - } - ret.add(max); - } - return ret; - } - - - private ArrayList findNonHaplotypeAssemblyErrors(ArrayList cis, boolean verbose) - { - ArrayList ret = new ArrayList(); - - ArrayList errors = new ArrayList(); - int e0 = 0; - int e1 = 0; - for (int iteration = 0; iteration < 2; ++iteration) { //first iteration: calculate e0 and e1, second iteration find errors... - for (int cii = 0; cii < cis.size(); ++cii) { - ContigInterval ci = cis.get(cii); - - //Haplotypes are typical sources of errors, - //by adjusting start and end, one could reduce errors caused by them... - //best way would be to remove markers from one pair (with less markers) of haplotype and re-evaluate the maps... -/* - long start = ci.getStart(); - long end = ci.getEnd(); - int orientation = ci.getOrientation(); - if (cii > 0) { - ContigInterval prev = cis.get(cii - 1); - String key = calculateKey(prev, ci); - Long cut = chainLinkHashCut2.get(key); //Cut2a - if (cut != null) - if (orientation >= 0) - start = cut; - else - end = cut; - } - if (cii + 1 < cis.size()) { - ContigInterval next = cis.get(cii + 1); - String key = calculateKeyInverse(ci, next); - Long cut = chainLinkHashCut2.get(key); //Cut2a - if (cut != null) - if (orientation >= 0) - end = cut; - else - start = cut; - }*/ - - if (iteration == 0) { - for (int map = 0; map < numMaps; ++map) - for (Marker m : ci.getMarkers(map)) - //if (m.getPosition() >= start && m.getPosition() <= end) - if (m.inside(m.pPlus) > 0) // TODO: allow any scores, now 0 vs >0 - ++e1; - else - ++e0; - - } else { - //System.err.println(e1 + "\t" + e0); - ArrayList markersCi = new ArrayList(); - for (int map = 0; map < numMaps; ++map) - for (Marker m : ci.getMarkers(map)) - markersCi.add(m); - Collections.sort(markersCi); - //calculateErrors(markersCi, errors, e0, e1); - calculateErrors2(markersCi, errors, e0, e1, numErrorsPerContig); - } - } - } - - if (verbose) { - System.err.println("#*** possible errors ***"); - System.err.println("#total outside/inside:\t" + e0 + "\t" + e1); - System.err.println("#X2\t0\t>0\tcontig\tstart\tend\tcontig2\tpos"); - } - - Collections.sort(errors); - if (errors != null) - for (int i = 0; i < numErrors && i < errors.size(); ++i) { - ret.add(errors.get(i)); - PossibleError ff = findFix(cis, errors.get(i), false); - ret.add(ff); - if (verbose) { - System.err.println(errors.get(i).getInfo() + ((ff == null) ? "" : "\t" + ff.getInfo()) + "\terror"); - } - } - return ret; - } - - //TODO: Check that calculateChainScore is ok - //note: assumes that score has been evaluated for the markers in cis and in the corresponding order and orientation - private void findLikelyAssemblyErrors(ArrayList cis) - { - System.err.println("*** possible haplotypes ***"); - System.err.println("score\tcontig\tstart\tend\tof_contig\tstart\tend\tcontig_aligment_start\tcontig_aligment_end"); - - ArrayList haplotypes = new ArrayList(); - - for (int cii = 0; cii < cis.size(); ++cii) { - ContigInterval ci = cis.get(cii); - int score = 0; - for (int map = 0; map < numMaps; ++map) - for (Marker m : ci.getMarkers(map)) - score += m.inside(m.pPlus); - if (cii > 0) - score += calculateChainScore(cis.get(cii - 1), ci); - if (cii + 1 < cis.size()) - score += calculateChainScore(ci, cis.get(cii + 1)); - - //int scoreH = 0; - //ContigInterval haplotypeOf = null; - for (ContigInterval ci2 : cis) { - Integer s = calculateChainScoreHaplotype(ci2, ci); - if (s != null && s > score) { - long ha[] = calculateChainScoreHaplotypeCut(ci2, ci); - haplotypes.add(new PossibleError((s - score), 0, (s - score) + "\t" + ci + "\t" + ci2 + "\t" + ha[0] + "\t" + ha[1])); -// scoreH = s; -// haplotypeOf = ci2; - } - } -// if (scoreH > score) { -// haplotypes.add(new PossibleError((scoreH - score), (scoreH - score) + "\t" + ci + "\t" + haplotypeOf)); -// //System.err.println((scoreH - score) + "\t" + ci + "\t" + haplotypeOf); -// } - - } - Collections.sort(haplotypes); - for (PossibleError pe: haplotypes) - System.err.println(pe.getInfo() + "\thaplotype"); - - findNonHaplotypeAssemblyErrors(cis, true); - - } - void setMaxIntersect(int parseInt) { - maxIntersect = parseInt; - } - - void setMinLinkAlignmentScore(int parseInt) { - minLinkAlignmentScore = parseInt; - - } - - private void setMinHaplotypeAlignmentScore(int parseInt) { - minHaplotypeAlignmentScore = parseInt; - } - - //autogenerated stubs... - void setMaxBridge(int parseInt) { - maxBridge = parseInt; - } - - void setCutPenalty(double parseDouble) { - cutPenalty = parseDouble; - } - - void setOrientationPenalty(double parseDouble) { - orientationPenalty = parseDouble; - } - - void setScaleScore(double parseDouble) { - scaleScore = parseDouble; - } - void setNumThreads(int nt) { - numThreads = nt; - } - - private ArrayList> myLiftover(ArrayList> markers, ArrayList alignment, boolean sameOrientation, String newContig) - { - ArrayList> ret = new ArrayList>(); - for (ArrayList row: markers) { - //System.err.println(row); - long position = InputData.myParseLong(row.get(1)); - long position_new = mapPosition12(alignment, position, sameOrientation); - //System.err.println(position + "->" + position_new); - if (position_new > 0) { - //System.err.println("HIPHEI"); - ArrayList nrow = new ArrayList(); - for (int i = 0; i < row.size(); ++i) - if (i == 0) - nrow.add(newContig); - else if (i == 1) - nrow.add("" + position_new); - else - nrow.add(row.get(i)); - ret.add(nrow); - } - } - return ret; - } - - private void liftover__(ContigInterval hap, ContigInterval c2, ArrayList alignment, boolean sameOrientation, long ascore, HashMap bestScore, HashMap bestScoreContig, HashMap>> map, HashMap>> liftoverMap) { - Long prevScore = bestScore.get(hap); - if (prevScore != null) { - ContigInterval c3 = bestScoreContig.get(hap); - if (c3.getContig().equals(c2.getContig()) && Misc.intersectIntervals(c2.getStart(), c2.getEnd(), c3.getStart(), c3.getEnd())) { // same alignment... - if (ascore > prevScore) { - liftoverMap.put(hap, myLiftover(map.get(hap), alignment, sameOrientation, c2.getContig())); // do liftover - bestScoreContig.put(hap, c2); - bestScore.put(hap, ascore); - } - } else { - if (ascore > prevScore) { - bestScoreContig.put(hap, c2); - bestScore.put(hap, ascore); - } - if (ascore > 2 * prevScore) { - liftoverMap.put(hap, myLiftover(map.get(hap), alignment, sameOrientation, c2.getContig())); // do liftover - } - else if (ascore > liftoverScoreDiff * prevScore) {// too little score difference... - liftoverMap.get(hap).clear(); // remove markers... - //System.err.println("remove"); - } - } - - } else { - //System.err.println("liftover"); - liftoverMap.put(hap, myLiftover(map.get(hap), alignment, sameOrientation, c2.getContig())); // do liftover - bestScore.put(hap, ascore); - bestScoreContig.put(hap, c2); - } - } - - private void liftover_(String fn, HashMap>> map) { - System.err.println("loading alignment chain..."); - - HashMap bestScore = new HashMap(); // score of used chain - HashMap bestScoreContig = new HashMap(); // contigInterval of used chain - - HashMap>> liftoverMap = new HashMap>>(); // store liftover markers... - - for (ContigInterval ci : map.keySet()) { - liftoverMap.put(ci, new ArrayList>()); - //System.err.println(ci); - } - - try { - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - do { - ArrayList row = Input.loadTableRow(br, "\t "); - if (row == null) - break; - boolean skip = true; - if (row.size() >= 12 && "chain".equals(row.get(0))) { - String contig1 = row.get(2); - String contig2 = row.get(7); - - boolean someMarkers1 = false; - if (haplotypeHash.containsKey(contig1)) - for (ContigInterval hap : haplotypeHash.get(contig1)) - if (map.containsKey(hap)) { - someMarkers1 = true; - break; - } - boolean someMarkers2 = false; - if (haplotypeHash.containsKey(contig2)) - for (ContigInterval hap : haplotypeHash.get(contig2)) { - if (map.containsKey(hap)) { - someMarkers2 = true; - break; - } - } - if (!someMarkers1 && !someMarkers2) // no markers to do liftover... - continue; - - skip = false; - long p1_[] = chain2OneBase(row.get(3), row.get(4), row.get(5), row.get(6)); - long p2_[] = chain2OneBase(row.get(8), row.get(9), row.get(10), row.get(11)); - if ("-".equals(row.get(4))) { - System.err.println("Error: only ++, and +- orientation allowed in the chain file"); - continue; - } - - //System.err.println("running"); - - boolean sameOrientation = row.get(4).equals(row.get(9)); - - // load alignment for liftover... - ArrayList alignment_ = new ArrayList(); - ArrayList row2 = null; - long v1 = p1_[0]; - long v2 = ((sameOrientation) ? p2_[0] : p2_[1]); - int v3 = 0; - do { - row2 = Input.loadTableRow(br, "\t "); - - v3 = Integer.parseInt(row2.get(0)); - alignment_.add(new long[]{v1, v2, v3}); - v1 += v3; - v2 += ((sameOrientation) ? v3 : -v3); - if (row2.size() >= 3) { - v1 += Integer.parseInt(row2.get(1)); - v2 += ((sameOrientation) ? Integer.parseInt(row2.get(2)) : -Integer.parseInt(row2.get(2))); - } - } while (row2 != null && row2.size() >= 3); - - long ascore_ = Long.parseLong(row.get(1)); - - if (someMarkers1) // && haplotypeHash.containsKey(contig1) - for (ContigInterval hap : haplotypeHash.get(contig1)) - if (map.containsKey(hap)) { - ContigInterval c2 = new ContigInterval(contig2, p2_[0], p2_[1]); - - ArrayList alignment = new ArrayList(); - double t = trimChain(alignment_, sameOrientation, hap, c2, alignment); - if (t == 0.0) // all trimmed - continue; - - if (t < 1.0) { // update c2 start and end - long p[] = getStartEnd2(alignment, sameOrientation); - c2 = new ContigInterval(contig2, p[0], p[1]); - } else { - alignment = alignment_; - } - - long ascore = (long)(t * ascore_); - - boolean doLift = true; - if (haplotypeHash.containsKey(contig2)) - for (ContigInterval hap2 : haplotypeHash.get(contig2)) // no liftover between haplotypes... - if (Misc.intersectIntervals(hap2.getStart(), hap2.getEnd(), c2.getStart(), c2.getEnd())) { - doLift = false; - break; - } - if (doLift) - liftover__(hap, c2, alignment, sameOrientation, ascore, bestScore, bestScoreContig, map, liftoverMap); - } - //otherway round... - if (someMarkers2) { // && haplotypeHash.containsKey(contig2) - ArrayList revAlignment_ = reverseAlignment(alignment_, sameOrientation); - for (ContigInterval hap : haplotypeHash.get(contig2)) - if (map.containsKey(hap)) { - ContigInterval c2 = new ContigInterval(contig1, p1_[0], p1_[1]); - - ArrayList alignment = new ArrayList(); - double t = trimChain(revAlignment_, sameOrientation, hap, c2, alignment); - if (t == 0.0) // all trimmed - continue; - if (t < 1.0) { // update c2 start and end - long p[] = getStartEnd2(alignment, sameOrientation); - c2 = new ContigInterval(contig1, p[0], p[1]); - } else - alignment = revAlignment_; - long ascore = (long)(t * ascore_); - - boolean doLift = true; - if (haplotypeHash.containsKey(contig1)) - for (ContigInterval hap2 : haplotypeHash.get(contig1)) // no liftover between haplotypes... - if (Misc.intersectIntervals(hap2.getStart(), hap2.getEnd(), c2.getStart(), c2.getEnd())) { - doLift = false; - break; - } - if (doLift) - liftover__(hap, c2, alignment, sameOrientation, ascore, bestScore, bestScoreContig, map, liftoverMap); - } - } - } else { - if (skip && (row.size() == 3 || row.size() == 1)) - ; - else - System.err.println("Warning: skipping " + row); - } - } while (true); - br.close(); - int numLifoverMarkers = 0; - for (ContigInterval ci : liftoverMap.keySet()) { - ArrayList> lom = liftoverMap.get(ci); - numLifoverMarkers += lom.size(); - StringBuilder sb = new StringBuilder(); - for (ArrayList row : lom) { - sb.append(row.get(0)); - for (int i = 1; i < row.size(); ++i) { - sb.append('\t'); - sb.append(row.get(i)); - } - sb.append('\n'); - } - System.out.print(sb); - } - System.err.println("Lifting over " + numLifoverMarkers + " markers"); - } catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - } - } - - public void liftover(String haplotypeFile, String chainFile, String mapFile) { - ArrayList haplotypes = InputData.loadHaplotypes(haplotypeFile); - - for (ContigInterval hapci: haplotypes) { - String key = hapci.getContig(); - ArrayList list = haplotypeHash.get(key); - if (list == null) { - list = new ArrayList(); - haplotypeHash.put(key, list); - } - list.add(hapci); - } - - HashMap>> map = InputData.loadRaw(mapFile, haplotypeHash); - //System.out.println(map); - - liftover_(chainFile, map); - - } - - private void loadCutSites(String fn) { - try { - HashMap> bedHash2 = new HashMap>(); - for (ContigInterval ci : bed) { - if (ci.getStart() != ci.getMinStart() || ci.getEnd() != ci.getMaxEnd()) { // ContigInterval with uncertainty in its start or end - String contig = ci.getContig(); - if (!bedHash2.containsKey(contig)) - bedHash2.put(contig, new ArrayList()); - bedHash2.get(contig).add(ci); - } - } - int numCutSites = 0; - BufferedReader br = new BufferedReader(new FileReader(fn)); - do { - ArrayList row = Input.loadTableRow(br, "\t "); - if (row == null) - break; - if (row.size() >= 2) { - String contig = row.get(0); - long position = 0; - long position2 = 0; - if (bedHash2.containsKey(contig)) - for (ContigInterval ci : bedHash2.get(contig)) { - if (position == 0) { - position = Long.parseLong(row.get(1)); - if (row.size() >= 3) { - position2 = Long.parseLong(row.get(2)); - --position; //N gap - ++position2; - } else - position2 = position + 1; - } - - - if (position2 >= ci.getMinStart() && position2 <= ci.getMaxStart()) { - ++numCutSites; - ci.setStart(position2); - break; - } - if (position >= ci.getMinEnd() && position <= ci.getMaxEnd()) { - ++numCutSites; - ci.setEnd(position); - break; - } - } - } - } while (true); - br.close(); - System.err.println("Trimming " + numCutSites + " contig ends based on cutSites file"); - } catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - } - } - - //find how to fix likely erroneous genomic interval (e) - //assumes solve has been called on the cis - private PossibleError findFix(ArrayList cis, PossibleError e, boolean verbose){ - //update rank and numContigIntervals - int numContigIntervals = 0; // update rank (might be corrupted) - for (ContigInterval c : cis) - c.setRank(numContigIntervals++); - - //calculate full anchoring score - ArrayList oldPos = new ArrayList(); - int oldFullScore = 0; - for (int map = 0; map < numMaps; ++map) - for (Marker m : getMarkers(cis, null, map)) { - oldFullScore += m.inside(m.pPlus); - oldPos.add(m.pPlus); // store mPlus, as this is changed... - } - - //get error region (region inside one contigInterval) - ContigInterval errorContig = cis.get(e.getCi()); - long pos1[] = e.getPos1(); - long pos2[] = e.getPos2(); - - int s0 = 0; // old score for region e - int s1 = 0; // score for e in + orient - int s2 = 0; // score for e in - orient - - ArrayList> iss = new ArrayList>(); //store increaseScores... - - for (int map = 0; map < numMaps; ++map) { - //get markers spanning error region for each map separately - - ArrayList errorMarkers = new ArrayList(); - ArrayList nonErrorMarkers = new ArrayList(); - - for (ContigInterval c : cis) - for (Marker m : c.getMarkers(map)) { - if ((c == errorContig) && m.getPosition() >= pos1[0] && m.getPosition() <= pos2[1]) - errorMarkers.add(m); - else - nonErrorMarkers.add(m); - } - - int numMarkers = errorMarkers.size(); - //calculate old score - //int oldScore[] = new int[numMarkers]; - for (int i = 0; i < numMarkers; ++i) { - Marker m = errorMarkers.get(i); - int s = m.inside(m.pPlus); - s0 += s; - } - s1 += solveForward(errorMarkers); - s2 += solveBackward(errorMarkers); - - ArrayList mm2 = new ArrayList(); - for (int i = 0; i < numMarkers; ++i) { - Marker m = errorMarkers.get(i); - if (m.inside(m.pPlus) > 0) // marker is now contributing to the score - mm2.add(m); - } - int maxBin = getMaxBin(errorContig.getMarkers(map)); - - iss.add(increaseScore(nonErrorMarkers, calculateBinSum(mm2, maxBin, true))); // mm2 in + orientation - - mm2.clear(); - for (int i = 0; i < numMarkers; ++i) { - Marker m = errorMarkers.get(i); - if (m.inside(m.pMinus) > 0) // marker is now contributing to the score - mm2.add(m); - } - - iss.add(increaseScore(nonErrorMarkers, calculateBinSum(mm2, maxBin, false))); // mm2 in - orientation - - //System.err.println("size=" + nonErrorMarkers.size()); - - } - - int pi = 0; - for (int map = 0; map < numMaps; ++map) - for (Marker m : getMarkers(cis, null, map)) - m.pPlus = oldPos.get(pi++); // restore mPlus, this is changed by increaseScore - - if ((s0 >= s1 && s0 >= s2)) - return null; // no fix found... - - //System.err.println("max score delta = " + (s1 - s0) + " or " + (s2 - s0) + "\t" + s0 + "\t" + s1 + "\t" + s2 + "\t" + e.getInfo()); - //merge results from each increaseScore - - long maxScore = oldFullScore; - int maxC = 0; - long maxP1 = 0; - long maxP2 = 0; - - for (int orient = 0; orient < 2; ++orient) { - int startMarker[] = new int[numMaps]; - for (ContigInterval c : cis) { - //store cut intervals - //if interval [a,b] is in cov, then prefix can be [1..a], [1..a+1],...,[1..b] - ArrayList cov = new ArrayList(); - - for (int map = 0; map < numMaps; ++map) { - ArrayList is = iss.get(map + map + orient); - - //System.err.println("size=" + is.size()); - - ArrayList markers = new ArrayList(); - for (Marker m : c.getMarkers(map)) - if ((errorContig == c) && m.getPosition() >= pos1[0] && m.getPosition() <= pos2[1]) // errorMarker - ; - else - markers.add(m); - - //System.err.println("size=" + markers.size()); - - int numMarkers = markers.size(); - if (numMarkers > 0) { - long isi = is.get(startMarker[map]); - if (c.getOrientation() >= 0) { // + orientation - cov.add(new long[]{c.getStart() - 1, markers.get(0).getPosition() - 1, isi}); - for (int mi = 1; mi < numMarkers; ++mi) { - //Marker m = markers.get(mi); - isi = is.get(startMarker[map] + mi); - cov.add(new long[]{markers.get(mi - 1).getPosition(), markers.get(mi).getPosition() - 1, isi}); - } - isi = is.get(startMarker[map] + numMarkers); - cov.add(new long[]{markers.get(numMarkers - 1).getPosition(), c.getEnd(), isi}); - } else { - cov.add(new long[]{markers.get(0).getPosition(), c.getEnd(), isi}); - for (int mi = 1; mi < numMarkers; ++mi) { - //Marker m = markers.get(mi); - isi = is.get(startMarker[map] + mi); - cov.add(new long[]{markers.get(mi).getPosition(), markers.get(mi - 1).getPosition() - 1, isi}); - } - isi = is.get(startMarker[map] + numMarkers); - cov.add(new long[]{c.getStart() - 1, markers.get(numMarkers - 1).getPosition() - 1, isi}); - } - } else { // 0 markers - int isi = is.get(startMarker[map]); - cov.add(new long[]{c.getStart() - 1, c.getEnd(), isi}); - } - startMarker[map] += numMarkers; - } - ArrayList cov_ret = Misc.cov(cov); - //System.err.println(cov_ret); - - for (int i = 0; i < cov_ret.size(); i+=2) { - long pos = cov_ret.get(i); - long cr = cov_ret.get(i + 1); - //System.err.println(pos + "\t" + cr); - if (cr > maxScore) { - maxScore = cr; - maxC = c.getRank(); - maxP1 = pos; - maxP2 = ((i + 2 < cov_ret.size()) ? cov_ret.get(i + 2) - 1 : maxP1); - } - } - } - } - - if (verbose) - System.err.println(cis.get(maxC).getContig() + "\t" + maxP1 + "-" + maxP2 + "\t" + errorContig.getContig() + "\t" + pos1[0] + "-" + pos1[1] + "\t" + pos2[0] + "-" + pos2[1] + "\t" + (maxScore - oldFullScore) + "\tfix"); - - String info = cis.get(maxC).getContig() + "\t" + maxP1 + "-" + maxP2; - - if (maxScore <= oldFullScore) { - return null; - } - return new PossibleError((maxScore - oldFullScore), maxC, info , new long[]{maxP1, maxP2}, null); - } - - // find a region inside an error with even higher error rate - private PossibleError splitError(ContigInterval c, PossibleError e) - { - //PossibleError ret = new PossibleError(max, ci.getRank(), info , new long[]{pos1n, pos1}, new long[]{pos2, pos2n} - long pos1[] = e.getPos1(); - long pos2[] = e.getPos2(); - ArrayList markersC = new ArrayList(); - int e0 = 0; - int e1 = 0; - for (int map = 0; map < numMaps; ++map) - for (Marker m : c.getMarkers(map)) - if (m.getPosition() >= pos1[0] && m.getPosition() <= pos2[1]) { - if (m.inside(m.pPlus) > 0) - ++e1; - else - ++e0; - markersC.add(m); - } - Collections.sort(markersC); - //System.err.println("e0=" + e0 + "\te1=" + e1); - - //calculateErrors(markersCi, errors, e0, e1); - ArrayList pes = new ArrayList(); - calculateErrors2(markersC, pes, e0, e1, 1); - if (pes.size() > 0) { - PossibleError ret = pes.get(0); - long p1[] = ret.getPos1(); // adjust region so that it is within the original error... - if (p1[0] < pos1[0]) - p1[0] = pos1[0]; - long p2[] = ret.getPos2(); - if (p2[1] > pos2[1]) - p2[1] = pos2[1]; - ret.ci = e.ci; // rank might be corrupted so calculateErrors2 could fail to get this right - return ret; - } - - return null; - } - //iterative version of findLikelyAssemblyErrors - private void findContigErrors(int minImprovement) { - System.err.println("Finding contig errors..."); - ArrayList newcis = new ArrayList(); - - int animation = 0; - boolean foundFix = true; - - HashMap missedErrors = new HashMap(); - - while (foundFix) { - foundFix = false; - int scoreOld = calculateScore(intervalsInChr, false) + calculateNonMarkerScore(intervalsInChr); - - int orientation[] = new int[intervalsInChr.size()]; // store orientaions - for (int ci = 0; ci < orientation.length; ++ci) - orientation[ci] = intervalsInChr.get(ci).getOrientation(); - - //ArrayList pe = findNonHaplotypeAssemblyErrors(intervalsInChr, false); - ArrayList pe = findNonHaplotypeAssemblyErrors(intervalsInChr, false); - - System.err.println("score = " + scoreOld); - - out:for (int e = 0; e + e < pe.size(); ++e) { - PossibleError pe1 = pe.get(e + e); - ContigInterval c1 = intervalsInChr.get(pe1.getCi()); - - PossibleError pe2 = pe.get(e + e + 1); - String errorId = pe1.getInfo() + ((pe2 == null) ? "" : "\t" + pe2.getInfo()); - - // see if we have already tried to fix this error, skip with decreasing prob - String errorId2 = errorId.substring(errorId.indexOf('\t')); - if (missedErrors.containsKey(errorId2)) { - double newValue = missedErrors.get(errorId2) * 0.5; - if (Math.random() >= newValue) - continue; - missedErrors.put(errorId2, newValue); - } else - missedErrors.put(errorId2, 2.0); // try twice, then with pr 1/2, 1/4, 1/8, ... - - System.err.println(errorId); - for (int fixIteration = 0; fixIteration < 2; ++fixIteration) { // try to split each region in fixIteration==1 - if (fixIteration == 1) { - calculateScore(intervalsInChr, false); // update pPlus... - pe1 = splitError(c1, pe1); - if (pe1 == null) { - //System.err.println("no split"); - break; - } - pe2 = findFix(intervalsInChr, pe1, false); - errorId = pe1.getInfo() + ((pe2 == null) ? "" : "\t" + pe2.getInfo()); - System.err.println(errorId); - } - ContigInterval sp1[] = null; - ContigInterval sp2[] = null; - ContigInterval c2 = null; - - long p11[] = pe1.getPos1(); - p11[0] -= 1; //-1 - p11[1] -= 1; //-1 - if (pe2 != null) { - c2 = intervalsInChr.get(pe2.getCi()); - if (c1 == c2) { - sp1 = c1.splitContigInterval(p11, pe1.getPos2(), pe2.getPos1()); - } else { - sp1 = c1.splitContigInterval(p11, pe1.getPos2()); - sp2 = c2.splitContigInterval(pe2.getPos1()); - } - } else { - sp1 = c1.splitContigInterval(p11, pe1.getPos2()); - } - - newcis.clear(); - for (ContigInterval ci : intervalsInChr) { - if (ci != c1 && ci != c2) - newcis.add(ci); - else if (ci == c1) - for (ContigInterval c : sp1) - newcis.add(c); - else - for (ContigInterval c : sp2) - newcis.add(c); - } - //System.err.println(newcis); - int splitSize1 = sp1.length; - int splitSize2 = ((sp2 == null) ? 1 : sp2.length); - if (splitSize1 == 1 && splitSize2 == 1) // no split... - continue; - - - //add scaffolding links - for (int ci = 1; ci < sp1.length; ++ci) { - String key1 = calculateKey(sp1[ci - 1], true, sp1[ci], true); - String key2 = calculateKey(sp1[ci], false, sp1[ci - 1], false); - scaffoldingLink.put(key1, 1); - scaffoldingLink.put(key2, 1); - } - if (sp2 != null) - for (int ci = 1; ci < sp2.length; ++ci) { - String key1 = calculateKey(sp2[ci - 1], true, sp2[ci], true); - String key2 = calculateKey(sp2[ci], false, sp2[ci - 1], false); - scaffoldingLink.put(key1, 1); - scaffoldingLink.put(key2, 1); - } - //keep old scaffoldingLinks - for (ContigInterval c : intervalsInChr) { - String k1 = calculateKey(c1, true, c, true); - if (scaffoldingLink.containsKey(k1)) { - scaffoldingLink.put(calculateKey(sp1[sp1.length - 1], true, c,true), 1); - scaffoldingLink.put(calculateKey(c, false, sp1[sp1.length - 1], false), 1); - } - String k2 = calculateKey(c, true, c1, true); - if (scaffoldingLink.containsKey(k2)) { - scaffoldingLink.put(calculateKey(c, true, sp1[0], true), 1); - scaffoldingLink.put(calculateKey(sp1[0], false, c, false), 1); - } - if (sp2 != null) { - k1 = calculateKey(c2, true, c, true); - if (scaffoldingLink.containsKey(k1)) { - scaffoldingLink.put(calculateKey(sp2[sp2.length - 1], true, c,true), 1); - scaffoldingLink.put(calculateKey(c, false, sp2[sp2.length - 1], false), 1); - } - k2 = calculateKey(c, true, c2, true); - if (scaffoldingLink.containsKey(k2)) { - scaffoldingLink.put(calculateKey(c, true, sp2[0], true), 1); - scaffoldingLink.put(calculateKey(sp2[0], false, c, false), 1); - } - } - } - //keep old chainLinks - for (ContigInterval c : intervalsInChr) { - for (int o = 0; o < 2; ++o) - for (int o1 = 0; o1 < 2; ++o1) { - String k1 = calculateKey(c, o == 0, c1, o1 == 0); - if (chainLinkHash.containsKey(k1)) { - String knew = calculateKey(c, o == 0, (o1==0) ? sp1[0] : sp1[sp1.length - 1], o1 == 0); - chainLinkHash.put(knew, chainLinkHash.get(k1)); - } - String k2 = calculateKey(c1, o1 == 0, c, o == 0); - if (chainLinkHash.containsKey(k2)) { - String knew = calculateKey((o1==0) ? sp1[sp1.length - 1]: sp1[0], o1 == 0, c, o == 0); - chainLinkHash.put(knew, chainLinkHash.get(k2)); - } - if (sp2 != null) { - String k3 = calculateKey(c, o == 0, c2, o1 == 0); - if (chainLinkHash.containsKey(k3)) { - String knew = calculateKey(c, o == 0, (o1==0) ? sp2[0] : sp2[sp2.length - 1], o1 == 0); - chainLinkHash.put(knew, chainLinkHash.get(k3)); - } - String k4 = calculateKey(c2, o1 == 0, c, o == 0); - if (chainLinkHash.containsKey(k4)) { - String knew = calculateKey((o1==0) ? sp2[sp2.length - 1]: sp2[0], o1 == 0, c, o == 0); - chainLinkHash.put(knew, chainLinkHash.get(k4)); - } - - } - - } - } - if (sp2 != null) // keep chain links between c1 to c2 - for (int o1 = 0; o1 < 2; ++o1) - for (int o2 = 0; o2 < 2; ++o2) { - String k1 = calculateKey(c1, o1 == 0, c2, o2 == 0); - if (chainLinkHash.containsKey(k1)) { - String knew = calculateKey((o1==0) ? sp1[sp1.length - 1]: sp1[0], o1 == 0, (o2==0) ? sp2[0]: sp2[sp2.length - 1], o2 == 0); - chainLinkHash.put(knew, chainLinkHash.get(k1)); - } - String k2 = calculateKey(c2, o2 == 0, c1, o1 == 0); - if (chainLinkHash.containsKey(k2)) { - String knew = calculateKey((o2==0) ? sp2[sp2.length - 1]: sp2[0], o2 == 0, (o1==0) ? sp1[0]: sp1[sp1.length - 1], o1 == 0); - chainLinkHash.put(knew, chainLinkHash.get(k2)); - } - } - - - - //improveAnchoring(newcis, false, false); - - //faster version of improveAnchoring - { - int bestScore = -1; - ArrayList newcontigs = new ArrayList(); - for (ContigInterval c : sp1) - newcontigs.add(c); - if (sp2 != null) - for (ContigInterval c : sp2) - newcontigs.add(c); - ArrayList newcis2 = new ArrayList(); - newcis2.addAll(newcis); - - ArrayList bestOrientation = new ArrayList(); - - for (int run = 0; run < numRuns; ++run) { - Collections.shuffle(newcontigs); - for (ContigInterval c : newcontigs) { - c.setOrientation((int)(2.0 * Math.random()) - 1); - } - boolean foundBetter = true; - - while (foundBetter) { - foundBetter = false; - for (ContigInterval c : newcontigs) { - if (calculateBest(newcis2, c, false) > 0) - foundBetter = true; - } - } - int score = calculateScore(newcis2, false) + calculateNonMarkerScore(newcis2); - if (score > bestScore) { - bestScore = score; - - newcis.clear(); - newcis.addAll(newcis2); - - bestOrientation.clear(); - for (ContigInterval c : newcis) - bestOrientation.add(c.getOrientation()); - } - } - int oi = 0; - for (ContigInterval c : newcis) - c.setOrientation(bestOrientation.get(oi++)); - } - - int score = calculateScore(newcis, false) + calculateNonMarkerScore(newcis); - //System.err.println(score); - if (score >= minImprovement + scoreOld + 2 * (splitSize1 + splitSize2 - 2)) { // at least +minImprovement in score and +2 for each cut... - System.err.println("split into " + splitSize1 + (splitSize2 <= 1 ? "" : "+" + splitSize2)); - System.err.print("number of markers"); - for (ContigInterval c : sp1) - System.err.print("\t" + c.getNumMarkers()); - if (sp2 != null) { - System.err.print("\t|"); - for (ContigInterval c : sp2) - System.err.print("\t" + c.getNumMarkers()); - } - System.err.println(); - - if (!printAnimation.equals("")) { - if (animation == 0) { - String fn = printAnimation + animation + ".la"; - - ArrayList orderSupport = calculateSupport(intervalsInChr); - try { - PrintStream ps = new PrintStream(fn); - printAnchoring(ps, intervalsInChr, orderSupport); - ps.close(); - } - catch (Exception ex){ - ex.printStackTrace(); - } - } - ++animation; - String fn = printAnimation + animation + ".la"; - ArrayList orderSupport = calculateSupport(newcis); - try { - PrintStream ps = new PrintStream(fn); - printAnchoring(ps, newcis, orderSupport); - ps.close(); - } - catch (Exception ex){ - ex.printStackTrace(); - } - } - - System.err.println("Score improvement = " + (score - scoreOld) + " score = " + score); - //System.err.println("score = " + score); - intervalsInChr.clear(); - intervalsInChr.addAll(newcis); - foundFix = true; - - break out; // found fix... - } - // set orientations back to stored state - for (int ci = 0; ci < orientation.length; ++ci) - intervalsInChr.get(ci).setOrientation((orientation[ci])); - } // fixIteration - } // for (int inr e=0; ... - } // while foundFix - int score = calculateScore(intervalsInChr, false); - System.out.println("#final score with corrected contigs " + score + " " + calculateNonMarkerScore(intervalsInChr)); - ArrayList orderSupport = calculateSupport(intervalsInChr); - - printAnchoring(intervalsInChr, orderSupport); - - //print bed for another run of Lep-Anchor - ContigInterval start = null; - for (int ci = 0; ci < intervalsInChr.size(); ++ci) { - ContigInterval c = intervalsInChr.get(ci); - ContigInterval next = ((ci + 1 < intervalsInChr.size()) ? intervalsInChr.get(ci + 1) : null); - boolean linked = false; - if (next != null) { - String key = calculateKey(c, next); - if (scaffoldingLink.containsKey(key) && scaffoldingLink.get(key) > 0) - linked = true; - } - if (start == null) - start = c; - - if (!linked) { - long pos1[] = c.getStartI(); - long pos2[] = c.getEndI(); - if (c != start) // more than one linked contigsIntervals... - if (c.getOrientation() >= 0) { - pos1 = start.getStartI(); - } else { - pos2 = start.getEndI(); - } - System.err.println(c.getContig() + "\t" + pos1[0] + "-" + pos1[1] + "\t" + pos2[0]+ "-" + pos2[1] + ((pos2.length==2) ? "" : "*") + "\t?\t" + c.getChromosome() + "\tbed"); - - start = null; - } - } - } - - private static void usageInfo() - { - System.err.println("usage: java PlaceAndOrientContigs bed=bed.file map=map1.txt [map2 [map3 ...]] options"); - System.err.println(" bed=file a file containing (contig start stop) intervals in 1-based coordinates"); - System.err.println(" map=file1 [file2 [...]] linkage map file(s)"); - System.err.println(" columns contig, pos, chromosome, map_bin_start [map_bin_stop [map_bin_start2 map_bin_stop2] [...]]"); - System.err.println(" orientation=+/- [+/- [...]] manual orientation for each map file (found automatically by default)"); - System.err.println(" chain=file chain file "); - System.err.println(" noChromosome=1 input file does not have chromosome column "); - System.err.println(" noIntervals=1 input file does not have intervals but map positions"); - System.err.println(" numRuns=NUM run this many runs to find better anchoring [5]"); - System.err.println(" chromosome=NUM take only chromosome NUM from the bed (chr=column 5) [not set]"); - System.err.println(" and from the map(s)"); - System.err.println(" numThreads=NUM number of threads [1]"); - - System.err.println(" compressMap=0 Do not compress map positions [1]"); - - System.err.println(" randomOrder=1 Start with a random anchoring [not set]"); - - System.err.println(" keepEmptyIntervals=1 Keep (contig)intervals without any markers [not set]"); - - System.err.println(" numErrors=NUM List at most this many potential errors [40]"); - System.err.println(" numErrorsPerContig=NUM List at most this many potential errors from one contig [3]\n"); - - System.err.println(" paf=file load alignment file in paf (minimap2) format"); - System.err.println(" maxBridge=NUM maximum scaffolding bridge length (for paf input) [50000]"); - System.err.println(" scalePaf=NUM multiply scaffolding links by NUM [1]"); - System.err.println(" maxIntersect=NUM maximum alignment intersection from paf [2000]"); - System.err.println(" maxPafScore=NUM maximum link score from the paf [not set]\n"); - - System.err.println(" scaleScore=NUM scale aligment scores to markers [0.00001] (0.00001 = 1kb 100% identity = 1)"); - System.err.println(" orientationPenalty=NUM if an aligment is in wrong orientation, multiply score by this [0.5]"); - System.err.println(" cutPenalty=NUM alignment cut penalty [0.001] (0.001 = 1kb gap = -1"); - - System.err.println(" useChainAndPaf=0 do not use both chain and paf score between contigs when available [not set]"); - - System.err.println(" proximity=file NUM1 NUM2 NUM3 load proximity data, NUM1=bin size [10000]"); - System.err.println(" NUM2=max distance in bins[25], NUM3=scale score [1.0]"); - - - System.err.println(" minHaplotypeAlignmentScore=NUM min alignment score required to consider haplotype [-10]"); - System.err.println(" minLinkAlignmentScore=NUM min alignment score required to consider contig link [-10]\n"); - - System.err.println(" evaluateAnchoring=FILE load initial anchring from a FILE (experimental)"); - System.err.println(" improveAnchoring=1 improve loaded initial anchring (experimental)"); - - System.err.println(" cutSites=FILE list possible contig cut sites for contigs"); - System.err.println(" the first cut site within each cut region is taken"); - - System.err.println(" findContigErrors=1 Iteratively find possible contig errors based on the map only"); - System.err.println(" paf, proximity and keepEmptyIntervals not allowed"); - - System.err.println(" minImprovement=NUM minimum improvement for findContigErrors [1]"); - System.err.println(" improvement of (NUM + 2*number_of_cuts) required"); - - System.err.println(" printAnimation=file print iterative solutions of findContigErrors to file0.la,...,fileN.la"); - - System.err.println(" alternativeJoins=1 prints alternative scaffolding joins, does not consider map information (yet)"); - System.err.println(" linksWithin=FILE only keep links between contig pairs listed in FILE"); - - } - - public static void main(String[] args) - { - - if (args.length == 0) { - usageInfo(); - System.exit(0); - } - String extraParameters = ""; - for (int i = 0; i < args.length; ++i) { - extraParameters += " " + args[i]; - } - ParameterParser pp = new ParameterParser(); - if (!pp.init(extraParameters)) { - usageInfo(); - System.exit(0); - } - pp.warning(new String[]{"findContigErrors","maxPafScore", "map", "bed", "orientation", "chain", "noIntervals", "noChromosome", "randomOrder", "paf", "evaluateAnchoring", "numRuns", "numErrors", "numErrorsPerContig", "scaleScore", "orientationPenalty", "cutPenalty", "maxBridge", "scalePaf", "minLinkAlignmentScore","minHaplotypeAlignmentScore", "maxIntersect", "chromosome", "keepEmptyIntervals", "cutSites", "useChainAndPaf", "proximity", "printAnimation", "compressMap", "minImprovement", "improveAnchoring", "alternativeJoins", "numThreads", "linksWithin"}); - -// ArrayList m = InputData.loadMap(pp.getValueAsString("map", null)); - //PlaceAndOrientContigs.solve(m); - - PlaceAndOrientContigs poc = new PlaceAndOrientContigs(); - - System.out.println("#java PlaceAndOrientContigs" + extraParameters); - - boolean findContigErrors = pp.getValueAsString("findContigErrors", "0").equals("1"); - - String chain = pp.getValueAsString("chain", null); - String paf = pp.getValueAsString("paf", null); - String prox = pp.getValueAsString("proximity", 0, null); - - if (findContigErrors && (paf != null || prox != null || pp.getValueAsString("keepEmptyIntervals", "0").equals("1"))) { - System.err.println("parameters paf, proximity and keepEmptyIntervals not allowed with findContigErrors!"); - System.exit(-1); - } - - poc.setNumThreads(Integer.parseInt(pp.getValueAsString("numThreads", "1"))); - poc.setScaleScore(Double.parseDouble(pp.getValueAsString("scaleScore", "0.00001"))); - - poc.setMaxIntersect(Integer.parseInt(pp.getValueAsString("maxIntersect", "2000"))); - - poc.setOrientationPenalty(Double.parseDouble(pp.getValueAsString("orientationPenalty", "0.5"))); - poc.setCutPenalty(Double.parseDouble(pp.getValueAsString("cutPenalty", "0.001"))); - poc.setMaxBridge(Integer.parseInt(pp.getValueAsString("maxBridge", "50000"))); - - int numMaps = pp.getNumberOfValues("map"); - - int chromosome = Integer.parseInt(pp.getValueAsString("chromosome", "-1")); - String bed = pp.getValueAsString("bed", null); - if (bed != null && numMaps > 0) - poc.loadBed(bed, chromosome); - else { - System.err.println("You have to provide bed and one or more map file(s)!"); - System.exit(-1); - } - - String cutSites = pp.getValueAsString("cutSites", null); - if (cutSites != null) { - poc.loadCutSites(cutSites); - } - - boolean findOrientation = false; - for (int i = 0; i < pp.getNumberOfValues("map"); ++i) { - String o = pp.getValueAsString("orientation", i, "?"); - if (!o.equals("+") && !o.equals("-")) - findOrientation = true; - } - if (pp.getNumberOfValues("map") != pp.getNumberOfValues("orientation") && pp.getNumberOfValues("orientation") > 0) { - System.err.println("You have to provide orientation for all maps or none!"); - System.exit(-1); - } - - poc.setCompressMap(pp.getValueAsString("compressMap", "1").equals("1")); - - for (int i = 0; i < pp.getNumberOfValues("map"); ++i) { - String o = pp.getValueAsString("orientation", i, "?"); - poc.addMap(pp.getValueAsString("map", i, null), !findOrientation && o.equals("-"), pp.getValueAsString("noChromosome", "0").equals("1"), pp.getValueAsString("noIntervals", "0").equals("1"), chromosome); - } - - if (chain != null) { - poc.loadChain(chain); - } - - int maxPafScore = Integer.parseInt(pp.getValueAsString("maxPafScore", "" + Integer.MAX_VALUE)); - - if (paf != null) - poc.loadPaf(paf, maxPafScore, Double.parseDouble(pp.getValueAsString("scalePaf", "1"))); - - if (prox != null) { - int bin = Integer.parseInt(pp.getValueAsString("proximity", 1, "10000")); - int maxD = Integer.parseInt(pp.getValueAsString("proximity", 2, "25")); - double scale = Double.parseDouble(pp.getValueAsString("proximity", 3, "1.0")); - poc.loadProximity(prox, bin, maxD, scale); - } - - poc.setNumRuns(Integer.parseInt(pp.getValueAsString("numRuns", "5"))); - - poc.setNumErrors(Integer.parseInt(pp.getValueAsString("numErrors", "40"))); - poc.setNumErrorsPerContig(Integer.parseInt(pp.getValueAsString("numErrorsPerContig", "3"))); - - poc.setMinHaplotypeAlignmentScore(Integer.parseInt(pp.getValueAsString("minHaplotypeAlignmentScore", "-10"))); - poc.setMinLinkAlignmentScore(Integer.parseInt(pp.getValueAsString("minLinkAlignmentScore", "-10"))); - - poc.setKeepEmptyIntervals(pp.getValueAsString("keepEmptyIntervals", "0").equals("1")); - poc.setUseChainAndPaf(pp.getValueAsString("useChainAndPaf", "1").equals("1")); - poc.setPrintAnimation(pp.getValueAsString("printAnimation", "")); - - String lwf = pp.getValueAsString("linksWithin", null); - if (lwf != null) - poc.linksWithin(lwf); - - poc.setCommentOutput(findContigErrors); - - String eval = pp.getValueAsString("evaluateAnchoring", null); - if (eval != null) { - poc.combineMaps(findOrientation, false, false); - ArrayList eval_result = InputData.loadLa(eval); - poc.evaluateScore(eval_result, pp.getValueAsString("improveAnchoring", "0").equals("1")); - } else { - poc.combineMaps(findOrientation, pp.getValueAsString("randomOrder", "0").equals("1"), true); - } - - if (findContigErrors) { - poc.setCommentOutput(false); - poc.findContigErrors(Integer.parseInt(pp.getValueAsString("minImprovement", "1"))); - } - - if (pp.getValueAsString("alternativeJoins", "0").equals("1")) { - poc.printAlternativeJoins(); - } - - - //System.out.println(m); - } - - private String[] contigs(String key) - { - String split[] = key.split("\t"); - int ip = split[2].indexOf('+'); - int im = split[2].indexOf('-'); - int p = Math.max(ip, im); - if (ip > 0 && im > 0) - p = Math.min(ip, im); - return new String[]{split[0], split[3].substring(p + 1)}; - - } - - private void linksWithin(String lwf) { - // TODO Clear proximity as well... - HashMap pairs = new HashMap(); - ArrayList> table = Input.loadTable(lwf, "\t "); - for (ArrayList row : table) { - if (row.size() >= 2) - pairs.put(row.get(0) + "\t" + row.get(1), 1); - } - - for (String key : chainLinkHash.keySet()) { - String cs[] = contigs(key); - String p1 = cs[0] + "\t" + cs[1]; - if (!pairs.containsKey(p1)) { - String p2 = cs[1] + "\t" + cs[0]; - if (!pairs.containsKey(p2)) - chainLinkHash.put(key, 0); - } - } - for (String key : scaffoldingLink.keySet()) { - String cs[] = contigs(key); - String p1 = cs[0] + "\t" + cs[1]; - if (!pairs.containsKey(p1)) { - String p2 = cs[1] + "\t" + cs[0]; - if (!pairs.containsKey(p2)) - scaffoldingLink.put(key, 0); - } - } - } - private int findLastIndex(ArrayList cis, int start, int maxDistance, boolean right) { - if (prox == null) - return start; - int n = cis.size(); - int end = start; - int d = prox.binLength(cis.get(start)); - if (right) - while (end + 1 < n && d < maxDistance) { - ++end; - d += prox.binLength(cis.get(end)); - } - else - while (end > 0 && d < maxDistance) { - --end; - d += prox.binLength(cis.get(end)); - } - - return end; - } - - //score between c and c + 1, c in 0,1,...,cis.size() - 2 - private int linkScore(ArrayList cis, int c) { - int ret = calculateChainScore(cis.get(c), cis.get(c + 1)); - if (prox != null) - ret += prox.linkScore(cis, c, c + 1); - return ret; - } - - //score of [start,end] => [start2,end2] in all orientations (++, -+, +-, --) - private int[] scores(ArrayList cis, int start, int end, int start2, int end2){ - ArrayList tmp1 = new ArrayList(); - for (int i = start; i <= end; ++i) - tmp1.add(cis.get(i)); - - ArrayList tmp2 = new ArrayList(); - for (int i = start2; i <= end2; ++i) - tmp2.add(cis.get(i)); - - int s[] = new int[4]; - for (int o2 = 0; o2 < 2; ++o2) { - for (int o1 = 0; o1 < 2; ++o1) { - ArrayList tmp = new ArrayList(); - tmp.addAll(tmp1); - tmp.addAll(tmp2); - s[o1 + 2 * o2] = linkScore(tmp, end - start); - - Collections.reverse(tmp1); - for (ContigInterval c : tmp1) - c.flipOrientation(); - } - Collections.reverse(tmp2); - for (ContigInterval c : tmp2) - c.flipOrientation(); - } - return s; - } - - private void printAlternativeJoins() { - int maxD = 0; - if (prox != null) - maxD = prox.getMaxDistance(); - int n = intervalsInChr.size(); - if (n <= 1) - return; - - int jTable[][] = new int[2 * n][2 * n]; - int start = 0; - while (start < n) { - int end_ = findLastIndex(intervalsInChr, start, maxD, true); // this could be faster... - for (int end = ((start == 0) ? 0 : end_); end <= end_; ++end) - for (int start2 = end + 1; start2 < n; ++start2) { - int end2 = findLastIndex(intervalsInChr, start2, maxD, true); // and this - int s[] = scores(intervalsInChr, start, end, start2, end2); - - //end2 is always maximal - boolean endMaximal = (end == end_); - boolean startMaximal = (findLastIndex(intervalsInChr, end, maxD, false) == start); - boolean start2Maximal = (findLastIndex(intervalsInChr, end2, maxD, false) == start2); - //remove non-maximal scores, otherwise you count each score multiple times for same endpoint with other end not maximal... - - if (startMaximal) - jTable[2 * end] [2 * start2] += s[0]; // += is not needed, = is enough - if (endMaximal) - jTable[2 * start + 1][2 * start2] += s[1]; - if (startMaximal && start2Maximal) - jTable[2 * end] [2 * end2 + 1] += s[2]; - if (endMaximal && start2Maximal) - jTable[2 * start + 1][2 * end2 + 1] += s[3]; - - int s2[] = scores(intervalsInChr, start2, end2, start, end); - - if (start2Maximal && endMaximal) - jTable[2 * end2] [2 * start] += s2[0]; - if (endMaximal) - jTable[2 * start2 + 1][2 * start] += s2[1]; - if (start2Maximal && startMaximal) - jTable[2 * end2] [2 * end + 1] += s2[2]; - if (startMaximal) - jTable[2 * start2 + 1][2 * end + 1] += s2[3]; - } - ++start; - } -/* for (int i = 0; i < n + n; ++i) { - StringBuilder sb = new StringBuilder(); - for (int j = 0; j < n + n; ++j) { - if (j != 0) - sb.append('\t'); - if (i == j) - sb.append('X'); - else - sb.append(jTable[i][j]); - } - System.err.println(sb); - }*/ - System.err.println("*** alternativeJoins ***"); - for (int ci = 0; ci < n; ++ci) { - //ArrayList list = new ArrayList(); - for (int i = 0; i < n + n; ++i) { - int ip2 = i >> 1; - if (jTable[2 * ci][i] > 0) { - System.err.println(intervalsInChr.get(ci) + "\t+\t" + intervalsInChr.get(ip2) + "\t" + ("+-".substring(i & 1, (i & 1) + 1)) + "\t" + jTable[2 * ci][i] + "\t" + ci + "\t" + ip2); - } - if (jTable[2 * ci + 1][i] > 0) { - System.err.println(intervalsInChr.get(ci) + "\t-\t" + intervalsInChr.get(ip2) + "\t" + ("+-".substring(i & 1, (i & 1) + 1)) + "\t" + jTable[2 * ci + 1][i] + "\t" + ci + "\t" + ip2); - } - } - - } - } -} diff --git a/software/LepAnchor/src/Proximity.java b/software/LepAnchor/src/Proximity.java deleted file mode 100644 index f1fa1e0..0000000 --- a/software/LepAnchor/src/Proximity.java +++ /dev/null @@ -1,657 +0,0 @@ -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; - - -public class Proximity { - private int binSize; - private int maxDistance; - private double scale; - //eg. binSize=10000 and maxDistance=25 =>10kb bins and max of 250kb - - //TODO: find possible assembly errors... - - public Proximity(int binSize, int maxDistance, double scale){ - this.binSize = binSize; - this.maxDistance = maxDistance; - this.scale = scale; - } - HashMap distanceHash = new HashMap(); - - public int getMaxDistance(){ - return maxDistance; - } - - public boolean loadData(String fn, HashMap> bedHash) - { - boolean nonEmpty = false; - System.err.println("loading proximity data..."); - HashMap symmetryHash = new HashMap(); - try { - HashMap distanceHash_tmp = new HashMap(); - - BufferedReader br = null; - if (fn.equals("-")) - br = new BufferedReader(new InputStreamReader(System.in)); - else - br = new BufferedReader(new FileReader(fn)); - do { - ArrayList row = Input.loadTableRow(br, "\t "); - if (row == null) - break; - String contig1 = row.get(0); - long pos1 = Long.parseLong(row.get(1)); - if (bedHash.containsKey(contig1)) - for (ContigInterval ci1 : bedHash.get(contig1)) - if (ci1.inside(pos1)) { - - int p1 = bin(pos1); - - int de1 = bin(ci1.getEnd()) - p1; - int ds1 = p1 - bin(ci1.getStart()); - - if (de1 >= maxDistance && ds1 >= maxDistance) - continue; - - String contig2 = row.get(2); - long pos2 = Long.parseLong(row.get(3)); - if (bedHash.containsKey(contig2)) - for (ContigInterval ci2 : bedHash.get(contig2)) - if (ci1 != ci2 && ci2.inside(pos2)) { - //DO MAGIC HERE - - int p2 = bin(pos2); - int de2 = bin(ci2.getEnd()) - p2; - int ds2 = p2 - bin(ci2.getStart()); - - if (de2 >= maxDistance && ds2 >= maxDistance || Math.min(de1, ds1) + Math.min(de2, ds2) >= maxDistance) - continue; - - nonEmpty = true; - - double value = Double.parseDouble(row.get(4)) * scale; - - if (de1 + ds2 < maxDistance) { // ++ - String key = calculateKey(ci1, true, ci2, true); - symmetryHash.put(key, calculateKey(ci2, false, ci1, false)); - - double list[] = distanceHash_tmp.get(key); - if (list == null) { - list = new double[maxDistance]; - distanceHash_tmp.put(key, list); - } - list[de1 + ds2] += value; - } - if (de1 + de2 < maxDistance) { // +- - String key = calculateKey(ci1, true, ci2, false); - symmetryHash.put(key, calculateKey(ci2, true, ci1, false)); - - double list[] = distanceHash_tmp.get(key); - if (list == null) { - list = new double[maxDistance]; - distanceHash_tmp.put(key, list); - } - list[de1 + de2] += value; - } - if (ds1 + ds2 < maxDistance) { // -+ - String key = calculateKey(ci1, false, ci2, true); - symmetryHash.put(key, calculateKey(ci2, false, ci1, true)); - - double list[] = distanceHash_tmp.get(key); - if (list == null) { - list = new double[maxDistance]; - distanceHash_tmp.put(key, list); - } - list[ds1 + ds2] += value; - } - if (ds1 + de2 < maxDistance) { // -- - String key = calculateKey(ci1, false, ci2, false); - symmetryHash.put(key, calculateKey(ci2, true, ci1, true)); - - double list[] = distanceHash_tmp.get(key); - if (list == null) { - list = new double[maxDistance]; - distanceHash_tmp.put(key, list); - } - list[ds1 + de2] += value; - } - } - } - } while (true); - br.close(); - - //transfer and truncate data from distanceHash_tmp (double) to distanceHash (int) - //make scores symmetric and do rounding - ArrayList missingKeys = new ArrayList(); - for (String key : distanceHash_tmp.keySet()) { - String sk = symmetryHash.get(key); - if (!distanceHash_tmp.containsKey(sk)) - missingKeys.add(key); - } - for (String key : missingKeys) { // put missing symmetric keys to the hashes - String sk = symmetryHash.get(key); - distanceHash_tmp.put(sk, distanceHash_tmp.get(key)); - symmetryHash.put(sk, key); - } - - for (String key : distanceHash_tmp.keySet()) { - double list[] = distanceHash_tmp.get(key); - double list2[] = distanceHash_tmp.get(symmetryHash.get(key)); // symmetric key - - int new_list[] = new int[maxDistance]; - - double sum = 0.0; - for (int i = 0; i < maxDistance; ++i) { - sum += list[i] + list2[i]; - new_list[maxDistance - i - 1] = (int)(0.5 * sum + 0.5); - } - int maxLength = maxDistance; - while (maxLength > 0 && new_list[maxLength - 1] == 0) - --maxLength; - if (maxLength > 0) - distanceHash.put(key, Arrays.copyOf(new_list, maxLength)); - -/* int maxIndex = maxDistance - 1; - double sum = 0.0; - while (maxIndex >= 0 && sum + list[maxIndex] + list2[maxIndex] < 1.0) { - sum += list[maxIndex] + list2[maxIndex]; - --maxIndex; - } - if (maxIndex >= 0) { - int new_list[] = new int[maxIndex + 1]; - while (maxIndex >= 0) { - sum += list[maxIndex] + list2[maxIndex]; - new_list[maxIndex] = (int)(0.5 * sum + 0.5); - --maxIndex; - } - distanceHash.put(key, new_list); - }*/ - } - - } catch (Exception e) { - e.printStackTrace(); - System.err.println("Error in file " + fn); - } - - System.err.println("Proximity links:"); - for (String key: distanceHash.keySet()) - System.err.println(key + "\t" + distanceHash.get(key)[0]); - return nonEmpty; - } - - - - //"bin" a contig position - private int bin(long position) - { - return (int) ((position - 1) / binSize); - } - - //how many bins a contig spans... - public int binLength(ContigInterval ci) - { - int s = bin(ci.getStart()); - int e = bin(ci.getEnd()); - return (e - s + 1); - } - - private String calculateKey(ContigInterval c1, ContigInterval c2) - { - return calculateKey(c1, c1.getOrientation() >= 0, c2, c2.getOrientation() >= 0); - } - - private String calculateKey(ContigInterval c1, boolean orientation1, ContigInterval c2, boolean orientation2) - { - return c1.toString() + (orientation1 ? '+':'-') + c2.toString() + (orientation2 ? '+':'-'); - } - - // calculate a list of contigs that should be evaluate against cis.get(ci) - private ArrayList next(ArrayList cis, int ci, int last) { - ArrayList ret = new ArrayList(); - //int n = cis.size(); - int i = ci + 1; - int d = 0; - while (i <= last && d < maxDistance) { - ContigInterval c = cis.get(i); - ret.add(i); - d += binLength(c); - ++i; - } - return ret; - } - - // calculate a list of contigs that should be evaluate against cis.get(ci) - private ArrayList prev(ArrayList cis, int ci, int first) { - ArrayList ret = new ArrayList(); - int i = ci - 1; - int d = 0; - while (i >= first && d < maxDistance) { - ContigInterval c = cis.get(i); - ret.add(i); - d += binLength(c); - --i; - } - return ret; - } - - private int borderScore(ArrayList cis, ArrayList prev, ArrayList next){ - int ret = 0; - int dp = 0; - for (int pci : prev) { - ContigInterval pc = ((pci < 0) ? cis.get(-pci - 1) : cis.get(pci)); - int dnp = dp; - dp += binLength(pc); - for (int nci : next) { - if (dnp >= maxDistance) - break; - ContigInterval nc = ((nci < 0) ? cis.get(-nci - 1) : cis.get(nci)); - - ret += score(pc, pci < 0, nc, nci < 0, dnp); - dnp += binLength(nc); - } - } - return ret; - } - // difference in score if cis[start...end] are moved to new_position and possibly flipped - public int scoreChange_slow(ArrayList cis, int start, int end, int moveDirection, boolean flip) - { - int scoreOld = score(cis); - PlaceAndOrientContigs.changeOrder(cis, start, end, moveDirection, flip); - int scoreNew = score(cis); - PlaceAndOrientContigs.changeOrderReverse(cis, start, end, moveDirection, flip); - //System.err.println(scoreNew - scoreOld); - return scoreNew - scoreOld; - } - - // difference in score if cis[start...end] are moved to new_position and possibly flipped - public int scoreChange(ArrayList cis, int start, int end, int moveDirection, boolean flip) - { - int ret = 0; - int n = cis.size(); - if (moveDirection == 0) { // flip - if (flip) { - //first part - ArrayList next = next(cis, start - 1, end); // start, start+1, ..., end - ArrayList prev = prev(cis, start, 0); // start-1, start-2, ..., 0 - - ArrayList new_next = new ArrayList(); - int d = 0; - for (int i = end; d < maxDistance && i >= start; --i) { - ContigInterval nc = cis.get(i); - new_next.add(-i - 1); - d += binLength(nc); - } - //subtract old score for start-1 -> start boundary, - ret -= borderScore(cis, prev, next); - //...and add new score for this boundary - ret += borderScore(cis, prev, new_next); - - //second part - - next = next(cis, end, n - 1); // end+1, end+2, ..., n-1 - prev = prev(cis, end + 1, start); // end, end-1, ..., start - - ArrayList new_prev = new ArrayList(); - d = 0; - for (int i = start; d < maxDistance && i <= end; ++i) { - ContigInterval nc = cis.get(i); - new_prev.add(-i - 1); - d += binLength(nc); - } - //subtract old score for end -> end+1 boundary, - ret -= borderScore(cis, prev, next); - //..and add new score for this boundary - ret += borderScore(cis, new_prev, next); - - } // else nothing to do... - } else { // move one or multiple contigs... - if (moveDirection >= 0) { // actually moveDirection > 0 - //int first = start - //int second = end; - //int third = end + moveDirection; - //first part - ArrayList next = next(cis, start - 1, n - 1); // start, start+1, ..., n-1 - ArrayList prev = prev(cis, start, 0); // start-1, start-2, ..., 0 - ret -= borderScore(cis, prev, next); - - ArrayList new_next = new ArrayList(); - int d = 0; - - for (int i = end + 1; d < maxDistance && i <= end + moveDirection; ++i) { - ContigInterval nc = cis.get(i); - new_next.add(i); - d += binLength(nc); - } - - if (flip) - for (int i = end; d < maxDistance && i >= start; --i) { - ContigInterval nc = cis.get(i); - new_next.add(-i - 1); - d += binLength(nc); - } - else - for (int i = start; d < maxDistance && i <= end; ++i) { - ContigInterval nc = cis.get(i); - new_next.add(i); - d += binLength(nc); - } - - for (int i = end + moveDirection + 1; d < maxDistance && i < n; ++i) { - ContigInterval nc = cis.get(i); - new_next.add(i); - d += binLength(nc); - } - //flip: new_next = end + 1, ..., end + moveDirection,end...start, end + moveDirection + 1 ... n - 1 - //!flip: new_next = end + 1, ..., end + moveDirection,start...end, end + moveDirection + 1 ... n - 1 - //prev = start - 1, start - 2, ..., 0 - - ret += borderScore(cis, prev, new_next); - - //second part - next = next(cis, end, n - 1); // second, second+1, ..., n-1 - prev = prev(cis, end + 1, start); // second-1, second-2, ..., first - - ret -= borderScore(cis, prev, next); - - new_next = new ArrayList(); - d = 0; - if (flip) - for (int i = end; d < maxDistance && i >= start; --i) { - ContigInterval nc = cis.get(i); - new_next.add(-i - 1); - d += binLength(nc); - } - else - for (int i = start; d < maxDistance && i <= end; ++i) { - ContigInterval nc = cis.get(i); - new_next.add(i); - d += binLength(nc); - } - for (int i = end + moveDirection + 1; d < maxDistance && i < n; ++i) { - ContigInterval nc = cis.get(i); - new_next.add(i); - d += binLength(nc); - } - //flip: new_next = end...start, end + moveDirection + 1 ... n - 1 - //!flip: new_next = start..end, ..., end + moveDirection + 1 ... n - 1 - - ArrayList new_prev = new ArrayList(); - d = 0; - - for (int i = end + moveDirection; d < maxDistance && i > end; --i) { //TODO: check if (>= end or) > is proper - ContigInterval nc = cis.get(i); - new_prev.add(i); - d += binLength(nc); - } - //new_prev = end + moveDirection...end - ret += borderScore(cis, new_prev, new_next); - - //third part - next = next(cis, end + moveDirection, n - 1); // third, third+1, ..., n-1 - prev = prev(cis, end + moveDirection + 1, end); // third-1, third-2, ..., second - - ret -= borderScore(cis, prev, next); - new_prev = new ArrayList(); - - //TODO: calculate new_prev - d = 0; - if (flip) - for (int i = start; d < maxDistance && i <= end; ++i) { - ContigInterval nc = cis.get(i); - new_prev.add(-i - 1); - d += binLength(nc); - } - else - for (int i = end; d < maxDistance && i >= start; --i) { - ContigInterval nc = cis.get(i); - new_prev.add(i); - d += binLength(nc); - } - ret += borderScore(cis, new_prev, next); - //System.err.println(borderScore(cis, prev, next)); - //System.err.println(prev); - //System.err.println(next); - - } else { //moveDirection < 0 - //int first = start + moveDirection; - //int second = start; - //int third = end; - //first part - ArrayList next = next(cis, start + moveDirection - 1, n - 1); // first, first+1, ..., n-1 - ArrayList prev = prev(cis, start + moveDirection, 0); // first-1, first-2, ..., 0 - ret -= borderScore(cis, prev, next); - - ArrayList new_next = new ArrayList(); - int d = 0; - if (flip) - for (int i = end; d < maxDistance && i >= start; --i) { - ContigInterval nc = cis.get(i); - new_next.add(-i - 1); - d += binLength(nc); - } - else - for (int i = start; d < maxDistance && i <= end; ++i) { - ContigInterval nc = cis.get(i); - new_next.add(i); - d += binLength(nc); - } - - for (int i = start + moveDirection; d < maxDistance && i < start; ++i) { - ContigInterval nc = cis.get(i); - new_next.add(i); - d += binLength(nc); - } - - for (int i = end + 1; d < maxDistance && i < n; ++i) { - ContigInterval nc = cis.get(i); - new_next.add(i); - d += binLength(nc); - } - - ret += borderScore(cis, prev, new_next); - //System.err.println(""); - //System.err.println(prev); - //System.err.println(next); - //System.err.println("->"); - //System.err.println(prev); - //System.err.println(new_next); - - //second part - next = next(cis, start - 1, n - 1); // second, second+1, ..., n-1 - prev = prev(cis, start, start + moveDirection); // second-1, second-2, ..., first - - ret -= borderScore(cis, prev, next); - - new_next = new ArrayList(); - - d = 0; - for (int i = start + moveDirection; d < maxDistance && i < start; ++i) { - ContigInterval nc = cis.get(i); - new_next.add(i); - d += binLength(nc); - } - - for (int i = end + 1; d < maxDistance && i < n; ++i) { - ContigInterval nc = cis.get(i); - new_next.add(i); - d += binLength(nc); - } - - ArrayList new_prev = new ArrayList(); - - d = 0; - if (flip) - for (int i = start; d < maxDistance && i <= end; ++i) { - ContigInterval nc = cis.get(i); - new_prev.add(-i - 1); - d += binLength(nc); - } - else - for (int i = end; d < maxDistance && i >= start; --i) { - ContigInterval nc = cis.get(i); - new_prev.add(i); - d += binLength(nc); - } - - ret += borderScore(cis, new_prev, new_next); - //System.err.println(""); - //System.err.println(prev); - //System.err.println(next); - //System.err.println("->"); - //System.err.println(new_prev); - //System.err.println(new_next); - - //third part - next = next(cis, end, n - 1); // third, third+1, ..., n-1 - prev = prev(cis, end + 1, start); // third-1, third-2, ..., second - - ret -= borderScore(cis, prev, next); - new_prev = new ArrayList(); - //TODO: calculate new_prev - d = 0; - for (int i = start - 1; d < maxDistance && i >= start + moveDirection; --i) { - ContigInterval nc = cis.get(i); - new_prev.add(i); - d += binLength(nc); - } - ret += borderScore(cis, new_prev, next); - //System.err.println(""); - //System.err.println(prev); - //System.err.println(next); - //System.err.println("->"); - //System.err.println(new_prev); - //System.err.println(next); - } - - } - //if (true) { - // int value = scoreChange_slow(cis, start, end, moveDirection, flip); - // if (value != ret) - // System.err.println(ret + "!=" + value + " " + start + " " + end + " " + moveDirection + " " + flip); - // return value; - //} - - return ret; - } - - public int score(ArrayList cis) - { - int ret = 0; - for (int i = 1; i < cis.size(); ++i) { - int j = i - 1; - int d = 0; - while (j >= 0 && d < maxDistance) { - ret += score(cis.get(j), cis.get(i), d); - d += binLength(cis.get(j)); - --j; - } - } - return ret; - } - - private int score(ContigInterval c1, ContigInterval c2, int distance) - { - int d[] = distanceHash.get(calculateKey(c1, c2)); - if (d != null && d.length > distance) - return d[distance]; - return 0; - } - - public int score(String key, int distance) - { - int d[] = distanceHash.get(key); - if (d != null && d.length > distance) - return d[distance]; - return 0; - } - - private int score(ContigInterval c1, boolean flip1, ContigInterval c2, boolean flip2, int distance) - { - int d[] = distanceHash.get(calculateKey(c1, (c1.getOrientation() >= 0) ^ flip1, c2, (c2.getOrientation() >= 0) ^ flip2)); - if (d != null && d.length > distance) - return d[distance]; - return 0; - } - - public static void main(String args[]) - { - //Test Proximity class - ArrayList bed = InputData.loadBed("proximity.bed"); - - HashMap> bedHash2 = new HashMap>(); - for (ContigInterval ci : bed) { - String contig = ci.getContig(); - if (!bedHash2.containsKey(contig)) - bedHash2.put(contig, new ArrayList()); - bedHash2.get(contig).add(ci); - } - Proximity p = new Proximity(10000, 2000, 0.01); - p.loadData("ld.txt", bedHash2); - ContigInterval c1 = bedHash2.get("000604F|quiver|pilon").get(0); - ContigInterval c2 = bedHash2.get("000000F|quiver|pilon").get(0); - ContigInterval c3 = bedHash2.get("000310F|quiver|pilon").get(0); - ArrayList cis = new ArrayList(); - cis.add(c1); - cis.add(c2); - cis.add(c3); - Collections.reverse(cis); - for (int orient = 0; orient < 8; ++orient) { - c1.setOrientation((orient & 4) - 1); - c2.setOrientation((orient & 2) - 1); - c3.setOrientation((orient & 1) - 1); - System.err.println(Integer.toBinaryString(orient + 8).substring(1) + ":\t" + p.score(cis)); - } - System.err.println(); - Collections.reverse(cis); - for (int orient = 0; orient < 8; ++orient) { - c1.setOrientation((orient & 4) - 1); - c2.setOrientation((orient & 2) - 1); - c3.setOrientation((orient & 1) - 1); - System.err.println(Integer.toBinaryString(orient + 8).substring(1) + ":\t" + p.score(cis)); - } - - c1.setOrientation(1); - c2.setOrientation(-1); - c3.setOrientation(1); - System.err.println(p.score(cis)); - System.err.println(); - for (int i = 0; i < 3; ++i) { - System.err.println(p.scoreChange(cis, i, i, 0, true)); - System.err.println(p.scoreChange_slow(cis, i, i, 0, true)); - System.err.println(); - } - - for (int i = 0; i < 2; ++i) { - System.err.println(p.scoreChange(cis, i, i, 1, true)); - System.err.println(p.scoreChange_slow(cis, i, i, 1, true)); - System.err.println(); - } - System.err.println(p.scoreChange(cis, 0, 0, 2, true)); - System.err.println(p.scoreChange_slow(cis, 0, 0, 2, true)); - System.err.println(); - - System.err.println(p.scoreChange(cis, 1, 2, -1, true)); - System.err.println(p.scoreChange_slow(cis, 1, 2, -1, true)); - System.err.println(); - - - System.err.println(p.scoreChange(cis, 0, 1, 1, true)); - System.err.println(p.scoreChange_slow(cis, 0, 1, 1, true)); - System.err.println(); - - } - // linkScore between ... ,prevC-1, prevC and nextC, nextC + 1, ... - public int linkScore(ArrayList cis, int prevC, int nextC) { - ArrayList next = next(cis, nextC - 1, cis.size() - 1); - ArrayList prev = prev(cis, prevC + 1, 0); - return borderScore(cis, prev, next); - } - - public boolean linkedIntervals(ArrayList cis, int c1, int c2) { - return (linkScore(cis, c1, c2) > 0); - } -} diff --git a/software/LepMap3/DataParser$PosteriorParser.class b/software/LepMap3/DataParser$PosteriorParser.class index af24204..4970103 100644 Binary files a/software/LepMap3/DataParser$PosteriorParser.class and b/software/LepMap3/DataParser$PosteriorParser.class differ diff --git a/software/LepMap3/DataParser$VCFParser.class b/software/LepMap3/DataParser$VCFParser.class index a9c9a5b..d65d91a 100644 Binary files a/software/LepMap3/DataParser$VCFParser.class and b/software/LepMap3/DataParser$VCFParser.class differ diff --git a/software/LepMap3/DataParser.class b/software/LepMap3/DataParser.class index 0dfdb77..35d5dd6 100644 Binary files a/software/LepMap3/DataParser.class and b/software/LepMap3/DataParser.class differ diff --git a/software/LepMap3/Error.class b/software/LepMap3/Error.class index 4a3c416..94ab6f1 100644 Binary files a/software/LepMap3/Error.class and b/software/LepMap3/Error.class differ diff --git a/software/LepMap3/Family2.class b/software/LepMap3/Family2.class index 9517aa3..f7a7dad 100644 Binary files a/software/LepMap3/Family2.class and b/software/LepMap3/Family2.class differ diff --git a/software/LepMap3/Filtering2.class b/software/LepMap3/Filtering2.class index aaa172b..c2aabb4 100644 Binary files a/software/LepMap3/Filtering2.class and b/software/LepMap3/Filtering2.class differ diff --git a/software/LepMap3/GammaFunction.class b/software/LepMap3/GammaFunction.class index dc5b8e1..bd257e5 100644 Binary files a/software/LepMap3/GammaFunction.class and b/software/LepMap3/GammaFunction.class differ diff --git a/software/LepMap3/IBD.class b/software/LepMap3/IBD.class index c7cfb65..f64f796 100644 Binary files a/software/LepMap3/IBD.class and b/software/LepMap3/IBD.class differ diff --git a/software/LepMap3/Input.class b/software/LepMap3/Input.class index 0c03486..198306c 100644 Binary files a/software/LepMap3/Input.class and b/software/LepMap3/Input.class differ diff --git a/software/LepMap3/JoinSingles2All.class b/software/LepMap3/JoinSingles2All.class index eea92b9..773553b 100644 Binary files a/software/LepMap3/JoinSingles2All.class and b/software/LepMap3/JoinSingles2All.class differ diff --git a/software/LepMap3/LMPlot.class b/software/LepMap3/LMPlot.class index 0702bbf..d91ed8c 100644 Binary files a/software/LepMap3/LMPlot.class and b/software/LepMap3/LMPlot.class differ diff --git a/software/LepMap3/Misc$ArrayIndexComparator.class b/software/LepMap3/Misc$ArrayIndexComparator.class index b651069..87a8c03 100644 Binary files a/software/LepMap3/Misc$ArrayIndexComparator.class and b/software/LepMap3/Misc$ArrayIndexComparator.class differ diff --git a/software/LepMap3/Misc$KthSmallest.class b/software/LepMap3/Misc$KthSmallest.class index 7779ff5..300a279 100644 Binary files a/software/LepMap3/Misc$KthSmallest.class and b/software/LepMap3/Misc$KthSmallest.class differ diff --git a/software/LepMap3/Misc.class b/software/LepMap3/Misc.class index 0e6c4a0..165162c 100644 Binary files a/software/LepMap3/Misc.class and b/software/LepMap3/Misc.class differ diff --git a/software/LepMap3/Order.class b/software/LepMap3/Order.class index d8bc83a..f3a136b 100644 Binary files a/software/LepMap3/Order.class and b/software/LepMap3/Order.class differ diff --git a/software/LepMap3/OrderFinder$MergeRunner.class b/software/LepMap3/OrderFinder$MergeRunner.class new file mode 100644 index 0000000..6aab1bb Binary files /dev/null and b/software/LepMap3/OrderFinder$MergeRunner.class differ diff --git a/software/LepMap3/OrderFinder$PhysicalFamily.class b/software/LepMap3/OrderFinder$PhysicalFamily.class index 0e56c8e..7bce76d 100644 Binary files a/software/LepMap3/OrderFinder$PhysicalFamily.class and b/software/LepMap3/OrderFinder$PhysicalFamily.class differ diff --git a/software/LepMap3/OrderFinder$PolishRunner.class b/software/LepMap3/OrderFinder$PolishRunner.class new file mode 100644 index 0000000..3a981ff Binary files /dev/null and b/software/LepMap3/OrderFinder$PolishRunner.class differ diff --git a/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale.class b/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale.class index 939ca88..31f95a2 100644 Binary files a/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale.class and b/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale.class differ diff --git a/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale1.class b/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale1.class index 7e200af..45cbf52 100644 Binary files a/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale1.class and b/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale1.class differ diff --git a/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale2.class b/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale2.class index 36c807e..33c3937 100644 Binary files a/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale2.class and b/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale2.class differ diff --git a/software/LepMap3/OrderFinder$SingleFamily.class b/software/LepMap3/OrderFinder$SingleFamily.class index 5344fea..e79be2a 100644 Binary files a/software/LepMap3/OrderFinder$SingleFamily.class and b/software/LepMap3/OrderFinder$SingleFamily.class differ diff --git a/software/LepMap3/OrderFinder.class b/software/LepMap3/OrderFinder.class index 620b72c..96c3922 100644 Binary files a/software/LepMap3/OrderFinder.class and b/software/LepMap3/OrderFinder.class differ diff --git a/software/LepMap3/OrderMarkers2.class b/software/LepMap3/OrderMarkers2.class index 1a6f8fa..43a89c8 100644 Binary files a/software/LepMap3/OrderMarkers2.class and b/software/LepMap3/OrderMarkers2.class differ diff --git a/software/LepMap3/ParameterParser.class b/software/LepMap3/ParameterParser.class index 8924c5c..1f8dc92 100644 Binary files a/software/LepMap3/ParameterParser.class and b/software/LepMap3/ParameterParser.class differ diff --git a/software/LepMap3/ParentCall2.class b/software/LepMap3/ParentCall2.class index 0bb8a36..cd7f580 100644 Binary files a/software/LepMap3/ParentCall2.class and b/software/LepMap3/ParentCall2.class differ diff --git a/software/LepMap3/Pileup2Likelihoods.class b/software/LepMap3/Pileup2Likelihoods.class index 144e448..3dbf7ad 100644 Binary files a/software/LepMap3/Pileup2Likelihoods.class and b/software/LepMap3/Pileup2Likelihoods.class differ diff --git a/software/LepMap3/Pileup2Likelihoods2.class b/software/LepMap3/Pileup2Likelihoods2.class new file mode 100644 index 0000000..25e9209 Binary files /dev/null and b/software/LepMap3/Pileup2Likelihoods2.class differ diff --git a/software/LepMap3/QTL.class b/software/LepMap3/QTL.class index 6dd6e7f..ea939a1 100644 Binary files a/software/LepMap3/QTL.class and b/software/LepMap3/QTL.class differ diff --git a/software/LepMap3/Separate2$JoinSinglesThread.class b/software/LepMap3/Separate2$JoinSinglesThread.class index 93baa37..20ff226 100644 Binary files a/software/LepMap3/Separate2$JoinSinglesThread.class and b/software/LepMap3/Separate2$JoinSinglesThread.class differ diff --git a/software/LepMap3/Separate2$JoinSinglesThreadMaxDistance.class b/software/LepMap3/Separate2$JoinSinglesThreadMaxDistance.class index 4372a16..15fab55 100644 Binary files a/software/LepMap3/Separate2$JoinSinglesThreadMaxDistance.class and b/software/LepMap3/Separate2$JoinSinglesThreadMaxDistance.class differ diff --git a/software/LepMap3/Separate2$SeparateIdenticalThread.class b/software/LepMap3/Separate2$SeparateIdenticalThread.class index f3be6ae..c491b22 100644 Binary files a/software/LepMap3/Separate2$SeparateIdenticalThread.class and b/software/LepMap3/Separate2$SeparateIdenticalThread.class differ diff --git a/software/LepMap3/Separate2$SeparateThread.class b/software/LepMap3/Separate2$SeparateThread.class index 283a9d5..3f798c2 100644 Binary files a/software/LepMap3/Separate2$SeparateThread.class and b/software/LepMap3/Separate2$SeparateThread.class differ diff --git a/software/LepMap3/Separate2.class b/software/LepMap3/Separate2.class index 1c9670c..28e70f4 100644 Binary files a/software/LepMap3/Separate2.class and b/software/LepMap3/Separate2.class differ diff --git a/software/LepMap3/SeparateChromosomes2.class b/software/LepMap3/SeparateChromosomes2.class index 98367c0..e3289f2 100644 Binary files a/software/LepMap3/SeparateChromosomes2.class and b/software/LepMap3/SeparateChromosomes2.class differ diff --git a/software/LepMap3/ShortPath.class b/software/LepMap3/ShortPath.class index fccee92..0bf46d3 100644 Binary files a/software/LepMap3/ShortPath.class and b/software/LepMap3/ShortPath.class differ diff --git a/software/LepMap3/UnionFind.class b/software/LepMap3/UnionFind.class index 2bac911..e375b1b 100644 Binary files a/software/LepMap3/UnionFind.class and b/software/LepMap3/UnionFind.class differ diff --git a/software/LepMap3/scripts/affx2post.awk b/software/LepMap3/scripts/affx2post.awk deleted file mode 100644 index 9ca5d4a..0000000 --- a/software/LepMap3/scripts/affx2post.awk +++ /dev/null @@ -1,83 +0,0 @@ -#converts a genotype file (SNP_name + AA/AB/BB for each individual) to posterior -#awk [-verror=0.001] -f affx2post.awk genotypes.txt >genotypes.post -function alleles2Index(a, b) -{ - if (b == 1) - return a - if (b == 2) - return 3 + a - if (b == 3) - return 5 + a - return 10 -} -function allele1(code) -{ - if (code <= 4) - return 1 - if (code <= 7) - return 2 - if (code <= 9) - return 3 - return 4 -} -function allele2(code) -{ - if (code <= 4) - return code - if (code <= 7) - return code - 3 - if (code <= 9) - return code - 5 - return 4 -} - -function distance(code1, code2 ,a1,a2,b1,b2) -{ - if (code1 == code2) - return 0 - a1 = allele1(code1) - a2 = allele2(code1) - - b1 = allele1(code2) - b2 = allele2(code2) - - if (a1 == b1 || a1 == b2 || a2 == b1 || a2 == b2) - return 1 - return 2 -} - - - -BEGIN{ - FS="\t" - OFS="\t" - if (error == "") - error = 0.001 - - code = 1 - for (i = 1; i <= 4; ++i) - for (j = i; j <= 4; ++j) { - s = "" - for (k = 1; k <= 10; ++k) - s = s " " error ^ distance(code, k) - map[i " " j] = substr(s, 2) - map[j " " i] = map[i " " j] - ++code - } - - map["0 0"] = "1 1 1 1 1 1 1 1 1 1" - map2["AA"]="1 1" - map2["AB"]="1 2" - map2["BB"]="2 2" - map2["NoCall"]="0 0" -} - -($1 !~ /^#/) { - $1=$1"\t"$1 - if (++line > 1) - for (i = 2; i <= NF; ++i) - $i = map[map2[$i]] - print -} - - diff --git a/software/LepMap3/scripts/allPaternal.awk b/software/LepMap3/scripts/allPaternal.awk deleted file mode 100644 index 5226eaf..0000000 --- a/software/LepMap3/scripts/allPaternal.awk +++ /dev/null @@ -1,45 +0,0 @@ -#flips maternally informative markers to paternally inf -BEGIN{ - FS="\t" - OFS="\t" - inf["0 1.0 0 0 0 0 0 0 0 0"]=1 - inf["0 0 1.0 0 0 0 0 0 0 0"]=1 - inf["0 0 0 1.0 0 0 0 0 0 0"]=1 - inf["0 0 0 0 0 1.0 0 0 0 0"]=1 - inf["0 0 0 0 0 0 1.0 0 0 0"]=1 - inf["0 0 0 0 0 0 0 0 1.0 0"]=1 - - inf["0 1 0 0 0 0 0 0 0 0"]=1 - inf["0 0 1 0 0 0 0 0 0 0"]=1 - inf["0 0 0 1 0 0 0 0 0 0"]=1 - inf["0 0 0 0 0 1 0 0 0 0"]=1 - inf["0 0 0 0 0 0 1 0 0 0"]=1 - inf["0 0 0 0 0 0 0 0 1 0"]=1 - -} - -(NR<=7){print} - -(NR==2){for (i = 3; i<=NF; ++i) {if (!($i in d)) d[$i] = ++p; f[i]=d[$i]}} - -(NR==4){for (i = 3; i<=NF; ++i) if ($i==0) pa[f[i], ++count[f[i]]]=i} - -(NR==6){for (i = 3; i<=NF; ++i) sex[i]=$i} - -#(NR==7){print "#"} - -(NR>7){ - for (j = 1; j <= p; ++j) { - i = 0 - if (inf[$(pa[j, 1])]==1) - i += sex[pa[j, 1]] - if (inf[$(pa[j, 2])]==1) - i += sex[pa[j, 2]] - if (i == 2) { - tmp = $(pa[j, 1]) - $(pa[j, 1]) = $(pa[j, 2]) - $(pa[j, 2]) = tmp - } - } - print -} diff --git a/software/LepMap3/scripts/genotypes2post.awk b/software/LepMap3/scripts/genotypes2post.awk deleted file mode 100644 index e0baf69..0000000 --- a/software/LepMap3/scripts/genotypes2post.awk +++ /dev/null @@ -1,81 +0,0 @@ -#converts a genotype file (transpose of linkage file with marker names) to posterior -#awk [-verror=0.001] -f genotypes2post.awk genotypes >genotypes.post -function alleles2Index(a, b) -{ - if (b == 1) - return a - if (b == 2) - return 3 + a - if (b == 3) - return 5 + a - return 10 -} -function allele1(code) -{ - if (code <= 4) - return 1 - if (code <= 7) - return 2 - if (code <= 9) - return 3 - return 4 -} -function allele2(code) -{ - if (code <= 4) - return code - if (code <= 7) - return code - 3 - if (code <= 9) - return code - 5 - return 4 -} - -function distance(code1, code2 ,a1,a2,b1,b2) -{ - if (code1 == code2) - return 0 - a1 = allele1(code1) - a2 = allele2(code1) - - b1 = allele1(code2) - b2 = allele2(code2) - - if (a1 == b1 || a1 == b2 || a2 == b1 || a2 == b2) - return 1 - return 2 -} - - - -BEGIN{ - FS="\t" - OFS="\t" - if (error == "") - error = 0.001 - - code = 1 - for (i = 1; i <= 4; ++i) - for (j = i; j <= 4; ++j) { - s = "" - for (k = 1; k <= 10; ++k) - s = s " " error ^ distance(code, k) - map[i " " j] = substr(s, 2) - map[j " " i] = map[i " " j] - map[j " " i] = map[i " " j] - map[i " " j] = map[i " " j] - ++code - } - - map["0 0"] = "1 1 1 1 1 1 1 1 1 1" - map["0 0"] = "1 1 1 1 1 1 1 1 1 1" -} - -($1 !~ /^#/) { - if (++line > 6) - for (i = 3; i <= NF; ++i) - $i = map[$i] - print -} - - diff --git a/software/LepMap3/scripts/loc2genotypes.awk b/software/LepMap3/scripts/loc2genotypes.awk deleted file mode 100644 index feca503..0000000 --- a/software/LepMap3/scripts/loc2genotypes.awk +++ /dev/null @@ -1,102 +0,0 @@ -#converts a loc file (JoinMap) to genotypes, that can be converted back post file -#should handle windows end-of-line characters as well -#awk -f locsingle.awk file.loc|awk -f loc2genotypes.awk|awk -f genotypes2post.awk |java -cp Lep-MAP3/bin ... data=- ... -BEGIN{ - map2["-"] = 0 - # - map2["l"] = 1 - map2["m"] = 2 - # - map2["n"] = 1 - map2["p"] = 2 - # - map2["h"] = 1 - map2["k"] = 2 - # - map2["e"] = 1 - map2["f"] = 2 - map2["g"] = 3 - # - map2["a"] = 1 - map2["b"] = 2 - map2["c"] = 3 - map2["d"] = 4 - for (i in map2) { - map[i] = map2[i] - map[toupper(i)] = map2[i] - } - delete map2 - markers[""] - markers[""] - markers[""] - markers[""] - markers[""] - markers[""] - markers[""] - markers[""] -} -($2 in markers) { - if ($NF == "\r") #windows end of line - --NF - ++line - start = 3 - if ($3 ~ /{.*}/ || $3 ~ /\(.*\)/) - start = 4 -# if (start == 3 && !($3 ~ /{.*}/) { -# print "Error: missing phase on data" >/dev/stderr -# exit(-1) -# } - - if (line == 1) { ## print pedigree - s = "CHR\tPOS" - for (i = start; i <= NF + 4; ++i) - s = s "\tF" - print s - s = "CHR\tPOS\tGP1\tGP2\tP1\tP2" - for (i = start; i <= NF; ++i) - s = s "\t" (i - start + 1) - print s - s = "CHR\tPOS\t0\t0\tGP1\tGP2" - for (i = start; i <= NF; ++i) - s = s "\tP1" - print s - s = "CHR\tPOS\t0\t0\t0\t0" - for (i = start; i <= NF; ++i) - s = s "\tP2" - print s - s = "CHR\tPOS\t1\t1\t1\t2" - for (i = start; i <= NF; ++i) - s = s "\t0" - print s - s = "CHR\tPOS" - for (i = start; i <= NF + 4; ++i) - s = s "\t0" - print s - } - s = $1 "\t" line - if (start == 4) { - if ($3 ~ /{0/) - s = s "\t" map[substr($2, 2, 1)] " " map[substr($2, 2, 1)] - else if ($3 ~ /{1/) - s = s "\t" map[substr($2, 3, 1)] " " map[substr($2, 3, 1)] - else - s = s "\t0 0" - - if ($3 ~ /{.0/) - s = s "\t" map[substr($2, 5, 1)] " " map[substr($2, 5, 1)] - else if ($3 ~ /{.1/) - s = s "\t" map[substr($2, 6, 1)] " " map[substr($2, 6, 1)] - else - s = s "\t0 0" - - } else - s = s "\t0 0\t0 0" - s = s "\t" map[substr($2, 2, 1)] " " map[substr($2, 3, 1)] "\t" map[substr($2, 5, 1)] " " map[substr($2, 6, 1)] - - for (i = start; i <= NF; ++i) - s = s "\t" map[substr($i, 1, 1)] " " map[substr($i, 2, 1)] - print s -} -END{ - -} diff --git a/software/LepMap3/scripts/locsingle.awk b/software/LepMap3/scripts/locsingle.awk deleted file mode 100644 index ac87d4f..0000000 --- a/software/LepMap3/scripts/locsingle.awk +++ /dev/null @@ -1,8 +0,0 @@ -#puts loc files on single line -{ - i = index($0, ";") - if (i > 0) - printf(substr($0, 1, i-1)) - else - print -} diff --git a/software/LepMap3/scripts/map2genotypes.awk b/software/LepMap3/scripts/map2genotypes.awk deleted file mode 100644 index 64cb5f3..0000000 --- a/software/LepMap3/scripts/map2genotypes.awk +++ /dev/null @@ -1,66 +0,0 @@ -#converts phased data to "genotypes" -#usage: -#java ... OrderMarkers2 ... outputPhasedData=1 > order_with_phase_software/LepMap3.txt -#awk [-vchr=X] [-vfullData=1] -f map2genotypes.awk order_with_phase_software/LepMap3.txt -#output columns marker name, chr, male postion, female postion, genotypes coded as "1 1", "1 2", "2 2" and 0 as missing -#providing fullData ouputs parents and pedigree... -BEGIN{ - map["00"]="1 1" - map["01"]="1 2" - map["10"]="2 1" - map["11"]="2 2" - map["0-"]="1 0" - map["-0"]="0 1" - map["-1"]="0 2" - map["1-"]="2 0" - map["--"]="0 0" - if (chr == "") - chr = 0 -} -(/^[^#]/){ - if (!notFirst && fullData){ - notFirst = 1 - s1 = "MARKER\tCHR\tMALE_POS\tFEMALE_POS" - s2 = "MARKER\tCHR\tMALE_POS\tFEMALE_POS" - s3 = "MARKER\tCHR\tMALE_POS\tFEMALE_POS" - s4 = "MARKER\tCHR\tMALE_POS\tFEMALE_POS" - s5 = "MARKER\tCHR\tMALE_POS\tFEMALE_POS" - s6 = "MARKER\tCHR\tMALE_POS\tFEMALE_POS" - for (i = 7; i<=NF; i+=3) { - n = length($i) / 2 - p1 = "P" (++numParents) - p2 = "P" (++numParents) - s1 = s1 "\t" p1 "x" p2 "\t" p1 "x" p2 - s2 = s2 "\t" p1 "\t" p2 - s3 = s3 "\t" 0 "\t" 0 - s4 = s4 "\t" 0 "\t" 0 - s5 = s5 "\t" 1 "\t" 2 - s6 = s6 "\t" 0 "\t" 0 - for (j = 1; j <= n; ++j) { - s1 = s1 "\t" p1 "x" p2 - s2 = s2 "\tC" (++numOffspring) - s3 = s3 "\t" p1 - s4 = s4 "\t" p2 - s5 = s5 "\t0" - s6 = s6 "\t0" - } - } - print s1 - print s2 - print s3 - print s4 - print s5 - print s6 - } - s = $1 "\t" chr "\t" $2 "\t" $3 - for (i = 7; i<=NF; i+=3) { - if (fullData) #parental data - s = s "\t1 2\t1 2" - n = length($i) / 2 - p1 = substr($i,1,n) - p2 = substr($i,n+1) - for (j = 1; j <= n; ++j) - s = s "\t" map[substr(p1, j, 1) substr(p2, j, 1)] - } - print s -} diff --git a/software/LepMap3/scripts/order2data.awk b/software/LepMap3/scripts/order2data.awk deleted file mode 100644 index 19e9e0f..0000000 --- a/software/LepMap3/scripts/order2data.awk +++ /dev/null @@ -1,65 +0,0 @@ -#script for marker binning... -BEGIN{ -#ACxAG=AA,AC,AG,CG - map["AA"] = "1 0 0 0 0 0 0"#00 - map["AC"] = "0 1 0 0 0 0 0"#01 - map["AG"] = "0 0 1 0 0 0 0"#10 - map["CG"] = "0 0 0 0 0 1 0"#11 - - if (chr == "") - chr = 1 -} -/^[^#]/{ - for (j = 7; j <= NF; ++j) - if ($j ~ /#$/) { - $j = substr($j, 1, length($j) - 1) - oldNF = j - break - } - if (oldNF == NF) - next - if (prev == "" && pedigree) { - s1 = "CHR\tPOS" - s2 = "CHR\tPOS" - s3 = "CHR\tPOS" - s4 = "CHR\tPOS" - s5 = "CHR\tPOS" - s6 = "CHR\tPOS" - f = 1 - nt = 0 - for (j = 7; j <= oldNF; j+=3) { - n = length($j) / 2 - s1 = s1 "\tF" f "\tF" f - s2 = s2 "\t" (nt + 1) "\t" (nt + 2) - s3 = s3 "\t0\t0" - s4 = s4 "\t0\t0" - s5 = s5 "\t1\t2" - s6 = s6 "\t0\t0" - for (i = 1; i <= n; ++i) { - s1 = s1 "\tF" f - s2 = s2 "\t" (nt + i + 2) - s3 = s3 "\t" (nt + 1) - s4 = s4 "\t" (nt + 2) - s5 = s5 "\t0" - s6 = s6 "\t0" - } - nt += n + 2 - ++f - } - print s1 "\n" s2 "\n" s3 "\n" s4 "\n" s5 "\n" s6 - } - - s = "" - nt = 0 - for (j = 7; j <= oldNF; j+=3) { - s = s "\t" map["AC"] "\t" map["AG"] - n = length($j) / 2 - for (i = oldNF + nt + 1; i <= oldNF + nt + 4 *n; i+=4) - s = s "\t" $i " " $(i+1) " " $(i+2) " 0 0 " $(i+3) " 0" - nt += 4 * n - } - if (prev != s || FILENAME != prevFN) - print $1 "\t" chr s - prev = s - prevFN = FILENAME -} diff --git a/software/LepMap3/scripts/phasematch.awk b/software/LepMap3/scripts/phasematch.awk deleted file mode 100644 index 100935a..0000000 --- a/software/LepMap3/scripts/phasematch.awk +++ /dev/null @@ -1,133 +0,0 @@ -#awk -f phasematch.awk order_reference.txt order_mapped.txt >order_mapped_in_reference_phase.txt -# -BEGIN{ -} -(NR==FNR && /^[^#]/){ - for (f = 7; f < NF; f+=3) - refdata[$1, f] = $f - if (numF != "" && numF != NF) { - print "Error: different number of columns in the input orders" > "/dev/stderr" - exit 1 - } - numF = NF -} - -(NR!=FNR){ - if (/^[#]/) - ; - else { - if (numF != NF) { - print "Error: different number of columns in the input orders" > "/dev/stderr" - exit 1 - } - } - data[FNR]=$0 -} - -END{ - for (i = 1; i <= FNR; ++i) { - $0 = data[i] - if (/^[#]/) - ; - else { - for (f = 7; f < NF; f+=3) { - if (($1 SUBSEP f) in refdata) { - ham1[f] += hamming1($f, refdata[$1, f]) - maxham1[f] += maxh - ham2[f] += hamming2($f, refdata[$1, f]) - maxham2[f] += maxh - } - } - } - - } -for (f = 7; f < NF; f+=3) { - print "***" > "/dev/stderr" - print "hamming distance1 is " ham1[f] " of " maxham1[f] " (" abs(ham1[f])/(maxham1[f]+0.000000000000000001) ") for family " ++family > "/dev/stderr" - print "hamming distance2 is " ham2[f] " of " maxham2[f] " (" abs(ham2[f])/(maxham2[f]+0.000000000000000001) ") for family " family > "/dev/stderr" - } - - for (i = 1; i <= FNR; ++i) { - $0 = data[i] - if (/^[#]/) - print - else { - for (f = 7; f < NF; f+=3) { - n = length($f) / 2 - p1 = substr($f, 1, n) - p2 = substr($f, n + 1) - if (ham1[f] < 0) - p1 = flip(p1) - if (ham2[f] < 0) - p2 = flip(p2) - $f = p1 p2 - } - - - s = $1 "\t" $2 "\t" $3 "\t" $4 " " $5 " " $6 - for (f = 7; f < NF; f+=3) - s = s "\t" $f " " $(f+1) " " $(f+2) - print s - } - - } -} -function abs(x) { - if (x < 0) - return -x - return x -} - -function flip(x) { - gsub(/0/, "x", x) - gsub(/1/, "0", x) - gsub(/x/, "1", x) - return x -} - -function hamming1(x, y ,i,xi,yi,ret, n) -{ - n = length(y) - if (length(x) < n) - n = length(x) -# print x " " y - ret = 0 - maxh = 0 - n = n / 2 - for (i = 1; i <= n; ++i) { - xi = substr(x, i, 1) - yi = substr(y, i, 1) - if (yi != "-" && xi != "-") { - ++maxh - if (xi == yi) { - ++ret - } - else - --ret - } - } - return ret -} -function hamming2(x, y ,i,xi,yi,ret, n) -{ - n = length(y) - if (length(x) < n) - n = length(x) -# print x " " y - ret = 0 - maxh = 0 - for (i = n / 2 + 1; i <= n; ++i) { - xi = substr(x, i, 1) - yi = substr(y, i, 1) - if (yi != "-" && xi != "-") { - ++maxh - if (xi == yi) { - ++ret - } - else - --ret - } - } - return ret -} - diff --git a/software/LepMap3/scripts/pileup2posterior.awk b/software/LepMap3/scripts/pileup2posterior.awk deleted file mode 100644 index 84c6873..0000000 --- a/software/LepMap3/scripts/pileup2posterior.awk +++ /dev/null @@ -1,156 +0,0 @@ -#samtools mpileup -q 10 -Q 10 -s `cat sorted_bams`|awk -f pileupParser2.awk|awk -f pileup2posterior.awk -#needs mapping.txt -#Part of Lep-MAP3 -BEGIN{ - if (limit7 == "") # maximum quality of a read base - limit7 = 0.001 - - map["a"]=1 - map["A"]=1 - - map["c"]=2 - map["C"]=2 - - map["g"]=3 - map["G"]=3 - - map["t"]=4 - map["T"]=4 - - "cat mapping.txt"|getline - if (NF == 0) { - print "Error: file mapping.txt not found!" > "/dev/stderr" - exit - } - s = "CHR\tPOS" - - numIndividuals = 0 - for (i = 1; i <= NF; ++i) { - mapping[i] = $i - if (!($i in imapping)) - listOrder[++numIndividuals] = $i - ++imapping[$i] - } - print "Number of bams = " NF > "/dev/stderr" - print "Number of individuals = " numIndividuals > "/dev/stderr" - close("cat mapping.txt") - - - for (mi = 1; mi <= numIndividuals; ++mi) { - s = s "\t" listOrder[mi] - } - print s - FS="\t" - - #ascii characters mapping to their quality - for (i = 33; i <= 127; ++i) { - if (i == 33) # q == 0 - q = 0.75 # all bases equal probable - else if (i == 34) # q == 1 - q = 0.5 * (0.75 + 10 ^ (-0.15)) # One base must be more probable than others, e.g. p \in ]10^-0.15, 0.75[ - else - q = 10 ^ (-0.1 * (i - 33)) - quality[sprintf("%c", i)] = q - } - - for (i = 33; i <= 127; ++i) - for (j = 33; j <= 127; ++j) { - qs1 = sprintf("%c", i) - qs2 = sprintf("%c", j) - p = combineQ(qs1, qs2) - if (p < limit7) - p = limit7 - - logP[qs1 qs2] = log(p / 3) - logNP[qs1 qs2] = log(1 - p) - logNP2[qs1 qs2] = log(p / 3 + (1 - p)) + log(0.5) - } - - - logHalf = log(0.5) - logTwo = log(2.0) -} - -#function phred2p(q , p) -#{ -# p = ord[q] - 33 -# if (p == 0) -# return 0.75; -# if (p == 1) -# return 0.725; -# else -# return 10^(-0.1 * p) -#} - - -function combineQ(q1, q2 ,ret,p1,p2) -{ - p1 = quality[q1] - p2 = quality[q2] - ret = p1 + p2 - p1 * p2 - if (ret > 0.75) - ret = 0.75 - return ret -} - -function myexp(x) -{ - if (x < -100) - return 0 - else - return exp(x) -} - -{ - delete prob - for (i = 5; i <= NF; i+=4) - for (j = 1; j <= length($i); ++j) { - a = substr($i, j, 1) - if (a in map) { - am = map[a] - individual = mapping[int(i / 4)] - - #p = combineQ(substr($(i+1), j, 1), substr($(i+2), j, 1)) - #if (p < limit7) - # p = limit7 - #logP = log(p / 3) - #logNP = log(1 - p) - #logNP2 = log(p / 3 + (1 - p)) + logHalf - #faster by using a table... - qs = substr($(i+1), j, 1) substr($(i+2), j, 1) - - for (k = 1; k <= 4; ++k) - if (k == am) - prob[individual, k, k] += logNP[qs] - else - prob[individual, k, k] += logP[qs] - - for (k = 1; k < 4; ++k) - for (l = k + 1; l <= 4; ++l) - if (k == am || l == am) - prob[individual, k, l] += logNP2[qs] - else - prob[individual, k, l] += logP[qs] - } - } - s = $1 "\t" $2 - for (mi = 1; mi <= numIndividuals; ++mi) { - m = listOrder[mi] - - maxp = prob[m,1,1] + 0 - for (k = 1; k <= 4; ++k) - for (l = k; l <= 4; ++l) - if (prob[m, k, l] + 0 > maxp) - maxp = prob[m, k, l] + 0 - - for (k = 1; k <= 4; ++k) - for (l = k; l <= 4; ++l) { - if (k == 1 && l == 1) - s = s "\t" - else - s = s " " - s = s myexp(prob[m, k, l] - maxp) - } - } - print s -} diff --git a/software/LepMap3/scripts/pileupParser2.awk b/software/LepMap3/scripts/pileupParser2.awk deleted file mode 100644 index 4f382c0..0000000 --- a/software/LepMap3/scripts/pileupParser2.awk +++ /dev/null @@ -1,90 +0,0 @@ -#samtools mpileup -q 10 -Q 10 -s `cat sorted_bams`|awk -f pileupParser2.awk|awk -f pileup2posterior.awk -#needs mapping.txt -#Part of Lep-MAP3 -BEGIN{ - "cat mapping.txt"|getline - if (NF == 0) { - print "Error: file mapping.txt not found!" > "/dev/stderr" - exit - } - numIndividuals = 0 - for (i = 1; i <= NF; ++i) { - mapping[i] = $i - if (!($i in ind)) - ++numIndividuals - ind[$i] - } - close("cat mapping.txt") - - - if (limit1 == "") - limit1 = 3 # coverage per individual - if (limit2 == "") - limit2 = 0.3 * numIndividuals # number of allowed individuals with lower coverage - if (limit3 == "") - limit3 = 0.1 * numIndividuals # minimum allele coverage - if (limit4 == "") - limit4 = limit1 * (numIndividuals - limit2) # minimum total counts - - print "PileupParser2 parameters: limit1=" limit1 " limit2=" limit2 " limit3=" limit3 " limit4=" limit4 > "/dev/stderr" - - FS="\t" - OFS="\t" -} - -function abs(a) -{ - if (a < 0) - return -a - else - return a -} - -{ - sum = 0 - for (i = 4; i <= NF; i+=4) - sum += $i; - - if (sum < limit4) - next - - delete count - for (i = 4; i <= NF; i+=4) - count[mapping[int(i / 4)]]+=$i - - missing = 0 - for (ci in count) - if (count[ci] < limit1) - ++missing - - if (missing > limit2) - next - - delete c - for (i = 5; i <= NF; i+=4) { - if ($(i-1) == 0) - $i = "" - gsub(/\$/,"",$i) #remove end of reads - gsub(/\^./,"",$i) #remove quality - while (match($i, /[+-][1-9][0-9]*/) > 0) { #remove indels - $i = substr($i, 1, RSTART - 1) substr($i, RSTART + RLENGTH + substr($i, RSTART + 1, RLENGTH - 1)) - } - tmp = $i - c[0] += gsub(/[Aa]/, "", tmp) - c[1] += gsub(/[Cc]/, "", tmp) - c[2] += gsub(/[Gg]/, "", tmp) - c[3] += gsub(/[Tt]/, "", tmp) - } - alleles = 0 - if (c[0] >= limit3) - ++alleles; - if (c[1] >= limit3) - ++alleles; - if (c[2] >= limit3) - ++alleles; - if (c[3] >= limit3) - ++alleles; - - if (alleles >= 2) - print -}