Merge pull request #24 from NCI-CGR/development

resolve conflicting Makefile.config includes
NCI-CGR · Feb 3, 2021 · 75c63af · 75c63af
2 parents 99c3598 + fd525e6
commit 75c63af
Show file tree

Hide file tree

Showing 43 changed files with 224 additions and 172 deletions.
diff --git a/1KG_files/Makefile b/1KG_files/Makefile
@@ -1,4 +1,4 @@
-include $(PROJECT_BASE_DIR)/Makefile.config
+include $(MAKEFILE_CONFIG_LOCATION)
 
 .PHONY: all clean
 all: $(addsuffix $(TRACKING_SUCCESS_SUFFIX),$(KG_MANIFEST) $(patsubst %,$(KG_GENOTYPES_PREFIX)%$(KG_GENOTYPES_SUFFIX),$(CHRS)) $(patsubst %,$(KG_GENOTYPES_PREFIX)%$(KG_GENOTYPES_SUFFIX).tbi,$(CHRS)) $(patsubst %,$(KG_GENOTYPES_PREFIX)%$(KG_GENOTYPES_SUFFIX).md5sum,$(CHRS)) $(patsubst %,$(KG_GENOTYPES_PREFIX)%$(KG_GENOTYPES_SUFFIX).tbi.md5sum,$(CHRS)))

diff --git a/Makefile b/Makefile
@@ -2,6 +2,7 @@
 ## Primary entry point for PLCO analysis pipeline
 ## 
 include Makefile.config
+MAKEFILE_CONFIG_LOCATION := $(shell pwd)/Makefile.config
 export
 .SECONDEXPANSION:
 .PHONY: all $(SUPPORTED_METHODS) bgen meta metal meta-analysis metaanalysis cleaned-chips-by-ancestry ancestry relatedness ldsc 1KG_files fastgwa-grm ldscores plotting flat-dosages globus

diff --git a/ancestry/Makefile b/ancestry/Makefile
@@ -1,7 +1,7 @@
 ## Cameron Palmer, 21 May 2020
 ## Compute GRAF ancestry estimates for each chip
 
-include $(PROJECT_BASE_DIR)/Makefile.config
+include $(MAKEFILE_CONFIG_LOCATION)
 
 ## location of GRAF relatedness estimation output
 ## NB: this assumes the relatedness pipeline has already been run. Dependency controlled at top level directory.

diff --git a/bgen/Makefile b/bgen/Makefile
@@ -2,7 +2,7 @@
 ## convert post-imputation-QC MIS files to nonredundant subject bgen 1.2 for downstream analysis
 
 export
-include $(PROJECT_BASE_DIR)/Makefile.config
+include $(MAKEFILE_CONFIG_LOCATION)
 
 UNIQUE_PLATFORMS := $(subst _,/,$(filter-out Omni5,$(PLATFORMS)))
 

diff --git a/bgen/Makefile.bgen_format b/bgen/Makefile.bgen_format
@@ -2,7 +2,7 @@
 ## Shared pipeline for each platform in turn, pulled in through an include from a dummy makefile controller.
 ## Note that this means that this is executed from within the {CHIP} directory, and output paths don't include that
 ## Ancestry is stored in "PROJECT_CODE"
-include $(PROJECT_BASE_DIR)/Makefile.config
+include $(MAKEFILE_CONFIG_LOCATION)
 
 RESOLVED_SUBJECT_LIST := $(UNIQUE_SUBJECT_LIST)
 

diff --git a/cleaned-chips-by-ancestry/Makefile b/cleaned-chips-by-ancestry/Makefile
@@ -1,7 +1,7 @@
 ## Cameron Palmer, 21 May 2020
 ## Secondary entry point for chip cleaning; dispatches each ancestry separately
 
-include $(PROJECT_BASE_DIR)/Makefile.config
+include $(MAKEFILE_CONFIG_LOCATION)
 export
 STUDIES := $(sort $(foreach platform,$(PLATFORMS),$(firstword $(subst _, ,$(platform)))))
 

diff --git a/cleaned-chips-by-ancestry/Makefile.include b/cleaned-chips-by-ancestry/Makefile.include
@@ -3,7 +3,7 @@
 ## Note that this means that this is executed from within the {ANCESTRY} directory, and output paths don't include that
 ## Ancestry is stored in "POPULATION_NAME"
 
-include $(PROJECT_BASE_DIR)/Makefile.config
+include $(MAKEFILE_CONFIG_LOCATION)
 
 .PHONY: all all_chips
 .SECONDARY:

diff --git a/config/CA125.female.config.yaml b/config/CA125.female.config.yaml
diff --git a/config/CA125.male.config.yaml b/config/CA125.male.config.yaml
diff --git a/config/Makefile b/config/Makefile
@@ -2,7 +2,7 @@
 ## dispatch config check
 
 export
-include $(PROJECT_BASE_DIR)/Makefile.config
+include $(MAKEFILE_CONFIG_LOCATION)
 
 .DELETE_ON_ERROR:
 .SECONDEXPANSION:

diff --git a/docs/Advanced Dev Notes.rst b/docs/Advanced Dev Notes.rst
@@ -50,7 +50,7 @@ Integration with Clusters
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
 ``make`` has no integrated support for cluster job submission. These pipelines have their calls wrapped
-in two simple utility functions which either dispatch jobs via a ``qsub`` interface (for cgems), or run
+in two simple utility functions which either dispatch jobs via a ``qsub`` interface (for cgems/ccad), or run
 the job in the main process but log the results with tracking files. This is a total mess. Other languages
 obviously offer actual support, so further development should make use of that. For extension of the current
 pipelines, support for ``slurm`` in particular needs to get patched into an appropriate interface/monitor program.
@@ -273,7 +273,7 @@ quit.
 Desync is annoying but again, the qsub monitoring software has a series of retries to attempt
 to allow for some amount of desync. The waiting times for this behavior are configurable, so
 if you have issues, you can make the monitor (controlled in a macro in ``Makefile.config``)
-wait longer or retry more times to adapt. As it's configured, I've not had any issues with cgems
+wait longer or retry more times to adapt. As it's configured, I've not had any issues with cgems/ccad
 in months.
 
 Zombie jobs are obnoxious because it's difficult to be certain when it's happening. I am aware

diff --git a/docs/Installation.rst b/docs/Installation.rst
@@ -23,10 +23,10 @@ Short Version (for experts)
 *  If needed, install git and git-lfs, and activate git-lfs
 *  Clone the `analysis pipeline repository`_
 *  Navigate into the repository directory
-*  Add the `CGR conda channel`_ to your **.condarc**
-*  Create the conda_ environments specified by **environment.yaml** and **environment-ldsc.yaml**
+*  Add the `CGR conda channel`_ to your ``.condarc``
+*  Create the conda_ environments specified by ``environment.yaml`` and ``environment-ldsc.yaml``
 *  Activate the environments (ldsc for ``ldsc`` and ``ldscores`` pipeline; the other for everything else)
-*  Update **Makefile.config** to point to your copies of the following:
+*  Update ``Makefile.config`` to point to your copies of the following:
 
    *  PLCO chip freeze
    *  PLCO imputed data freeze

diff --git a/docs/Methods Summary.rst b/docs/Methods Summary.rst
@@ -18,15 +18,15 @@ Relatedness Estimation
 Genotype data from the five PLCO platforms were updated to match
 the variant IDs present in the `graf`_ reference dataset ``G1000GpGeno``.
 Each chip dataset in turn was converted to `graf`_ fpg format and used
-to estimate within-platform subject relatedness with `graf -geno`.
+to estimate within-platform subject relatedness with ``graf -geno``.
 
 .. _graf: https://github.com/ncbi/graf
 
 Ancestry Estimation
 ~~~~~~~~~~~~~~~~~~~
 
 Genotype data from relatedness estimation were used to estimate
-subject ancestry with `graf -pop` and `graf`_ `PlotPopulations.pl`.
+subject ancestry with ``graf -pop`` and `graf`_ ``PlotPopulations.pl``.
 As ancestry estimation was conducted separately for each platform,
 several subjects with borderline ancestry calls had discordant ancestry
 calls between platforms. In these instances, the ancestry call was resolved
@@ -82,14 +82,17 @@ then the resulting pgen files were reformatted to `bgen v1.2`_ with ``plink2 --r
 .. _`plink 2`: https://www.cog-genomics.org/plink/2.0/
 
 
+Primary Analysis
+----------------
+
 Phenotype Modeling
 ~~~~~~~~~~~~~~~~~~
 
 Phenotype and covariate data from IMS v10, along with indicator variables reporting
 genotyping platform batch and ``Other Asian`` raw ancestry calls from `graf`_,
 were processed and formatted into model matrix files. Continuous traits were
-inverse normal transformed within ancestry group, stratified by sex. Categorical
-traits were processed into individual binary contrasts between a single reference
+inverse normal transformed within ancestry group, stratified by sex, with random resolution of ties.
+Categorical traits were processed into individual binary contrasts between a single reference
 group (category 0, with the largest number of subjects); any non-reference group
 with fewer than 10 subjects was combined into a single meta-group based on
 the PLCO analysis plan document guidelines. All categorical covariates were similarly
@@ -168,7 +171,7 @@ of 1000 Genomes subjects from each supercontinent versus the TOPMed 5b reference
 
 
 Meta-Analysis
-~~~~~~~~~~~~~
+-------------
 
 For each continuous and binary phenotype, platform subsets of the same `graf`_ ancestry group
 were meta-analyzed together with `metal`_ with heterogeneity analysis.
@@ -183,7 +186,7 @@ and should be replaced in future iterations of this analysis.
 
 
 LD Score Regression
-~~~~~~~~~~~~~~~~~~~
+-------------------
 
 Results files from each analysis were processed to contain
 signed summary statistics. These files were then processed with the `ldsc`_
@@ -200,3 +203,7 @@ helper script ``munge_sumstats.py`` using the following parameters:
 * ``--sumstats {filename}``
 * ``--p P``
 
+
+Finally, the resulting processed files were used to estimate LD score regression
+intercepts with `ldsc`_ script ``ldsc.py`` against reference LD scores from the
+matched supercontinent.
diff --git a/docs/Preprocessing.rst b/docs/Preprocessing.rst
@@ -13,10 +13,10 @@ Relatedness
 ~~~~~~~~~~~
 
 Relatedness estimation is primarily required as an intermediate in the ancestry estimation process using graf_. The files are stored
-and can be used for other QC reasons of course; see **relatedness/PLCO_{chip_name}.relatedness.txt** for relevant output files in standard
+and can be used for other QC reasons of course; see ``relatedness/PLCO_{chip_name}.relatedness.txt`` for relevant output files in standard
 graf format.
 
-*  Usage: **make relatedness**
+*  Usage: ``make relatedness``
 *  Dependencies:
 
    *  chip freeze
@@ -38,11 +38,11 @@ graf format.
 .. warning::
 
    graf_ is mildly frustrating to use: it has some non-compliant behaviors. Notably, its exit codes are not standard,
-   so it doesn't exit `0` on success. Always check the output log from graf_ before proceeding! And note that the overall
-   `make` run will have some "exit code ignored" warnings due to this behavior.
+   so it doesn't exit ``0`` on success. Always check the output log from graf_ before proceeding! And note that the overall
+   ``make`` run will have some "exit code ignored" warnings due to this behavior.
 
    Additionally, graf_ complains when the output files it tries to create already exist. So, if you're running graf_ in an
-   existing directory, you will likely need to purge intermediates (or just kill the "relatedness/" directory and check it out again)
+   existing directory, you will likely need to purge intermediates (or just kill the ``relatedness/`` directory and check it out again)
    before rerunning. However, this is an early step that doesn't expect to be rerun frequently. It can definitely be patched to
    work better; or you can use the version in the upstream QC pipeline that's much better.
 
@@ -52,14 +52,14 @@ Ancestry Estimation
 ~~~~~~~~~~~~~~~~~~~
 
 Ancestry estimation is required for chip processing and various sanity checks. As with relatedness, this is computed with graf_. The files
-are stored and can be used for various QC purposes; see **ancestry/PLCO_{chip_name}.graf_estimates.txt**.
+are stored and can be used for various QC purposes; see ``ancestry/PLCO_{chip_name}.graf_estimates.txt``.
 
 Note that the final ancestry calls listed above are modified according to the consensus instructions of the "Atlas" analysis group.
-Subjects from the default "African" graf_ ancestry are merged into the "African American" label to more consistently represent
-the sampling distribution of the PLCO project. Subjects from the default "Other Asian or Pacific Islander" graf_ ancestry are merged
-into the "East Asian" label according to the instructions of collaborators.
+Subjects from the default ``African`` graf_ ancestry are merged into the ``African American`` label to more consistently represent
+the sampling distribution of the PLCO project. Subjects from the default ``Other Asian or Pacific Islander`` graf_ ancestry are merged
+into the ``East Asian`` label according to the instructions of collaborators.
 
-*  Usage: **make ancestry**
+*  Usage: ``make ancestry``
 *  Dependencies:
 
    *  `relatedness pipeline`_
@@ -106,7 +106,7 @@ used by the pipeline in its current form but were useful in the processing of th
 
 .. _`IBS/IBD estimates`: https://www.cog-genomics.org/plink/1.9/ibd
 
-*  Usage: **make cleaned-chips-by-ancestry**
+*  Usage: ``make cleaned-chips-by-ancestry``
 *  Dependencies:
 
    *  `ancestry pipeline`_
@@ -130,14 +130,14 @@ used by the pipeline in its current form but were useful in the processing of th
 .. topic:: Debugging
 
    This pipeline extensively uses plink_ for filtering and QC operations. plink_'s memory allocation is limited to 16G
-   in **Makefile.config**. That's a completely *ad hoc* bit of nonsense that may need to be changed depending on your
+   in ``Makefile.config``. That's a completely *ad hoc* bit of nonsense that may need to be changed depending on your
    individual project's parameters.
 
    The pipeline is designed to allow different combinations of platform/ancestry to not exist. That seems to work well,
    but some issues may pop up if plink_ finds something it doesn't like in a small dataset.
 
-   The IBS/IBD calculation with plink_ **--genome** is somewhat quirkly set up. For datasets above a fixed (configurable)
-   threshold of number of subjects, the IBS/IBD calculation is split into chunks with **--parallel** and then glued back
+   The IBS/IBD calculation with plink_ ``--genome`` is somewhat quirkly set up. For datasets above a fixed (configurable)
+   threshold of number of subjects, the IBS/IBD calculation is split into chunks with ``--parallel`` and then glued back
    together in a separate rule. These various thresholds were selected to make PLCO/GSA/Europeans run reasonably efficiently.
    For much larger chips, you may need to fiddle with the thresholds and number of quasiparallelized jobs to make things
    go ok.
@@ -174,7 +174,7 @@ and then used as fixed input for all association testing.
 
 .. _bolt-lmm: https://alkesgroup.broadinstitute.org/BOLT-LMM/BOLT-LMM_manual.html
 
-* Usage: **make bgen**
+* Usage: ``make bgen``
 * Dependencies:
 
   *  imputed data freeze
@@ -186,14 +186,14 @@ and then used as fixed input for all association testing.
      bgen_ input, then there's no reason to run this pipeline and waste a ton of hard drive space
   *  the bgen_ format in use here is v1.2, based on bolt-lmm_ documentation and other software support suggesting that
      that's the most efficient version accepted by all current tools. this may need to be changed in the future
-  *  the bgen_ reformatting process is conducted using plink_. the resultant ***.sample** files are slightly malformatted,
-     and so an additional step fixes the included **NA** values by setting them to **0**. this could well be changed in
+  *  the bgen_ reformatting process is conducted using plink_. the resultant ``*.sample`` files are slightly malformatted,
+     and so an additional step fixes the included ``NA`` values by setting them to ``0``. this could well be changed in
      a future version depending on upstream behavior
 
 .. topic:: Debugging
 
    bgen_ support in conversion tools is pretty limited. I've ended up using plink_ for VCF->BGEN conversion in two steps,
-   even bearing in mind the apparent bug in output ***.sample** format files created with it. But I could very much see
+   even bearing in mind the apparent bug in output ``*.sample`` format files created with it. But I could very much see
    the possibility of needing a different adapter program in the future depending on one's needs and any format discrepancies
    I've not found, and it would have the benefit of potentially removing an extra rule/intermediate file from this pipeline.
 
@@ -208,15 +208,15 @@ by supercontinent.
 
 .. _`1000 Genomes Project`: https://www.internationalgenome.org
 
-* Usage: **make 1KG_files**
+* Usage: ``make 1KG_files``
 * Dependencies:
 
   * a functional internet connection
 
 * Assumptions:
 
   *  the 1000 Genomes files downloaded are frozen at a particular latest release according to the configuration information
-     in **Makefile.config**. that can obviously be changed if you want
+     in ``Makefile.config``. that can obviously be changed if you want
   *  most target installations should actually have some sort of copy of the 1000 Genomes data present already somewhere
      on their filesystem; however, this pipeline is not designed to support that as-is. it should be pretty easy to modify
      if you really want
@@ -237,7 +237,7 @@ covered European subjects, and more generalized data were/are needed.
 
 .. _ldsc: https://github.com/bulik/ldsc
 
-*  Usage: **make ldsc**
+*  Usage: ``make ldsc``
 
 *  Dependencies:
 
@@ -251,6 +251,6 @@ covered European subjects, and more generalized data were/are needed.
       some sort of weird versioning issue. regardless, this pipeline just hacks the result into submission. that results
       in some discrepancies from the stock reference files, but there's no indication of exactly which subjects/variants
       were used for those files, so that's not unexpected. basically: ymmv
-   *  default built-in files include an African American (**AFRAMR**) meta-group for appropriate subjects. note however
-      that "African American" as a human genetics group label is a very heterogeneous group, so there's no guarantee
+   *  default built-in files include an African American (``AFRAMR``) meta-group for appropriate subjects. note however
+      that ``African American`` as a human genetics group label is a very heterogeneous group, so there's no guarantee
       that this reference group will be appropriate for a given set of African American study subjects
diff --git a/docs/Trait Configuration.rst b/docs/Trait Configuration.rst
@@ -49,7 +49,9 @@ Mandatory Settings
 * ``algorithm``: at least one software tool that should be run for this trait. This is how
   linear or logistic or polytomous regression is selected for a trait, so be sure to choose
   correctly! As of v2.0.0, supported methods are: ``boltlmm`` (continuous traits), ``fastgwa``
-  (continuous traits), and ``saige`` (binary or categorical/ordinal traits)
+  (continuous traits), and ``saige`` (binary or categorical/ordinal traits); note that ``fastgwa``
+  analyses were removed from primary "Atlas" analysis some months ago, and the pipeline has not
+  been exhaustively tested since
 
 
 .. note::