CGATOxford · sebastian-luna-valero · May 17, 2017 · May 16, 2017 · May 17, 2017
diff --git a/CGATPipelines/pipeline_annotations.py b/CGATPipelines/pipeline_annotations.py
@@ -61,7 +61,10 @@
     files (``filename_gtf``, filename_pep``, ``filename_cdna``)
 
 2 section ``[general]`` with the location of the indexed genomic
-    fasta files to use and the name of the genome (default=``hg19``),
+    fasta files to use and the name of the genome, as well as the
+    genome assembly report obtained from NCBI for mapping between
+    UCSC and ENSEMBL contigs. This can be obtained from:
+    https://www.ncbi.nlm.nih.gov/assembly
     see :doc:`../modules/IndexedFasta`.
 
 3 section ``[ucsc]`` with the name of the database to use (default=``hg19``).
@@ -670,7 +673,7 @@ def buildContigBed(infile, outfile):
     Parameters
     ----------
     infile : str
-      infiles is constructed from `PARAMS` variable to retrieve
+      infile is constructed from `PARAMS` variable to retrieve
       the `genome` :term:`fasta` file
 
     Returns
@@ -703,7 +706,7 @@ def buildUngappedContigBed(infile, outfiles):
     Parameters
     ----------
     infile: str
-      infiles is constructed from `PARAMS` variable to retrieve
+      infile is constructed from `PARAMS` variable to retrieve
       the `genome` :term:`fasta` file
 
     assembly_gaps_min_size: int
@@ -772,7 +775,7 @@ def buildGenomeInformation(infile, outfile):
     Parameters
     ----------
     infile: str
-      infiles is constructed from ``PARAMS`` variable to retrieve
+      infile is constructed from ``PARAMS`` variable to retrieve
       the ``genome`` :term:`fasta` file
 
     Returns
@@ -821,7 +824,7 @@ def buildGenomeGCSegmentation(infile, outfile):
     Parameters
     ----------
     infile: str
-      infiles is constructed from `PARAMS` variable to retrieve
+      infile is constructed from `PARAMS` variable to retrieve
       the ``genome`` :term:`fasta` file
 
     segmentation_window_size: int
@@ -870,7 +873,7 @@ def runGenomeGCProfile(infile, outfile):
     Parameters
     ----------
     infile: str
-      infiles is constructed from ``PARAMS`` variable to retrieve
+      infile is constructed from ``PARAMS`` variable to retrieve
       the ``genome`` :term:`fasta` file
 
     segmentation_min_length: int
@@ -953,7 +956,7 @@ def buildCpGBed(infile, outfile):
     Parameters
     ----------
     infile: str
-      infiles is constructed from `PARAMS` variable to retrieve
+      infile is constructed from `PARAMS` variable to retrieve
       the `genome` :term:`fasta` file
 
     Returns
@@ -985,8 +988,8 @@ def buildCpGBed(infile, outfile):
 # -----------------------------------------------------------------
 # ENSEMBL gene set
 @follows(mkdir('ensembl.dir'))
-@files(PARAMS["ensembl_filename_gtf"], PARAMS['interface_geneset_all_gtf'])
-def buildGeneSet(infile, outfile):
+@files((PARAMS["ensembl_filename_gtf"], PARAMS["general_assembly_report"]), PARAMS['interface_geneset_all_gtf'])
+def buildGeneSet(infiles, outfile):
     '''output sanitized ENSEMBL geneset.
 
     This method outputs an ENSEMBL gene set after some sanitizing steps:
@@ -999,21 +1002,22 @@ def buildGeneSet(infile, outfile):
 
     Arguments
     ---------
-    infile : string
+    infiles : tuple
        ENSEMBL geneset in :term:`gtf` format.
+       NCBI Assembly report in `txt` format.
     outfile : string
        geneset in :term:`gtf` format.
 
     '''
+    gtf_file, assembly_report = infiles
 
-    statement = ['''zcat %(infile)s
+    statement = ['''zcat %(gtf_file)s
     | grep 'transcript_id'
     | cgat gff2gff
     --method=sanitize
-    --sanitize-method=genome
+    --sanitize-method=ucsc
     --skip-missing
-    --genome-file=%(genome_dir)s/%(genome)s
-    --log=%(outfile)s.log
+    --assembly-report=%(assembly_report)s
     ''']
 
     if PARAMS["ensembl_remove_contigs"]:

diff --git a/CGATPipelines/pipeline_annotations/pipeline.ini b/CGATPipelines/pipeline_annotations/pipeline.ini
@@ -5,11 +5,19 @@
 ##########################################################
 [general]
 
-# location of indexed genome 
-genome=hg19
+# base name of indexed genome, e.g. "hg38", without suffix
+genome=
 
-# location of indexed genome 
-genome_dir=/ifs/mirror/genomes/plain
+# directory containing indexed genome files
+# e.g. /ifs/mirror/genomes/plain
+genome_dir=
+
+# path to NCBI assembly full sequence report
+# allows contig mapping from UCSC to ENSEMBL.
+# Download from https://www.ncbi.nlm.nih.gov/assembly.
+# Required. Contigs not in file will be forced to ucsc format
+# e.g. /ifs/mirror/assembly/GRCh38.p10_assembly_report.txt
+assembly_report=
 
 # scratchdir for data not to be backed up
 scratchdir=/tmp
@@ -24,29 +32,34 @@ name=csvdb
 #-----------------------------------------------------------------
 # information about ENSEMBL gene set. The ENSEMBL gene set is
 # imported from the dumps provided by ENSEMBL.
+# e.g. /ifs/mirror/ensembl/hg38/Homo_sapiens.GRCh38.87.gtf.gz
+# e.g. /ifs/mirror/ensembl/hg38/Homo_sapiens.GRCh38.87.pep.all.fa.gz
+# e.g. /ifs/mirror/ensembl/hg38/Homo_sapiens.GRCh38.87.cdna.all.fa.gz
 [ensembl]
-
-filename_gtf=/ifs/mirror/ensembl/hg19/Homo_sapiens.GRCh37.60.gtf.gz
-filename_pep=/ifs/mirror/ensembl/hg19/Homo_sapiens.GRCh37.60.pep.all.fa.gz
-filename_cdna=/ifs/mirror/ensembl/hg19/Homo_sapiens.GRCh37.60.cdna.all.fa.gz
+filename_gtf=
+filename_pep=
+filename_cdna=
 
 # which ensembl build number is this?  This has an affect on certain
 # annotation attributes presence/absence
-build=?!
+# e.g. 87
+build=
 
 # comma separated list of regular expressions for contigs
 # to be removed from ensembl_filename_gtf during genome
-# sanitization
+# sanitization. Use ensembl notation.
+# e.g _alt|chrUn
 remove_contigs=
 
 # biomart dataset to use
 
 # Set biomart host.
 # Go to www.ensemble.org to find correct ensemble version and month for archives and current release
 # To access specfic version, use hosts formats such as these below for archived and/or current releases:
-# may2009.archive.ensembl.org
-# jul2016.archive.ensembl.org
-biomart_host=jul2016.archive.ensembl.org
+# current: www.ensembl.org
+# Ensembl54: may2009.archive.ensembl.org
+# Ensembl85: jul2016.archive.ensembl.org
+biomart_host=
 
 # When accessing an archive server, use
 # biomart_mart=ENSEMBL_MART_ENSEMBL as default web service
@@ -57,7 +70,8 @@ biomart_mart=ENSEMBL_MART_ENSEMBL
 #     library(biomaRt)
 #     ensembl <- useMart("ensembl")
 #     listDatasets(ensembl)
-biomart_dataset=hsapiens_gene_ensembl
+# example: hsapiens_gene_ensembl
+biomart_dataset=
 
 [biomart]
 # name of biomart attributes to fetch
@@ -88,8 +102,8 @@ host=genome-mysql.cse.ucsc.edu
 # UCSC database user name
 user=genome
 
-# UCSC database name
-database=hg19
+# UCSC database name, e.g. hg38
+database=
 
 # repeats to collect as ',' separated list
 repeattypes=DNA,LINE,SINE,LTR,Transposon
@@ -144,7 +158,8 @@ host=ensembldb.ensembl.org
 # to get the database name try something like:
 # mysql --user anonymous --port 5306 --host ensembldb.ensembl.org -e "show databases;"
 #   | grep homo | less
-database=homo_sapiens_core_75_37
+# e.g. homo_sapiens_core_75_37
+database=
 
 # ensembl port
 port=5306
@@ -159,7 +174,7 @@ url_goslim=http://www.geneontology.org/ontology/subsets/goslim_generic.obo
 # at http://www.geneontology.org/GO.downloads.annotations.shtml
 # eg. for human: gene_association.goa_human.gz
 # and for mouse: gene_association.mgi.gz NOT goa_mouse which is 7 years out of date!
-geneontology_file=must_specify
+geneontology_file=
 
 #----------------------------------------------------------
 [mapability]
@@ -214,7 +229,8 @@ mart=ensembl
 #   library(biomaRt)
 #   ensembl <- useMart(ensembl)
 #   listDatasets(ensembl)
-dataset=hsapiens_gene_ensembl
+# e.g. hsapiens_gene_ensembl
+dataset=
 
 # host
 # Default host is www.biomart.org
@@ -242,8 +258,6 @@ max_indel_length=
 # Best not to change
 [interface]
 # database
-
-
 ##=======================================
 ## Assembly derived annotations