Skip to content

Commit

Permalink
Merge pull request #36 from bigbio/dev
Browse files Browse the repository at this point in the history
Clean code style
  • Loading branch information
ypriverol authored Feb 11, 2021
2 parents 7c68b20 + 87047ae commit 895cef2
Show file tree
Hide file tree
Showing 16 changed files with 1,537 additions and 1,510 deletions.
805 changes: 404 additions & 401 deletions pypgatk/cgenomes/cgenomes_proteindb.py

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions pypgatk/cgenomes/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@


class SNP(object):

def __init__(self, gene=None, mrna=None, dna_mut=None, aa_mut=None, type=None):
Expand All @@ -15,4 +14,4 @@ def __init__(self, gene=None, mrna=None, dna_mut=None, aa_mut=None, type=None):
self.mrna = mrna
self.aa_mut = aa_mut
self.type = type
self.dna_mut = dna_mut
self.dna_mut = dna_mut
36 changes: 17 additions & 19 deletions pypgatk/commands/dnaseq_to_proteindb.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,14 @@

@click.command("dnaseq-to-proteindb", short_help="Generate peptides based on DNA sequences")
@click.option('-c', '--config_file', help='Configuration to perform conversion between ENSEMBL Files',
default= this_dir + '/../config/ensembl_config.yaml')
default=this_dir + '/../config/ensembl_config.yaml')
@click.option('--input_fasta', help='Path to sequences fasta')
@click.option('--translation_table', default=1, type=int, help='Translation Table (default 1)')
@click.option('--num_orfs', default=3, type=int, help='Number of ORFs (default 0)')
@click.option('--num_orfs_complement', default=0, type=int,
help='Number of ORFs from the reverse side (default 0)')
@click.option('--output_proteindb', default="peptide-database.fa", help="Output file name, exits if already exists")
@click.option('-p', '--var_prefix', default="", help="String to add before the variant peptides")

@click.option('--skip_including_all_cds',
help="By default any transcript that has a defined CDS will be translated, this option disables this features instead it only depends on the biotypes",
is_flag=True)
Expand All @@ -35,20 +34,19 @@ def dnaseq_to_proteindb(ctx, config_file, input_fasta, translation_table, num_or
output_proteindb, var_prefix,
skip_including_all_cds, include_biotypes, exclude_biotypes, biotype_str, expression_str,
expression_thresh):
if input_fasta is None:
print_help()

pipeline_arguments = {EnsemblDataService.TRANSLATION_TABLE: translation_table,
EnsemblDataService.PROTEIN_DB_OUTPUT: output_proteindb,
EnsemblDataService.HEADER_VAR_PREFIX: var_prefix,
EnsemblDataService.EXCLUDE_BIOTYPES: exclude_biotypes,
EnsemblDataService.SKIP_INCLUDING_ALL_CDS: skip_including_all_cds,
EnsemblDataService.INCLUDE_BIOTYPES: include_biotypes,
EnsemblDataService.BIOTYPE_STR: biotype_str, EnsemblDataService.NUM_ORFS: num_orfs,
EnsemblDataService.NUM_ORFS_COMPLEMENT: num_orfs_complement,
EnsemblDataService.EXPRESSION_STR: expression_str,
EnsemblDataService.EXPRESSION_THRESH: expression_thresh}

ensembl_data_service = EnsemblDataService(config_file, pipeline_arguments)
ensembl_data_service.dnaseq_to_proteindb(input_fasta)

if input_fasta is None:
print_help()

pipeline_arguments = {EnsemblDataService.TRANSLATION_TABLE: translation_table,
EnsemblDataService.PROTEIN_DB_OUTPUT: output_proteindb,
EnsemblDataService.HEADER_VAR_PREFIX: var_prefix,
EnsemblDataService.EXCLUDE_BIOTYPES: exclude_biotypes,
EnsemblDataService.SKIP_INCLUDING_ALL_CDS: skip_including_all_cds,
EnsemblDataService.INCLUDE_BIOTYPES: include_biotypes,
EnsemblDataService.BIOTYPE_STR: biotype_str, EnsemblDataService.NUM_ORFS: num_orfs,
EnsemblDataService.NUM_ORFS_COMPLEMENT: num_orfs_complement,
EnsemblDataService.EXPRESSION_STR: expression_str,
EnsemblDataService.EXPRESSION_THRESH: expression_thresh}

ensembl_data_service = EnsemblDataService(config_file, pipeline_arguments)
ensembl_data_service.dnaseq_to_proteindb(input_fasta)
27 changes: 15 additions & 12 deletions pypgatk/commands/ensembl_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,23 @@

this_dir, this_filename = os.path.split(__file__)


@click.command('ensembl-check', short_help="Command to check ensembl database for stop codons, gaps")
@click.option('-c' ,'--config_file',
@click.option('-c', '--config_file',
help='Configuration to perform Ensembl database check',
default= this_dir + '/../config/ensembl_config.yaml')
@click.option('-in' , '--input_fasta', help='input_fasta file to perform the translation')
@click.option('-out' , '--output', help='Output File', default="peptide-database.fa")
@click.option('-adds','--add_stop_codons', help='If a stop codons is found, add a new protein with suffix (_Codon_{num})', is_flag=True)
@click.option('-aa' , '--num_aa', help='Minimun number of aminoacids for a protein to be included in the database', default = 6)
default=this_dir + '/../config/ensembl_config.yaml')
@click.option('-in', '--input_fasta', help='input_fasta file to perform the translation')
@click.option('-out', '--output', help='Output File', default="peptide-database.fa")
@click.option('-adds', '--add_stop_codons',
help='If a stop codons is found, add a new protein with suffix (_Codon_{num})', is_flag=True)
@click.option('-aa', '--num_aa', help='Minimun number of aminoacids for a protein to be included in the database',
default=6)
@click.pass_context
def ensembl_check(ctx, config_file, input_fasta,output, add_stop_codons, num_aa):
if input_fasta is None:
print_help()
def ensembl_check(ctx, config_file, input_fasta, output, add_stop_codons, num_aa):
if input_fasta is None:
print_help()

pipeline_arguments = {EnsemblDataService.PROTEIN_DB_OUTPUT: output}
pipeline_arguments = {EnsemblDataService.PROTEIN_DB_OUTPUT: output}

ensembl_data_service = EnsemblDataService(config_file, pipeline_arguments)
ensembl_data_service.check_proteindb(input_fasta, add_stop_codons, num_aa)
ensembl_data_service = EnsemblDataService(config_file, pipeline_arguments)
ensembl_data_service.check_proteindb(input_fasta, add_stop_codons, num_aa)
114 changes: 58 additions & 56 deletions pypgatk/commands/ensembl_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

this_dir, this_filename = os.path.split(__file__)


@click.command('ensembl-downloader', short_help='Command to download the ensembl information')
@click.option('-c', '--config_file',
help='Configuration file for the ensembl data downloader pipeline',
Expand All @@ -16,77 +17,78 @@
help='Output directory for the peptide databases',
default="./database_ensembl/")
@click.option('-fp', '--folder_prefix_release',
help='Output folder prefix to download the data',
help='Output folder prefix to download the data',
default='release-')
@click.option('-t', '--taxonomy',
help='Taxonomy identifiers (comma separated list can be given) that will be use to download the data from Ensembl',
default='')
@click.option('-l', '--list_taxonomies',
help='Print the list of all the taxonomies in ENSEMBL (https://www.ensembl.org)', is_flag=True, default=False)
@click.option('-sg', '--skip_gtf', help="Skip the gtf file during the download", is_flag=True)
help='Print the list of all the taxonomies in ENSEMBL (https://www.ensembl.org)', is_flag=True,
default=False)
@click.option('-sg', '--skip_gtf', help="Skip the gtf file during the download", is_flag=True)
@click.option('-sp', '--skip_protein', help="Skip the protein fasta file during download", is_flag=True)
@click.option('-sc', '--skip_cds', help='Skip the CDS file download', is_flag=True)
@click.option('-sd', '--skip_cdna', help='Skip the cDNA file download', is_flag=True)
@click.option('-sn', '--skip_ncrna', help='Skip the ncRNA file download', is_flag=True)
@click.option('-sv', '--skip_vcf', help='Skip the VCF variant file', is_flag=True)
@click.option('-en', '--ensembl_name', help='Ensembl name code to download, it can be use instead of taxonomy (e.g. homo_sapiens)', default='')
@click.option('-en', '--ensembl_name',
help='Ensembl name code to download, it can be use instead of taxonomy (e.g. homo_sapiens)', default='')
@click.option('--grch37', help='Download a previous version GRCh37 of ensembl genomes', is_flag=True)
def ensembl_downloader(config_file, output_directory, folder_prefix_release, taxonomy, list_taxonomies,
skip_gtf, skip_protein, skip_cds, skip_cdna, skip_ncrna, skip_vcf, ensembl_name, grch37 = False):
""" This tool enables to download from enseml ftp the FASTA and GTF files"""

if config_file is None:
msg = "The config file for the pipeline is missing, please provide one "
logging.error(msg)
raise AppConfigException(msg)
skip_gtf, skip_protein, skip_cds, skip_cdna, skip_ncrna, skip_vcf, ensembl_name, grch37=False):
""" This tool enables to download from enseml ftp the FASTA and GTF files"""

# Parse pipelines parameters.
pipeline_arguments = {}
if output_directory is not None:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_OUTPUT_DIRECTORY] = output_directory
if folder_prefix_release is not None:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_FOLDER_PREFIX_RELEASE] = folder_prefix_release
if taxonomy is not None:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_TAXONOMY] = taxonomy
if ensembl_name is not None:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_ENSEMBL_NAME] = ensembl_name
if list_taxonomies:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_LIST_TAXONOMIES] = list_taxonomies
if skip_protein is not None and skip_protein:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_PROTEIN] = True
else:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_PROTEIN] = False
if skip_gtf is not None and skip_gtf:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_GTF] = True
else:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_GTF] = False
if skip_cds is not None and skip_cds:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDS] = True
else:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDS] = False
if skip_ncrna is not None and skip_ncrna:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_NCRNA] = True
else:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_NCRNA] = False
if skip_cdna is not None and skip_cdna:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDNA] = True
else:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDNA] = False
if skip_vcf is not None and skip_vcf:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_VCF] = True
else:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_VCF] = False
if config_file is None:
msg = "The config file for the pipeline is missing, please provide one "
logging.error(msg)
raise AppConfigException(msg)

ensembl_download_service = EnsemblDataDownloadService(config_file, pipeline_arguments)
# Parse pipelines parameters.
pipeline_arguments = {}
if output_directory is not None:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_OUTPUT_DIRECTORY] = output_directory
if folder_prefix_release is not None:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_FOLDER_PREFIX_RELEASE] = folder_prefix_release
if taxonomy is not None:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_TAXONOMY] = taxonomy
if ensembl_name is not None:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_ENSEMBL_NAME] = ensembl_name
if list_taxonomies:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_LIST_TAXONOMIES] = list_taxonomies
if skip_protein is not None and skip_protein:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_PROTEIN] = True
else:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_PROTEIN] = False
if skip_gtf is not None and skip_gtf:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_GTF] = True
else:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_GTF] = False
if skip_cds is not None and skip_cds:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDS] = True
else:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDS] = False
if skip_ncrna is not None and skip_ncrna:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_NCRNA] = True
else:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_NCRNA] = False
if skip_cdna is not None and skip_cdna:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDNA] = True
else:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDNA] = False
if skip_vcf is not None and skip_vcf:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_VCF] = True
else:
pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_VCF] = False

ensembl_download_service = EnsemblDataDownloadService(config_file, pipeline_arguments)

logger = ensembl_download_service.get_logger_for("Main Pipeline Ensembl Downloader")
logger.info("Pipeline STARTING ... ")
if list_taxonomies:
list_of_taxonomies = ensembl_download_service.get_species_from_rest()
for taxonomy_info in list_of_taxonomies:
print(taxonomy_info)
logger = ensembl_download_service.get_logger_for("Main Pipeline Ensembl Downloader")
logger.info("Pipeline STARTING ... ")
if list_taxonomies:
list_of_taxonomies = ensembl_download_service.get_species_from_rest()
for taxonomy_info in list_of_taxonomies:
print(taxonomy_info)

ensembl_download_service.download_database_by_species(grch37)
ensembl_download_service.download_database_by_species(grch37)

logger.info("Pipeline Finish !!!")
logger.info("Pipeline Finish !!!")
Loading

0 comments on commit 895cef2

Please sign in to comment.