Merge pull request #36 from bigbio/dev

Clean code style
bigbio · Feb 11, 2021 · 895cef2 · 895cef2
2 parents 7c68b20 + 87047ae
commit 895cef2
Show file tree

Hide file tree

Showing 16 changed files with 1,537 additions and 1,510 deletions.
diff --git a/pypgatk/cgenomes/cgenomes_proteindb.py b/pypgatk/cgenomes/cgenomes_proteindb.py
diff --git a/pypgatk/cgenomes/models.py b/pypgatk/cgenomes/models.py
@@ -1,5 +1,4 @@
 
-
 class SNP(object):
 
     def __init__(self, gene=None, mrna=None, dna_mut=None, aa_mut=None, type=None):
@@ -15,4 +14,4 @@ def __init__(self, gene=None, mrna=None, dna_mut=None, aa_mut=None, type=None):
         self.mrna = mrna
         self.aa_mut = aa_mut
         self.type = type
-        self.dna_mut = dna_mut
+        self.dna_mut = dna_mut
diff --git a/pypgatk/commands/dnaseq_to_proteindb.py b/pypgatk/commands/dnaseq_to_proteindb.py
@@ -10,15 +10,14 @@
 
 @click.command("dnaseq-to-proteindb", short_help="Generate peptides based on DNA sequences")
 @click.option('-c', '--config_file', help='Configuration to perform conversion between ENSEMBL Files',
-              default= this_dir + '/../config/ensembl_config.yaml')
+              default=this_dir + '/../config/ensembl_config.yaml')
 @click.option('--input_fasta', help='Path to sequences fasta')
 @click.option('--translation_table', default=1, type=int, help='Translation Table (default 1)')
 @click.option('--num_orfs', default=3, type=int, help='Number of ORFs (default 0)')
 @click.option('--num_orfs_complement', default=0, type=int,
               help='Number of ORFs from the reverse side (default 0)')
 @click.option('--output_proteindb', default="peptide-database.fa", help="Output file name, exits if already exists")
 @click.option('-p', '--var_prefix', default="", help="String to add before the variant peptides")
-
 @click.option('--skip_including_all_cds',
               help="By default any transcript that has a defined CDS will be translated, this option disables this features instead it only depends on the biotypes",
               is_flag=True)
@@ -35,20 +34,19 @@ def dnaseq_to_proteindb(ctx, config_file, input_fasta, translation_table, num_or
                         output_proteindb, var_prefix,
                         skip_including_all_cds, include_biotypes, exclude_biotypes, biotype_str, expression_str,
                         expression_thresh):
-    if input_fasta is None:
-        print_help()
-
-    pipeline_arguments = {EnsemblDataService.TRANSLATION_TABLE: translation_table,
-                          EnsemblDataService.PROTEIN_DB_OUTPUT: output_proteindb,
-                          EnsemblDataService.HEADER_VAR_PREFIX: var_prefix,
-                          EnsemblDataService.EXCLUDE_BIOTYPES: exclude_biotypes,
-                          EnsemblDataService.SKIP_INCLUDING_ALL_CDS: skip_including_all_cds,
-                          EnsemblDataService.INCLUDE_BIOTYPES: include_biotypes,
-                          EnsemblDataService.BIOTYPE_STR: biotype_str, EnsemblDataService.NUM_ORFS: num_orfs,
-                          EnsemblDataService.NUM_ORFS_COMPLEMENT: num_orfs_complement,
-                          EnsemblDataService.EXPRESSION_STR: expression_str,
-                          EnsemblDataService.EXPRESSION_THRESH: expression_thresh}
-
-    ensembl_data_service = EnsemblDataService(config_file, pipeline_arguments)
-    ensembl_data_service.dnaseq_to_proteindb(input_fasta)
-
+  if input_fasta is None:
+    print_help()
+
+  pipeline_arguments = {EnsemblDataService.TRANSLATION_TABLE: translation_table,
+                        EnsemblDataService.PROTEIN_DB_OUTPUT: output_proteindb,
+                        EnsemblDataService.HEADER_VAR_PREFIX: var_prefix,
+                        EnsemblDataService.EXCLUDE_BIOTYPES: exclude_biotypes,
+                        EnsemblDataService.SKIP_INCLUDING_ALL_CDS: skip_including_all_cds,
+                        EnsemblDataService.INCLUDE_BIOTYPES: include_biotypes,
+                        EnsemblDataService.BIOTYPE_STR: biotype_str, EnsemblDataService.NUM_ORFS: num_orfs,
+                        EnsemblDataService.NUM_ORFS_COMPLEMENT: num_orfs_complement,
+                        EnsemblDataService.EXPRESSION_STR: expression_str,
+                        EnsemblDataService.EXPRESSION_THRESH: expression_thresh}
+
+  ensembl_data_service = EnsemblDataService(config_file, pipeline_arguments)
+  ensembl_data_service.dnaseq_to_proteindb(input_fasta)
diff --git a/pypgatk/commands/ensembl_database.py b/pypgatk/commands/ensembl_database.py
@@ -8,20 +8,23 @@
 
 this_dir, this_filename = os.path.split(__file__)
 
+
 @click.command('ensembl-check', short_help="Command to check ensembl database for stop codons, gaps")
-@click.option('-c'  ,'--config_file',
+@click.option('-c', '--config_file',
               help='Configuration to perform Ensembl database check',
-              default= this_dir + '/../config/ensembl_config.yaml')
-@click.option('-in' , '--input_fasta', help='input_fasta file to perform the translation')
-@click.option('-out' , '--output', help='Output File', default="peptide-database.fa")
-@click.option('-adds','--add_stop_codons', help='If a stop codons is found, add a new protein with suffix (_Codon_{num})', is_flag=True)
-@click.option('-aa'  , '--num_aa', help='Minimun number of aminoacids for a protein to be included in the database', default = 6)
+              default=this_dir + '/../config/ensembl_config.yaml')
+@click.option('-in', '--input_fasta', help='input_fasta file to perform the translation')
+@click.option('-out', '--output', help='Output File', default="peptide-database.fa")
+@click.option('-adds', '--add_stop_codons',
+              help='If a stop codons is found, add a new protein with suffix (_Codon_{num})', is_flag=True)
+@click.option('-aa', '--num_aa', help='Minimun number of aminoacids for a protein to be included in the database',
+              default=6)
 @click.pass_context
-def ensembl_check(ctx, config_file, input_fasta,output, add_stop_codons, num_aa):
-    if input_fasta is None:
-        print_help()
+def ensembl_check(ctx, config_file, input_fasta, output, add_stop_codons, num_aa):
+  if input_fasta is None:
+    print_help()
 
-    pipeline_arguments = {EnsemblDataService.PROTEIN_DB_OUTPUT: output}
+  pipeline_arguments = {EnsemblDataService.PROTEIN_DB_OUTPUT: output}
 
-    ensembl_data_service = EnsemblDataService(config_file, pipeline_arguments)
-    ensembl_data_service.check_proteindb(input_fasta, add_stop_codons, num_aa)
+  ensembl_data_service = EnsemblDataService(config_file, pipeline_arguments)
+  ensembl_data_service.check_proteindb(input_fasta, add_stop_codons, num_aa)
diff --git a/pypgatk/commands/ensembl_downloader.py b/pypgatk/commands/ensembl_downloader.py
@@ -8,6 +8,7 @@
 
 this_dir, this_filename = os.path.split(__file__)
 
+
 @click.command('ensembl-downloader', short_help='Command to download the ensembl information')
 @click.option('-c', '--config_file',
               help='Configuration file for the ensembl data downloader pipeline',
@@ -16,77 +17,78 @@
               help='Output directory for the peptide databases',
               default="./database_ensembl/")
 @click.option('-fp', '--folder_prefix_release',
-               help='Output folder prefix to download the data',
+              help='Output folder prefix to download the data',
               default='release-')
 @click.option('-t', '--taxonomy',
               help='Taxonomy identifiers (comma separated list can be given) that will be use to download the data from Ensembl',
               default='')
 @click.option('-l', '--list_taxonomies',
-              help='Print the list of all the taxonomies in ENSEMBL (https://www.ensembl.org)', is_flag=True, default=False)
-@click.option('-sg', '--skip_gtf',  help="Skip the gtf file during the download", is_flag=True)
+              help='Print the list of all the taxonomies in ENSEMBL (https://www.ensembl.org)', is_flag=True,
+              default=False)
+@click.option('-sg', '--skip_gtf', help="Skip the gtf file during the download", is_flag=True)
 @click.option('-sp', '--skip_protein', help="Skip the protein fasta file during download", is_flag=True)
 @click.option('-sc', '--skip_cds', help='Skip the CDS file download', is_flag=True)
 @click.option('-sd', '--skip_cdna', help='Skip the cDNA file download', is_flag=True)
 @click.option('-sn', '--skip_ncrna', help='Skip the ncRNA file download', is_flag=True)
 @click.option('-sv', '--skip_vcf', help='Skip the VCF variant file', is_flag=True)
-@click.option('-en', '--ensembl_name', help='Ensembl name code to download, it can be use instead of taxonomy (e.g. homo_sapiens)', default='')
+@click.option('-en', '--ensembl_name',
+              help='Ensembl name code to download, it can be use instead of taxonomy (e.g. homo_sapiens)', default='')
 @click.option('--grch37', help='Download a previous version GRCh37 of ensembl genomes', is_flag=True)
 def ensembl_downloader(config_file, output_directory, folder_prefix_release, taxonomy, list_taxonomies,
-                       skip_gtf, skip_protein, skip_cds, skip_cdna, skip_ncrna, skip_vcf, ensembl_name, grch37 = False):
-    """ This tool enables to download from enseml ftp the FASTA and GTF files"""
-
-    if config_file is None:
-        msg = "The config file for the pipeline is missing, please provide one "
-        logging.error(msg)
-        raise AppConfigException(msg)
+                       skip_gtf, skip_protein, skip_cds, skip_cdna, skip_ncrna, skip_vcf, ensembl_name, grch37=False):
+  """ This tool enables to download from enseml ftp the FASTA and GTF files"""
 
-    # Parse pipelines parameters.
-    pipeline_arguments = {}
-    if output_directory is not None:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_OUTPUT_DIRECTORY] = output_directory
-    if folder_prefix_release is not None:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_FOLDER_PREFIX_RELEASE] = folder_prefix_release
-    if taxonomy is not None:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_TAXONOMY] = taxonomy
-    if ensembl_name is not None:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_ENSEMBL_NAME] = ensembl_name
-    if list_taxonomies:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_LIST_TAXONOMIES] = list_taxonomies
-    if skip_protein is not None and skip_protein:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_PROTEIN] = True
-    else:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_PROTEIN] = False
-    if skip_gtf is not None and skip_gtf:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_GTF] = True
-    else:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_GTF] = False
-    if skip_cds is not None and skip_cds:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDS] = True
-    else:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDS] = False
-    if skip_ncrna is not None and skip_ncrna:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_NCRNA] = True
-    else:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_NCRNA] = False
-    if skip_cdna is not None and skip_cdna:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDNA] = True
-    else:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDNA] = False
-    if skip_vcf is not None and skip_vcf:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_VCF] = True
-    else:
-        pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_VCF] = False
+  if config_file is None:
+    msg = "The config file for the pipeline is missing, please provide one "
+    logging.error(msg)
+    raise AppConfigException(msg)
 
-    ensembl_download_service = EnsemblDataDownloadService(config_file, pipeline_arguments)
+  # Parse pipelines parameters.
+  pipeline_arguments = {}
+  if output_directory is not None:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_OUTPUT_DIRECTORY] = output_directory
+  if folder_prefix_release is not None:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_FOLDER_PREFIX_RELEASE] = folder_prefix_release
+  if taxonomy is not None:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_TAXONOMY] = taxonomy
+  if ensembl_name is not None:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_ENSEMBL_NAME] = ensembl_name
+  if list_taxonomies:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_LIST_TAXONOMIES] = list_taxonomies
+  if skip_protein is not None and skip_protein:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_PROTEIN] = True
+  else:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_PROTEIN] = False
+  if skip_gtf is not None and skip_gtf:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_GTF] = True
+  else:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_GTF] = False
+  if skip_cds is not None and skip_cds:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDS] = True
+  else:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDS] = False
+  if skip_ncrna is not None and skip_ncrna:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_NCRNA] = True
+  else:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_NCRNA] = False
+  if skip_cdna is not None and skip_cdna:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDNA] = True
+  else:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_CDNA] = False
+  if skip_vcf is not None and skip_vcf:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_VCF] = True
+  else:
+    pipeline_arguments[EnsemblDataDownloadService.CONFIG_KEY_SKIP_VCF] = False
 
+  ensembl_download_service = EnsemblDataDownloadService(config_file, pipeline_arguments)
 
-    logger = ensembl_download_service.get_logger_for("Main Pipeline Ensembl Downloader")
-    logger.info("Pipeline STARTING ... ")
-    if list_taxonomies:
-        list_of_taxonomies = ensembl_download_service.get_species_from_rest()
-        for taxonomy_info in list_of_taxonomies:
-            print(taxonomy_info)
+  logger = ensembl_download_service.get_logger_for("Main Pipeline Ensembl Downloader")
+  logger.info("Pipeline STARTING ... ")
+  if list_taxonomies:
+    list_of_taxonomies = ensembl_download_service.get_species_from_rest()
+    for taxonomy_info in list_of_taxonomies:
+      print(taxonomy_info)
 
-    ensembl_download_service.download_database_by_species(grch37)
+  ensembl_download_service.download_database_by_species(grch37)
 
-    logger.info("Pipeline Finish !!!")
+  logger.info("Pipeline Finish !!!")