diff --git a/.travis.yml b/.travis.yml index a023625..a830aad 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,6 @@ os: - linux sudo: false python: - - "2.7" - "3.6" script: - python setup.py test @@ -23,6 +22,8 @@ jobs: include: - stage: deploy docs language: python + python: + - "3.6" install: - pip install mkdocs==1 - pip install mkdocs-material==3.0.3 diff --git a/docs/usage.md b/docs/usage.md index c769eed..5d51733 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,5 +1,121 @@ RefChef comes with two main commands (`refchef-cook` and `refchef-menu`). `refchef-cook` will read the recipes and execute the commands that will retrieve the references, indices, or annotations. `refchef-menu` provides an easy way to summarize the items already on the system. +- See the installation instructions for how to install refchef. +- Create your own local repository for tracking references: +``` +cd /Volumes/jwalla12 +git init local_references +``` + +- Create a directory for refchef to store your references: +``` +mkdir /Volumes/jwalla12/references +``` + +- Create a `master.yaml` file and save it in your git repository. This file will contain the commands that will be executed to download your references, as well as some additional metadata. For more information about the details of the .yaml file format, see (https://compbiocore.github.io/refchef/specs/). Note that the creation of the `final_checksums.md5` file should always be included in the `master.yaml` file. As a minimal example, here is a `master.yaml` file that will download the grch38 human genome from Ensembl: +``` +grch38: + metadata: + name: grch38_release87 + species: Homo sapiens + organization: ensembl + downloader: jrwallace + levels: + references: + - component: primary + complete: + status: false + commands: + - wget ftp://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz + - wget ftp://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/dna/CHECKSUMS + - md5sum *.gz > postdownload-checksums.md5 + - gunzip *.gz + - md5sum *.* > final_checksums.md5 + +``` +- In addition to the .yaml file, you will also need to specify the following details: 1. where you'd like the references to be saved, 2. the local git repository for version control of references, and 3. the remote github repository for version control of reference sequences. There are a few options for relaying this information to refchef -- they can be specified in a `cfg.ini` file or a `cfg.yaml` file, or you can pass them as arguments to `refchef-cook` -- the command that will read your `master.yaml` file and download the references. The following is an example where arguments are passed to `refchef-cook` and references are not pushed to a remote repository: + +``` +refchef-cook -e -o /Volumes/jwalla12/references -gl /Volumes/jwalla12/local_references +``` + +todo: add examples re: using a cfg file and remote repo + +- Then you'll see the following: +``` +/anaconda3/lib/python3.7/site-packages/refchef/utils.py:12: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details. + dict_ = yaml.load(yml) + šŸ¶ RefChef... getting reference: grch38, component: primary +Running command "wget ftp://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" +--2019-07-12 15:56:56-- ftp://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz + => ā€˜Homo_sapiens.GRCh38.dna.primary_assembly.fa.gzā€™ +Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.193.8 +Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.193.8|:21... connected. +Logging in as anonymous ... Logged in! +==> SYST ... done. ==> PWD ... done. +==> TYPE I ... done. ==> CWD (1) /pub/release-87/fasta/homo_sapiens/dna ... done. +==> SIZE Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz ... 881214448 +==> PASV ... done. ==> RETR Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz ... done. +Length: 881214448 (840M) (unauthoritative) + +Homo_sapiens.GRCh38.d 100%[=======================>] 840.39M 6.71MB/s in 4m 26s + +2019-07-12 16:01:25 (3.16 MB/s) - ā€˜Homo_sapiens.GRCh38.dna.primary_assembly.fa.gzā€™ saved [881214448] + +Running command "wget ftp://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/dna/CHECKSUMS" +--2019-07-12 16:01:25-- ftp://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/dna/CHECKSUMS + => ā€˜CHECKSUMSā€™ +Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.193.8 +Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.193.8|:21... connected. +Logging in as anonymous ... Logged in! +==> SYST ... done. ==> PWD ... done. +==> TYPE I ... done. ==> CWD (1) /pub/release-87/fasta/homo_sapiens/dna ... done. +==> SIZE CHECKSUMS ... 5010 +==> PASV ... done. ==> RETR CHECKSUMS ... done. +Length: 5010 (4.9K) (unauthoritative) + +CHECKSUMS 100%[=======================>] 4.89K --.-KB/s in 0s + +2019-07-12 16:01:27 (97.5 MB/s) - ā€˜CHECKSUMSā€™ saved [5010] + +Running command "md5sum *.gz > postdownload-checksums.md5" +Running command "gunzip *.gz" +Running command "md5sum *.* > final_checksums.md5" + +``` + +- After this command is run, master.yaml will reflect that you have downloaded the references and it will now look like this: +``` +grch38: + metadata: + name: grch38_release87 + species: Homo sapiens + organization: ensembl + downloader: jrwallace + levels: + references: + - component: primary + complete: + status: true + time: 2019-07-12 16:02:25.505498 + commands: + - wget ftp://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz + - wget ftp://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/dna/CHECKSUMS + - md5sum *.gz > postdownload-checksums.md5 + - gunzip *.gz + - md5sum *.* > final_checksums.md5 + location: /Volumes/jwalla12/references/grch38/primary + files: + - CHECKSUMS + - final_checksums.md5 + - Homo_sapiens.GRCh38.dna.primary_assembly.fa + - metadata.txt + - postdownload-checksums.md5 + +``` + +todo: add information re: adding references already present elsewhere (should the command be more like a cp command?) + #### User workflow diagram ![Diagram](assets/refchef-diagram.svg) @@ -12,7 +128,7 @@ Both scripts can take a `--config (-c)` argument with the path for a config file config-yaml: path-settings: reference-directory: ~/data/references_dir # directory where references will be downloaded and processed. - github-directory: ~/data/git_local # local git repository where `master.yaml` is located. + git-directory: ~/data/git_local # local git repository where `master.yaml` is located. remote-repository: user/repo # remote user and repository for version control of `master.yaml` log-settings: log: 'yes' diff --git a/refchef/config.py b/refchef/config.py index bc485af..6e66c44 100644 --- a/refchef/config.py +++ b/refchef/config.py @@ -9,6 +9,7 @@ pass + class Config: def __init__(self, reference_dir, git_local, git_remote, log): self.reference_dir = os.path.expanduser(reference_dir) @@ -25,7 +26,7 @@ def yaml(path): d['reference_dir'] = dict_['config-yaml']['path-settings']['reference-directory'] d['git_local'] = dict_['config-yaml']['path-settings']['git-directory'] d['git_remote'] = dict_['config-yaml']['path-settings']['remote-repository'] - d['log'] = dict_['config-yaml']['log-settings']['log'] + d['log'] = utils.process_logical(dict_['config-yaml']['log-settings']['log']) # d['break_on_error'] = dict_['config-yaml']['runtime-settings']['break-on-error'] # d['verbose'] = dict_['config-yaml']['runtime-settings']['verbose'] @@ -40,7 +41,7 @@ def ini(path): d['reference_dir'] = config.get('path-settings', 'reference-directory') d['git_local'] = config.get('path-settings', 'git-directory') d['git_remote'] = config.get('path-settings', 'remote-repository') - d['log'] = config.get('log-settings', 'log') + d['log'] = utils.process_logical(config.get('log-settings', 'log')) # d['break_on_error'] = config.get('runtime-settings', 'break-on-error') # d['verbose'] = config.get('runtime-settings', 'verbose') diff --git a/refchef/github_utils.py b/refchef/github_utils.py index 4eeccce..bbf3d87 100644 --- a/refchef/github_utils.py +++ b/refchef/github_utils.py @@ -8,6 +8,8 @@ from refchef import config from refchef.utils import * + + def setup_git(conf): git_dir = os.path.join(conf.git_local, '.git') work_tree = os.path.join(conf.git_local, '') diff --git a/refchef/references.py b/refchef/references.py index ca52bdb..fa29fb6 100644 --- a/refchef/references.py +++ b/refchef/references.py @@ -13,6 +13,7 @@ def execute(conf, file_name): """Process all steps to create directories, fetch files, and update yaml for references/indices/annotations""" + yaml_file = os.path.join(conf.git_local, file_name) yaml_dict = utils.read_yaml(yaml_file) keys = list(yaml_dict.keys()) @@ -32,7 +33,6 @@ def execute(conf, file_name): k, component) logging.info(to_print) - print(to_print) # Fetch references fetch(entry['commands'], path_) @@ -71,7 +71,7 @@ def fetch(command_list, directory): """ Run all commands from within the given directory""" for c in command_list: with cd(directory): - print("Running command \"{}\"".format(c)) + logging.info("Running command \"{}\"".format(c)) subprocess.call(c, shell=True) def get_filenames(path_): @@ -82,14 +82,20 @@ def get_filenames(path_): def add_uuid(path_): """Reads final_checksums.md5 and returns id.""" - with open(os.path.join(path_, 'final_checksums.md5'), 'r') as f: - line = f.readline().replace('\n','') - if sys.platform == 'darwin': - id_ = line.split(" = ")[1] - else: - id_ = line.split(" ")[0] - - return str(uuid.uuid3(uuid.NAMESPACE_DNS, id_)) + if os.path.exists(os.path.join(path_, 'final_checksums.md5')): + with open(os.path.join(path_, 'final_checksums.md5'), 'r') as f: + line = f.readline().replace('\n','') + if sys.platform == 'darwin': + cs = line.split(" = ")[1] + else: + cs = line.split(" ")[0] + + return str(uuid.uuid3(uuid.NAMESPACE_DNS, cs)) + else: + logging.warning("No final_checksums.md found. UUID will not correspond to checksum.") + return str(uuid.uuid1()) + + return _id def create_metadata_file(metadata, path_): """Creates metadata.txt file.""" diff --git a/refchef/table_utils.py b/refchef/table_utils.py index ae865a6..ff82fa7 100644 --- a/refchef/table_utils.py +++ b/refchef/table_utils.py @@ -13,6 +13,7 @@ from refchef.github_utils import read_menu_from_github from refchef.utils import * + def get_full_menu(master): """Reads yaml file and converts to a table format""" diff --git a/refchef/utils.py b/refchef/utils.py index 40ac609..e04b913 100644 --- a/refchef/utils.py +++ b/refchef/utils.py @@ -6,6 +6,7 @@ from collections import OrderedDict, defaultdict, Mapping from future.utils import iteritems + def read_yaml(file_path): """Simple function to read yaml file""" with open(file_path) as yml: diff --git a/scripts/refchef-cook b/scripts/refchef-cook index 074313c..0043919 100644 --- a/scripts/refchef-cook +++ b/scripts/refchef-cook @@ -14,90 +14,128 @@ import glob import logging import datetime -parser = argparse.ArgumentParser(description='Controls how to run the reference parser') - -parser.add_argument('--execute', '-e', help = 'Executes the YAML file, either the new if it exists or the master if not', action='store_true') -parser.add_argument('--new', '-n', type=str, help = 'Denotes the new YAML') -parser.add_argument('--git', '-g', choices=['commit', 'push'], help='Git commands to use. Use `commit` if no `--git_remote` is passed.') -parser.add_argument('--config', '-c', type=str, help='Path do to config file in .yaml or .ini format.') -parser.add_argument('--outdir', '-o', type=str, default=False, help='Directory where references will be saved.') -parser.add_argument('--git_local', '-gl', type=str, default=False, help='Local git directory, where master.yaml will be located.') -parser.add_argument('--git_remote', '-gr', type=str, default=False, help='Remote Git repository.') -parser.add_argument('--logs', '-l', action='store_true', help='Logging mode on/off.') - - -# Parse arguments -arguments = parser.parse_args() - -# Check for config file or config arguments. -arg_dict = {'reference_dir': arguments.outdir, - 'git_local': arguments.git_local, - 'git_remote': arguments.git_remote, - 'log': arguments.logs} - -conf = False - -if arguments.config: - try: +def main(): + parser = argparse.ArgumentParser(description='Controls how to run the reference parser') + + parser.add_argument('--execute', '-e', help = 'Executes the YAML file, either the new if it exists or the master if not', action='store_true') + parser.add_argument('--new', '-n', type=str, help = 'Denotes the new YAML') + parser.add_argument('--git', '-g', choices=['commit', 'push'], help='Git commands to use. Use `commit` if no `--git_remote` is passed.') + parser.add_argument('--config', '-c', type=str, help='Path do to config file in .yaml or .ini format.') + parser.add_argument('--outdir', '-o', type=str, default=False, help='Directory where references will be saved.') + parser.add_argument('--git_local', '-gl', type=str, default=False, help='Local git directory, where master.yaml will be located.') + parser.add_argument('--git_remote', '-gr', type=str, default=False, help='Remote Git repository.') + parser.add_argument('--logs', '-l', action='store_true', help='Logging mode on/off.') + + # Parse arguments + arguments = parser.parse_args() + + # Check for config file or config arguments. + arg_dict = {'reference_dir': arguments.outdir, + 'git_local': arguments.git_local, + 'git_remote': arguments.git_remote, + 'log': arguments.logs} + + conf = False + + if arguments.config: + print(read_yaml(arguments.config)) + print(config.yaml(arguments.config)) try: - d = config.yaml(arguments.config) + try: + d = config.yaml(arguments.config) + except: + d = config.ini(arguments.config) + conf = config.Config(**d) except: - d = config.ini(arguments.config) - conf = config.Config(**d) - except: - print("""Malformatted config file. See the documentation for details at - https://compbiocore.github.io/refchef - """) -else: - try: - conf = config.Config(**arg_dict) - except: - print("""No configuration file found. Try passing a .ini or .yaml file to --config (-c), -or at least the output directory (--outdir, -o) and the path to the local git directory -for this project (--git_local, -gl). For more details: refchef-cook --help, or see the -documentation at https://compbiocore.github.io/refchef - """) - -if conf: - - ### Log summary - if arguments.logs: - FORMAT = '%(asctime)-15s %(clientip)s %(user)-8s %(message)s' + print("""Malformatted config file. See the documentation for details at + https://compbiocore.github.io/refchef + """) + else: + try: + conf = config.Config(**arg_dict) + except: + print("""No configuration file found. Try passing a .ini or .yaml file to --config (-c), + or at least the output directory (--outdir, -o) and the path to the local git directory + for this project (--git_local, -gl). For more details: refchef-cook --help, or see the + documentation at https://compbiocore.github.io/refchef + """) + + if conf: + ### Log summary + FORMAT = '%(asctime)s %(levelname)s: %(message)s' now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - path_ = os.path.join(conf.git_local, 'logs') - file_name = 'refchef_{}.log'.format(now) - if not os.path.exists(path_): - os.makedirs(path_) + if conf.log: + + path_ = os.path.join(conf.git_local, 'logs') + file_name = 'refchef_{}.log'.format(now) + + if not os.path.exists(path_): + os.makedirs(path_) + + logging.basicConfig(filename=os.path.join(path_, file_name), + format=FORMAT, + level=logging.DEBUG) - logging.basicConfig(filename=os.path.join(path_, file_name), - format=FORMAT, - level=logging.INFO) + else: + logging.basicConfig(format=FORMAT, + level=logging.INFO) - # Read menu (master.yaml) - master = read_menu(conf) + logging.getLogger().addHandler(logging.StreamHandler()) - # If new argument, append that to master and reload master. - if arguments.new is not None: - origin = arguments.new - destination = os.path.join(conf.git_local, 'master.yaml') - utils.append_yaml(origin, destination) + # Read menu (master.yaml) master = read_menu(conf) - ## Execute, commit and push steps. - if arguments.execute: - execute(conf, 'master.yaml') - if arguments.logs: - m = read_yaml(os.path.join(conf.git_local, 'master.yaml')) - logging.info("\n{0}".format(get_full_menu(m)[['type', 'name', 'component', 'organization', 'uuid']])) - - git_dir, work_tree = gh.setup_git(conf) - - ## Git Steps - if arguments.git == 'push': - gh.pull(git_dir, work_tree) - gh.commit(git_dir, work_tree) - gh.push(git_dir, work_tree) - elif arguments.git == 'commit': - gh.commit(git_dir, work_tree) + logging.info(u""" + =========================================== + REFCHEF \U0001F436 + ------------------------------------------- + - References will be downloaded to: {0} + - Remote repository for master.yaml {1} + - Local repository for master.yaml {2} + - Logs files: {3}/logs/ + ------------------------------------------- + """.format(conf.reference_dir, conf.git_remote, conf.git_local, conf.git_local)) + + + # If new argument, append that to master and reload master. + if arguments.new is not None: + origin = arguments.new + destination = os.path.join(conf.git_local, 'master.yaml') + utils.append_yaml(origin, destination) + master = read_menu(conf) + + for r in master.keys(): + for i in master[r]['levels']['references']: + if not i['complete']['status']: + logging.info(u""" + ------------------------------------------- + The folowing references will be downloaded: + - {0} + =========================================== + """.format(r)) + else: + logging.info(""" + No references to download. + """) + + ## Execute, commit and push steps. + if arguments.execute: + execute(conf, 'master.yaml') + if arguments.logs: + m = read_yaml(os.path.join(conf.git_local, 'master.yaml')) + logging.info("\n{0}".format(get_full_menu(m)[['type', 'name', 'component', 'organization', 'uuid']])) + + git_dir, work_tree = gh.setup_git(conf) + + ## Git Steps + if arguments.git == 'push': + gh.pull(git_dir, work_tree) + gh.commit(git_dir, work_tree) + gh.push(git_dir, work_tree) + elif arguments.git == 'commit': + gh.commit(git_dir, work_tree) + +if __name__ == '__main__': + main() diff --git a/scripts/refchef-menu b/scripts/refchef-menu index 7b8efe6..651c32b 100644 --- a/scripts/refchef-menu +++ b/scripts/refchef-menu @@ -13,43 +13,47 @@ from refchef.table_utils import * from refchef.utils import * from refchef import config -parser = argparse.ArgumentParser(description='Get and filter references available in the system.') - -parser.add_argument("--filter", type=str, help="Field:value pair to filter menu on.") -# parser.add_argument("--regex", "-r", -# help="Whether value passed to filter is a regex expression.", -# action="store_true") -parser.add_argument('--master', '-m', type=str, help='Path do to master.yaml') -parser.add_argument('--config', '-c', type=str, help='Path do to config file in .yaml or .ini format.') -parser.add_argument('--full', action='store_true', help='Whether to show full table, including location and names of files.') -# Parse arguments -arguments = parser.parse_args() - -if arguments.config: - try: - d = config.yaml(arguments.config) - except: - d = config.ini(arguments.config) - conf = config.Config(**d) - - master = read_yaml(os.path.join(conf.git_local, 'master.yaml')) - -if arguments.master: - master = read_yaml(os.path.expanduser(arguments.master)) - - -menu = get_full_menu(master) - -if (arguments.filter is not None): - filtered = multiple_filter(menu, arguments.filter) - if arguments.full: - pretty_print(filtered) - else: - partial = filtered.drop(columns=['location', 'files']) - pretty_print(partial) -else: - if arguments.full: - pretty_print(menu) +def main(): + parser = argparse.ArgumentParser(description='Get and filter references available in the system.') + + parser.add_argument("--filter", type=str, help="Field:value pair to filter menu on.") + # parser.add_argument("--regex", "-r", + # help="Whether value passed to filter is a regex expression.", + # action="store_true") + parser.add_argument('--master', '-m', type=str, help='Path do to master.yaml') + parser.add_argument('--config', '-c', type=str, help='Path do to config file in .yaml or .ini format.') + parser.add_argument('--full', action='store_true', help='Whether to show full table, including location and names of files.') + # Parse arguments + arguments = parser.parse_args() + + if arguments.config: + try: + d = config.yaml(arguments.config) + except: + d = config.ini(arguments.config) + conf = config.Config(**d) + + master = read_yaml(os.path.join(conf.git_local, 'master.yaml')) + + if arguments.master: + master = read_yaml(os.path.expanduser(arguments.master)) + + + menu = get_full_menu(master) + + if (arguments.filter is not None): + filtered = multiple_filter(menu, arguments.filter) + if arguments.full: + pretty_print(filtered) + else: + partial = filtered.drop(columns=['location', 'files']) + pretty_print(partial) else: - partial = menu.drop(columns=['location', 'files']) - pretty_print(partial) + if arguments.full: + pretty_print(menu) + else: + partial = menu.drop(columns=['location', 'files']) + pretty_print(partial) + +if __name__ == '__main__': + main() diff --git a/tests/test_config.py b/tests/test_config.py index 536729b..b4f6823 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -3,6 +3,7 @@ import sys import shutil from refchef import config + try: input = raw_input except NameError: