From cd498dbe33bbb57ffa5c4d1009678ae25244f737 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 12 Jun 2023 10:29:40 -0500 Subject: [PATCH 01/17] add retreive samples logic --- .../workspace_downloader.py | 160 +++++++++++++----- 1 file changed, 115 insertions(+), 45 deletions(-) diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index d3f18e7d4..a95b41b18 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -1,7 +1,7 @@ """ usage: workspace_downloader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_version SOURCE_VERSION] [--root_dir ROOT_DIR] [--kb_base_url KB_BASE_URL] [--workers WORKERS] [--token_filepath TOKEN_FILEPATH] - [--keep_job_dir] + [--keep_job_dir] [--retrieve_sample] PROTOTYPE - Download genome files from the workspace service (WSS). @@ -24,6 +24,7 @@ --token_filepath TOKEN_FILEPATH A file path that stores KBase token --keep_job_dir Keep SDK job directory after download task is completed + --retrieve_sample Retrieve sample for each genome object e.g. @@ -52,6 +53,7 @@ from multiprocessing import Pool, Queue, cpu_count import docker +import requests from src.clients.AssemblyUtilClient import AssemblyUtil from src.clients.workspaceClient import Workspace @@ -68,26 +70,27 @@ class Conf: def __init__( - self, - job_dir, - output_dir, - workers, - kb_base_url, - token_filepath, + self, + job_dir, + output_dir, + workers, + kb_base_url, + token_filepath, ): port = loader_helper.find_free_port() - token = loader_helper.get_token(token_filepath) + self.token = loader_helper.get_token(token_filepath) self.start_callback_server( - docker.from_env(), uuid.uuid4().hex, job_dir, kb_base_url, token, port + docker.from_env(), uuid.uuid4().hex, job_dir, kb_base_url, self.token, port ) ws_url = os.path.join(kb_base_url, "ws") + self.sample_url = os.path.join(kb_base_url, "sampleservice") callback_url = "http://" + loader_helper.get_ip() + ":" + str(port) print("callback_url:", callback_url) - self.ws = Workspace(ws_url, token=token) - self.asu = AssemblyUtil(callback_url, token=token) + self.ws = Workspace(ws_url, token=self.token) + self.asu = AssemblyUtil(callback_url, token=self.token) self.queue = Queue() self.pth = output_dir self.job_dir = job_dir @@ -115,7 +118,7 @@ def setup_callback_server_envs(self, job_dir, kb_base_url, token, port): return env, vol def start_callback_server( - self, client, container_name, job_dir, kb_base_url, token, port + self, client, container_name, job_dir, kb_base_url, token, port ): env, vol = self.setup_callback_server_envs(job_dir, kb_base_url, token, port) self.container = client.containers.run( @@ -150,7 +153,7 @@ def _make_job_dir(root_dir, job_dir, username): def _make_collection_source_dir( - root_dir, collection_source_dir, collection, source_verion + root_dir, collection_source_dir, collection, source_verion ): """ Helper function that creates a collection & source_version and link in data @@ -213,9 +216,9 @@ def _create_softlink(csd_upa_dir, upa_dir): """ if os.path.exists(csd_upa_dir): if ( - os.path.isdir(csd_upa_dir) - and os.path.islink(csd_upa_dir) - and os.readlink(csd_upa_dir) == upa_dir + os.path.isdir(csd_upa_dir) + and os.path.islink(csd_upa_dir) + and os.readlink(csd_upa_dir) == upa_dir ): return raise ValueError( @@ -245,7 +248,7 @@ def _process_object_info(obj_info, genome_upa): def list_objects( - wsid, conf, filter_objects_name_by, include_metadata=False, batch_size=10000 + wsid, conf, filter_objects_name_by, include_metadata=False, batch_size=10000 ): """ List all objects information given a workspace ID. @@ -278,31 +281,99 @@ def process_input(conf): if not task: print("Stopping") break - upa, obj_info, genome_upa = task + upa, obj_info, genome_upa, retrieve_sample = task - # cfn points to the assembly file outside of the container - # get_assembly_as_fasta writes the file to /kb/module/workdir/tmp/ inside the container. - # workdir is shared between the container and the external file system - # Any file path get_assembly_as_fasta returns will be relative to inside the container, and so is not useful for this script + upa_dir = os.path.join(conf.pth, upa) + if not os.path.isdir(upa_dir) or not loader_helper.is_upa_info_complete(upa_dir): - cfn = os.path.join(conf.job_dir, "workdir/tmp", upa) - # upa file is downloaded to cfn - conf.asu.get_assembly_as_fasta({"ref": upa.replace("_", "/"), "filename": upa}) + # remove legacy upa_dir to avoid FileExistsError in hard link + if os.path.isdir(upa_dir): + shutil.rmtree(upa_dir) - # each upa in output_dir as a seperate directory - dstd = os.path.join(conf.pth, upa) - os.makedirs(dstd, exist_ok=True) + # cfn points to the assembly file outside of the container + # get_assembly_as_fasta writes the file to /kb/module/workdir/tmp/ inside the container. + # workdir is shared between the container and the external file system + # Any file path get_assembly_as_fasta returns will be relative to inside the container, and so is not useful for this script + cfn = os.path.join(conf.job_dir, "workdir/tmp", upa) + # upa file is downloaded to cfn + conf.asu.get_assembly_as_fasta({"ref": upa.replace("_", "/"), "filename": upa}) - dst = os.path.join(dstd, f"{upa}.fa") - # Hard link .fa file from job_dir to output_dir in WS - os.link(cfn, dst) + # each upa in output_dir as a separate directory + dstd = os.path.join(conf.pth, upa) + os.makedirs(dstd, exist_ok=True) - metafile = os.path.join(dstd, f"{upa}.meta") - # save meta file with relevant object_info - with open(metafile, "w", encoding="utf8") as json_file: - json.dump(_process_object_info(obj_info, genome_upa), json_file, indent=2) + dst = os.path.join(dstd, f"{upa}.fa") + # Hard link .fa file from job_dir to output_dir in WS + os.link(cfn, dst) - print("Completed %s" % (upa)) + metafile = os.path.join(dstd, f"{upa}.meta") + # save meta file with relevant object_info + with open(metafile, "w", encoding="utf8") as json_file: + json.dump(_process_object_info(obj_info, genome_upa), json_file, indent=2) + + print("Completed %s" % (upa)) + else: + print(f"Skip downloading {upa} as it already exists") + + if retrieve_sample: + _download_sample_data(conf, upa) + + +def _download_sample_data(conf, upa): + # retrieve sample data from sample service and save to file + + dstd = os.path.join(conf.pth, upa) + os.makedirs(dstd, exist_ok=True) + sample_file = os.path.join(dstd, f"{upa}.sample") + + if os.path.isfile(sample_file): + print(f"Skip downloading sample for {upa} as it already exists") + return + + # retrieve data links associated with upa + links_ret = _post_sample_service(conf.token, + conf.sample_url, + "get_data_links_from_data", + {"upa": upa.replace("_", "/")}) + data_links = links_ret['links'] + if not data_links: + print(f"No sample data links found for {upa}") + return + + # there should only be one data link for each upa + if len(data_links) != 1: + raise ValueError(f"Expected 1 data link for {upa}, got {len(data_links)}") + + # retrieve sample data and save to file + sample_id = data_links[0]['id'] + sample_ret = _post_sample_service(conf.token, + conf.sample_url, + "get_sample", + {"id": sample_id}) + + with open(sample_file, "w", encoding="utf8") as json_file: + json.dump(sample_ret, json_file, indent=2) + + +def _post_sample_service(token, sample_url, method, params): + # Sends a POST request to the sample service API. + + headers = { + "Authorization": token, + "Content-Type": "application/json" + } + payload = { + "method": f"SampleService.{method}", + "id": str(uuid.uuid4()), + "params": [params] + } + resp = requests.post(url=sample_url, headers=headers, json=payload) + resp_json = resp.json() + if resp_json.get('error'): + raise RuntimeError(f"Error from SampleService - {resp_json['error']}") + result = resp_json['result'][0] + + return result def main(): @@ -361,6 +432,11 @@ def main(): action="store_true", help="Keep SDK job directory after download task is completed", ) + optional.add_argument( + "--retrieve_sample", + action="store_true", + help="Retrieve sample for each genome object", + ) args = parser.parse_args() @@ -372,6 +448,7 @@ def main(): workers = args.workers token_filepath = args.token_filepath keep_job_dir = args.keep_job_dir + retrieve_sample = args.retrieve_sample if bool(kbase_collection) ^ bool(source_version): parser.error( @@ -426,7 +503,7 @@ def main(): workspace_id, conf, loader_common_names.OBJECTS_NAME_ASSEMBLY, - ) + )[:5] assembly_genome_map, duplicate_map = _assembly_genome_lookup(genome_objs) if duplicate_map: for assembly_upa, id_and_date in duplicate_map.items(): @@ -445,15 +522,8 @@ def main(): for obj_info in assembly_objs: upa = "{6}_{0}_{4}".format(*obj_info) upas.append(upa) - upa_dir = os.path.join(output_dir, upa) - if os.path.isdir(upa_dir) and loader_helper.is_upa_info_complete(upa_dir): - continue - - # remove legacy upa_dir to avoid FileExistsError in hard link - if os.path.isdir(upa_dir): - shutil.rmtree(upa_dir) genome_upa = assembly_genome_map[upa.replace("_", "/")] - conf.queue.put([upa, obj_info, genome_upa]) + conf.queue.put([upa, obj_info, genome_upa, retrieve_sample]) for i in range(workers + 1): conf.queue.put(None) From 9bf75344a929cd91d86a8ed903d75a3e545cab75 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 12 Jun 2023 10:32:33 -0500 Subject: [PATCH 02/17] remove local testing logic --- src/loaders/workspace_downloader/workspace_downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index a95b41b18..39c41ad2c 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -350,7 +350,7 @@ def _download_sample_data(conf, upa): conf.sample_url, "get_sample", {"id": sample_id}) - + with open(sample_file, "w", encoding="utf8") as json_file: json.dump(sample_ret, json_file, indent=2) @@ -503,7 +503,7 @@ def main(): workspace_id, conf, loader_common_names.OBJECTS_NAME_ASSEMBLY, - )[:5] + ) assembly_genome_map, duplicate_map = _assembly_genome_lookup(genome_objs) if duplicate_map: for assembly_upa, id_and_date in duplicate_map.items(): From d64341185fd9ba4cacc3615a73918586b3ba7062 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 12 Jun 2023 18:13:51 -0500 Subject: [PATCH 03/17] address comments --- .../workspace_downloader.py | 63 +++++++++++++------ 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 39c41ad2c..04a21466a 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -1,7 +1,7 @@ """ usage: workspace_downloader.py [-h] --workspace_id WORKSPACE_ID [--kbase_collection KBASE_COLLECTION] [--source_version SOURCE_VERSION] [--root_dir ROOT_DIR] [--kb_base_url KB_BASE_URL] [--workers WORKERS] [--token_filepath TOKEN_FILEPATH] - [--keep_job_dir] [--retrieve_sample] + [--keep_job_dir] [--retrieve_sample] [--ignore_no_sample_error] PROTOTYPE - Download genome files from the workspace service (WSS). @@ -25,6 +25,8 @@ A file path that stores KBase token --keep_job_dir Keep SDK job directory after download task is completed --retrieve_sample Retrieve sample for each genome object + --ignore_no_sample_error + Ignore error when no sample data is found for an object e.g. @@ -51,6 +53,7 @@ import uuid from collections import defaultdict from multiprocessing import Pool, Queue, cpu_count +from pathlib import Path import docker import requests @@ -67,6 +70,9 @@ # filename that logs genome duplicates for each assembly GENOME_DUPLICATE_FILE = "duplicate_genomes.json" +# key name for sample file in metadata file +SAMPLE_FILE_KEY = "sample_file" + class Conf: def __init__( @@ -76,10 +82,13 @@ def __init__( workers, kb_base_url, token_filepath, + retrieve_sample, + ignore_no_sample_error ): port = loader_helper.find_free_port() self.token = loader_helper.get_token(token_filepath) - + self.retrieve_sample = retrieve_sample + self.ignore_no_sample_error = ignore_no_sample_error self.start_callback_server( docker.from_env(), uuid.uuid4().hex, job_dir, kb_base_url, self.token, port ) @@ -281,9 +290,10 @@ def process_input(conf): if not task: print("Stopping") break - upa, obj_info, genome_upa, retrieve_sample = task + upa, obj_info, genome_upa = task upa_dir = os.path.join(conf.pth, upa) + metafile = os.path.join(upa_dir, f"{upa}.meta") if not os.path.isdir(upa_dir) or not loader_helper.is_upa_info_complete(upa_dir): # remove legacy upa_dir to avoid FileExistsError in hard link @@ -299,14 +309,12 @@ def process_input(conf): conf.asu.get_assembly_as_fasta({"ref": upa.replace("_", "/"), "filename": upa}) # each upa in output_dir as a separate directory - dstd = os.path.join(conf.pth, upa) - os.makedirs(dstd, exist_ok=True) + os.makedirs(upa_dir, exist_ok=True) - dst = os.path.join(dstd, f"{upa}.fa") + dst = os.path.join(upa_dir, f"{upa}.fa") # Hard link .fa file from job_dir to output_dir in WS os.link(cfn, dst) - metafile = os.path.join(dstd, f"{upa}.meta") # save meta file with relevant object_info with open(metafile, "w", encoding="utf8") as json_file: json.dump(_process_object_info(obj_info, genome_upa), json_file, indent=2) @@ -315,18 +323,23 @@ def process_input(conf): else: print(f"Skip downloading {upa} as it already exists") - if retrieve_sample: - _download_sample_data(conf, upa) + if conf.retrieve_sample: + _download_sample_data(conf, upa, metafile) -def _download_sample_data(conf, upa): +def _download_sample_data(conf, upa, metafile): # retrieve sample data from sample service and save to file - dstd = os.path.join(conf.pth, upa) - os.makedirs(dstd, exist_ok=True) - sample_file = os.path.join(dstd, f"{upa}.sample") + with open(metafile, "r", encoding="utf8") as json_file: + meta = json.load(json_file) - if os.path.isfile(sample_file): + upa_dir = Path(metafile).parent + sample_file_name = f"{upa}.sample" + sample_file = os.path.join(upa_dir, sample_file_name) + + if (SAMPLE_FILE_KEY in meta and + meta[SAMPLE_FILE_KEY] == sample_file_name and + os.path.isfile(sample_file)): print(f"Skip downloading sample for {upa} as it already exists") return @@ -336,7 +349,7 @@ def _download_sample_data(conf, upa): "get_data_links_from_data", {"upa": upa.replace("_", "/")}) data_links = links_ret['links'] - if not data_links: + if not data_links and conf.ignore_no_sample_error: print(f"No sample data links found for {upa}") return @@ -348,12 +361,18 @@ def _download_sample_data(conf, upa): sample_id = data_links[0]['id'] sample_ret = _post_sample_service(conf.token, conf.sample_url, - "get_sample", - {"id": sample_id}) + "get_sample_via_data", + {"upa": upa.replace("_", "/"), + "id": sample_id, + "version": data_links[0]["version"]}) with open(sample_file, "w", encoding="utf8") as json_file: json.dump(sample_ret, json_file, indent=2) + meta[SAMPLE_FILE_KEY] = sample_file_name + with open(metafile, "w", encoding="utf8") as json_file: + json.dump(meta, json_file, indent=2) + def _post_sample_service(token, sample_url, method, params): # Sends a POST request to the sample service API. @@ -437,6 +456,11 @@ def main(): action="store_true", help="Retrieve sample for each genome object", ) + optional.add_argument( + "--ignore_no_sample_error", + action="store_true", + help="Ignore error when no sample data is found for an object", + ) args = parser.parse_args() @@ -449,6 +473,7 @@ def main(): token_filepath = args.token_filepath keep_job_dir = args.keep_job_dir retrieve_sample = args.retrieve_sample + ignore_no_sample_error = args.ignore_no_sample_error if bool(kbase_collection) ^ bool(source_version): parser.error( @@ -491,6 +516,8 @@ def main(): workers, kb_base_url, token_filepath, + retrieve_sample, + ignore_no_sample_error ) genome_objs = list_objects( @@ -523,7 +550,7 @@ def main(): upa = "{6}_{0}_{4}".format(*obj_info) upas.append(upa) genome_upa = assembly_genome_map[upa.replace("_", "/")] - conf.queue.put([upa, obj_info, genome_upa, retrieve_sample]) + conf.queue.put([upa, obj_info, genome_upa]) for i in range(workers + 1): conf.queue.put(None) From 674858d802ebc50132d41ad69a466515de6b3b84 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 12 Jun 2023 22:23:52 -0500 Subject: [PATCH 04/17] using sample service client --- src/clients/SampleServiceClient.py | 1055 +++++++++++++++++ src/loaders/common/loader_common_names.py | 5 +- .../workspace_downloader.py | 48 +- 3 files changed, 1069 insertions(+), 39 deletions(-) create mode 100644 src/clients/SampleServiceClient.py diff --git a/src/clients/SampleServiceClient.py b/src/clients/SampleServiceClient.py new file mode 100644 index 000000000..6671f348d --- /dev/null +++ b/src/clients/SampleServiceClient.py @@ -0,0 +1,1055 @@ +# -*- coding: utf-8 -*- +############################################################ +# +# Autogenerated by the KBase type compiler - +# any changes made here will be overwritten +# +############################################################ + +from __future__ import print_function +# the following is a hack to get the baseclient to import whether we're in a +# package or not. This makes pep8 unhappy hence the annotations. +try: + # baseclient and this client are in a package + from .baseclient import BaseClient as _BaseClient # @UnusedImport +except ImportError: + # no they aren't + from baseclient import BaseClient as _BaseClient # @Reimport + + +class SampleService(object): + + def __init__( + self, url=None, timeout=30 * 60, user_id=None, + password=None, token=None, ignore_authrc=False, + trust_all_ssl_certificates=False, + auth_svc='https://ci.kbase.us/services/auth/api/legacy/KBase/Sessions/Login'): + if url is None: + raise ValueError('A url is required') + self._service_ver = None + self._client = _BaseClient( + url, timeout=timeout, user_id=user_id, password=password, + token=token, ignore_authrc=ignore_authrc, + trust_all_ssl_certificates=trust_all_ssl_certificates, + auth_svc=auth_svc) + + def create_sample(self, params, context=None): + """ + Create a new sample or a sample version. + :param params: instance of type "CreateSampleParams" (Parameters for + creating a sample. If Sample.id is null, a new Sample is created + along with a new ID. Otherwise, a new version of Sample.id is + created. If Sample.id does not exist, an error is returned. Any + incoming user, version or timestamp in the incoming sample is + ignored. sample - the sample to save. prior_version - if non-null, + ensures that no other sample version is saved between + prior_version and the version that is created by this save. If + this is not the case, the sample will fail to save. as_admin - run + the method as a service administrator. The user must have full + administration permissions. as_user - create the sample as a + different user. Ignored if as_admin is not true. Neither the + administrator nor the impersonated user need have permissions to + the sample if a new version is saved.) -> structure: parameter + "sample" of type "Sample" (A Sample, consisting of a tree of + subsamples and replicates. id - the ID of the sample. user - the + user that saved the sample. node_tree - the tree(s) of sample + nodes in the sample. The the roots of all trees must be + BioReplicate nodes. All the BioReplicate nodes must be at the + start of the list, and all child nodes must occur after their + parents in the list. name - the name of the sample. Must be less + than 255 characters. save_date - the date the sample version was + saved. version - the version of the sample.) -> structure: + parameter "id" of type "sample_id" (A Sample ID. Must be globally + unique. Always assigned by the Sample service.), parameter "user" + of type "user" (A user's username.), parameter "node_tree" of list + of type "SampleNode" (A node in a sample tree. id - the ID of the + node. parent - the id of the parent node for the current node. + BioReplicate nodes, and only BioReplicate nodes, do not have a + parent. type - the type of the node. meta_controlled - metadata + restricted by the sample controlled vocabulary and validators. + source_meta - the pre-transformation keys and values of the + controlled metadata at the data source for controlled metadata + keys. In some cases the source metadata may be transformed prior + to ingestion by the Sample Service; the contents of this data + structure allows for reconstructing the original representation. + The metadata here is not validated other than basic size checks + and is provided on an informational basis only. The metadata keys + in the SourceMetadata data structure must be a subset of the + meta_controlled mapping keys. meta_user - unrestricted metadata.) + -> structure: parameter "id" of type "node_id" (A SampleNode ID. + Must be unique within a Sample and be less than 255 characters.), + parameter "parent" of type "node_id" (A SampleNode ID. Must be + unique within a Sample and be less than 255 characters.), + parameter "type" of type "samplenode_type" (The type of a sample + node. One of: BioReplicate - a biological replicate. Always at the + top of the sample tree. TechReplicate - a technical replicate. + SubSample - a sub sample that is not a technical replicate.), + parameter "meta_controlled" of type "metadata" (Metadata attached + to a sample.) -> mapping from type "metadata_key" (A key in a + metadata key/value pair. Less than 1000 unicode characters.) to + type "metadata_value" (A metadata value, represented by a mapping + of value keys to primitive values. An example for a location + metadata key might be: { "name": "Castle Geyser", "lat": + 44.463816, "long": -110.836471 } "primitive values" means an int, + float, string, or equivalent typedefs. Including any collection + types is an error.) -> mapping from type "metadata_value_key" (A + key for a value associated with a piece of metadata. Less than + 1000 unicode characters. Examples: units, value, species) to + unspecified object, parameter "source_meta" of list of type + "SourceMetadata" (Information about a metadata key as it appeared + at the data source. The source key and value represents the + original state of the metadata before it was tranformed for + ingestion by the sample service. key - the metadata key. skey - + the key as it appeared at the data source. svalue - the value as + it appeared at the data source.) -> structure: parameter "key" of + type "metadata_key" (A key in a metadata key/value pair. Less than + 1000 unicode characters.), parameter "skey" of type "metadata_key" + (A key in a metadata key/value pair. Less than 1000 unicode + characters.), parameter "svalue" of type "metadata_value" (A + metadata value, represented by a mapping of value keys to + primitive values. An example for a location metadata key might be: + { "name": "Castle Geyser", "lat": 44.463816, "long": -110.836471 } + "primitive values" means an int, float, string, or equivalent + typedefs. Including any collection types is an error.) -> mapping + from type "metadata_value_key" (A key for a value associated with + a piece of metadata. Less than 1000 unicode characters. Examples: + units, value, species) to unspecified object, parameter + "meta_user" of type "metadata" (Metadata attached to a sample.) -> + mapping from type "metadata_key" (A key in a metadata key/value + pair. Less than 1000 unicode characters.) to type "metadata_value" + (A metadata value, represented by a mapping of value keys to + primitive values. An example for a location metadata key might be: + { "name": "Castle Geyser", "lat": 44.463816, "long": -110.836471 } + "primitive values" means an int, float, string, or equivalent + typedefs. Including any collection types is an error.) -> mapping + from type "metadata_value_key" (A key for a value associated with + a piece of metadata. Less than 1000 unicode characters. Examples: + units, value, species) to unspecified object, parameter "name" of + type "sample_name" (A sample name. Must be less than 255 + characters.), parameter "save_date" of type "timestamp" (A + timestamp in epoch milliseconds.), parameter "version" of type + "version" (The version of a sample. Always > 0.), parameter + "prior_version" of Long, parameter "as_admin" of type "boolean" (A + boolean value, 0 for false, 1 for true.), parameter "as_user" of + type "user" (A user's username.) + :returns: instance of type "SampleAddress" (A Sample ID and version. + id - the ID of the sample. version - the version of the sample.) + -> structure: parameter "id" of type "sample_id" (A Sample ID. + Must be globally unique. Always assigned by the Sample service.), + parameter "version" of type "version" (The version of a sample. + Always > 0.) + """ + return self._client.call_method('SampleService.create_sample', + [params], self._service_ver, context) + + def get_sample(self, params, context=None): + """ + Get a sample. If the version is omitted the most recent sample is returned. + :param params: instance of type "GetSampleParams" (get_sample + parameters. id - the ID of the sample to retrieve. version - the + version of the sample to retrieve, or the most recent sample if + omitted. as_admin - get the sample regardless of ACLs as long as + the user has administration read permissions.) -> structure: + parameter "id" of type "sample_id" (A Sample ID. Must be globally + unique. Always assigned by the Sample service.), parameter + "version" of type "version" (The version of a sample. Always > + 0.), parameter "as_admin" of type "boolean" (A boolean value, 0 + for false, 1 for true.) + :returns: instance of type "Sample" (A Sample, consisting of a tree + of subsamples and replicates. id - the ID of the sample. user - + the user that saved the sample. node_tree - the tree(s) of sample + nodes in the sample. The the roots of all trees must be + BioReplicate nodes. All the BioReplicate nodes must be at the + start of the list, and all child nodes must occur after their + parents in the list. name - the name of the sample. Must be less + than 255 characters. save_date - the date the sample version was + saved. version - the version of the sample.) -> structure: + parameter "id" of type "sample_id" (A Sample ID. Must be globally + unique. Always assigned by the Sample service.), parameter "user" + of type "user" (A user's username.), parameter "node_tree" of list + of type "SampleNode" (A node in a sample tree. id - the ID of the + node. parent - the id of the parent node for the current node. + BioReplicate nodes, and only BioReplicate nodes, do not have a + parent. type - the type of the node. meta_controlled - metadata + restricted by the sample controlled vocabulary and validators. + source_meta - the pre-transformation keys and values of the + controlled metadata at the data source for controlled metadata + keys. In some cases the source metadata may be transformed prior + to ingestion by the Sample Service; the contents of this data + structure allows for reconstructing the original representation. + The metadata here is not validated other than basic size checks + and is provided on an informational basis only. The metadata keys + in the SourceMetadata data structure must be a subset of the + meta_controlled mapping keys. meta_user - unrestricted metadata.) + -> structure: parameter "id" of type "node_id" (A SampleNode ID. + Must be unique within a Sample and be less than 255 characters.), + parameter "parent" of type "node_id" (A SampleNode ID. Must be + unique within a Sample and be less than 255 characters.), + parameter "type" of type "samplenode_type" (The type of a sample + node. One of: BioReplicate - a biological replicate. Always at the + top of the sample tree. TechReplicate - a technical replicate. + SubSample - a sub sample that is not a technical replicate.), + parameter "meta_controlled" of type "metadata" (Metadata attached + to a sample.) -> mapping from type "metadata_key" (A key in a + metadata key/value pair. Less than 1000 unicode characters.) to + type "metadata_value" (A metadata value, represented by a mapping + of value keys to primitive values. An example for a location + metadata key might be: { "name": "Castle Geyser", "lat": + 44.463816, "long": -110.836471 } "primitive values" means an int, + float, string, or equivalent typedefs. Including any collection + types is an error.) -> mapping from type "metadata_value_key" (A + key for a value associated with a piece of metadata. Less than + 1000 unicode characters. Examples: units, value, species) to + unspecified object, parameter "source_meta" of list of type + "SourceMetadata" (Information about a metadata key as it appeared + at the data source. The source key and value represents the + original state of the metadata before it was tranformed for + ingestion by the sample service. key - the metadata key. skey - + the key as it appeared at the data source. svalue - the value as + it appeared at the data source.) -> structure: parameter "key" of + type "metadata_key" (A key in a metadata key/value pair. Less than + 1000 unicode characters.), parameter "skey" of type "metadata_key" + (A key in a metadata key/value pair. Less than 1000 unicode + characters.), parameter "svalue" of type "metadata_value" (A + metadata value, represented by a mapping of value keys to + primitive values. An example for a location metadata key might be: + { "name": "Castle Geyser", "lat": 44.463816, "long": -110.836471 } + "primitive values" means an int, float, string, or equivalent + typedefs. Including any collection types is an error.) -> mapping + from type "metadata_value_key" (A key for a value associated with + a piece of metadata. Less than 1000 unicode characters. Examples: + units, value, species) to unspecified object, parameter + "meta_user" of type "metadata" (Metadata attached to a sample.) -> + mapping from type "metadata_key" (A key in a metadata key/value + pair. Less than 1000 unicode characters.) to type "metadata_value" + (A metadata value, represented by a mapping of value keys to + primitive values. An example for a location metadata key might be: + { "name": "Castle Geyser", "lat": 44.463816, "long": -110.836471 } + "primitive values" means an int, float, string, or equivalent + typedefs. Including any collection types is an error.) -> mapping + from type "metadata_value_key" (A key for a value associated with + a piece of metadata. Less than 1000 unicode characters. Examples: + units, value, species) to unspecified object, parameter "name" of + type "sample_name" (A sample name. Must be less than 255 + characters.), parameter "save_date" of type "timestamp" (A + timestamp in epoch milliseconds.), parameter "version" of type + "version" (The version of a sample. Always > 0.) + """ + return self._client.call_method('SampleService.get_sample', + [params], self._service_ver, context) + + def get_samples(self, params, context=None): + """ + :param params: instance of type "GetSamplesParams" -> structure: + parameter "samples" of list of type "SampleIdentifier" -> + structure: parameter "id" of type "sample_id" (A Sample ID. Must + be globally unique. Always assigned by the Sample service.), + parameter "version" of type "version" (The version of a sample. + Always > 0.), parameter "as_admin" of type "boolean" (A boolean + value, 0 for false, 1 for true.) + :returns: instance of list of type "Sample" (A Sample, consisting of + a tree of subsamples and replicates. id - the ID of the sample. + user - the user that saved the sample. node_tree - the tree(s) of + sample nodes in the sample. The the roots of all trees must be + BioReplicate nodes. All the BioReplicate nodes must be at the + start of the list, and all child nodes must occur after their + parents in the list. name - the name of the sample. Must be less + than 255 characters. save_date - the date the sample version was + saved. version - the version of the sample.) -> structure: + parameter "id" of type "sample_id" (A Sample ID. Must be globally + unique. Always assigned by the Sample service.), parameter "user" + of type "user" (A user's username.), parameter "node_tree" of list + of type "SampleNode" (A node in a sample tree. id - the ID of the + node. parent - the id of the parent node for the current node. + BioReplicate nodes, and only BioReplicate nodes, do not have a + parent. type - the type of the node. meta_controlled - metadata + restricted by the sample controlled vocabulary and validators. + source_meta - the pre-transformation keys and values of the + controlled metadata at the data source for controlled metadata + keys. In some cases the source metadata may be transformed prior + to ingestion by the Sample Service; the contents of this data + structure allows for reconstructing the original representation. + The metadata here is not validated other than basic size checks + and is provided on an informational basis only. The metadata keys + in the SourceMetadata data structure must be a subset of the + meta_controlled mapping keys. meta_user - unrestricted metadata.) + -> structure: parameter "id" of type "node_id" (A SampleNode ID. + Must be unique within a Sample and be less than 255 characters.), + parameter "parent" of type "node_id" (A SampleNode ID. Must be + unique within a Sample and be less than 255 characters.), + parameter "type" of type "samplenode_type" (The type of a sample + node. One of: BioReplicate - a biological replicate. Always at the + top of the sample tree. TechReplicate - a technical replicate. + SubSample - a sub sample that is not a technical replicate.), + parameter "meta_controlled" of type "metadata" (Metadata attached + to a sample.) -> mapping from type "metadata_key" (A key in a + metadata key/value pair. Less than 1000 unicode characters.) to + type "metadata_value" (A metadata value, represented by a mapping + of value keys to primitive values. An example for a location + metadata key might be: { "name": "Castle Geyser", "lat": + 44.463816, "long": -110.836471 } "primitive values" means an int, + float, string, or equivalent typedefs. Including any collection + types is an error.) -> mapping from type "metadata_value_key" (A + key for a value associated with a piece of metadata. Less than + 1000 unicode characters. Examples: units, value, species) to + unspecified object, parameter "source_meta" of list of type + "SourceMetadata" (Information about a metadata key as it appeared + at the data source. The source key and value represents the + original state of the metadata before it was tranformed for + ingestion by the sample service. key - the metadata key. skey - + the key as it appeared at the data source. svalue - the value as + it appeared at the data source.) -> structure: parameter "key" of + type "metadata_key" (A key in a metadata key/value pair. Less than + 1000 unicode characters.), parameter "skey" of type "metadata_key" + (A key in a metadata key/value pair. Less than 1000 unicode + characters.), parameter "svalue" of type "metadata_value" (A + metadata value, represented by a mapping of value keys to + primitive values. An example for a location metadata key might be: + { "name": "Castle Geyser", "lat": 44.463816, "long": -110.836471 } + "primitive values" means an int, float, string, or equivalent + typedefs. Including any collection types is an error.) -> mapping + from type "metadata_value_key" (A key for a value associated with + a piece of metadata. Less than 1000 unicode characters. Examples: + units, value, species) to unspecified object, parameter + "meta_user" of type "metadata" (Metadata attached to a sample.) -> + mapping from type "metadata_key" (A key in a metadata key/value + pair. Less than 1000 unicode characters.) to type "metadata_value" + (A metadata value, represented by a mapping of value keys to + primitive values. An example for a location metadata key might be: + { "name": "Castle Geyser", "lat": 44.463816, "long": -110.836471 } + "primitive values" means an int, float, string, or equivalent + typedefs. Including any collection types is an error.) -> mapping + from type "metadata_value_key" (A key for a value associated with + a piece of metadata. Less than 1000 unicode characters. Examples: + units, value, species) to unspecified object, parameter "name" of + type "sample_name" (A sample name. Must be less than 255 + characters.), parameter "save_date" of type "timestamp" (A + timestamp in epoch milliseconds.), parameter "version" of type + "version" (The version of a sample. Always > 0.) + """ + return self._client.call_method('SampleService.get_samples', + [params], self._service_ver, context) + + def get_sample_acls(self, params, context=None): + """ + Get a sample's ACLs. + :param params: instance of type "GetSampleACLsParams" + (get_sample_acls parameters. id - the ID of the sample to + retrieve. as_admin - get the sample acls regardless of ACL + contents as long as the user has administration read permissions.) + -> structure: parameter "id" of type "sample_id" (A Sample ID. + Must be globally unique. Always assigned by the Sample service.), + parameter "as_admin" of type "boolean" (A boolean value, 0 for + false, 1 for true.) + :returns: instance of type "SampleACLs" (Access control lists for a + sample. Access levels include the privileges of the lower access + levels. owner - the user that created and owns the sample. admin - + users that can administrate (e.g. alter ACLs) the sample. write - + users that can write (e.g. create a new version) to the sample. + read - users that can view the sample. public_read - whether any + user can read the sample, regardless of permissions.) -> + structure: parameter "owner" of type "user" (A user's username.), + parameter "admin" of list of type "user" (A user's username.), + parameter "write" of list of type "user" (A user's username.), + parameter "read" of list of type "user" (A user's username.), + parameter "public_read" of type "boolean" (A boolean value, 0 for + false, 1 for true.) + """ + return self._client.call_method('SampleService.get_sample_acls', + [params], self._service_ver, context) + + def update_sample_acls(self, params, context=None): + """ + Update a sample's ACLs. + :param params: instance of type "UpdateSampleACLsParams" + (update_sample_acls parameters. id - the ID of the sample to + modify. admin - a list of users that will receive admin + privileges. Default none. write - a list of users that will + receive write privileges. Default none. read - a list of users + that will receive read privileges. Default none. remove - a list + of users that will have all privileges removed. Default none. + public_read - an integer that determines whether the sample will + be set to publicly readable: > 0: public read. 0: No change (the + default). < 0: private. at_least - false, the default, indicates + that the users should get the exact permissions as specified in + the user lists, which may mean a reduction in permissions. If + true, users that already exist in the sample ACLs will not have + their permissions reduced as part of the ACL update unless they + are in the remove list. E.g. if a user has write permissions and + read permissions are specified in the update, no changes will be + made to the user's permission. as_admin - update the sample acls + regardless of sample ACL contents as long as the user has full + service administration permissions.) -> structure: parameter "id" + of type "sample_id" (A Sample ID. Must be globally unique. Always + assigned by the Sample service.), parameter "admin" of list of + type "user" (A user's username.), parameter "write" of list of + type "user" (A user's username.), parameter "read" of list of type + "user" (A user's username.), parameter "remove" of list of type + "user" (A user's username.), parameter "public_read" of Long, + parameter "at_least" of type "boolean" (A boolean value, 0 for + false, 1 for true.), parameter "as_admin" of type "boolean" (A + boolean value, 0 for false, 1 for true.) + """ + return self._client.call_method('SampleService.update_sample_acls', + [params], self._service_ver, context) + + def update_samples_acls(self, params, context=None): + """ + Update the ACLs of many samples. + :param params: instance of type "UpdateSamplesACLsParams" + (update_samples_acls parameters. These parameters are the same as + update_sample_acls, except: ids - a list of IDs of samples to + modify.) -> structure: parameter "ids" of list of type "sample_id" + (A Sample ID. Must be globally unique. Always assigned by the + Sample service.), parameter "admin" of list of type "user" (A + user's username.), parameter "write" of list of type "user" (A + user's username.), parameter "read" of list of type "user" (A + user's username.), parameter "remove" of list of type "user" (A + user's username.), parameter "public_read" of Long, parameter + "at_least" of type "boolean" (A boolean value, 0 for false, 1 for + true.), parameter "as_admin" of type "boolean" (A boolean value, 0 + for false, 1 for true.) + """ + return self._client.call_method('SampleService.update_samples_acls', + [params], self._service_ver, context) + + def replace_sample_acls(self, params, context=None): + """ + Completely overwrite a sample's ACLs. Any current ACLs are replaced by the provided + ACLs, even if empty, and gone forever. + The sample owner cannot be changed via this method. + :param params: instance of type "ReplaceSampleACLsParams" + (replace_sample_acls parameters. id - the ID of the sample to + modify. acls - the ACLs to set on the sample. as_admin - replace + the sample acls regardless of sample ACL contents as long as the + user has full service administration permissions.) -> structure: + parameter "id" of type "sample_id" (A Sample ID. Must be globally + unique. Always assigned by the Sample service.), parameter "acls" + of type "SampleACLs" (Access control lists for a sample. Access + levels include the privileges of the lower access levels. owner - + the user that created and owns the sample. admin - users that can + administrate (e.g. alter ACLs) the sample. write - users that can + write (e.g. create a new version) to the sample. read - users that + can view the sample. public_read - whether any user can read the + sample, regardless of permissions.) -> structure: parameter + "owner" of type "user" (A user's username.), parameter "admin" of + list of type "user" (A user's username.), parameter "write" of + list of type "user" (A user's username.), parameter "read" of list + of type "user" (A user's username.), parameter "public_read" of + type "boolean" (A boolean value, 0 for false, 1 for true.), + parameter "as_admin" of type "boolean" (A boolean value, 0 for + false, 1 for true.) + """ + return self._client.call_method('SampleService.replace_sample_acls', + [params], self._service_ver, context) + + def get_metadata_key_static_metadata(self, params, context=None): + """ + Get static metadata for one or more metadata keys. + The static metadata for a metadata key is metadata *about* the key - e.g. it may + define the key's semantics or denote that the key is linked to an ontological ID. + The static metadata does not change without the service being restarted. Client caching is + recommended to improve performance. + :param params: instance of type "GetMetadataKeyStaticMetadataParams" + (get_metadata_key_static_metadata parameters. keys - the list of + metadata keys to interrogate. prefix - 0 (the default) to + interrogate standard metadata keys. 1 to interrogate prefix + metadata keys, but require an exact match to the prefix key. 2 to + interrogate prefix metadata keys, but any keys which are a prefix + of the provided keys will be included in the results.) -> + structure: parameter "keys" of list of type "metadata_key" (A key + in a metadata key/value pair. Less than 1000 unicode characters.), + parameter "prefix" of Long + :returns: instance of type "GetMetadataKeyStaticMetadataResults" + (get_metadata_key_static_metadata results. static_metadata - the + static metadata for the requested keys.) -> structure: parameter + "static_metadata" of type "metadata" (Metadata attached to a + sample.) -> mapping from type "metadata_key" (A key in a metadata + key/value pair. Less than 1000 unicode characters.) to type + "metadata_value" (A metadata value, represented by a mapping of + value keys to primitive values. An example for a location metadata + key might be: { "name": "Castle Geyser", "lat": 44.463816, "long": + -110.836471 } "primitive values" means an int, float, string, or + equivalent typedefs. Including any collection types is an error.) + -> mapping from type "metadata_value_key" (A key for a value + associated with a piece of metadata. Less than 1000 unicode + characters. Examples: units, value, species) to unspecified object + """ + return self._client.call_method('SampleService.get_metadata_key_static_metadata', + [params], self._service_ver, context) + + def create_data_link(self, params, context=None): + """ + Create a link from a KBase Workspace object to a sample. + The user must have admin permissions for the sample and write permissions for the + Workspace object. + :param params: instance of type "CreateDataLinkParams" + (create_data_link parameters. upa - the workspace UPA of the + object to be linked. dataid - the dataid of the data to be linked, + if any, within the object. If omitted the entire object is linked + to the sample. id - the sample id. version - the sample version. + node - the sample node. update - if false (the default), fail if a + link already exists from the data unit (the combination of the UPA + and dataid). if true, expire the old link and create the new link + unless the link is already to the requested sample node, in which + case the operation is a no-op. as_admin - run the method as a + service administrator. The user must have full administration + permissions. as_user - create the link as a different user. + Ignored if as_admin is not true. Neither the administrator nor the + impersonated user need have permissions to the data or sample.) -> + structure: parameter "upa" of type "ws_upa" (A KBase Workspace + service Unique Permanent Address (UPA). E.g. 5/6/7 where 5 is the + workspace ID, 6 the object ID, and 7 the object version.), + parameter "dataid" of type "data_id" (An id for a unit of data + within a KBase Workspace object. A single object may contain many + data units. A dataid is expected to be unique within a single + object. Must be less than 255 characters.), parameter "id" of type + "sample_id" (A Sample ID. Must be globally unique. Always assigned + by the Sample service.), parameter "version" of type "version" + (The version of a sample. Always > 0.), parameter "node" of type + "node_id" (A SampleNode ID. Must be unique within a Sample and be + less than 255 characters.), parameter "update" of type "boolean" + (A boolean value, 0 for false, 1 for true.), parameter "as_admin" + of type "boolean" (A boolean value, 0 for false, 1 for true.), + parameter "as_user" of type "user" (A user's username.) + :returns: instance of type "CreateDataLinkResults" (create_data_link + results. new_link - the new link.) -> structure: parameter + "new_link" of type "DataLink" (A data link from a KBase workspace + object to a sample. upa - the workspace UPA of the linked object. + dataid - the dataid of the linked data, if any, within the object. + If omitted the entire object is linked to the sample. id - the + sample id. version - the sample version. node - the sample node. + createdby - the user that created the link. created - the time the + link was created. expiredby - the user that expired the link, if + any. expired - the time the link was expired, if at all.) -> + structure: parameter "linkid" of type "link_id" (A link ID. Must + be globally unique. Always assigned by the Sample service. + Typically only of use to service admins.), parameter "upa" of type + "ws_upa" (A KBase Workspace service Unique Permanent Address + (UPA). E.g. 5/6/7 where 5 is the workspace ID, 6 the object ID, + and 7 the object version.), parameter "dataid" of type "data_id" + (An id for a unit of data within a KBase Workspace object. A + single object may contain many data units. A dataid is expected to + be unique within a single object. Must be less than 255 + characters.), parameter "id" of type "sample_id" (A Sample ID. + Must be globally unique. Always assigned by the Sample service.), + parameter "version" of type "version" (The version of a sample. + Always > 0.), parameter "node" of type "node_id" (A SampleNode ID. + Must be unique within a Sample and be less than 255 characters.), + parameter "createdby" of type "user" (A user's username.), + parameter "created" of type "timestamp" (A timestamp in epoch + milliseconds.), parameter "expiredby" of type "user" (A user's + username.), parameter "expired" of type "timestamp" (A timestamp + in epoch milliseconds.) + """ + return self._client.call_method('SampleService.create_data_link', + [params], self._service_ver, context) + + def propagate_data_links(self, params, context=None): + """ + Propagates data links from a previous sample to the current (latest) version + The user must have admin permissions for the sample and write permissions for the + Workspace object. + :param params: instance of type "PropagateDataLinkParams" + (propagate_data_links parameters. id - the sample id. version - + the sample version. (data links are propagated to) + previous_version - the previouse sample version. (data links are + propagated from) ignore_types - the workspace data type ignored + from propagating. default empty. update - if false (the default), + fail if a link already exists from the data unit (the combination + of the UPA and dataid). if true, expire the old link and create + the new link unless the link is already to the requested sample + node, in which case the operation is a no-op. effective_time - the + effective time at which the query should be run - the default is + the current time. Providing a time allows for reproducibility of + previous results. as_admin - run the method as a service + administrator. The user must have full administration permissions. + as_user - create the link as a different user. Ignored if as_admin + is not true. Neither the administrator nor the impersonated user + need have permissions to the data or sample.) -> structure: + parameter "id" of type "sample_id" (A Sample ID. Must be globally + unique. Always assigned by the Sample service.), parameter + "version" of type "version" (The version of a sample. Always > + 0.), parameter "previous_version" of type "version" (The version + of a sample. Always > 0.), parameter "ignore_types" of list of + type "ws_type_string" (A workspace type string. Specifies the + workspace data type a single string in the format + [module].[typename]: module - a string. The module name of the + typespec containing the type. typename - a string. The name of the + type as assigned by the typedef statement. Example: + KBaseSets.SampleSet), parameter "update" of type "boolean" (A + boolean value, 0 for false, 1 for true.), parameter + "effective_time" of type "timestamp" (A timestamp in epoch + milliseconds.), parameter "as_admin" of type "boolean" (A boolean + value, 0 for false, 1 for true.), parameter "as_user" of type + "user" (A user's username.) + :returns: instance of type "PropagateDataLinkResults" + (propagate_data_links results. links - the links.) -> structure: + parameter "links" of list of type "DataLink" (A data link from a + KBase workspace object to a sample. upa - the workspace UPA of the + linked object. dataid - the dataid of the linked data, if any, + within the object. If omitted the entire object is linked to the + sample. id - the sample id. version - the sample version. node - + the sample node. createdby - the user that created the link. + created - the time the link was created. expiredby - the user that + expired the link, if any. expired - the time the link was expired, + if at all.) -> structure: parameter "linkid" of type "link_id" (A + link ID. Must be globally unique. Always assigned by the Sample + service. Typically only of use to service admins.), parameter + "upa" of type "ws_upa" (A KBase Workspace service Unique Permanent + Address (UPA). E.g. 5/6/7 where 5 is the workspace ID, 6 the + object ID, and 7 the object version.), parameter "dataid" of type + "data_id" (An id for a unit of data within a KBase Workspace + object. A single object may contain many data units. A dataid is + expected to be unique within a single object. Must be less than + 255 characters.), parameter "id" of type "sample_id" (A Sample ID. + Must be globally unique. Always assigned by the Sample service.), + parameter "version" of type "version" (The version of a sample. + Always > 0.), parameter "node" of type "node_id" (A SampleNode ID. + Must be unique within a Sample and be less than 255 characters.), + parameter "createdby" of type "user" (A user's username.), + parameter "created" of type "timestamp" (A timestamp in epoch + milliseconds.), parameter "expiredby" of type "user" (A user's + username.), parameter "expired" of type "timestamp" (A timestamp + in epoch milliseconds.) + """ + return self._client.call_method('SampleService.propagate_data_links', + [params], self._service_ver, context) + + def expire_data_link(self, params, context=None): + """ + Expire a link from a KBase Workspace object. + The user must have admin permissions for the sample and write permissions for the + Workspace object. + :param params: instance of type "ExpireDataLinkParams" + (expire_data_link parameters. upa - the workspace upa of the + object from which the link originates. dataid - the dataid, if + any, of the data within the object from which the link originates. + Omit for links where the link is from the entire object. as_admin + - run the method as a service administrator. The user must have + full administration permissions. as_user - expire the link as a + different user. Ignored if as_admin is not true. Neither the + administrator nor the impersonated user need have permissions to + the link if a new version is saved.) -> structure: parameter "upa" + of type "ws_upa" (A KBase Workspace service Unique Permanent + Address (UPA). E.g. 5/6/7 where 5 is the workspace ID, 6 the + object ID, and 7 the object version.), parameter "dataid" of type + "data_id" (An id for a unit of data within a KBase Workspace + object. A single object may contain many data units. A dataid is + expected to be unique within a single object. Must be less than + 255 characters.), parameter "as_admin" of type "boolean" (A + boolean value, 0 for false, 1 for true.), parameter "as_user" of + type "user" (A user's username.) + """ + return self._client.call_method('SampleService.expire_data_link', + [params], self._service_ver, context) + + def get_data_links_from_sample(self, params, context=None): + """ + Get data links to Workspace objects originating from a sample. + The user must have read permissions to the sample. Only Workspace objects the user + can read are returned. + :param params: instance of type "GetDataLinksFromSampleParams" + (get_data_links_from_sample parameters. id - the sample ID. + version - the sample version. effective_time - the effective time + at which the query should be run - the default is the current + time. Providing a time allows for reproducibility of previous + results. as_admin - run the method as a service administrator. The + user must have read administration permissions.) -> structure: + parameter "id" of type "sample_id" (A Sample ID. Must be globally + unique. Always assigned by the Sample service.), parameter + "version" of type "version" (The version of a sample. Always > + 0.), parameter "effective_time" of type "timestamp" (A timestamp + in epoch milliseconds.), parameter "as_admin" of type "boolean" (A + boolean value, 0 for false, 1 for true.) + :returns: instance of type "GetDataLinksFromSampleResults" + (get_data_links_from_sample results. links - the links. + effective_time - the time at which the query was run. This + timestamp, if saved, can be used when running the method again to + ensure reproducible results. Note that changes to workspace + permissions may cause results to change over time.) -> structure: + parameter "links" of list of type "DataLink" (A data link from a + KBase workspace object to a sample. upa - the workspace UPA of the + linked object. dataid - the dataid of the linked data, if any, + within the object. If omitted the entire object is linked to the + sample. id - the sample id. version - the sample version. node - + the sample node. createdby - the user that created the link. + created - the time the link was created. expiredby - the user that + expired the link, if any. expired - the time the link was expired, + if at all.) -> structure: parameter "linkid" of type "link_id" (A + link ID. Must be globally unique. Always assigned by the Sample + service. Typically only of use to service admins.), parameter + "upa" of type "ws_upa" (A KBase Workspace service Unique Permanent + Address (UPA). E.g. 5/6/7 where 5 is the workspace ID, 6 the + object ID, and 7 the object version.), parameter "dataid" of type + "data_id" (An id for a unit of data within a KBase Workspace + object. A single object may contain many data units. A dataid is + expected to be unique within a single object. Must be less than + 255 characters.), parameter "id" of type "sample_id" (A Sample ID. + Must be globally unique. Always assigned by the Sample service.), + parameter "version" of type "version" (The version of a sample. + Always > 0.), parameter "node" of type "node_id" (A SampleNode ID. + Must be unique within a Sample and be less than 255 characters.), + parameter "createdby" of type "user" (A user's username.), + parameter "created" of type "timestamp" (A timestamp in epoch + milliseconds.), parameter "expiredby" of type "user" (A user's + username.), parameter "expired" of type "timestamp" (A timestamp + in epoch milliseconds.), parameter "effective_time" of type + "timestamp" (A timestamp in epoch milliseconds.) + """ + return self._client.call_method('SampleService.get_data_links_from_sample', + [params], self._service_ver, context) + + def get_data_links_from_sample_set(self, params, context=None): + """ + Get all workspace object metadata linked to samples in a list of samples or sample set + refs. Returns metadata about links to data objects. A batch version of + get_data_links_from_sample. + The user must have read permissions to the sample. A permissions error is thrown when a + sample is found that the user has no access to. + :param params: instance of type "GetDataLinksFromSampleSetParams" + (get_data_links_from_sample_set parameters. sample_ids - a list of + sample ids and versions effective_time - the time at which the + query was run. This timestamp, if saved, can be used when running + the method again to enqure reproducible results. Note that changes + to workspace permissions may cause results to change over time. + as_admin - run the method as a service administrator. The user + must have read administration permissions.) -> structure: + parameter "sample_ids" of list of type "SampleIdentifier" -> + structure: parameter "id" of type "sample_id" (A Sample ID. Must + be globally unique. Always assigned by the Sample service.), + parameter "version" of type "version" (The version of a sample. + Always > 0.), parameter "effective_time" of type "timestamp" (A + timestamp in epoch milliseconds.), parameter "as_admin" of type + "boolean" (A boolean value, 0 for false, 1 for true.) + :returns: instance of type "GetDataLinksFromSampleResults" + (get_data_links_from_sample results. links - the links. + effective_time - the time at which the query was run. This + timestamp, if saved, can be used when running the method again to + ensure reproducible results. Note that changes to workspace + permissions may cause results to change over time.) -> structure: + parameter "links" of list of type "DataLink" (A data link from a + KBase workspace object to a sample. upa - the workspace UPA of the + linked object. dataid - the dataid of the linked data, if any, + within the object. If omitted the entire object is linked to the + sample. id - the sample id. version - the sample version. node - + the sample node. createdby - the user that created the link. + created - the time the link was created. expiredby - the user that + expired the link, if any. expired - the time the link was expired, + if at all.) -> structure: parameter "linkid" of type "link_id" (A + link ID. Must be globally unique. Always assigned by the Sample + service. Typically only of use to service admins.), parameter + "upa" of type "ws_upa" (A KBase Workspace service Unique Permanent + Address (UPA). E.g. 5/6/7 where 5 is the workspace ID, 6 the + object ID, and 7 the object version.), parameter "dataid" of type + "data_id" (An id for a unit of data within a KBase Workspace + object. A single object may contain many data units. A dataid is + expected to be unique within a single object. Must be less than + 255 characters.), parameter "id" of type "sample_id" (A Sample ID. + Must be globally unique. Always assigned by the Sample service.), + parameter "version" of type "version" (The version of a sample. + Always > 0.), parameter "node" of type "node_id" (A SampleNode ID. + Must be unique within a Sample and be less than 255 characters.), + parameter "createdby" of type "user" (A user's username.), + parameter "created" of type "timestamp" (A timestamp in epoch + milliseconds.), parameter "expiredby" of type "user" (A user's + username.), parameter "expired" of type "timestamp" (A timestamp + in epoch milliseconds.), parameter "effective_time" of type + "timestamp" (A timestamp in epoch milliseconds.) + """ + return self._client.call_method('SampleService.get_data_links_from_sample_set', + [params], self._service_ver, context) + + def get_data_links_from_data(self, params, context=None): + """ + Get data links to samples originating from Workspace data. + The user must have read permissions to the workspace data. + :param params: instance of type "GetDataLinksFromDataParams" + (get_data_links_from_data parameters. upa - the data UPA. + effective_time - the effective time at which the query should be + run - the default is the current time. Providing a time allows for + reproducibility of previous results. as_admin - run the method as + a service administrator. The user must have read administration + permissions.) -> structure: parameter "upa" of type "ws_upa" (A + KBase Workspace service Unique Permanent Address (UPA). E.g. 5/6/7 + where 5 is the workspace ID, 6 the object ID, and 7 the object + version.), parameter "effective_time" of type "timestamp" (A + timestamp in epoch milliseconds.), parameter "as_admin" of type + "boolean" (A boolean value, 0 for false, 1 for true.) + :returns: instance of type "GetDataLinksFromDataResults" + (get_data_links_from_data results. links - the links. + effective_time - the time at which the query was run. This + timestamp, if saved, can be used when running the method again to + ensure reproducible results.) -> structure: parameter "links" of + list of type "DataLink" (A data link from a KBase workspace object + to a sample. upa - the workspace UPA of the linked object. dataid + - the dataid of the linked data, if any, within the object. If + omitted the entire object is linked to the sample. id - the sample + id. version - the sample version. node - the sample node. + createdby - the user that created the link. created - the time the + link was created. expiredby - the user that expired the link, if + any. expired - the time the link was expired, if at all.) -> + structure: parameter "linkid" of type "link_id" (A link ID. Must + be globally unique. Always assigned by the Sample service. + Typically only of use to service admins.), parameter "upa" of type + "ws_upa" (A KBase Workspace service Unique Permanent Address + (UPA). E.g. 5/6/7 where 5 is the workspace ID, 6 the object ID, + and 7 the object version.), parameter "dataid" of type "data_id" + (An id for a unit of data within a KBase Workspace object. A + single object may contain many data units. A dataid is expected to + be unique within a single object. Must be less than 255 + characters.), parameter "id" of type "sample_id" (A Sample ID. + Must be globally unique. Always assigned by the Sample service.), + parameter "version" of type "version" (The version of a sample. + Always > 0.), parameter "node" of type "node_id" (A SampleNode ID. + Must be unique within a Sample and be less than 255 characters.), + parameter "createdby" of type "user" (A user's username.), + parameter "created" of type "timestamp" (A timestamp in epoch + milliseconds.), parameter "expiredby" of type "user" (A user's + username.), parameter "expired" of type "timestamp" (A timestamp + in epoch milliseconds.), parameter "effective_time" of type + "timestamp" (A timestamp in epoch milliseconds.) + """ + return self._client.call_method('SampleService.get_data_links_from_data', + [params], self._service_ver, context) + + def get_sample_via_data(self, params, context=None): + """ + Get a sample via a workspace object. Read permissions to a workspace object grants + read permissions to all versions of any linked samples, whether the links are expired or + not. This method allows for fetching samples when the user does not have explicit + read access to the sample. + :param params: instance of type "GetSampleViaDataParams" + (get_sample_via_data parameters. upa - the workspace UPA of the + target object. id - the target sample id. version - the target + sample version.) -> structure: parameter "upa" of type "ws_upa" (A + KBase Workspace service Unique Permanent Address (UPA). E.g. 5/6/7 + where 5 is the workspace ID, 6 the object ID, and 7 the object + version.), parameter "id" of type "sample_id" (A Sample ID. Must + be globally unique. Always assigned by the Sample service.), + parameter "version" of type "version" (The version of a sample. + Always > 0.) + :returns: instance of type "Sample" (A Sample, consisting of a tree + of subsamples and replicates. id - the ID of the sample. user - + the user that saved the sample. node_tree - the tree(s) of sample + nodes in the sample. The the roots of all trees must be + BioReplicate nodes. All the BioReplicate nodes must be at the + start of the list, and all child nodes must occur after their + parents in the list. name - the name of the sample. Must be less + than 255 characters. save_date - the date the sample version was + saved. version - the version of the sample.) -> structure: + parameter "id" of type "sample_id" (A Sample ID. Must be globally + unique. Always assigned by the Sample service.), parameter "user" + of type "user" (A user's username.), parameter "node_tree" of list + of type "SampleNode" (A node in a sample tree. id - the ID of the + node. parent - the id of the parent node for the current node. + BioReplicate nodes, and only BioReplicate nodes, do not have a + parent. type - the type of the node. meta_controlled - metadata + restricted by the sample controlled vocabulary and validators. + source_meta - the pre-transformation keys and values of the + controlled metadata at the data source for controlled metadata + keys. In some cases the source metadata may be transformed prior + to ingestion by the Sample Service; the contents of this data + structure allows for reconstructing the original representation. + The metadata here is not validated other than basic size checks + and is provided on an informational basis only. The metadata keys + in the SourceMetadata data structure must be a subset of the + meta_controlled mapping keys. meta_user - unrestricted metadata.) + -> structure: parameter "id" of type "node_id" (A SampleNode ID. + Must be unique within a Sample and be less than 255 characters.), + parameter "parent" of type "node_id" (A SampleNode ID. Must be + unique within a Sample and be less than 255 characters.), + parameter "type" of type "samplenode_type" (The type of a sample + node. One of: BioReplicate - a biological replicate. Always at the + top of the sample tree. TechReplicate - a technical replicate. + SubSample - a sub sample that is not a technical replicate.), + parameter "meta_controlled" of type "metadata" (Metadata attached + to a sample.) -> mapping from type "metadata_key" (A key in a + metadata key/value pair. Less than 1000 unicode characters.) to + type "metadata_value" (A metadata value, represented by a mapping + of value keys to primitive values. An example for a location + metadata key might be: { "name": "Castle Geyser", "lat": + 44.463816, "long": -110.836471 } "primitive values" means an int, + float, string, or equivalent typedefs. Including any collection + types is an error.) -> mapping from type "metadata_value_key" (A + key for a value associated with a piece of metadata. Less than + 1000 unicode characters. Examples: units, value, species) to + unspecified object, parameter "source_meta" of list of type + "SourceMetadata" (Information about a metadata key as it appeared + at the data source. The source key and value represents the + original state of the metadata before it was tranformed for + ingestion by the sample service. key - the metadata key. skey - + the key as it appeared at the data source. svalue - the value as + it appeared at the data source.) -> structure: parameter "key" of + type "metadata_key" (A key in a metadata key/value pair. Less than + 1000 unicode characters.), parameter "skey" of type "metadata_key" + (A key in a metadata key/value pair. Less than 1000 unicode + characters.), parameter "svalue" of type "metadata_value" (A + metadata value, represented by a mapping of value keys to + primitive values. An example for a location metadata key might be: + { "name": "Castle Geyser", "lat": 44.463816, "long": -110.836471 } + "primitive values" means an int, float, string, or equivalent + typedefs. Including any collection types is an error.) -> mapping + from type "metadata_value_key" (A key for a value associated with + a piece of metadata. Less than 1000 unicode characters. Examples: + units, value, species) to unspecified object, parameter + "meta_user" of type "metadata" (Metadata attached to a sample.) -> + mapping from type "metadata_key" (A key in a metadata key/value + pair. Less than 1000 unicode characters.) to type "metadata_value" + (A metadata value, represented by a mapping of value keys to + primitive values. An example for a location metadata key might be: + { "name": "Castle Geyser", "lat": 44.463816, "long": -110.836471 } + "primitive values" means an int, float, string, or equivalent + typedefs. Including any collection types is an error.) -> mapping + from type "metadata_value_key" (A key for a value associated with + a piece of metadata. Less than 1000 unicode characters. Examples: + units, value, species) to unspecified object, parameter "name" of + type "sample_name" (A sample name. Must be less than 255 + characters.), parameter "save_date" of type "timestamp" (A + timestamp in epoch milliseconds.), parameter "version" of type + "version" (The version of a sample. Always > 0.) + """ + return self._client.call_method('SampleService.get_sample_via_data', + [params], self._service_ver, context) + + def get_data_link(self, params, context=None): + """ + Get a link, expired or not, by its ID. This method requires read administration privileges + for the service. + :param params: instance of type "GetDataLinkParams" (get_data_link + parameters. linkid - the link ID.) -> structure: parameter + "linkid" of type "link_id" (A link ID. Must be globally unique. + Always assigned by the Sample service. Typically only of use to + service admins.) + :returns: instance of type "DataLink" (A data link from a KBase + workspace object to a sample. upa - the workspace UPA of the + linked object. dataid - the dataid of the linked data, if any, + within the object. If omitted the entire object is linked to the + sample. id - the sample id. version - the sample version. node - + the sample node. createdby - the user that created the link. + created - the time the link was created. expiredby - the user that + expired the link, if any. expired - the time the link was expired, + if at all.) -> structure: parameter "linkid" of type "link_id" (A + link ID. Must be globally unique. Always assigned by the Sample + service. Typically only of use to service admins.), parameter + "upa" of type "ws_upa" (A KBase Workspace service Unique Permanent + Address (UPA). E.g. 5/6/7 where 5 is the workspace ID, 6 the + object ID, and 7 the object version.), parameter "dataid" of type + "data_id" (An id for a unit of data within a KBase Workspace + object. A single object may contain many data units. A dataid is + expected to be unique within a single object. Must be less than + 255 characters.), parameter "id" of type "sample_id" (A Sample ID. + Must be globally unique. Always assigned by the Sample service.), + parameter "version" of type "version" (The version of a sample. + Always > 0.), parameter "node" of type "node_id" (A SampleNode ID. + Must be unique within a Sample and be less than 255 characters.), + parameter "createdby" of type "user" (A user's username.), + parameter "created" of type "timestamp" (A timestamp in epoch + milliseconds.), parameter "expiredby" of type "user" (A user's + username.), parameter "expired" of type "timestamp" (A timestamp + in epoch milliseconds.) + """ + return self._client.call_method('SampleService.get_data_link', + [params], self._service_ver, context) + + def validate_samples(self, params, context=None): + """ + :param params: instance of type "ValidateSamplesParams" (Provide + sample and run through the validation steps, but without saving + them. Allows all the samples to be evaluated for validity first so + potential errors can be addressed.) -> structure: parameter + "samples" of list of type "Sample" (A Sample, consisting of a tree + of subsamples and replicates. id - the ID of the sample. user - + the user that saved the sample. node_tree - the tree(s) of sample + nodes in the sample. The the roots of all trees must be + BioReplicate nodes. All the BioReplicate nodes must be at the + start of the list, and all child nodes must occur after their + parents in the list. name - the name of the sample. Must be less + than 255 characters. save_date - the date the sample version was + saved. version - the version of the sample.) -> structure: + parameter "id" of type "sample_id" (A Sample ID. Must be globally + unique. Always assigned by the Sample service.), parameter "user" + of type "user" (A user's username.), parameter "node_tree" of list + of type "SampleNode" (A node in a sample tree. id - the ID of the + node. parent - the id of the parent node for the current node. + BioReplicate nodes, and only BioReplicate nodes, do not have a + parent. type - the type of the node. meta_controlled - metadata + restricted by the sample controlled vocabulary and validators. + source_meta - the pre-transformation keys and values of the + controlled metadata at the data source for controlled metadata + keys. In some cases the source metadata may be transformed prior + to ingestion by the Sample Service; the contents of this data + structure allows for reconstructing the original representation. + The metadata here is not validated other than basic size checks + and is provided on an informational basis only. The metadata keys + in the SourceMetadata data structure must be a subset of the + meta_controlled mapping keys. meta_user - unrestricted metadata.) + -> structure: parameter "id" of type "node_id" (A SampleNode ID. + Must be unique within a Sample and be less than 255 characters.), + parameter "parent" of type "node_id" (A SampleNode ID. Must be + unique within a Sample and be less than 255 characters.), + parameter "type" of type "samplenode_type" (The type of a sample + node. One of: BioReplicate - a biological replicate. Always at the + top of the sample tree. TechReplicate - a technical replicate. + SubSample - a sub sample that is not a technical replicate.), + parameter "meta_controlled" of type "metadata" (Metadata attached + to a sample.) -> mapping from type "metadata_key" (A key in a + metadata key/value pair. Less than 1000 unicode characters.) to + type "metadata_value" (A metadata value, represented by a mapping + of value keys to primitive values. An example for a location + metadata key might be: { "name": "Castle Geyser", "lat": + 44.463816, "long": -110.836471 } "primitive values" means an int, + float, string, or equivalent typedefs. Including any collection + types is an error.) -> mapping from type "metadata_value_key" (A + key for a value associated with a piece of metadata. Less than + 1000 unicode characters. Examples: units, value, species) to + unspecified object, parameter "source_meta" of list of type + "SourceMetadata" (Information about a metadata key as it appeared + at the data source. The source key and value represents the + original state of the metadata before it was tranformed for + ingestion by the sample service. key - the metadata key. skey - + the key as it appeared at the data source. svalue - the value as + it appeared at the data source.) -> structure: parameter "key" of + type "metadata_key" (A key in a metadata key/value pair. Less than + 1000 unicode characters.), parameter "skey" of type "metadata_key" + (A key in a metadata key/value pair. Less than 1000 unicode + characters.), parameter "svalue" of type "metadata_value" (A + metadata value, represented by a mapping of value keys to + primitive values. An example for a location metadata key might be: + { "name": "Castle Geyser", "lat": 44.463816, "long": -110.836471 } + "primitive values" means an int, float, string, or equivalent + typedefs. Including any collection types is an error.) -> mapping + from type "metadata_value_key" (A key for a value associated with + a piece of metadata. Less than 1000 unicode characters. Examples: + units, value, species) to unspecified object, parameter + "meta_user" of type "metadata" (Metadata attached to a sample.) -> + mapping from type "metadata_key" (A key in a metadata key/value + pair. Less than 1000 unicode characters.) to type "metadata_value" + (A metadata value, represented by a mapping of value keys to + primitive values. An example for a location metadata key might be: + { "name": "Castle Geyser", "lat": 44.463816, "long": -110.836471 } + "primitive values" means an int, float, string, or equivalent + typedefs. Including any collection types is an error.) -> mapping + from type "metadata_value_key" (A key for a value associated with + a piece of metadata. Less than 1000 unicode characters. Examples: + units, value, species) to unspecified object, parameter "name" of + type "sample_name" (A sample name. Must be less than 255 + characters.), parameter "save_date" of type "timestamp" (A + timestamp in epoch milliseconds.), parameter "version" of type + "version" (The version of a sample. Always > 0.) + :returns: instance of type "ValidateSamplesResults" -> structure: + parameter "errors" of list of type "ValidateSamplesError" -> + structure: parameter "message" of String, parameter "dev_message" + of String, parameter "sample_name" of type "sample_name" (A sample + name. Must be less than 255 characters.), parameter "node" of type + "node_id" (A SampleNode ID. Must be unique within a Sample and be + less than 255 characters.), parameter "key" of type "metadata_key" + (A key in a metadata key/value pair. Less than 1000 unicode + characters.), parameter "subkey" of String + """ + return self._client.call_method('SampleService.validate_samples', + [params], self._service_ver, context) + + def status(self, context=None): + return self._client.call_method('SampleService.status', + [], self._service_ver, context) \ No newline at end of file diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index e38945a80..05f110f67 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -83,4 +83,7 @@ MASH_METADATA = 'mash_run_metadata.json' # The fatal error file created if a data file cannot be successfully processed -FATAL_ERROR_FILE = "fatal_error.json" \ No newline at end of file +FATAL_ERROR_FILE = "fatal_error.json" + +# key name for sample file in the metadata file for downloaded workspace objects +SAMPLE_FILE_KEY = "sample_file" \ No newline at end of file diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 04a21466a..1c62014e0 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -56,9 +56,9 @@ from pathlib import Path import docker -import requests from src.clients.AssemblyUtilClient import AssemblyUtil +from src.clients.SampleServiceClient import SampleService from src.clients.workspaceClient import Workspace from src.loaders.common import loader_common_names, loader_helper @@ -70,9 +70,6 @@ # filename that logs genome duplicates for each assembly GENOME_DUPLICATE_FILE = "duplicate_genomes.json" -# key name for sample file in metadata file -SAMPLE_FILE_KEY = "sample_file" - class Conf: def __init__( @@ -100,6 +97,7 @@ def __init__( self.ws = Workspace(ws_url, token=self.token) self.asu = AssemblyUtil(callback_url, token=self.token) + self.ss = SampleService(self.sample_url, token=self.token) self.queue = Queue() self.pth = output_dir self.job_dir = job_dir @@ -337,17 +335,15 @@ def _download_sample_data(conf, upa, metafile): sample_file_name = f"{upa}.sample" sample_file = os.path.join(upa_dir, sample_file_name) - if (SAMPLE_FILE_KEY in meta and - meta[SAMPLE_FILE_KEY] == sample_file_name and + if (loader_common_names.SAMPLE_FILE_KEY in meta and + meta[loader_common_names.SAMPLE_FILE_KEY] == sample_file_name and os.path.isfile(sample_file)): print(f"Skip downloading sample for {upa} as it already exists") return # retrieve data links associated with upa - links_ret = _post_sample_service(conf.token, - conf.sample_url, - "get_data_links_from_data", - {"upa": upa.replace("_", "/")}) + links_ret = conf.ss.get_data_links_from_data({"upa": upa.replace("_", "/")}) + data_links = links_ret['links'] if not data_links and conf.ignore_no_sample_error: print(f"No sample data links found for {upa}") @@ -359,42 +355,18 @@ def _download_sample_data(conf, upa, metafile): # retrieve sample data and save to file sample_id = data_links[0]['id'] - sample_ret = _post_sample_service(conf.token, - conf.sample_url, - "get_sample_via_data", - {"upa": upa.replace("_", "/"), - "id": sample_id, - "version": data_links[0]["version"]}) + sample_ret = conf.ss.get_sample_via_data({"upa": upa.replace("_", "/"), + "id": sample_id, + "version": data_links[0]["version"]}) with open(sample_file, "w", encoding="utf8") as json_file: json.dump(sample_ret, json_file, indent=2) - meta[SAMPLE_FILE_KEY] = sample_file_name + meta[loader_common_names.SAMPLE_FILE_KEY] = sample_file_name with open(metafile, "w", encoding="utf8") as json_file: json.dump(meta, json_file, indent=2) -def _post_sample_service(token, sample_url, method, params): - # Sends a POST request to the sample service API. - - headers = { - "Authorization": token, - "Content-Type": "application/json" - } - payload = { - "method": f"SampleService.{method}", - "id": str(uuid.uuid4()), - "params": [params] - } - resp = requests.post(url=sample_url, headers=headers, json=payload) - resp_json = resp.json() - if resp_json.get('error'): - raise RuntimeError(f"Error from SampleService - {resp_json['error']}") - result = resp_json['result'][0] - - return result - - def main(): parser = argparse.ArgumentParser( description="PROTOTYPE - Download genome files from the workspace service (WSS).", From f4953871c0c8969a878f09386151ce96870ba0b8 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Tue, 13 Jun 2023 14:45:45 -0500 Subject: [PATCH 05/17] add retrieve node tree logic --- src/loaders/common/loader_common_names.py | 2 +- .../workspace_downloader.py | 47 ++++++++++++++++++- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index 05f110f67..cb4440362 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -86,4 +86,4 @@ FATAL_ERROR_FILE = "fatal_error.json" # key name for sample file in the metadata file for downloaded workspace objects -SAMPLE_FILE_KEY = "sample_file" \ No newline at end of file +SAMPLE_FILE_KEY = "sample_file" diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 1c62014e0..ab05bc613 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -54,6 +54,7 @@ from collections import defaultdict from multiprocessing import Pool, Queue, cpu_count from pathlib import Path +from typing import Any import docker @@ -71,6 +72,10 @@ GENOME_DUPLICATE_FILE = "duplicate_genomes.json" +class BadNodeTreeError(Exception): + pass + + class Conf: def __init__( self, @@ -279,7 +284,7 @@ def list_objects( return res_objs -def process_input(conf): +def process_input(conf: Conf): """ Download .fa and .meta files from workspace and save a copy under output_dir. """ @@ -325,8 +330,9 @@ def process_input(conf): _download_sample_data(conf, upa, metafile) -def _download_sample_data(conf, upa, metafile): +def _download_sample_data(conf: Conf, upa: str, metafile: str) -> None: # retrieve sample data from sample service and save to file + # additionally, retrieve node data from the sample data and save it to a file with open(metafile, "r", encoding="utf8") as json_file: meta = json.load(json_file) @@ -366,6 +372,43 @@ def _download_sample_data(conf, upa, metafile): with open(metafile, "w", encoding="utf8") as json_file: json.dump(meta, json_file, indent=2) + sample_node_name = f"{upa}.node_tree.sample" + sample_node_file = os.path.join(upa_dir, sample_node_name) + if os.path.isfile(sample_node_file): + print(f"Skip generating sample node tree for {upa} as it already exists") + return + + try: + node_data = _retrieve_node_data(sample_ret['node_tree']) + except BadNodeTreeError as e: + raise ValueError(f"Error retrieving node data for {upa}") from e + + with open(sample_node_file, "w", encoding="utf8") as json_file: + json.dump(node_data, json_file, indent=2) + + +def _retrieve_node_data( + node_tree: list[dict[str, Any]] +) -> dict[str, str | int | float]: + # retrieve the meta_controlled node data from node tree + + node_data = dict() + + if len(node_tree) != 1: + raise BadNodeTreeError(f"Expected 1 node in node tree, got {len(node_tree)}") + + sample_node = node_tree[0] + + meta_controlled = sample_node['meta_controlled'] + for key, meta_value in meta_controlled.items(): + + if 'value' not in meta_value: + raise BadNodeTreeError(f"Expected 'value' key in meta_value, got {meta_value}") + + node_data[key] = meta_value['value'] + + return node_data + def main(): parser = argparse.ArgumentParser( From 06fab4c8d0d59e239b4d8570c4a31d7bb7f3ce3c Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Tue, 13 Jun 2023 14:52:00 -0500 Subject: [PATCH 06/17] move file extension to common names for parser --- src/loaders/common/loader_common_names.py | 4 ++++ src/loaders/workspace_downloader/workspace_downloader.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index cb4440362..63bd10393 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -87,3 +87,7 @@ # key name for sample file in the metadata file for downloaded workspace objects SAMPLE_FILE_KEY = "sample_file" + +# extension for source sample data file and sample node data file for downloaded workspace objects +SAMPLE_FILE_EXT = "sample" +SAMPLE_NODE_FILE_EXT = "node_tree.sample" diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index ab05bc613..01fff1342 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -338,7 +338,7 @@ def _download_sample_data(conf: Conf, upa: str, metafile: str) -> None: meta = json.load(json_file) upa_dir = Path(metafile).parent - sample_file_name = f"{upa}.sample" + sample_file_name = f"{upa}.{loader_common_names.SAMPLE_FILE_EXT}" sample_file = os.path.join(upa_dir, sample_file_name) if (loader_common_names.SAMPLE_FILE_KEY in meta and @@ -372,7 +372,7 @@ def _download_sample_data(conf: Conf, upa: str, metafile: str) -> None: with open(metafile, "w", encoding="utf8") as json_file: json.dump(meta, json_file, indent=2) - sample_node_name = f"{upa}.node_tree.sample" + sample_node_name = f"{upa}.{loader_common_names.SAMPLE_NODE_FILE_EXT}" sample_node_file = os.path.join(upa_dir, sample_node_name) if os.path.isfile(sample_node_file): print(f"Skip generating sample node tree for {upa} as it already exists") From 26b7330fb7134c65380427d804618b7b171c765a Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 14 Jun 2023 08:25:10 -0500 Subject: [PATCH 07/17] process long and lat data --- src/loaders/common/loader_common_names.py | 13 ++- .../workspace_downloader.py | 109 +++++++++++++----- 2 files changed, 91 insertions(+), 31 deletions(-) diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index 63bd10393..bc468591e 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -85,9 +85,16 @@ # The fatal error file created if a data file cannot be successfully processed FATAL_ERROR_FILE = "fatal_error.json" -# key name for sample file in the metadata file for downloaded workspace objects +# key name for sample file and prepared sample file in the metadata file for downloaded workspace objects SAMPLE_FILE_KEY = "sample_file" +SAMPLE_PREPARED_KEY = "sample_prepared_file" -# extension for source sample data file and sample node data file for downloaded workspace objects +# extension for source sample data file and prepared sample node data file for downloaded workspace objects SAMPLE_FILE_EXT = "sample" -SAMPLE_NODE_FILE_EXT = "node_tree.sample" +SAMPLE_PREPARED_EXT = "prepared.sample" + +# key name for latitude and longitude data from sample's meta_controlled +SAMPLE_LATITUDE = "latitude" +SAMPLE_LONGITUDE = "longitude" +# key name for created sample geo-spatial data in format of [longitude, latitude] +SAMPLE_GEO = 'geo_spatial' diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 01fff1342..9c6509061 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -340,13 +340,53 @@ def _download_sample_data(conf: Conf, upa: str, metafile: str) -> None: upa_dir = Path(metafile).parent sample_file_name = f"{upa}.{loader_common_names.SAMPLE_FILE_EXT}" sample_file = os.path.join(upa_dir, sample_file_name) - - if (loader_common_names.SAMPLE_FILE_KEY in meta and - meta[loader_common_names.SAMPLE_FILE_KEY] == sample_file_name and - os.path.isfile(sample_file)): + if _check_file_exists(loader_common_names.SAMPLE_FILE_KEY, meta, sample_file): print(f"Skip downloading sample for {upa} as it already exists") return + sample_ret = _retrieve_sample(conf, upa) + if not sample_ret: + return + + with open(sample_file, "w", encoding="utf8") as json_file: + json.dump(sample_ret, json_file, indent=2) + + sample_prepared_name = f"{upa}.{loader_common_names.SAMPLE_PREPARED_EXT}" + sample_prepared_file = os.path.join(upa_dir, sample_prepared_name) + if _check_file_exists(loader_common_names.SAMPLE_PREPARED_KEY, meta, sample_prepared_file): + print(f"Skip generating sample node tree for {upa} as it already exists") + return + + try: + node_data = _retrieve_node_data(sample_ret['node_tree']) + except BadNodeTreeError as e: + raise ValueError(f"Error retrieving node data for {upa}") from e + + with open(sample_prepared_file, "w", encoding="utf8") as json_file: + json.dump(node_data, json_file, indent=2) + + # write sample file and prepared sample node file name back to the meta file + meta[loader_common_names.SAMPLE_FILE_KEY] = sample_file_name + meta[loader_common_names.SAMPLE_PREPARED_KEY] = sample_prepared_name + with open(metafile, "w", encoding="utf8") as json_file: + json.dump(meta, json_file, indent=2) + + +def _check_file_exists( + file_key_name: str, + metadata: dict[str, Any], + file_path: str +) -> bool: + # check if file exists and if the metadata matches the file name + + return (file_key_name in metadata and + metadata[file_key_name] == Path(file_path).name and + os.path.isfile(file_path)) + + +def _retrieve_sample(conf: Conf, upa: str) -> dict[str, Any] | None: + # retrieve sample data from sample service + # retrieve data links associated with upa links_ret = conf.ss.get_data_links_from_data({"upa": upa.replace("_", "/")}) @@ -364,27 +404,10 @@ def _download_sample_data(conf: Conf, upa: str, metafile: str) -> None: sample_ret = conf.ss.get_sample_via_data({"upa": upa.replace("_", "/"), "id": sample_id, "version": data_links[0]["version"]}) + if not sample_ret: + raise ValueError(f"Retrieved empty sample data for {upa}") - with open(sample_file, "w", encoding="utf8") as json_file: - json.dump(sample_ret, json_file, indent=2) - - meta[loader_common_names.SAMPLE_FILE_KEY] = sample_file_name - with open(metafile, "w", encoding="utf8") as json_file: - json.dump(meta, json_file, indent=2) - - sample_node_name = f"{upa}.{loader_common_names.SAMPLE_NODE_FILE_EXT}" - sample_node_file = os.path.join(upa_dir, sample_node_name) - if os.path.isfile(sample_node_file): - print(f"Skip generating sample node tree for {upa} as it already exists") - return - - try: - node_data = _retrieve_node_data(sample_ret['node_tree']) - except BadNodeTreeError as e: - raise ValueError(f"Error retrieving node data for {upa}") from e - - with open(sample_node_file, "w", encoding="utf8") as json_file: - json.dump(node_data, json_file, indent=2) + return sample_ret def _retrieve_node_data( @@ -400,16 +423,46 @@ def _retrieve_node_data( sample_node = node_tree[0] meta_controlled = sample_node['meta_controlled'] + _check_dict_contains(meta_controlled, [loader_common_names.SAMPLE_LATITUDE, loader_common_names.SAMPLE_LONGITUDE]) for key, meta_value in meta_controlled.items(): - - if 'value' not in meta_value: - raise BadNodeTreeError(f"Expected 'value' key in meta_value, got {meta_value}") - + _validate_node_data(key, meta_value) node_data[key] = meta_value['value'] + # create and add geo-spatial data in the format of [latitude, longitude] + node_data[loader_common_names.SAMPLE_GEO] = [meta_controlled[loader_common_names.SAMPLE_LATITUDE]['value'], + meta_controlled[loader_common_names.SAMPLE_LONGITUDE]['value']] + return node_data +def _validate_node_data(key, meta_value): + # validate meta_value for a given key + + # validate latitude and longitude sample data + if key in [loader_common_names.SAMPLE_LATITUDE, loader_common_names.SAMPLE_LONGITUDE]: + expected_keys = ['value', 'units'] + _check_dict_keys(meta_value, expected_keys) + + if meta_value['units'] != 'degrees': + raise BadNodeTreeError(f"Expected 'units' to be 'degrees', got {meta_value['units']}") + # validate other general sample data + else: + expected_keys = ['value'] + _check_dict_keys(meta_value, expected_keys) + + +def _check_dict_keys(dictionary: dict[Any, Any], key_list: list[Any]): + # check if dictionary keys match key list + if not sorted(dictionary.keys()) == sorted(key_list): + raise BadNodeTreeError(f"Expected only {key_list} keys in node data, got {dictionary}") + + +def _check_dict_contains(dictionary: dict[Any, Any], key_list: list[Any]): + # check if dictionary contains all keys in key list + if not set(key_list).issubset(dictionary.keys()): + raise BadNodeTreeError(f"Expected all keys in {key_list} in node data, got {dictionary}") + + def main(): parser = argparse.ArgumentParser( description="PROTOTYPE - Download genome files from the workspace service (WSS).", From 48d7b1759713e4b9b277ea4211b70abbca07ff39 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 14 Jun 2023 09:06:24 -0500 Subject: [PATCH 08/17] retrieve samples from genome instead of assembly --- src/loaders/workspace_downloader/workspace_downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 9c6509061..c707b9db3 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -327,7 +327,7 @@ def process_input(conf: Conf): print(f"Skip downloading {upa} as it already exists") if conf.retrieve_sample: - _download_sample_data(conf, upa, metafile) + _download_sample_data(conf, genome_upa, metafile) def _download_sample_data(conf: Conf, upa: str, metafile: str) -> None: From 114e92513ef9864d0ed11a24acdf9a6071685a0f Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 14 Jun 2023 09:19:21 -0500 Subject: [PATCH 09/17] fix genome_upa format --- src/loaders/workspace_downloader/workspace_downloader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index c707b9db3..db7cd96e0 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -327,12 +327,13 @@ def process_input(conf: Conf): print(f"Skip downloading {upa} as it already exists") if conf.retrieve_sample: - _download_sample_data(conf, genome_upa, metafile) + _download_sample_data(conf, genome_upa.replace("/", "_"), metafile) def _download_sample_data(conf: Conf, upa: str, metafile: str) -> None: # retrieve sample data from sample service and save to file # additionally, retrieve node data from the sample data and save it to a file + # NOTE: upa is in the format of "A_B_C" with open(metafile, "r", encoding="utf8") as json_file: meta = json.load(json_file) From 611f54f87b81efc5ce116902a2fb849dcba32159 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 14 Jun 2023 21:44:43 -0500 Subject: [PATCH 10/17] address comments --- src/loaders/common/loader_common_names.py | 4 +- .../workspace_downloader.py | 93 +++++++++++-------- 2 files changed, 59 insertions(+), 38 deletions(-) diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index bc468591e..e377b3b58 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -86,7 +86,9 @@ FATAL_ERROR_FILE = "fatal_error.json" # key name for sample file and prepared sample file in the metadata file for downloaded workspace objects +# sample file contains raw sample information SAMPLE_FILE_KEY = "sample_file" +# prepared sample file contains key-value pairs of parsed meta_controlled from node tree of the sample SAMPLE_PREPARED_KEY = "sample_prepared_file" # extension for source sample data file and prepared sample node data file for downloaded workspace objects @@ -97,4 +99,4 @@ SAMPLE_LATITUDE = "latitude" SAMPLE_LONGITUDE = "longitude" # key name for created sample geo-spatial data in format of [longitude, latitude] -SAMPLE_GEO = 'geo_spatial' +SAMPLE_GEO = '_geo_spatial' diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index db7cd96e0..4cf3ec9f0 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -76,6 +76,10 @@ class BadNodeTreeError(Exception): pass +class NoDataLinkError(Exception): + pass + + class Conf: def __init__( self, @@ -319,58 +323,76 @@ def process_input(conf: Conf): os.link(cfn, dst) # save meta file with relevant object_info - with open(metafile, "w", encoding="utf8") as json_file: - json.dump(_process_object_info(obj_info, genome_upa), json_file, indent=2) + _dump_json_to_file(metafile, _process_object_info(obj_info, genome_upa)) print("Completed %s" % (upa)) else: print(f"Skip downloading {upa} as it already exists") if conf.retrieve_sample: - _download_sample_data(conf, genome_upa.replace("/", "_"), metafile) + _download_sample_data(conf, [upa.replace("_", "/"), genome_upa], metafile) -def _download_sample_data(conf: Conf, upa: str, metafile: str) -> None: - # retrieve sample data from sample service and save to file +def _download_sample_data( + conf: Conf, + upas: list[str], + metafile: str) -> None: + # retrieve sample data from sample service and save to file for one and only one upa from input upas # additionally, retrieve node data from the sample data and save it to a file - # NOTE: upa is in the format of "A_B_C" - with open(metafile, "r", encoding="utf8") as json_file: - meta = json.load(json_file) - - upa_dir = Path(metafile).parent - sample_file_name = f"{upa}.{loader_common_names.SAMPLE_FILE_EXT}" - sample_file = os.path.join(upa_dir, sample_file_name) - if _check_file_exists(loader_common_names.SAMPLE_FILE_KEY, meta, sample_file): - print(f"Skip downloading sample for {upa} as it already exists") - return + sample_ret, sample_upa = _find_sample_upa(conf, upas) - sample_ret = _retrieve_sample(conf, upa) if not sample_ret: + if not conf.ignore_no_sample_error: + raise ValueError(f"Sample data not found for {upas}") return - with open(sample_file, "w", encoding="utf8") as json_file: - json.dump(sample_ret, json_file, indent=2) + with open(metafile, "r", encoding="utf8") as json_file: + meta = json.load(json_file) - sample_prepared_name = f"{upa}.{loader_common_names.SAMPLE_PREPARED_EXT}" + upa_dir = Path(metafile).parent + sample_file_name = f"{sample_upa}.{loader_common_names.SAMPLE_FILE_EXT}" + sample_file = os.path.join(upa_dir, sample_file_name) + sample_prepared_name = f"{sample_upa}.{loader_common_names.SAMPLE_PREPARED_EXT}" sample_prepared_file = os.path.join(upa_dir, sample_prepared_name) - if _check_file_exists(loader_common_names.SAMPLE_PREPARED_KEY, meta, sample_prepared_file): - print(f"Skip generating sample node tree for {upa} as it already exists") - return - try: - node_data = _retrieve_node_data(sample_ret['node_tree']) - except BadNodeTreeError as e: - raise ValueError(f"Error retrieving node data for {upa}") from e + if not _check_file_exists(loader_common_names.SAMPLE_FILE_KEY, meta, sample_file): + _dump_json_to_file(sample_file, sample_ret) - with open(sample_prepared_file, "w", encoding="utf8") as json_file: - json.dump(node_data, json_file, indent=2) + if not _check_file_exists(loader_common_names.SAMPLE_PREPARED_KEY, meta, sample_prepared_file): + node_data = _retrieve_node_data(sample_ret['node_tree']) + _dump_json_to_file(sample_prepared_file, node_data) # write sample file and prepared sample node file name back to the meta file meta[loader_common_names.SAMPLE_FILE_KEY] = sample_file_name meta[loader_common_names.SAMPLE_PREPARED_KEY] = sample_prepared_name - with open(metafile, "w", encoding="utf8") as json_file: - json.dump(meta, json_file, indent=2) + _dump_json_to_file(metafile, meta) + + +def _dump_json_to_file(json_file_path: str, json_data: dict[str, Any]) -> None: + with open(json_file_path, "w", encoding="utf8") as json_file: + json.dump(json_data, json_file, indent=2) + + +def _find_sample_upa( + conf: Conf, + upas: list[str] +) -> (dict[str, Any], str): + # find one and only one sample associated upa from input upas and retrieve the sample data + # raise error if multiple samples are found + + found_sample, sample_ret, sample_upa = False, None, None + + for upa in upas: + try: + sample_ret, sample_upa = _retrieve_sample(conf, upa), upa + if found_sample: + raise ValueError(f"Found multiple samples in input {upas}") + found_sample = True + except NoDataLinkError: + pass + + return sample_ret, sample_upa def _check_file_exists( @@ -389,12 +411,11 @@ def _retrieve_sample(conf: Conf, upa: str) -> dict[str, Any] | None: # retrieve sample data from sample service # retrieve data links associated with upa - links_ret = conf.ss.get_data_links_from_data({"upa": upa.replace("_", "/")}) + links_ret = conf.ss.get_data_links_from_data({"upa": upa}) data_links = links_ret['links'] - if not data_links and conf.ignore_no_sample_error: - print(f"No sample data links found for {upa}") - return + if not data_links: + raise NoDataLinkError(f"Expected at least 1 data link for {upa}") # there should only be one data link for each upa if len(data_links) != 1: @@ -402,11 +423,9 @@ def _retrieve_sample(conf: Conf, upa: str) -> dict[str, Any] | None: # retrieve sample data and save to file sample_id = data_links[0]['id'] - sample_ret = conf.ss.get_sample_via_data({"upa": upa.replace("_", "/"), + sample_ret = conf.ss.get_sample_via_data({"upa": upa, "id": sample_id, "version": data_links[0]["version"]}) - if not sample_ret: - raise ValueError(f"Retrieved empty sample data for {upa}") return sample_ret From 4b825debefc8ec90cdbcf66966b98ae709ee65e6 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 14 Jun 2023 22:05:58 -0500 Subject: [PATCH 11/17] fix file prefix --- .../workspace_downloader.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 4cf3ec9f0..9b7006e12 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -350,23 +350,26 @@ def _download_sample_data( with open(metafile, "r", encoding="utf8") as json_file: meta = json.load(json_file) - upa_dir = Path(metafile).parent - sample_file_name = f"{sample_upa}.{loader_common_names.SAMPLE_FILE_EXT}" + upa_dir, sample_file_prefix = Path(metafile).parent, sample_upa.replace("/", "_") + sample_file_name = f"{sample_file_prefix}.{loader_common_names.SAMPLE_FILE_EXT}" sample_file = os.path.join(upa_dir, sample_file_name) - sample_prepared_name = f"{sample_upa}.{loader_common_names.SAMPLE_PREPARED_EXT}" + sample_prepared_name = f"{sample_file_prefix}.{loader_common_names.SAMPLE_PREPARED_EXT}" sample_prepared_file = os.path.join(upa_dir, sample_prepared_name) + update_meta = False if not _check_file_exists(loader_common_names.SAMPLE_FILE_KEY, meta, sample_file): _dump_json_to_file(sample_file, sample_ret) + meta[loader_common_names.SAMPLE_FILE_KEY] = sample_file_name + update_meta = True if not _check_file_exists(loader_common_names.SAMPLE_PREPARED_KEY, meta, sample_prepared_file): node_data = _retrieve_node_data(sample_ret['node_tree']) _dump_json_to_file(sample_prepared_file, node_data) + meta[loader_common_names.SAMPLE_PREPARED_KEY] = sample_prepared_name + update_meta = True - # write sample file and prepared sample node file name back to the meta file - meta[loader_common_names.SAMPLE_FILE_KEY] = sample_file_name - meta[loader_common_names.SAMPLE_PREPARED_KEY] = sample_prepared_name - _dump_json_to_file(metafile, meta) + if update_meta: + _dump_json_to_file(metafile, meta) def _dump_json_to_file(json_file_path: str, json_data: dict[str, Any]) -> None: From 0a6490f2539d3d5c2cc972465a3b4c83cf745595 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 14 Jun 2023 22:19:02 -0500 Subject: [PATCH 12/17] make sure metadata file gets updated --- .../workspace_downloader.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 9b7006e12..002e23ae0 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -357,19 +357,20 @@ def _download_sample_data( sample_prepared_file = os.path.join(upa_dir, sample_prepared_name) update_meta = False - if not _check_file_exists(loader_common_names.SAMPLE_FILE_KEY, meta, sample_file): - _dump_json_to_file(sample_file, sample_ret) - meta[loader_common_names.SAMPLE_FILE_KEY] = sample_file_name - update_meta = True - - if not _check_file_exists(loader_common_names.SAMPLE_PREPARED_KEY, meta, sample_prepared_file): - node_data = _retrieve_node_data(sample_ret['node_tree']) - _dump_json_to_file(sample_prepared_file, node_data) - meta[loader_common_names.SAMPLE_PREPARED_KEY] = sample_prepared_name - update_meta = True - - if update_meta: - _dump_json_to_file(metafile, meta) + try: + if not _check_file_exists(loader_common_names.SAMPLE_FILE_KEY, meta, sample_file): + _dump_json_to_file(sample_file, sample_ret) + meta[loader_common_names.SAMPLE_FILE_KEY] = sample_file_name + update_meta = True + + if not _check_file_exists(loader_common_names.SAMPLE_PREPARED_KEY, meta, sample_prepared_file): + node_data = _retrieve_node_data(sample_ret['node_tree']) + _dump_json_to_file(sample_prepared_file, node_data) + meta[loader_common_names.SAMPLE_PREPARED_KEY] = sample_prepared_name + update_meta = True + finally: + if update_meta: + _dump_json_to_file(metafile, meta) def _dump_json_to_file(json_file_path: str, json_data: dict[str, Any]) -> None: From d9c8ea72ea93b9bb19dce5e5178cc4acb2418027 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 15 Jun 2023 14:01:35 -0500 Subject: [PATCH 13/17] add sample retrieve time --- src/loaders/common/loader_common_names.py | 1 + src/loaders/workspace_downloader/workspace_downloader.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index e377b3b58..40444ba59 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -94,6 +94,7 @@ # extension for source sample data file and prepared sample node data file for downloaded workspace objects SAMPLE_FILE_EXT = "sample" SAMPLE_PREPARED_EXT = "prepared.sample" +SAMPLE_RETRIEVED_TIME = "sample_retrieved_time" # key name for latitude and longitude data from sample's meta_controlled SAMPLE_LATITUDE = "latitude" diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 002e23ae0..1a6733c9b 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -360,6 +360,7 @@ def _download_sample_data( try: if not _check_file_exists(loader_common_names.SAMPLE_FILE_KEY, meta, sample_file): _dump_json_to_file(sample_file, sample_ret) + meta[loader_common_names.SAMPLE_RETRIEVED_TIME] = time.time() meta[loader_common_names.SAMPLE_FILE_KEY] = sample_file_name update_meta = True From f68b82ef408f1b9d1cc053f4727f81d74a0c65b5 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 15 Jun 2023 16:34:49 -0500 Subject: [PATCH 14/17] time --- src/loaders/workspace_downloader/workspace_downloader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 1a6733c9b..fce1e81de 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -62,6 +62,7 @@ from src.clients.SampleServiceClient import SampleService from src.clients.workspaceClient import Workspace from src.loaders.common import loader_common_names, loader_helper +from src.service.timestamp import timestamp # setup KB_AUTH_TOKEN as env or provide a token_filepath in --token_filepath # export KB_AUTH_TOKEN="your-kb-auth-token" @@ -360,7 +361,7 @@ def _download_sample_data( try: if not _check_file_exists(loader_common_names.SAMPLE_FILE_KEY, meta, sample_file): _dump_json_to_file(sample_file, sample_ret) - meta[loader_common_names.SAMPLE_RETRIEVED_TIME] = time.time() + meta[loader_common_names.SAMPLE_RETRIEVED_TIME] = timestamp() meta[loader_common_names.SAMPLE_FILE_KEY] = sample_file_name update_meta = True From 3df23f6fbb392fa688ef906b33078dd26112f3a6 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 15 Jun 2023 17:02:19 -0500 Subject: [PATCH 15/17] time --- src/loaders/common/loader_common_names.py | 2 +- .../workspace_downloader.py | 19 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index 40444ba59..b54b197ee 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -94,7 +94,7 @@ # extension for source sample data file and prepared sample node data file for downloaded workspace objects SAMPLE_FILE_EXT = "sample" SAMPLE_PREPARED_EXT = "prepared.sample" -SAMPLE_RETRIEVED_TIME = "sample_retrieved_time" +SAMPLE_EFFECTIVE_TIME = "sample_effective_time" # key name for latitude and longitude data from sample's meta_controlled SAMPLE_LATITUDE = "latitude" diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index fce1e81de..cdbd32676 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -62,7 +62,6 @@ from src.clients.SampleServiceClient import SampleService from src.clients.workspaceClient import Workspace from src.loaders.common import loader_common_names, loader_helper -from src.service.timestamp import timestamp # setup KB_AUTH_TOKEN as env or provide a token_filepath in --token_filepath # export KB_AUTH_TOKEN="your-kb-auth-token" @@ -341,7 +340,7 @@ def _download_sample_data( # retrieve sample data from sample service and save to file for one and only one upa from input upas # additionally, retrieve node data from the sample data and save it to a file - sample_ret, sample_upa = _find_sample_upa(conf, upas) + sample_ret, sample_upa, sample_effective_time = _find_sample_upa(conf, upas) if not sample_ret: if not conf.ignore_no_sample_error: @@ -361,7 +360,7 @@ def _download_sample_data( try: if not _check_file_exists(loader_common_names.SAMPLE_FILE_KEY, meta, sample_file): _dump_json_to_file(sample_file, sample_ret) - meta[loader_common_names.SAMPLE_RETRIEVED_TIME] = timestamp() + meta[loader_common_names.SAMPLE_EFFECTIVE_TIME] = sample_effective_time meta[loader_common_names.SAMPLE_FILE_KEY] = sample_file_name update_meta = True @@ -387,18 +386,18 @@ def _find_sample_upa( # find one and only one sample associated upa from input upas and retrieve the sample data # raise error if multiple samples are found - found_sample, sample_ret, sample_upa = False, None, None + found_sample, sample_ret, sample_upa, sample_effective_time = False, None, None, None for upa in upas: try: - sample_ret, sample_upa = _retrieve_sample(conf, upa), upa + sample_ret, sample_effective_time = _retrieve_sample(conf, upa) if found_sample: raise ValueError(f"Found multiple samples in input {upas}") - found_sample = True + found_sample, sample_upa = True, upa except NoDataLinkError: pass - return sample_ret, sample_upa + return sample_ret, sample_upa, sample_effective_time def _check_file_exists( @@ -413,13 +412,13 @@ def _check_file_exists( os.path.isfile(file_path)) -def _retrieve_sample(conf: Conf, upa: str) -> dict[str, Any] | None: +def _retrieve_sample(conf: Conf, upa: str) -> (dict[str, Any] | None, int): # retrieve sample data from sample service # retrieve data links associated with upa links_ret = conf.ss.get_data_links_from_data({"upa": upa}) - data_links = links_ret['links'] + data_links, effective_time = links_ret['links'], links_ret['effective_time'] if not data_links: raise NoDataLinkError(f"Expected at least 1 data link for {upa}") @@ -433,7 +432,7 @@ def _retrieve_sample(conf: Conf, upa: str) -> dict[str, Any] | None: "id": sample_id, "version": data_links[0]["version"]}) - return sample_ret + return sample_ret, effective_time def _retrieve_node_data( From 67e2e8ddeeb4fc9e1fe3136d62e79a18441252fe Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 15 Jun 2023 17:31:38 -0500 Subject: [PATCH 16/17] check metadata file first --- src/loaders/common/loader_common_names.py | 2 +- .../workspace_downloader.py | 57 ++++++++----------- 2 files changed, 25 insertions(+), 34 deletions(-) diff --git a/src/loaders/common/loader_common_names.py b/src/loaders/common/loader_common_names.py index b54b197ee..142354020 100644 --- a/src/loaders/common/loader_common_names.py +++ b/src/loaders/common/loader_common_names.py @@ -90,11 +90,11 @@ SAMPLE_FILE_KEY = "sample_file" # prepared sample file contains key-value pairs of parsed meta_controlled from node tree of the sample SAMPLE_PREPARED_KEY = "sample_prepared_file" +SAMPLE_EFFECTIVE_TIME = "sample_effective_time" # extension for source sample data file and prepared sample node data file for downloaded workspace objects SAMPLE_FILE_EXT = "sample" SAMPLE_PREPARED_EXT = "prepared.sample" -SAMPLE_EFFECTIVE_TIME = "sample_effective_time" # key name for latitude and longitude data from sample's meta_controlled SAMPLE_LATITUDE = "latitude" diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index cdbd32676..8dd271180 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -340,38 +340,38 @@ def _download_sample_data( # retrieve sample data from sample service and save to file for one and only one upa from input upas # additionally, retrieve node data from the sample data and save it to a file - sample_ret, sample_upa, sample_effective_time = _find_sample_upa(conf, upas) + # check if sample information already exists in the metadata file + with open(metafile, "r", encoding="utf8") as json_file: + meta = json.load(json_file) + + sample_keys = [loader_common_names.SAMPLE_FILE_KEY, + loader_common_names.SAMPLE_PREPARED_KEY, + loader_common_names.SAMPLE_EFFECTIVE_TIME] + if all(key in meta for key in sample_keys): + print(f"Skip downloading sample data for {upas} as it already exists") + return + sample_ret, sample_upa, sample_effective_time = _find_sample_upa(conf, upas) if not sample_ret: if not conf.ignore_no_sample_error: raise ValueError(f"Sample data not found for {upas}") return + node_data = _retrieve_node_data(sample_ret['node_tree']) - with open(metafile, "r", encoding="utf8") as json_file: - meta = json.load(json_file) - + # save sample data and parsed key-value node data to file upa_dir, sample_file_prefix = Path(metafile).parent, sample_upa.replace("/", "_") sample_file_name = f"{sample_file_prefix}.{loader_common_names.SAMPLE_FILE_EXT}" sample_file = os.path.join(upa_dir, sample_file_name) sample_prepared_name = f"{sample_file_prefix}.{loader_common_names.SAMPLE_PREPARED_EXT}" sample_prepared_file = os.path.join(upa_dir, sample_prepared_name) - update_meta = False + _dump_json_to_file(sample_file, sample_ret) + _dump_json_to_file(sample_prepared_file, node_data) - try: - if not _check_file_exists(loader_common_names.SAMPLE_FILE_KEY, meta, sample_file): - _dump_json_to_file(sample_file, sample_ret) - meta[loader_common_names.SAMPLE_EFFECTIVE_TIME] = sample_effective_time - meta[loader_common_names.SAMPLE_FILE_KEY] = sample_file_name - update_meta = True - - if not _check_file_exists(loader_common_names.SAMPLE_PREPARED_KEY, meta, sample_prepared_file): - node_data = _retrieve_node_data(sample_ret['node_tree']) - _dump_json_to_file(sample_prepared_file, node_data) - meta[loader_common_names.SAMPLE_PREPARED_KEY] = sample_prepared_name - update_meta = True - finally: - if update_meta: - _dump_json_to_file(metafile, meta) + # update metadata file with sample information + meta[loader_common_names.SAMPLE_FILE_KEY] = sample_file_name + meta[loader_common_names.SAMPLE_PREPARED_KEY] = sample_prepared_name + meta[loader_common_names.SAMPLE_EFFECTIVE_TIME] = sample_effective_time + _dump_json_to_file(metafile, meta) def _dump_json_to_file(json_file_path: str, json_data: dict[str, Any]) -> None: @@ -400,19 +400,10 @@ def _find_sample_upa( return sample_ret, sample_upa, sample_effective_time -def _check_file_exists( - file_key_name: str, - metadata: dict[str, Any], - file_path: str -) -> bool: - # check if file exists and if the metadata matches the file name - - return (file_key_name in metadata and - metadata[file_key_name] == Path(file_path).name and - os.path.isfile(file_path)) - - -def _retrieve_sample(conf: Conf, upa: str) -> (dict[str, Any] | None, int): +def _retrieve_sample( + conf: Conf, + upa: str +) -> (dict[str, Any] | None, int): # retrieve sample data from sample service # retrieve data links associated with upa From f07f3ee28cb6bacd29f07126b3353279c7b126cf Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 15 Jun 2023 17:36:41 -0500 Subject: [PATCH 17/17] add more comments --- src/loaders/workspace_downloader/workspace_downloader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py index 8dd271180..a524ae0eb 100644 --- a/src/loaders/workspace_downloader/workspace_downloader.py +++ b/src/loaders/workspace_downloader/workspace_downloader.py @@ -351,6 +351,7 @@ def _download_sample_data( print(f"Skip downloading sample data for {upas} as it already exists") return + # retrieve sample data from sample service and parse key-value node data sample_ret, sample_upa, sample_effective_time = _find_sample_upa(conf, upas) if not sample_ret: if not conf.ignore_no_sample_error: @@ -375,6 +376,7 @@ def _download_sample_data( def _dump_json_to_file(json_file_path: str, json_data: dict[str, Any]) -> None: + # dump json data to file with open(json_file_path, "w", encoding="utf8") as json_file: json.dump(json_data, json_file, indent=2)