diff --git a/README.md b/README.md index ce19e52..00d5358 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,20 @@ -# ua-cwl-transformer -Base image for University of Arizona transformers using Common Workflow Language +# University of Arizona Transformer for Common Workflow Language +This repo contains source for building base docker images for the University of Arizona (UA) environment that uses Common Workflow Language (CWL) for processing Gantry data. + +The images built here are from the AgPipeline image(s) created by the [base docker support](https://github.com/AgPipeline/base-docker-support) repo. + +## Details +The transformer_class.py file in the base images are replaced by our version which provides the correct environment for derived transformers. + +The command line arguments are saved in the `args` variable of the class instance. + +## How to Contribute +If the current images don't provide the functionality needed, please put in a [feature request](https://github.com/AgPipeline/computing-pipeline/issues/new/choose) before creating a new image. +Your feedback is important to us and it's quite possible that we will want to incorporate your request in our existing images. + +If you need a separate environment, such as for the CyVerse Discovery Environment (DE), please consider using a separate repo for your work (within this organization is a good place). +If you are creating an environment *derived* from this one, you might also want to consider a separate repo for your work. + +If you are creating a new folder for a new image, please use a meaningful prefix to the folder name; for example, use a prefix of 'gdal' for an image that has gdal pre-installed. + +Also, be sure to read about how to [contribute](https://github.com/AgPipeline/Organization-info) to this organization. diff --git a/common-image/Dockerfile b/common-image/Dockerfile new file mode 100644 index 0000000..070dfd1 --- /dev/null +++ b/common-image/Dockerfile @@ -0,0 +1,79 @@ +FROM agpipeline/base-image:1.0 +LABEL maintainer="Chris Schnaufer " + +# Build environment values +ARG arg_terrautil_url=https://github.com/terraref/terrautils.git +ENV terrautil_url=$arg_terrautil_url + +ARG arg_terrautil_branch=master +ENV terrautil_branch=$arg_terrautil_branch + +ARG arg_sensor_url=https://github.com/terraref/sensor-metadata.git +ENV sensor_url=$arg_sensor_url + +ARG arg_sensor_branch=master +ENV sensor_branch=$arg_sensor_branch + +# We need to explicitly set the user to root to install in system folders +USER root + +# Install applications we need +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + python3-gdal \ + gdal-bin + +# Install Python updates +RUN python3 -m pip install --upgrade --no-cache-dir pip + +RUN python3 -m pip install --upgrade --no-cache-dir setuptools +RUN python3 -m pip install --upgrade --no-cache-dir \ + numpy \ + influxdb \ + laspy \ + requests==2.21.0 \ + python-dateutil \ + utm \ + matplotlib \ + Pillow \ + scipy \ + cryptography \ + pyclowder + +RUN apt-get install -y --no-install-recommends \ + libgdal-dev \ + gcc \ + g++ \ + python3-dev && \ + python3 -m pip install --upgrade --no-cache-dir \ + pygdal==2.2.3.5 && \ + apt-get remove -y \ + libgdal-dev \ + gcc \ + g++ \ + python3-dev && \ + apt-get autoremove -y + +# Install from source +RUN apt-get install -y --no-install-recommends \ + git && \ + git clone $terrautil_url --branch $terrautil_branch --single-branch "/home/extractor/terrautil" && \ + python3 -m pip install "/home/extractor/terrautil/" && \ + rm -rf /home/extractor/terrautil && \ + git clone $sensor_url --branch $sensor_branch --single-branch "/home/extractor/sensor-metadata" && \ + mv "/home/extractor/sensor-metadata/sensors" "/home/extractor/sensors" && \ + rm -rf /home/extractor/sensor-metadata && \ + apt-get remove -y \ + git && \ + apt-get autoremove -y + +# Perform some cleanup +RUN apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ + rm -rf ~/.cache/pip + +# Set to the user we want to be +USER extractor + +COPY *.py /home/extractor/ diff --git a/common-image/README.md b/common-image/README.md new file mode 100644 index 0000000..7fc851e --- /dev/null +++ b/common-image/README.md @@ -0,0 +1,27 @@ +# Common Image +This folder contains the base files for the [Gantry workflow at UA](https://github.com/AgPipeline/Organization-info) environment. +This environment builds off this organizations [base docker support](https://github.com/AgPipeline/base-docker-support), replacing the *transformer_class.py* file. +Please refer to that repo for additional context. + +## Purpose +As part of providing an environment suitable for the Gantry workflow at UA, there are additional variables defined in the [configuration.py](https://github.com/AgPipeline/ua-gantry-transformer/blob/common-extractor/common-image/configuration.py) file. + +Our [transformer_class.Transformer](https://github.com/AgPipeline/ua-gantry-transformer/blob/common-extractor/common-image/transformer_class.py) instance adds to the command line parameters. +Additionally, the metadata that's received is processed to make it easier for transformers to perform their work. +It's expected that common functions will be added to our Transformer class as additional TERRA REF extractors get ported to the new framework. + +## Relation to Finished Transformers +Now that an environment has been provided through this code, a transformer template needs to be cloned and developed. +In most cases the result of cloning the developing the transformer template will provide a final product. + +## Docker Image Notes +This section contains information on the Dockerfile and some build notes. + +### What's Provided +Be sure to check the [Dockerfile](https://github.com/AgPipeline/ua-gantry-transformer/blob/common-extractor/common-image/Dockerfile) for an exact list of what's installed. + +The Python packages of `numpy` and `gdal` are installed to allow array manipulation and geo-spatial support. + +### Build Arguments +There are Docker build arguments defined in the Dockerfile. +These are intended to provide additional flexibility for installing packages when building a Docker image, without having to edit the Dockerfile. diff --git a/common-image/configuration.py b/common-image/configuration.py new file mode 100644 index 0000000..19c01a5 --- /dev/null +++ b/common-image/configuration.py @@ -0,0 +1,29 @@ +"""Contains transformer configuration information +""" + +# The version number of the transformer +TRANSFORMER_VERSION = '1.0' + +# The transformer description +TRANSFORMER_DESCRIPTION = '' + +# Short name of the transformer +TRANSFORMER_NAME = '' + +# The sensor associated with the transformer +TRANSFORMER_SENSOR = '' + +# The transformer type (eg: 'rgbmask', 'plotclipper') +TRANSFORMER_TYPE = '' + +# The name of the author of the extractor +AUTHOR_NAME = '' + +# The email of the author of the extractor +AUTHOR_EMAIL = '' + +# Contributors to this transformer +CONTRUBUTORS = [] + +# Reposity URI of where the source code lives +REPOSITORY = '' diff --git a/common-image/transformer_class.py b/common-image/transformer_class.py new file mode 100644 index 0000000..859febd --- /dev/null +++ b/common-image/transformer_class.py @@ -0,0 +1,161 @@ +"""Class instance for Transformer +""" + +import os +import argparse +import logging + +from pyclowder.utils import setup_logging as pyc_setup_logging +from terrautils.metadata import get_terraref_metadata as tr_get_terraref_metadata, \ + get_season_and_experiment as tr_get_season_and_experiment, \ + get_extractor_metadata as tr_get_extractor_metadata +from terrautils.sensors import Sensors + +import configuration + +import terrautils.lemnatec + +terrautils.lemnatec.SENSOR_METADATA_CACHE = os.path.dirname(os.path.realpath(__file__)) + +class __internal__(): + """Class containing functions for this file only + """ + def __init__(self): + """Perform class level initialization + """ + + @staticmethod + def get_metadata_timestamp(metadata: dict) -> str: + """Looks up the timestamp in the metadata + Arguments: + metadata: the metadata to find the timestamp in + """ + if 'content' in metadata: + check_md = metadata['content'] + else: + check_md = metadata + + timestamp = None + if 'timestamp' in check_md: + timestamp = check_md['timestamp'] + elif 'gantry_variable_metadata' in check_md: + if 'datetime' in check_md['gantry_variable_metadata']: + timestamp = check_md['gantry_variable_metadata']['datetime'] + + return timestamp + +class Transformer(): + """Generic class for supporting transformers + """ + #pylint: disable=unused-argument + def __init__(self, **kwargs): + """Performs initialization of class instance + Arguments: + kwargs: additional parameters passed in to Transformer + """ + self.sensor = None + self.args = None + + @property + def default_epsg(self): + """Returns the default EPSG code that utilities expect + """ + return 4326 + + @property + def sensor_name(self): + """Returns the name of the sensor we represent + """ + return configuration.TRANSFORMER_SENSOR + + # pylint: disable=no-self-use + def generate_transformer_md(self) -> dict: + """Generates metadata about this transformer + Returns: + Returns the transformer metadata + """ + return { + 'version': configuration.TRANSFORMER_VERSION, + 'name': configuration.TRANSFORMER_NAME, + 'author': configuration.AUTHOR_NAME, + 'description': configuration.TRANSFORMER_DESCRIPTION, + 'repository': {'repUrl': configuration.REPOSITORY} + } + + # pylint: disable=no-self-use + def add_parameters(self, parser: argparse.ArgumentParser) -> None: + """Adds processing parameters to existing parameters + Arguments: + parser: instance of argparse + """ + parser.add_argument('--logging', '-l', nargs='?', default=os.getenv("LOGGING"), + help='file or url or logging configuration (default=None)') + + parser.epilog = configuration.TRANSFORMER_NAME + ' version ' + configuration.TRANSFORMER_VERSION + \ + ' author ' + configuration.AUTHOR_NAME + ' ' + configuration.AUTHOR_EMAIL + + #pylint: disable=no-self-use + def get_transformer_params(self, args: argparse.Namespace, metadata: dict) -> dict: + """Returns a parameter list for processing data + Arguments: + args: result of calling argparse.parse_args + metadata: the loaded metadata + """ + # Setup logging + pyc_setup_logging(args.logging) + + self.args = args + + # Determine if we're using JSONLD (which we should be) + if 'content' in metadata: + parse_md = metadata['content'] + else: + parse_md = metadata + + terraref_md = tr_get_terraref_metadata(parse_md, configuration.TRANSFORMER_SENSOR) + if not terraref_md: + return {'code': -5001, 'error': "Unable to load Gantry information from metadata for '%s'" % \ + configuration.TRANSFORMER_TYPE} + + timestamp = __internal__.get_metadata_timestamp(parse_md) + if not timestamp: + return {'code': -5002, 'error': "Unable to locate timestamp in metadata for '%s'" % \ + configuration.TRANSFORMER_TYPE} + + # Fetch experiment name from terra metadata + season_name, experiment_name, updated_experiment = \ + tr_get_season_and_experiment(timestamp, configuration.TRANSFORMER_TYPE, terraref_md) + + # Setup our sensor + self.sensor = Sensors(base='', station='ua-mac', sensor=configuration.TRANSFORMER_SENSOR) + leaf_name = self.sensor.get_display_name() + + # Get our trimmed metadata + terraref_md_trim = tr_get_terraref_metadata(parse_md) + if updated_experiment is not None: + terraref_md_trim['experiment_metadata'] = updated_experiment + + # Get the list of files, if there are some + file_list = [] + if args.file_list: + for one_file in args.file_list: + # Filter out arguments that are obviously not files + if not one_file.startswith('-'): + file_list.append(one_file) + + # Prepare our parameters + check_md = {'timestamp': timestamp, + 'season': season_name, + 'experiment': experiment_name, + 'container_name': None, + 'target_container_name': leaf_name, # TODO: Is this needed? + 'trigger_name': None, + 'context_md': terraref_md_trim, + 'working_folder': args.working_space, + 'list_files': lambda: file_list + } + + return {'check_md': check_md, + 'transformer_md': tr_get_extractor_metadata(terraref_md, configuration.TRANSFORMER_NAME), + 'full_md': parse_md + }