diff --git a/images/build_images b/images/build_images index 994c3b3..438f795 100755 --- a/images/build_images +++ b/images/build_images @@ -30,6 +30,8 @@ CSD_URL_TEMPLATE = 'http://{s3_bucket}/datacollector/{build}/csd/STREAMSETS-{0}. IMAGE_NAME_TEMPLATE = 'streamsets/clusterdock:topology_cdh-streamsets_datacollector-{}' PARCEL_URL_TEMPLATE = 'http://{s3_bucket}/datacollector/{build}/parcel/STREAMSETS_DATACOLLECTOR-{0}-el6.parcel' PARCEL_MANIFEST_URL_TEMPLATE = 'http://{s3_bucket}/datacollector/{build}/parcel/manifest.json' +# Name of service in CDH cluster. +SDC_PRODUCT_NAME = 'STREAMSETS_DATACOLLECTOR' def main(): @@ -50,7 +52,7 @@ def main(): parser.add_argument('-p', '--push', help='Push Docker images after building', action='store_true') args = parser.parse_args() - image_folder = Path(Path(__file__).parent, 'sdc').resolve() + image_folder = Path(Path(__file__).parent, 'cloudera_service').resolve() if args.dry_run: logger.info('Doing dry-run of tool ...') @@ -63,6 +65,7 @@ def main(): cmd_elements = ['docker build -t {}'.format(image_name), '--build-arg CSD_URL={}'.format(csd_url), '--build-arg PARCEL_URL={}'.format(parcel_url), + '--build-arg PRODUCT={}'.format(SDC_PRODUCT_NAME), str(image_folder)] cmd = ' '.join(cmd_elements) logger.debug('Running Docker build command (%s) ...', cmd) diff --git a/images/sdc/Dockerfile b/images/sdc/Dockerfile deleted file mode 100644 index 73ab43e..0000000 --- a/images/sdc/Dockerfile +++ /dev/null @@ -1,37 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -FROM alpine:latest -MAINTAINER Dima Spivak - -ARG CSD_DIRECTORY=/opt/cloudera/csd -ARG CSD_URL=http://archives.streamsets.com/datacollector/3.1.0.0/csd/STREAMSETS-3.1.0.0.jar - -ARG PARCEL_REPO_DIRECTORY=/opt/cloudera/parcel-repo -ARG PARCEL_URL=http://archives.streamsets.com/datacollector/3.1.0.0/parcel/STREAMSETS_DATACOLLECTOR-3.1.0.0-el6.parcel - -RUN apk --no-cache add tar - -RUN mkdir -p ${PARCEL_REPO_DIRECTORY} && \ - wget -P ${PARCEL_REPO_DIRECTORY} ${PARCEL_URL} && \ - PARCEL_NAME=$(basename ${PARCEL_URL}) && \ - cd ${PARCEL_REPO_DIRECTORY} && \ - sha1sum ${PARCEL_NAME} | awk '{ print $1 }' > ${PARCEL_NAME}.sha - -VOLUME ${PARCEL_REPO_DIRECTORY} - -RUN mkdir -p "${CSD_DIRECTORY}" && \ - CSD_NAME=$(basename ${CSD_URL}) && \ - wget -O "${CSD_DIRECTORY}/${CSD_NAME}" "${CSD_URL}" - -VOLUME ${CSD_DIRECTORY} - -CMD ["/bin/true"] diff --git a/start.py b/start.py index a366c9b..412dad8 100644 --- a/start.py +++ b/start.py @@ -160,6 +160,9 @@ def main(args): if args.spark2_version: _install_service_from_local_repo(cluster, product='SPARK2') + if args.sdc_version: + _install_service_from_local_repo(cluster, product='STREAMSETS_DATACOLLECTOR') + if args.kerberos: cluster.kdc_node = kdc_node _configure_kdc(cluster, args.kerberos_principals, args.kerberos_ticket_lifetime, quiet=quiet) @@ -273,15 +276,6 @@ def cm_server_not_dead(primary_node): deployment.update_cm_config(configs={'manages_parcels': True}) if args.sdc_version: - # We install StreamSets DataCollector using local repo /opt/cloudera/parcel-repo. - # Set file and folder permissions correctly. - commands = ['chown cloudera-scm:cloudera-scm /opt/cloudera/csd', - 'chown cloudera-scm:cloudera-scm /opt/cloudera/parcel-repo', - 'chown cloudera-scm:cloudera-scm /opt/cloudera/csd/STREAMSETS*.jar', - 'chmod 644 /opt/cloudera/csd/STREAMSETS*.jar', - 'chown cloudera-scm:cloudera-scm /opt/cloudera/parcel-repo/STREAMSETS_*'] - primary_node.execute(' && '.join(commands)) - # The parcel is already present. Hence just distribute and activate it after refresing parcel repos. product = 'STREAMSETS_DATACOLLECTOR' deployment.refresh_parcel_repos() @@ -357,7 +351,7 @@ def cm_server_not_dead(primary_node): if args.sdc_version: logger.info('Configuring StreamSets Data Collector ...') - _configure_sdc(deployment, cluster, is_kerberos_enabled=args.kerberos) + _configure_sdc(deployment, cluster, args) if args.kerberos: logger.info('Configure Cloudera Manager for Kerberos ...') @@ -899,7 +893,7 @@ def _setup_ssl_encryption_authentication(cluster, service): ] cluster.primary_node.execute(' && '.join(ssl_authentication_commands)) -def _configure_sdc(deployment, cluster, is_kerberos_enabled): +def _configure_sdc(deployment, cluster, args): logger.info('Adding StreamSets service to cluster (%s) ...', DEFAULT_CLUSTER_NAME) datacollector_role = {'type': 'DATACOLLECTOR', 'hostRef': {'hostId': cluster.primary_node.host_id}} @@ -908,18 +902,21 @@ def _configure_sdc(deployment, cluster, is_kerberos_enabled): 'type': 'STREAMSETS', 'displayName': 'StreamSets', 'roles': [datacollector_role]}]) - # When running an application with Spark2, the following - # environment variables must be set before starting StreamSets Data Collector. - environment_variables = {'SPARK_SUBMIT_YARN_COMMAND': '/usr/bin/spark2-submit', - 'SPARK_KAFKA_VERSION': '0.10'} + if args.spark2_version: + # When running an application with Spark2, the following + # environment variables must be set before starting StreamSets Data Collector. + environment_variables = {'SPARK_SUBMIT_YARN_COMMAND': '/usr/bin/spark2-submit', + 'SPARK_KAFKA_VERSION': '0.10', + 'SPARK_HOME': '/opt/cloudera/parcels/SPARK2/lib/spark2'} + else: + # When running an application on YARN, the Spark executor requires access to the spark-submit script located in + # the Spark installation directory. Default is directory specified by SPARK_HOME environment variable. + # Hence SPARK_HOME environment variable must be set before starting StreamSets Data Collector. + environment_variables = {'SPARK_HOME': '/opt/cloudera/parcels/CDH/lib/spark'} configs = {'sdc-env.sh_role_safety_valve': '\n'.join('export {}={}'.format(key, value) for key, value in environment_variables.items())} - # When running an application on YARN, the Spark executor requires access to the spark-submit script located in - # the Spark installation directory. Default is directory specified by SPARK_HOME environment variable. - # Hence SPARK_HOME environment variable must be set before starting StreamSets Data Collector. - configs = {'sdc-env.sh_role_safety_valve': 'export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark'} - if is_kerberos_enabled: + if args.kerberos: # Create JAAS config file on node-1. Needed to access kerberized Kafka. primary_node = cluster.primary_node sdc_principal = 'sdc/{kafka_node_name}@{realm}'.format(kafka_node_name=primary_node.fqdn,