Skip to content

Commit

Permalink
Add env. variables for spark2 (#27)
Browse files Browse the repository at this point in the history
  • Loading branch information
kirtiv1 committed May 18, 2018
1 parent e39aaf6 commit dd87a3f
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 58 deletions.
5 changes: 4 additions & 1 deletion images/build_images
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ CSD_URL_TEMPLATE = 'http://{s3_bucket}/datacollector/{build}/csd/STREAMSETS-{0}.
IMAGE_NAME_TEMPLATE = 'streamsets/clusterdock:topology_cdh-streamsets_datacollector-{}'
PARCEL_URL_TEMPLATE = 'http://{s3_bucket}/datacollector/{build}/parcel/STREAMSETS_DATACOLLECTOR-{0}-el6.parcel'
PARCEL_MANIFEST_URL_TEMPLATE = 'http://{s3_bucket}/datacollector/{build}/parcel/manifest.json'
# Name of service in CDH cluster.
SDC_PRODUCT_NAME = 'STREAMSETS_DATACOLLECTOR'


def main():
Expand All @@ -50,7 +52,7 @@ def main():
parser.add_argument('-p', '--push', help='Push Docker images after building', action='store_true')
args = parser.parse_args()

image_folder = Path(Path(__file__).parent, 'sdc').resolve()
image_folder = Path(Path(__file__).parent, 'cloudera_service').resolve()

if args.dry_run:
logger.info('Doing dry-run of tool ...')
Expand All @@ -63,6 +65,7 @@ def main():
cmd_elements = ['docker build -t {}'.format(image_name),
'--build-arg CSD_URL={}'.format(csd_url),
'--build-arg PARCEL_URL={}'.format(parcel_url),
'--build-arg PRODUCT={}'.format(SDC_PRODUCT_NAME),
str(image_folder)]
cmd = ' '.join(cmd_elements)
logger.debug('Running Docker build command (%s) ...', cmd)
Expand Down
37 changes: 0 additions & 37 deletions images/sdc/Dockerfile

This file was deleted.

37 changes: 17 additions & 20 deletions start.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,9 @@ def main(args):
if args.spark2_version:
_install_service_from_local_repo(cluster, product='SPARK2')

if args.sdc_version:
_install_service_from_local_repo(cluster, product='STREAMSETS_DATACOLLECTOR')

if args.kerberos:
cluster.kdc_node = kdc_node
_configure_kdc(cluster, args.kerberos_principals, args.kerberos_ticket_lifetime, quiet=quiet)
Expand Down Expand Up @@ -273,15 +276,6 @@ def cm_server_not_dead(primary_node):
deployment.update_cm_config(configs={'manages_parcels': True})

if args.sdc_version:
# We install StreamSets DataCollector using local repo /opt/cloudera/parcel-repo.
# Set file and folder permissions correctly.
commands = ['chown cloudera-scm:cloudera-scm /opt/cloudera/csd',
'chown cloudera-scm:cloudera-scm /opt/cloudera/parcel-repo',
'chown cloudera-scm:cloudera-scm /opt/cloudera/csd/STREAMSETS*.jar',
'chmod 644 /opt/cloudera/csd/STREAMSETS*.jar',
'chown cloudera-scm:cloudera-scm /opt/cloudera/parcel-repo/STREAMSETS_*']
primary_node.execute(' && '.join(commands))

# The parcel is already present. Hence just distribute and activate it after refresing parcel repos.
product = 'STREAMSETS_DATACOLLECTOR'
deployment.refresh_parcel_repos()
Expand Down Expand Up @@ -357,7 +351,7 @@ def cm_server_not_dead(primary_node):

if args.sdc_version:
logger.info('Configuring StreamSets Data Collector ...')
_configure_sdc(deployment, cluster, is_kerberos_enabled=args.kerberos)
_configure_sdc(deployment, cluster, args)

if args.kerberos:
logger.info('Configure Cloudera Manager for Kerberos ...')
Expand Down Expand Up @@ -899,7 +893,7 @@ def _setup_ssl_encryption_authentication(cluster, service):
]
cluster.primary_node.execute(' && '.join(ssl_authentication_commands))

def _configure_sdc(deployment, cluster, is_kerberos_enabled):
def _configure_sdc(deployment, cluster, args):
logger.info('Adding StreamSets service to cluster (%s) ...', DEFAULT_CLUSTER_NAME)
datacollector_role = {'type': 'DATACOLLECTOR',
'hostRef': {'hostId': cluster.primary_node.host_id}}
Expand All @@ -908,18 +902,21 @@ def _configure_sdc(deployment, cluster, is_kerberos_enabled):
'type': 'STREAMSETS',
'displayName': 'StreamSets',
'roles': [datacollector_role]}])
# When running an application with Spark2, the following
# environment variables must be set before starting StreamSets Data Collector.
environment_variables = {'SPARK_SUBMIT_YARN_COMMAND': '/usr/bin/spark2-submit',
'SPARK_KAFKA_VERSION': '0.10'}
if args.spark2_version:
# When running an application with Spark2, the following
# environment variables must be set before starting StreamSets Data Collector.
environment_variables = {'SPARK_SUBMIT_YARN_COMMAND': '/usr/bin/spark2-submit',
'SPARK_KAFKA_VERSION': '0.10',
'SPARK_HOME': '/opt/cloudera/parcels/SPARK2/lib/spark2'}
else:
# When running an application on YARN, the Spark executor requires access to the spark-submit script located in
# the Spark installation directory. Default is directory specified by SPARK_HOME environment variable.
# Hence SPARK_HOME environment variable must be set before starting StreamSets Data Collector.
environment_variables = {'SPARK_HOME': '/opt/cloudera/parcels/CDH/lib/spark'}
configs = {'sdc-env.sh_role_safety_valve': '\n'.join('export {}={}'.format(key, value)
for key, value in environment_variables.items())}
# When running an application on YARN, the Spark executor requires access to the spark-submit script located in
# the Spark installation directory. Default is directory specified by SPARK_HOME environment variable.
# Hence SPARK_HOME environment variable must be set before starting StreamSets Data Collector.
configs = {'sdc-env.sh_role_safety_valve': 'export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark'}

if is_kerberos_enabled:
if args.kerberos:
# Create JAAS config file on node-1. Needed to access kerberized Kafka.
primary_node = cluster.primary_node
sdc_principal = 'sdc/{kafka_node_name}@{realm}'.format(kafka_node_name=primary_node.fqdn,
Expand Down

0 comments on commit dd87a3f

Please sign in to comment.