From b12aa51067b323fb0001aa7f57d9170b643fe073 Mon Sep 17 00:00:00 2001 From: Kevin W Monroe Date: Wed, 7 Jun 2017 16:21:50 +0000 Subject: BIGTOP-2822: spark charm: leverage puppet config, gpu enablement Closes #239 --- .../src/charm/spark/layer-spark/README.md | 17 ++++- .../layer-spark/lib/charms/layer/bigtop_spark.py | 89 +++++++++++----------- .../src/charm/spark/layer-spark/reactive/spark.py | 61 ++++++++++++++- 3 files changed, 116 insertions(+), 51 deletions(-) (limited to 'bigtop-packages') diff --git a/bigtop-packages/src/charm/spark/layer-spark/README.md b/bigtop-packages/src/charm/spark/layer-spark/README.md index a5ac2c0b..f7feaab7 100644 --- a/bigtop-packages/src/charm/spark/layer-spark/README.md +++ b/bigtop-packages/src/charm/spark/layer-spark/README.md @@ -62,7 +62,7 @@ the unit acting as master, query Zookeeper as follows: juju run --unit zookeeper/0 'echo "get /spark/master_status" | /usr/lib/zookeeper/bin/zkCli.sh' -### YARN-client and YARN-cluster +### YARN This charm leverages our pluggable Hadoop model with the `hadoop-plugin` interface. This means that this charm can be related to an Apache Hadoop cluster to run Spark jobs there. The suggested deployment method is to use the @@ -224,14 +224,25 @@ Set a different value with: does not exceed the NodeManager maximum (defined on each nodemanager as `yarn.nodemanager.resource.memory-mb` in `yarn-default.xml`). -## spark_bench_enabled +## install-cuda +Provided by `layer-nvidia-cuda`, this option controls the installation +of NVIDIA CUDA packages if capable GPU hardware is present. When `false` (the +default), CUDA will not be installed or configured regardless of hardware +support. Set this to `true` to fetch and install CUDA-related packages from +the NVIDIA developer repository. + + juju config spark install-cuda=true + +> **Note**: This option requires external network access to +http://developer.download.nvidia.com/. Ensure appropriate proxies are +configured if needed. +## spark_bench_enabled Install the SparkBench benchmarking suite. If `true` (the default), this charm will download spark bench from the URL specified by `spark_bench_ppc64le` or `spark_bench_x86_64`, depending on the unit's architecture. ## spark_execution_mode - Spark has four modes of execution: local, standalone, yarn-client, and yarn-cluster. The default mode is `standalone` and can be changed by setting the `spark_execution_mode` config option. diff --git a/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py b/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py index 990c2903..91fcbf79 100644 --- a/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py +++ b/bigtop-packages/src/charm/spark/layer-spark/lib/charms/layer/bigtop_spark.py @@ -36,7 +36,7 @@ class Spark(object): mode = hookenv.config()['spark_execution_mode'] zk_units = unitdata.kv().get('zookeeper.units', []) master = None - if mode.startswith('local') or mode == 'yarn-cluster': + if mode.startswith('local') or mode.startswith('yarn'): master = mode elif mode == 'standalone' and not zk_units: master = 'spark://{}:7077'.format(spark_master_host) @@ -47,8 +47,6 @@ class Spark(object): nodes.append('{}:7077'.format(ip)) nodes_str = ','.join(nodes) master = 'spark://{}'.format(nodes_str) - elif mode.startswith('yarn'): - master = 'yarn-client' return master def install_benchmark(self): @@ -149,7 +147,8 @@ class Spark(object): self.install_demo() def setup_hdfs_logs(self): - # create hdfs storage space for history server + # Create hdfs storage space for history server and return the name + # of the created directory. dc = self.dist_config events_dir = dc.path('spark_events') events_dir = 'hdfs://{}'.format(events_dir) @@ -159,18 +158,14 @@ class Spark(object): events_dir) return events_dir - def configure(self, available_hosts, zk_units, peers): + def configure(self, available_hosts, zk_units, peers, extra_libs): """ This is the core logic of setting up spark. - Two flags are needed: - - * Namenode exists aka HDFS is ready - * Resource manager exists aka YARN is ready - - both flags are infered from the available hosts. - :param dict available_hosts: Hosts that Spark should know about. + :param list zk_units: List of Zookeeper dicts with host/port info. + :param list peers: List of Spark peer tuples (unit name, IP). + :param list extra_libs: List of extra lib paths for driver/executors. """ # Bootstrap spark if not unitdata.kv().get('spark.bootstrapped', False): @@ -188,6 +183,33 @@ class Spark(object): mode = hookenv.config()['spark_execution_mode'] master_ip = utils.resolve_private_address(available_hosts['spark-master']) master_url = self.get_master_url(master_ip) + req_driver_mem = hookenv.config()['driver_memory'] + req_executor_mem = hookenv.config()['executor_memory'] + + # handle tuning options that may be set as percentages + driver_mem = '1g' + executor_mem = '1g' + if req_driver_mem.endswith('%'): + if mode == 'standalone' or mode.startswith('local'): + mem_mb = host.get_total_ram() / 1024 / 1024 + req_percentage = float(req_driver_mem.strip('%')) / 100 + driver_mem = str(int(mem_mb * req_percentage)) + 'm' + else: + hookenv.log("driver_memory percentage in non-local mode. Using 1g default.", + level=None) + else: + driver_mem = req_driver_mem + + if req_executor_mem.endswith('%'): + if mode == 'standalone' or mode.startswith('local'): + mem_mb = host.get_total_ram() / 1024 / 1024 + req_percentage = float(req_executor_mem.strip('%')) / 100 + executor_mem = str(int(mem_mb * req_percentage)) + 'm' + else: + hookenv.log("executor_memory percentage in non-local mode. Using 1g default.", + level=None) + else: + executor_mem = req_executor_mem # Setup hosts dict hosts = { @@ -196,7 +218,11 @@ class Spark(object): if 'namenode' in available_hosts: hosts['namenode'] = available_hosts['namenode'] events_log_dir = self.setup_hdfs_logs() - + else: + # Bigtop includes a default hadoop_head_node if we do not specify + # any namenode info. To ensure spark standalone doesn't get + # invalid hadoop config, set our NN to an empty string. + hosts['namenode'] = '' if 'resourcemanager' in available_hosts: hosts['resourcemanager'] = available_hosts['resourcemanager'] @@ -215,6 +241,10 @@ class Spark(object): 'spark::common::master_url': master_url, 'spark::common::event_log_dir': events_log_dir, 'spark::common::history_log_dir': events_log_dir, + 'spark::common::extra_lib_dirs': + ':'.join(extra_libs) if extra_libs else None, + 'spark::common::driver_mem': driver_mem, + 'spark::common::executor_mem': executor_mem, } if zk_units: zks = [] @@ -242,39 +272,6 @@ class Spark(object): self.patch_worker_master_url(master_ip, master_url) - # handle tuning options that may be set as percentages - driver_mem = '1g' - req_driver_mem = hookenv.config()['driver_memory'] - executor_mem = '1g' - req_executor_mem = hookenv.config()['executor_memory'] - if req_driver_mem.endswith('%'): - if mode == 'standalone' or mode.startswith('local'): - mem_mb = host.get_total_ram() / 1024 / 1024 - req_percentage = float(req_driver_mem.strip('%')) / 100 - driver_mem = str(int(mem_mb * req_percentage)) + 'm' - else: - hookenv.log("driver_memory percentage in non-local mode. Using 1g default.", - level=None) - else: - driver_mem = req_driver_mem - - if req_executor_mem.endswith('%'): - if mode == 'standalone' or mode.startswith('local'): - mem_mb = host.get_total_ram() / 1024 / 1024 - req_percentage = float(req_executor_mem.strip('%')) / 100 - executor_mem = str(int(mem_mb * req_percentage)) + 'm' - else: - hookenv.log("executor_memory percentage in non-local mode. Using 1g default.", - level=None) - else: - executor_mem = req_executor_mem - - spark_env = '/etc/spark/conf/spark-env.sh' - utils.re_edit_in_place(spark_env, { - r'.*SPARK_DRIVER_MEMORY.*': 'export SPARK_DRIVER_MEMORY={}'.format(driver_mem), - r'.*SPARK_EXECUTOR_MEMORY.*': 'export SPARK_EXECUTOR_MEMORY={}'.format(executor_mem), - }, append_non_matches=True) - # Install SB (subsequent calls will reconfigure existing install) # SparkBench looks for the spark master in /etc/environment with utils.environment_edit_in_place('/etc/environment') as env: diff --git a/bigtop-packages/src/charm/spark/layer-spark/reactive/spark.py b/bigtop-packages/src/charm/spark/layer-spark/reactive/spark.py index b6b0ca7a..fc74fa10 100644 --- a/bigtop-packages/src/charm/spark/layer-spark/reactive/spark.py +++ b/bigtop-packages/src/charm/spark/layer-spark/reactive/spark.py @@ -38,6 +38,9 @@ def report_status(): elif mode == 'standalone' and is_state('leadership.is_leader'): mode = mode + " - master" + if is_state('spark.cuda.configured'): + mode = mode + " with CUDA" + if is_state('spark.started'): hookenv.status_set('active', 'ready ({})'.format(mode)) else: @@ -74,8 +77,15 @@ def install_spark_standalone(zks, peers): hookenv.log("Waiting 2m to ensure zk ensemble has settled: {}".format(zks)) time.sleep(120) + # Let spark know if we have cuda libs installed. + # NB: spark packages prereq hadoop (boo), so even in standalone mode, we'll + # have hadoop libs installed. May as well include them in our lib path. + extra_libs = ["/usr/lib/hadoop/lib/native"] + if is_state('cuda.installed'): + extra_libs.append("/usr/local/cuda/lib64") + spark = Spark() - spark.configure(hosts, zks, peers) + spark.configure(hosts, zk_units=zks, peers=peers, extra_libs=extra_libs) set_deployment_mode_state('spark.standalone.installed') @@ -98,8 +108,13 @@ def install_spark_yarn(): nns = hadoop.namenodes() hosts['namenode'] = nns[0] + # Always include native hadoop libs in yarn mode; add cuda libs if present. + extra_libs = ["/usr/lib/hadoop/lib/native"] + if is_state('cuda.installed'): + extra_libs.append("/usr/local/cuda/lib64") + spark = Spark() - spark.configure(hosts, zk_units=None, peers=None) + spark.configure(hosts, zk_units=None, peers=None, extra_libs=extra_libs) set_deployment_mode_state('spark.yarn.installed') @@ -160,6 +175,7 @@ def reinstall_spark(): # If neither config nor our matrix is changing, there is nothing to do. if not (is_state('config.changed') or data_changed('deployment_matrix', deployment_matrix)): + report_status() return # (Re)install based on our execution mode @@ -197,6 +213,47 @@ def leader_elected(): set_state("master.elected") +@when('spark.started', 'cuda.installed') +@when_not('spark.cuda.configured') +def configure_cuda(): + """ + Ensure cuda bits are configured. + + We can't be sure that the config.changed handler in the nvidia-cuda + layer will fire before the handler in this layer. We might call + reinstall_spark on config-changed before the cuda.installed state is set, + thereby missing the cuda lib path configuration. Deal with this by + excplicitly calling reinstall_spark after we *know* cuda.installed is set. + This may result in 2 calls to reinstall_spark when cuda-related config + changes, but it ensures our spark lib config is accurate. + """ + hookenv.log("Configuring spark with CUDA library paths") + reinstall_spark() + set_state('spark.cuda.configured') + report_status() + + +@when('spark.started', 'spark.cuda.configured') +@when_not('cuda.installed') +def unconfigure_cuda(): + """ + Ensure cuda bits are unconfigured. + + Similar to the configure_cuda method, we can't be sure that the + config.changed handler in the nvidia-cuda layer will fire before the + handler in this layer. We might call reinstall_spark on config-changed + before the cuda.installed state is removed, thereby configuring spark with + a cuda lib path when the user wanted cuda config removed. Deal with this by + excplicitly calling reinstall_spark after we *know* cuda.installed is gone. + This may result in 2 calls to reinstall_spark when cuda-related config + changes, but it ensures our spark lib config is accurate. + """ + hookenv.log("Removing CUDA library paths from spark configuration") + reinstall_spark() + remove_state('spark.cuda.configured') + report_status() + + @when('spark.started', 'client.joined') def client_present(client): if is_state('leadership.is_leader'): -- cgit v1.2.3