[SPARK-13578][CORE] Modify launch scripts to not use assemblies.

Instead of looking for a specially-named assembly, the scripts now will blindly add all jars under the libs directory to the classpath. This libs directory is still currently the old assembly dir, so things should keep working the same way as before until we make more packaging changes. The only lost feature is the detection of multiple assemblies; I consider that a minor nicety that only really affects few developers, so it's probably ok. Tested locally by running spark-shell; also did some minor Win32 testing (just made sure spark-shell started). Author: Marcelo Vanzin <vanzin@cloudera.com> Closes #11591 from vanzin/SPARK-13578.

[SPARK-13578][CORE] Modify launch scripts to not use assemblies.
Instead of looking for a specially-named assembly, the scripts now will blindly add all jars under the libs directory to the classpath. This libs directory is still currently the old assembly dir, so things should keep working the same way as before until we make more packaging changes. The only lost feature is the detection of multiple assemblies; I consider that a minor nicety that only really affects few developers, so it's probably ok. Tested locally by running spark-shell; also did some minor Win32 testing (just made sure spark-shell started). Author: Marcelo Vanzin <vanzin@cloudera.com> Closes #11591 from vanzin/SPARK-13578.
45f8053b · Marcelo Vanzin · Josh Rosen · 9a87afd7 · 45f8053b · 45f8053b
5 changed file
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -35,42 +35,27 @@ else
  fi
 fi

-# Find assembly jar
-SPARK_ASSEMBLY_JAR=
+# Find Spark jars.
+# TODO: change the directory name when Spark jars move from "lib".
 if [ -f "${SPARK_HOME}/RELEASE" ]; then
-  ASSEMBLY_DIR="${SPARK_HOME}/lib"
+  SPARK_JARS_DIR="${SPARK_HOME}/lib"
 else
-  ASSEMBLY_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION"
+  SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION"
 fi

-GREP_OPTIONS=
-num_jars="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" | wc -l)"
-if [ "$num_jars" -eq "0" -a -z "$SPARK_ASSEMBLY_JAR" -a "$SPARK_PREPEND_CLASSES" != "1" ]; then
-  echo "Failed to find Spark assembly in $ASSEMBLY_DIR." 1>&2
+if [ ! -d "$SPARK_JARS_DIR" ]; then
+  echo "Failed to find Spark jars directory ($SPARK_JARS_DIR)." 1>&2
  echo "You need to build Spark before running this program." 1>&2
  exit 1
 fi
-if [ -d "$ASSEMBLY_DIR" ]; then
-  ASSEMBLY_JARS="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" || true)"
-  if [ "$num_jars" -gt "1" ]; then
-    echo "Found multiple Spark assembly jars in $ASSEMBLY_DIR:" 1>&2
-    echo "$ASSEMBLY_JARS" 1>&2
-    echo "Please remove all but one jar." 1>&2
-    exit 1
-  fi
-fi

-SPARK_ASSEMBLY_JAR="${ASSEMBLY_DIR}/${ASSEMBLY_JARS}"
-
-LAUNCH_CLASSPATH="$SPARK_ASSEMBLY_JAR"
+LAUNCH_CLASSPATH="$SPARK_JARS_DIR/*"

 # Add the launcher build dir to the classpath if requested.
 if [ -n "$SPARK_PREPEND_CLASSES" ]; then
  LAUNCH_CLASSPATH="${SPARK_HOME}/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH"
 fi

-export _SPARK_ASSEMBLY="$SPARK_ASSEMBLY_JAR"
-
 # For tests
 if [[ -n "$SPARK_TESTING" ]]; then
  unset YARN_CONF_DIR

--- a/bin/spark-class2.cmd
+++ b/bin/spark-class2.cmd
@@ -28,33 +28,27 @@ if "x%1"=="x" (
  exit /b 1
 )

-rem Find assembly jar
-set SPARK_ASSEMBLY_JAR=0
-
+rem Find Spark jars.
+rem TODO: change the directory name when Spark jars move from "lib".
 if exist "%SPARK_HOME%\RELEASE" (
-  set ASSEMBLY_DIR="%SPARK_HOME%\lib"
+  set SPARK_JARS_DIR="%SPARK_HOME%\lib"
 ) else (
-  set ASSEMBLY_DIR="%SPARK_HOME%\assembly\target\scala-%SPARK_SCALA_VERSION%"
+  set SPARK_JARS_DIR="%SPARK_HOME%\assembly\target\scala-%SPARK_SCALA_VERSION%"
 )

-for %%d in (%ASSEMBLY_DIR%\spark-assembly*hadoop*.jar) do (
-  set SPARK_ASSEMBLY_JAR=%%d
-)
-if "%SPARK_ASSEMBLY_JAR%"=="0" (
+if not exist "%SPARK_JARS_DIR%"\ (
  echo Failed to find Spark assembly JAR.
  echo You need to build Spark before running this program.
  exit /b 1
 )

-set LAUNCH_CLASSPATH=%SPARK_ASSEMBLY_JAR%
+set LAUNCH_CLASSPATH=%SPARK_JARS_DIR%\*

 rem Add the launcher build dir to the classpath if requested.
 if not "x%SPARK_PREPEND_CLASSES%"=="x" (
  set LAUNCH_CLASSPATH="%SPARK_HOME%\launcher\target\scala-%SPARK_SCALA_VERSION%\classes;%LAUNCH_CLASSPATH%"
 )

-set _SPARK_ASSEMBLY=%SPARK_ASSEMBLY_JAR%
-
 rem Figure out where java is.
 set RUNNER=java
 if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java

--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -19,7 +19,6 @@ package org.apache.spark.launcher;

 import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileFilter;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
 import java.io.IOException;
@@ -172,21 +171,13 @@ abstract class AbstractCommandBuilder {
      addToClassPath(cp, String.format("%s/core/target/jars/*", sparkHome));
    }

-    // We can't rely on the ENV_SPARK_ASSEMBLY variable to be set. Certain situations, such as
-    // when running unit tests, or user code that embeds Spark and creates a SparkContext
-    // with a local or local-cluster master, will cause this code to be called from an
-    // environment where that env variable is not guaranteed to exist.
-    //
-    // For the testing case, we rely on the test code to set and propagate the test classpath
-    // appropriately.
-    //
-    // For the user code case, we fall back to looking for the Spark assembly under SPARK_HOME.
-    // That duplicates some of the code in the shell scripts that look for the assembly, though.
-    String assembly = getenv(ENV_SPARK_ASSEMBLY);
-    if (assembly == null && !isTesting) {
-      assembly = findAssembly();
+    // Add Spark jars to the classpath. For the testing case, we rely on the test code to set and
+    // propagate the test classpath appropriately. For normal invocation, look for the jars
+    // directory under SPARK_HOME.
+    String jarsDir = findJarsDir(!isTesting);
+    if (jarsDir != null) {
+      addToClassPath(cp, join(File.separator, jarsDir, "*"));
    }
-    addToClassPath(cp, assembly);

    // Datanucleus jars must be included on the classpath. Datanucleus jars do not work if only
    // included in the uber jar as plugin.xml metadata is lost. Both sbt and maven will populate
@@ -320,28 +311,25 @@ abstract class AbstractCommandBuilder {
    return props;
  }

-  private String findAssembly() {
+  private String findJarsDir(boolean failIfNotFound) {
+    // TODO: change to the correct directory once the assembly build is changed.
    String sparkHome = getSparkHome();
    File libdir;
    if (new File(sparkHome, "RELEASE").isFile()) {
      libdir = new File(sparkHome, "lib");
-      checkState(libdir.isDirectory(), "Library directory '%s' does not exist.",
-          libdir.getAbsolutePath());
+      checkState(!failIfNotFound || libdir.isDirectory(),
+        "Library directory '%s' does not exist.",
+        libdir.getAbsolutePath());
    } else {
      libdir = new File(sparkHome, String.format("assembly/target/scala-%s", getScalaVersion()));
-    }
-
-    final Pattern re = Pattern.compile("spark-assembly.*hadoop.*\\.jar");
-    FileFilter filter = new FileFilter() {
-      @Override
-      public boolean accept(File file) {
-        return file.isFile() && re.matcher(file.getName()).matches();
+      if (!libdir.isDirectory()) {
+        checkState(!failIfNotFound,
+          "Library directory '%s' does not exist; make sure Spark is built.",
+          libdir.getAbsolutePath());
+        libdir = null;
      }
-    };
-    File[] assemblies = libdir.listFiles(filter);
-    checkState(assemblies != null && assemblies.length > 0, "No assemblies found in '%s'.", libdir);
-    checkState(assemblies.length == 1, "Multiple assemblies found in '%s'.", libdir);
-    return assemblies[0].getAbsolutePath();
+    }
+    return libdir != null ? libdir.getAbsolutePath() : null;
  }

  private String getConfDir() {

--- a/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
@@ -30,7 +30,6 @@ class CommandBuilderUtils {
  static final String DEFAULT_MEM = "1g";
  static final String DEFAULT_PROPERTIES_FILE = "spark-defaults.conf";
  static final String ENV_SPARK_HOME = "SPARK_HOME";
-  static final String ENV_SPARK_ASSEMBLY = "_SPARK_ASSEMBLY";

  /** The set of known JVM vendors. */
  static enum JavaVendor {

--- a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
@@ -282,7 +282,6 @@ public class SparkSubmitCommandBuilderSuite extends BaseSuite {
  private SparkSubmitCommandBuilder newCommandBuilder(List<String> args) {
    SparkSubmitCommandBuilder builder = new SparkSubmitCommandBuilder(args);
    builder.childEnv.put(CommandBuilderUtils.ENV_SPARK_HOME, System.getProperty("spark.test.home"));
-    builder.childEnv.put(CommandBuilderUtils.ENV_SPARK_ASSEMBLY, "dummy");
    return builder;
  }