2

This example is taken directly from the Spark example code so I'm at a bit of a loss figuring out what's going on.

import org.apache.spark.sql.*;
import org.apache.spark.sql.api.java.UDF1;
import org.apache.spark.sql.types.DataTypes;

public class TestSpark {

    public static void main(String[] args) {

        SparkSession spark = SparkSession.builder()
                .appName("testspark")
                .master("spark://0.0.0.0:7077")
                 // Yes this file exists, and contains this class
                .config("spark.jars", "out/artifacts/testspark_jar/testspark.jar")
                .getOrCreate();

        // This works
        spark.sql("SELECT 5 + 1").show();

        spark.udf().register("plusOne", (UDF1<Integer, Integer>) x -> x + 1, DataTypes.IntegerType);

        // This fails
        spark.sql("SELECT plusOne(5)").show();

    }

}

I'm running this on a Spark Standalone cluster running on localhost.

The worker consistently fails with:

Caused by: java.lang.ClassCastException: cannot assign instance of java.lang.invoke.SerializedLambda to field org.apache.spark.rdd.MapPartitionsRDD.f of type scala.Function3 in instance of org.apache.spark.rdd.MapPartitionsRDD
    at java.base/java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2205)
    at java.base/java.io.ObjectStreamClass$FieldReflector.checkObjectFieldValueTypes(ObjectStreamClass.java:2168)
    at java.base/java.io.ObjectStreamClass.checkObjFieldValueTypes(ObjectStreamClass.java:1422)
    at java.base/java.io.ObjectInputStream.defaultCheckFieldValues(ObjectInputStream.java:2450)
    at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2357)
    at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2166)
    at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1668)
    at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2434)
    at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2328)
    at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2166)
    at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1668)
    at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:482)
    at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:440)
    at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:488)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.base/java.lang.reflect.Method.invoke(Method.java:566)
    at java.base/java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1175)
    at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2295)
    at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2166)
    at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1668)
    at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2434)
    at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2328)
    at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2166)
    at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1668)
    at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2434)
    at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2328)
    at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2166)
    at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1668)
    at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:482)
    at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:440)
    at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)
    at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:115)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:83)
    at org.apache.spark.scheduler.Task.run(Task.scala:127)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
    at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
    at java.base/java.lang.Thread.run(Thread.java:834)

I'm running on Java 11, using Spark 3.0.1.

I did find this very similar question which looks like it would be the answer: java.lang.ClassCastException using lambda expressions in spark job on remote server

However, after ensuring that my TestSpark is compiled into a JAR that is supplied to the SparkSession, I still get the same error.

Any help would be greatly appreciated. It seems as if something is going on at the Java/Scala border, but I don't know enough about Scala interop to analyze further.

levand
  • 8,440
  • 3
  • 41
  • 54

1 Answers1

2

The answer in the question I already linked (java.lang.ClassCastException using lambda expressions in spark job on remote server) was correct.

It was obfuscated again because I wasn't compiling the JAR correctly (I think something with the manifest, but not 100% sure).

After compiling the JAR correctly (using mvn package instead of my IDE) and supplying that in the spark.jars config property, the code works as expected.

levand
  • 8,440
  • 3
  • 41
  • 54
  • Hi @levand, I'm facing similar problem but combinated with SpringBoot. Could you please details step by step your solution. – elgsylvain85 Jul 18 '22 at 22:52