I'm trying to read a simple CSV from my local file system from my Jupyter Notebook that is running a Scala kernel.
This is what I do:
import org.apache.spark.sql._
val spark = {
NotebookSparkSession.builder()
.master("local[*]")
.getOrCreate()
}
val df = spark.read
.format("csv")
.option("header", "true")
.option("mode", "DROPMALFORMED")
.load("/home/user/ml-projects/housing-classification-example-scala/data/file.csv")
df.printSchema()
This is the content of my CSV file:
id,name
1,joesan
2,sanjoe
Here is the error I get:
ERROR Execute exception in user code (Illegal pattern component: XXX)
java.lang.IllegalArgumentException: Illegal pattern component: XXX
org.apache.commons.lang3.time.FastDatePrinter.parsePattern(FastDatePrinter.java:282)
org.apache.commons.lang3.time.FastDatePrinter.init(FastDatePrinter.java:149)
org.apache.commons.lang3.time.FastDatePrinter.<init>(FastDatePrinter.java:142)
org.apache.commons.lang3.time.FastDateFormat.<init>(FastDateFormat.java:384)
org.apache.commons.lang3.time.FastDateFormat.<init>(FastDateFormat.java:369)
org.apache.commons.lang3.time.FastDateFormat$1.createInstance(FastDateFormat.java:91)
org.apache.commons.lang3.time.FastDateFormat$1.createInstance(FastDateFormat.java:88)
org.apache.commons.lang3.time.FormatCache.getInstance(FormatCache.java:82)
org.apache.commons.lang3.time.FastDateFormat.getInstance(FastDateFormat.java:165)
org.apache.spark.sql.execution.datasources.csv.CSVOptions.<init>(CSVOptions.scala:140)
org.apache.spark.sql.execution.datasources.csv.CSVOptions.<init>(CSVOptions.scala:45)
org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:58)
org.apache.spark.sql.execution.datasources.DataSource.$anonfun$getOrInferFileFormatSchema$12(DataSource.scala:183)
scala.Option.orElse(Option.scala:447)
org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:180)
org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:373)
org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
ammonite.$sess.cmd1$Helper.<init>(cmd1.sc:13)
ammonite.$sess.cmd1$.<init>(cmd1.sc:7)
ammonite.$sess.cmd1$.<clinit>(cmd1.sc)
ammonite.$sess.cmd1.$main(cmd1.sc)
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.base/java.lang.reflect.Method.invoke(Method.java:566)
ammonite.runtime.Evaluator$$anon$1.$anonfun$evalMain$1(Evaluator.scala:108)
ammonite.util.Util$.withContextClassloader(Util.scala:24)
ammonite.runtime.Evaluator$$anon$1.evalMain(Evaluator.scala:90)
ammonite.runtime.Evaluator$$anon$1.$anonfun$processLine$2(Evaluator.scala:127)
ammonite.util.Catching.map(Res.scala:117)