I have a 3.25 day job that failed when writing the dataframe to DBFS. Update: I have had a 2nd long job (also 3+ days) fail in the same manner. I can use this cluster for smaller jobs with no issues, it looks like only long jobs are impacted.
java.io.IOException: Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature. Please see the cause for further information.
at hadoop_azure_shaded.com.microsoft.azure.storage.core.Utility.initIOException(Utility.java:769)
It looks like Spark wasn't able to authenticate to the storage account Azure Databricks uses for the DBFS root. My notebook writes the dataframe to DBFS and then uploads the results to ADLS as a later stage. The run is failing before it gets to that part (as shown in the stack trace and lack of trace message indicating it got to the upload stage)
I can run shorter jobs just fine, so it's not a fundamental problem.
Does anyone know what happened here and how I can make this more robust? I'd like to avoid losing all data from the run next time.
Here is the error trace:
Py4JJavaError Traceback (most recent call last)
<command-920141663601998> in <module>
121 app_details_json_dataframe2.show( truncate=100 )
122
--> 123 app_details_json_dataframe2.coalesce(1).write.mode("overwrite").format("delta").option("header","true").option("compression", "gzip").json(path="/data/Directory")
124
125 for f in dbutils.fs.ls("dbfs:/data/Directory/"):
/databricks/spark/python/pyspark/sql/readwriter.py in json(self, path, mode, compression, dateFormat, timestampFormat, lineSep, encoding, ignoreNullFields)
844 compression=compression, dateFormat=dateFormat, timestampFormat=timestampFormat,
845 lineSep=lineSep, encoding=encoding, ignoreNullFields=ignoreNullFields)
--> 846 self._jwrite.json(path)
847
848 def parquet(self, path, mode=None, partitionBy=None, compression=None):
/databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
115 def deco(*a, **kw):
116 try:
--> 117 return f(*a, **kw)
118 except py4j.protocol.Py4JJavaError as e:
119 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
And this is the exception I get:
Py4JJavaError: An error occurred while calling o1085.json.
: org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:603)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:359)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:198)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:126)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:124)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:138)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$$nestedInanonfun$eagerlyExecuteCommands$1$1.$anonfun$applyOrElse$1(QueryExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$8(SQLExecution.scala:209)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:356)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:160)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:958)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:115)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:306)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$$nestedInanonfun$eagerlyExecuteCommands$1$1.applyOrElse(QueryExecution.scala:160)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$$nestedInanonfun$eagerlyExecuteCommands$1$1.applyOrElse(QueryExecution.scala:156)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:492)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:94)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:492)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:268)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:264)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:468)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$eagerlyExecuteCommands$1(QueryExecution.scala:156)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:324)
at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:156)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:141)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:132)
at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:183)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:959)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:427)
at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:396)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:250)
at org.apache.spark.sql.DataFrameWriter.json(DataFrameWriter.scala:874)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 27.0 failed 4 times, most recent failure: Lost task 0.3 in stage 27.0 (TID 58) (10.139.64.4 executor driver): org.apache.spark.SparkException: Task failed while writing rows.
at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:607)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:456)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$20(FileFormatWriter.scala:335)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$3(ResultTask.scala:75)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$1(ResultTask.scala:75)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:55)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:156)
at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:125)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.Task.run(Task.scala:95)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:826)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1670)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:829)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:684)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature. Please see the cause for further information.
at hadoop_azure_shaded.com.microsoft.azure.storage.core.Utility.initIOException(Utility.java:769)
at hadoop_azure_shaded.com.microsoft.azure.storage.blob.BlobOutputStreamInternal.writeBlock(BlobOutputStreamInternal.java:475)
at hadoop_azure_shaded.com.microsoft.azure.storage.blob.BlobOutputStreamInternal.access$000(BlobOutputStreamInternal.java:47)
at hadoop_azure_shaded.com.microsoft.azure.storage.blob.BlobOutputStreamInternal$1.call(BlobOutputStreamInternal.java:419)
at hadoop_azure_shaded.com.microsoft.azure.storage.blob.BlobOutputStreamInternal$1.call(BlobOutputStreamInternal.java:416)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
... 3 more
Suppressed: org.apache.spark.util.TaskCompletionListenerException: Self-suppression not permitted