spark-shell --packages com.databricks:spark-csv_2.11:1.2.0 1. using SQLContext ~~~~~~~~~~~~~~~~~~~~ 1. import org.apache.spark.sql.SQLContext 2. val sqlctx = new SQLContext(sc) 3. import sqlctx._
val df = sqlctx.read.format("com.databricks.spark.csv").option("inferScheme","true").option("delimiter",";").option("header","true").load("/user/cloudera/data.csv")
df.select(avg($"col1")).show() // this works fine
sqlctx.sql("select percentile_approx(balance,0.5) as median from port_bank_table").show() or sqlctx.sql("select percentile(balance,0.5) as median from port_bank_table").show() // both are not working , getting the below error
org.apache.spark.sql.AnalysisException: undefined function percentile_approx; line 0 pos 0 at org.apache.spark.sql.catalyst.analysis.SimpleFunctionRegistry$$anonfun$2.apply(FunctionRegistry.scala:65) at org.apache.spark.sql.catalyst.analysis.SimpleFunctionRegistry$$anonfun$2.apply(FunctionRegistry.scala:65)
using HiveContext ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ so tried using hive context scala> import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.HiveContext
scala> val hivectx = new HiveContext(sc) 18/01/09 22:51:06 WARN metastore.ObjectStore: Failed to get database default, returning NoSuchObjectException hivectx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@5be91161
scala> import hivectx._ import hivectx._
getting the below error
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Failed to start database 'metastore_db' with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@be453c4,
see the next exception for details.