I get the "UnicodeEncodeError" when I try to display the spark dataframe on terminal using this code:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
import pyspark
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import NotFoundError
### Creating Spark Session
spark = SparkSession \
.builder \
.appName("test") \
.config("spark.executor.heartbeatInterval","60s") \
.getOrCreate()
spark.conf.set('spark.sql.session.timeZone', 'UTC')
spark.sparkContext.setLogLevel("ERROR")
es_server_ip = "elasticsearch"
es_server_port = "9200"
es_conn = Elasticsearch("http://user:password@elasticsearch:9200",use_ssl=False,verify_certs=True)
#function to read dataframe from Elastic Search index
def readFromES(esIndex,esQuery):
esDf = spark.read.format("org.elasticsearch.spark.sql") \
.option("es.nodes",es_server_ip ) \
.option("es.port",es_server_port) \
.option("es.net.http.auth.user", "user") \
.option("es.net.http.auth.pass", "password") \
.option("es.net.ssl","false") \
.option("es.net.ssl.cert.allow.self.signed","true") \
.option("es.read.metadata", "false") \
.option("es.mapping.date.rich", "false") \
.option("es.query",esQuery) \
.load(esIndex)
return esDf
#defining the elastic search query
q_ci = """{
"query": {
"match_all": {}
}
}"""
#invoking the function and saving the data to df1
df1 = readFromES("test_delete",q_ci)
df1.show(truncate=False)
Error:
df1.show(truncate=False)
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 382, in show UnicodeEncodeError: 'ascii' codec can't encode character '\ufffd' in position 124: ordinal not in range(128)
I would need the output as below
+--------------------+------+-----+
|hostname |kpi |value|
+--------------------+------+-----+
|host4 |cpu |95 |
|host3 |disk |90 |
|Apr�ngli |cpu |78 |
|host2 |memory|85 |
+--------------------+------+-----+
You may simulate the dataframe using the below code
data1 = [("Apr�ngli","cpu",78),
("host2","memory",85),
("host3","disk",90),
("host4","cpu",95),
]
schema1= StructType([ \
StructField("hostname",StringType(),True), \
StructField("kpi",StringType(),True), \
StructField("value",IntegerType(),True)
])
df1 = spark.createDataFrame(data=data1,schema=schema1)
df1.printSchema()
df1.show(truncate=False)
Steps I took: As mentioned in other stackoverflow answers, I did the below but still receiving the error
export PYTHONIOENCODING=utf8
version details:
PYTHON_VERSION=3.6.8
Spark version 2.4.5