I've a StructuredStreaming program running on GCP Dataproc, when i import KafkaProducer, i get error shown below:
Imports :
import sys, datetime, time, os
from pyspark.sql.functions import col, rank, dense_rank, to_date, to_timestamp, format_number, row_number, lead, lag,monotonically_increasing_id
from google.cloud import storage
import datetime
from pyspark.sql.functions import to_timestamp,split as split_df, broadcast as s_broad, window
from pyspark.sql import SparkSession, Window, Row
import calendar
from pyspark.sql.types import StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, LongType, DateType, TimestampType
from pyspark.sql.functions import lit, col, concat, regexp_replace, when, unix_timestamp, count as sql_count
import pandas as pd
import json
import copy
from google.cloud import bigquery
from kafka import KafkaProducer
The last import is causing error shown below :
Traceback (most recent call last):
File "/tmp/1b6daf68f83749b88536401dbcf7eb71/StructuredStreaming_GCP_Versa_Sase_gcloud.py", line 14, in <module>
from kafka import KafkaProducer
File "/opt/conda/default/lib/python3.8/site-packages/kafka/__init__.py", line 23, in <module>
from kafka.producer import KafkaProducer
File "/opt/conda/default/lib/python3.8/site-packages/kafka/producer/__init__.py", line 4, in <module>
from .simple import SimpleProducer
File "/opt/conda/default/lib/python3.8/site-packages/kafka/producer/simple.py", line 54
return '<SimpleProducer batch=%s>' % self.async
^
SyntaxError: invalid syntax
While creation of the DataProc cluster, as part of 'initialization-actions', i install the following:
pip install pypi
pip install kafka-python
pip install google-cloud-storage
pip install pandas
Any ideas what needs to be done to debug/fix this ? tia!