0

I am new to Python, Databricks, and pydeequ. I'm trying to use pydeequ in Databricks. I installed the library via Maven using "com.amazon.deequ:deequ:2.0.4-spark-3.3". The analyzers are working, but not the profilerunner.

I am trying to run this example from the readme

Code:

from pydeequ.profiles import * from pydeequ.profiles import ColumnProfilerRunner

result = ColumnProfilerRunner(spark)
.onData(df)
.run() KeyError: 'StringColumnProfile'

I am getting this error:

KeyError Traceback (most recent call last) File :5 
      1 from pydeequ.profiles import *
      2 #from pydeequ.profiles import ColumnProfilerRunner
      3 
      4 #Se Crea una instancia de ColumnProfilerRunner y se le asigna un database 
----> 5 result = ColumnProfilerRunner(spark) \
      6     .onData(df) \
      7     .run()

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-f96b57df-8a1b-408f-821a-896e4943bb1a/lib/python3.9/site-packages/pydeequ/profiles.py:122, in ColumnProfilerRunBuilder.run(self) 116 """ 117 A method that runs a profile check on the data to obtain a ColumnProfiles class 118 119 :return: A ColumnProfiles result 120 """ 121 run = self._ColumnProfilerRunBuilder.run() --> 122 return ColumnProfilesBuilder(self._spark_session)._columnProfilesFromColumnRunBuilderRun(run)

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-f96b57df-8a1b-408f-821a-896e4943bb1a/lib/python3.9/site-packages/pydeequ/profiles.py:256, in ColumnProfilesBuilder._columnProfilesFromColumnRunBuilderRun(self, run) 254 self._run_result = run 255 profile_map = self._jvm.scala.collection.JavaConversions.mapAsJavaMap(run.profiles()) # TODO from ScalaUtils --> 256 self._profiles = {column: self._columnProfileBuilder(column, profile_map[column]) for column in profile_map} 257 return self

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-f96b57df-8a1b-408f-821a-896e4943bb1a/lib/python3.9/site-packages/pydeequ/profiles.py:256, in (.0) 254 self._run_result = run 255 profile_map = self._jvm.scala.collection.JavaConversions.mapAsJavaMap(run.profiles()) # TODO from ScalaUtils --> 256 self._profiles = {column: self._columnProfileBuilder(column, profile_map[column]) for column in profile_map} 257 return self

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-f96b57df-8a1b-408f-821a-896e4943bb1a/lib/python3.9/site-packages/pydeequ/profiles.py:275, in ColumnProfilesBuilder._columnProfileBuilder(self, column, java_column_profile) 268 def _columnProfileBuilder(self, column, java_column_profile): 269 """Factory function for ColumnProfile 270 Returns a Java profile based on the designated column 271 272 :param column: The column to run a profile on 273 :param java_column_profile: The profile mapped as a Java map 274 """ --> 275 return self.columnProfileClasses[java_column_profile.getClass().getSimpleName()]( 276 self._spark_session, column, java_column_profile 277 )

KeyError: 'StringColumnProfile'

0 Answers0