Some of my data in snowflake is in json str format, but in actual it is list of floats. I used an udf to convert json str to list of floats but seems like snowflake is internally auto converting list of floats to string format again. Just want to know if this is how it works with snowflake or is there any better method to store list of floats in their actual format. I don't want to process the data everytime to convert it from json str to list of floats.
Use below code to demostrate the problem
connection_parameters = {
"account": "MY_ACCOUNT"
"user": "USER",
"password": "PASSWORD",
"role": "MY_ROLE",
"warehouse": "MY_WH",
"database": "MY_DB",
"schema": "MY_SCHEMA"
}
table = "MY_TABLE"
sf_session = Session.builder.configs(connection_parameters).create()
from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import ArrayType, DoubleType, StringType
import json
from typing import List
def parse_embedding_from_string(x: str) -> List[float]:
res = json.loads(x)
return res
retrieve_embedding = udf(parse_embedding_from_string)
df = sf_session.createDataFrame(data=[['[0.4, 2.57, 3.47]'], ['[34.50, 16.34, 12.9]'], ['[413.0, 1.211, 8.41]'], ['[0.4, 8.1, 10.11]'], ['[-6.89, 7.1, -12.1]'], ['[14.0, -21.0, 3.12]'], ['[11.0, 44.1, 26.2]'], ['[-4.4, 5.8, -0.10]']], schema=["embedding"])
df = df.withColumn("embedding_new", retrieve_embedding(col("embedding")))
# Output -
df.toPandas().iloc[0]["EMBEDDING_NEW"]
Below is the output
'[\n 0.4,\n 2.57,\n 3.47\n]'