0

I have a row of a polars dataframe created using iterators reading a parquet file from this method: Iterate over rows polars rust

I have constructed a HashMap that represents an individual row and I would like to now convert that row into JSON.

This is what my code looks like so far:

use polars::prelude::*;
use std::iter::zip;
use std::{fs::File, collections::HashMap};

fn main() -> anyhow::Result<()> {

    let file = File::open("0.parquet").unwrap();
    let mut df = ParquetReader::new(file).finish()?;
    dbg!(df.schema());
    let fields = df.fields();
    let columns: Vec<&String> = fields.iter().map(|x| x.name()).collect();

    df.as_single_chunk_par();
    let mut iters = df.iter().map(|s| s.iter()).collect::<Vec<_>>();

    for _ in 0..df.height() {
        let mut row = HashMap::new();
        for (column, iter) in zip(&columns, &mut iters) {
            let value = iter.next().expect("should have as many iterations as rows");
            row.insert(column, value);
        }
        dbg!(&row);
        let json = serde_json::to_string(&row).unwrap();
        dbg!(json);
        break;
    }

    Ok(())
}

And I have the following feature flags enabled: ["parquet", "serde", "dtype-u8", "dtype-i8", "dtype-date", "dtype-datetime"].

I am running into the following error at the serde_json::to_string(&row).unwrap() line:

thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value: Error("the enum variant AnyValue::Datetime cannot be serialized", line: 0, column: 0)', src/main.rs:47:48

I am also unable to implement my own serialized for AnyValue::DateTime because of only traits defined in the current crate can be implemented for types defined outside of the crate.

What's the best way to serialize this row into JSON?

kmdreko
  • 42,554
  • 6
  • 57
  • 106
Al Johri
  • 1,729
  • 22
  • 23

1 Answers1

1

I was able to resolve this error by using a match statement over value to change it from a Datetime to an Int64.

let value = match value {
                AnyValue::Datetime(value, TimeUnit::Milliseconds, _) => AnyValue::Int64(value),
                x => x
            };
row.insert(column, value);

Root cause is there is no enum variant for Datetime in the impl Serialize block: https://docs.rs/polars-core/0.24.0/src/polars_core/datatypes/mod.rs.html#298

Although this code now works, it outputs data that looks like:

{'myintcolumn': {'Int64': 22342342343},
 'mylistoclumn': {'List': {'datatype': 'Int32', 'name': '', 'values': []}},
 'mystrcolumn': {'Utf8': 'lorem ipsum lorem ipsum'}

So you likely to be customizing the serialization here regardless of the data type.

Update: If you want to get the JSON without all of the inner nesting, I had to do a gnarly match statement:

use polars::prelude::*;
use std::iter::zip;
use std::{fs::File, collections::HashMap};
use serde_json::json;

fn main() -> anyhow::Result<()> {

    let file = File::open("0.parquet").unwrap();
    let mut df = ParquetReader::new(file).finish()?;
    dbg!(df.schema());
    let fields = df.fields();
    let columns: Vec<&String> = fields.iter().map(|x| x.name()).collect();

    df.as_single_chunk_par();
    let mut iters = df.iter().map(|s| s.iter()).collect::<Vec<_>>();

    for _ in 0..df.height() {
        let mut row = HashMap::new();
        for (column, iter) in zip(&columns, &mut iters) {
            let value = iter.next().expect("should have as many iterations as rows");
            let value = match value {
                AnyValue::Null => json!(Option::<String>::None),
                AnyValue::Int64(val) => json!(val),
                AnyValue::Int32(val) => json!(val),
                AnyValue::Int8(val) => json!(val),
                AnyValue::Float32(val) => json!(val),
                AnyValue::Float64(val) => json!(val),
                AnyValue::Utf8(val) => json!(val),
                AnyValue::List(val) => {
                    match val.dtype() {
                        DataType::Int32 => ({let vec: Vec<Option<_>> = val.i32().unwrap().into_iter().collect(); json!(vec)}),
                        DataType::Float32 => ({let vec: Vec<Option<_>> = val.f32().unwrap().into_iter().collect(); json!(vec)}),
                        DataType::Utf8 => ({let vec: Vec<Option<_>> = val.utf8().unwrap().into_iter().collect(); json!(vec)}),
                        DataType::UInt8 => ({let vec: Vec<Option<_>> = val.u8().unwrap().into_iter().collect(); json!(vec)}),
                        x => panic!("unable to parse list column: {} with value: {} and type: {:?}", column, x, x.inner_dtype())
                    }
                },
                AnyValue::Datetime(val, TimeUnit::Milliseconds, _) => json!(val),
                x => panic!("unable to parse column: {} with value: {}", column, x)
            };
            row.insert(*column as &str, value);
        }
        let json = serde_json::to_string(&row).unwrap();
        dbg!(json);
        break;
    }

    Ok(())
}
Al Johri
  • 1,729
  • 22
  • 23