0

I've been following the examples in the calamine documentation for pulling values out of spreadsheets.

They work fine but when trying to use parse empty cells the optional parser examples do not work. I'm wondering if any of you can spot what I'm missing here.

We have a spreadsheet with say 5 columns.

Here's the code that is working:

fn example<P: AsRef<Path>>(xls_path: P) -> Result<(), Error> {
    let mut workbook: Xlsx<_> = open_workbook(xls_path)?;
    let range = workbook
        .worksheet_range("Billing")
        .ok_or(Error::Msg("Cannot find 'Billing' sheet"))??;

    let mut iter = RangeDeserializerBuilder::new().from_range(&range)?.enumerate();
    
    while let Some((i, result)) = iter.next() {
        println!("{:?}", result);
        let (
            invoice_num,
            billing_date,
            client,
            amount,
            date_received,
            quarter,
            direct_expenses,
            expenses,
            net,
            total_expenses,
            gross_profit,
        ): (
            Option<String>,
            Option<String>,
            Option<String>,
            Option<f64>,
            Option<String>,
            Option<String>,
            Option<f64>,
            Option<String>,
            Option<String>,
            Option<f64>,
            Option<f64>,
        ) = result?;

        println!("Number {:?}", i);
        println!("Invoice #: {:?}", invoice_num);
        // print them all..
    }

    Ok(())
}

Manually assigning them to Optional values causes every row to be printed regardless of what cells are empty inside of it.

Now the example code takes this approach:

#[derive(Serialize, Deserialize, Debug, Clone)]
struct RawExcelRow {
    #[serde(rename = "INVOICE #", deserialize_with = "de_opt_string")]
    invoice_num: Option<String>,
    #[serde(rename = "BILLING DATE", deserialize_with = "de_opt_string")]
    billing_date: Option<String>,
    #[serde(rename = "CLIENT/PROJECT", deserialize_with = "de_opt_string")]
    client_project: Option<String>,
    #[serde(rename = "AMOUNT", deserialize_with = "de_opt_f64")]
    amount: Option<f64>,
    #[serde(rename = "DATE REC'D", deserialize_with = "de_opt_string")]
    date_received: Option<String>,

}

fn example2() -> Result<(), Box<dyn std::error::Error>> {
    if env::args().count() > 2 {
        println!("{}", env::args().count());
        return Err(Box::new(Error::from("Enter the filename or leave blank for default")));
    }

    let path_string = format!("{}", env::current_dir().unwrap().display());
    let file_name = &env::args().nth(1).ok_or(Error::Msg("Unkown filename"))?;
    let xls_path = Path::new(&path_string).join(file_name);
    
    let mut excel: Xlsx<_> = open_workbook(xls_path)?;

    let range = excel
      .worksheet_range("Billing")
      .ok_or(calamine::Error::Msg("Cannot find Billing sheet"))??;

    let mut iter = RangeDeserializerBuilder::new().from_range::<_, RawExcelRowDate>(&range)?.enumerate();
    while let Some((i, Ok(row))) = iter.next() {
        println!("Number {}", i);
        println!("Invoice #: {:?}", row.invoice_num);
        // print them all..
    }
    Ok(())
}

And the deserializers:

fn de_opt_string<'de, D>(deserializer: D) -> Result<Option<String>, D::Error> 
where
    D: serde::Deserializer<'de>,
{
    let data_type = calamine::DataType::deserialize(deserializer);
    match data_type {
        Ok(calamine::DataType::Error(e)) => Ok(None),
        Ok(calamine::DataType::Float(f)) => Ok(Some(f.to_string())),
        Ok(calamine::DataType::Int(i)) => Ok(Some(i.to_string())),
        Ok(calamine::DataType::String(s)) => Ok(Some(s)),
        Ok(calamine::DataType::DateTime(d)) => Ok(Some(d.to_string())),
        _ => Ok(None),  
    }
}

fn de_opt_f64<'de, D>(deserializer: D) -> Result<Option<f64>, D::Error>
where
    D: serde::Deserializer<'de>,
{
    let data_type = calamine::DataType::deserialize(deserializer);
    match data_type {
        Ok(calamine::DataType::Error(_)) => Ok(None),
        Ok(calamine::DataType::Float(f)) => Ok(Some(f)),
        Ok(calamine::DataType::Int(i)) => Ok(Some(i as f64)),
        _ => Ok(None),
    }
}

What happens is anytime I add a column to the struct RawExcelRow it stops at the first row where one of columns corresponding to the struct has an empty cell. The value I'm expecting to see is None but instead I see the error: Deserializer error: missing field 'DATE REC'D' as DATE REC'D is the first cell that is empty in one of the rows. For example, row 18 has the first empty cell in the DATE REC'D column. So only rows 1-17 will be parsed.

I've looked through the source but I can't seem to find it's calling this error instead of just filling in a None value. I've also tried using a deserializer based on visitors to no avail.

P.S.

calling RangeDeserializerBuilder::with_headers(&COLUMNS) where &COLUMNS is an array of column names, as they do in the example, instead of ::new() makes no difference as ::new() just uses the default value for RangeDeserializer which uses all of the headers.

ss7
  • 2,902
  • 7
  • 41
  • 90

1 Answers1

0

The answer to this was somewhat elusive as Calamine is not very well documented concerning dates.

The answer in the link mentions providing the attribute #[serde(default)] to use the Option::default value when an input is missing.

This has the effect of causing Calamine to continue even when a cell is empty. The Calamine documentation leaves this out despite showing an "optional" parser as an example.

How can I deserialize an optional field with custom functions using Serde?

Knowing this, this question is somewhat of a duplicate of that one and the others listed in the answer there. However, I am not deleting the question as it is not immediately obvious what the problem actually is.

Solution

Changing the struct to look like this (and adding a date deserializer) fixed the problem:

#[derive(Serialize, Deserialize, Debug, Clone)]
struct RawExcelRow {
    #[serde(default, rename = "INVOICE #", deserialize_with = "de_opt_string")]
    invoice_num: Option<String>,
    #[serde(default, rename = "BILLING DATE", deserialize_with = "de_opt_date")]
    billing_date: Option<NaiveDate>,
    #[serde(default, rename = "CLIENT/PROJECT", deserialize_with = "de_opt_string")]
    client_project: Option<String>,
    #[serde(default, rename = "AMOUNT", deserialize_with = "de_opt_f64")]
    amount: Option<f64>,
    #[serde(default, rename = "DATE REC'D", deserialize_with = "de_opt_date")]
    date_received: Option<NaiveDate>,

}
ss7
  • 2,902
  • 7
  • 41
  • 90