0

I have some code, which iteratively receives data which it dumps to a HDF5 file. Here is a toy example of what I am trying to achieve:

#include <HDFql.hpp>

void createHDF(const std::string & filepath)
{
    char script_[1024];
    sprintf(script_, "CREATE TRUNCATE FILE %s", filepath.c_str());
    HDFql::execute(script_);
    sprintf(script_, "USE FILE %s", filepath.c_str());
    HDFql::execute(script_);

    sprintf(script_, "CREATE GROUP events");
    HDFql::execute(script_);
    HDFql::execute("CREATE CHUNKED DATASET events/xs AS SMALLINT(UNLIMITED)");
    HDFql::execute("CREATE CHUNKED DATASET events/ys AS SMALLINT(UNLIMITED)");
    HDFql::execute("CREATE CHUNKED DATASET events/ts AS DOUBLE(UNLIMITED)");
    HDFql::execute("CREATE CHUNKED DATASET events/ps AS TINYINT(UNLIMITED)");

    sprintf(script_, "CREATE GROUP frames");
    HDFql::execute(script_);

    sprintf(script_, "CREATE GROUP optic_flow");
    HDFql::execute(script_);
}

void writeData(const std::vector<double>& ts_v, std::vector<int16_t>& xs_v, 
    std::vector<int16_t>& ys_v, std::vector<int8_t>& ps_v)
{
    //Input arrays are all the same size
    const int data_size = ts_v.size();

    //Open file
    sprintf(script_, "USE FILE %s", HDF5_path_.c_str());
    HDFql::execute(script_);

    //Add events
    sprintf(script_, "ALTER DIMENSION events/xs TO +%d", data_size);
    HDFql::execute(script_);
    sprintf(script_, "ALTER DIMENSION events/ys TO +%d", data_size);
    HDFql::execute(script_);
    sprintf(script_, "ALTER DIMENSION events/ts TO +%d", data_size);
    HDFql::execute(script_);
    sprintf(script_, "ALTER DIMENSION events/ps TO +%d", data_size);
    HDFql::execute(script_);

    HDFql::variableRegister(&xs_v[0]);
    sprintf(script_, "INSERT INTO events/xs(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size, 
    data_size, HDFql::variableGetNumber(&xs_v[0]));
    HDFql::execute(script_);
    HDFql::variableUnregister(&xs_v[0]);

    HDFql::variableRegister(&ys_v[0]);
    sprintf(script_, "INSERT INTO events/ys(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
         data_size, HDFql::variableGetNumber(&ys_v[0]));
    HDFql::execute(script_);
    HDFql::variableUnregister(&ys_v[0]);

    HDFql::variableRegister(&ts_v[0]);
    sprintf(script_, "INSERT INTO events/ts(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
         data_size, HDFql::variableGetNumber(&ts_v[0]));
    HDFql::execute(script_);
    HDFql::variableUnregister(&ts_v[0]);

    HDFql::variableRegister(&ps_v[0]);
    sprintf(script_, "INSERT INTO events/ps(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
            data_size, HDFql::variableGetNumber(&ps_v[0]));
    HDFql::execute(script_);
    HDFql::variableUnregister(&ps_v[0]);

    total_events_added_ += data_size;
    events_idx_++;
}

int main (int argc, const char * argv[]) {
    std::string path = "/tmp/test.h5";
    createHDF(path);

    const int data_size = 1000;
    const int iterations = 10000;
    std::vector<double> ts(data_size);
    std::vector<int16_t> xs(data_size);
    std::vector<int16_t> ys(data_size);
    std::vector<int8_t> ps(data_size);
    for(int i=0; i<data_size; i++)
    {
        ts_v.push_back(i);
        xs_v.push_back(i);
        ys_v.push_back(i);
        ps_v.push_back(1);
    }
    for(int i=0; i<iterations; i++)
    {
        writeData(ts, xs, ys, ps);
    }
}

This code runs extremely slowly. Using other binary libraries such as cnpy, this executes in the blink of an eye, so it is not the amount of data being written that is the issue. I was wondering if that is just how things are in HDFql, or whether there is some blunder in the code somewhere.

Many thanks!

SOG
  • 876
  • 6
  • 10
Mr Squid
  • 1,196
  • 16
  • 34

2 Answers2

1

Are you with cnpy executing the same operations that you are doing in HDFql (e.g. extending the dimensions of datasets events/xs, events/ys, events/ts and events/ps, using a chunk size equal to 1)?

Looking at your code, you may want to explicitly specify the chunk size of the datasets equal to ts_v.size() as this will most probably increase performance greatly. The way you have it now makes HDFql automatically calculate a chunk size for your convenience (using a best guess approach), which may not lead to an optimal performance. You need to explicitly specify the chunk size like, e.g., CREATE CHUNKED(10) DATASET events/xs AS SMALLINT(UNLIMITED).

SOG
  • 876
  • 6
  • 10
  • Yes, I am doing the same with `cnpy`. Unfortunately the data packages coming in are not all the same size, as in the above toy example and I have no way of knowing a-priori. – Mr Squid Apr 30 '20 at 02:47
  • Can you post the chunk sizes that `cnpy` specifies for the datasets? In addition, I will post another answer with your code more optimized. – SOG Apr 30 '20 at 06:54
1

Your code more optimized:

#include <HDFql.hpp>

void createHDF(const std::string & filepath)
{
    char script_[1024];

    sprintf(script_, "CREATE TRUNCATE AND USE FILE %s", filepath.c_str());
    HDFql::execute(script_);

    HDFql::execute("CREATE GROUP events, frames, optic_flow");

    HDFql::execute("CREATE CHUNKED DATASET events/xs AS SMALLINT(UNLIMITED)");
    HDFql::execute("CREATE CHUNKED DATASET events/ys AS SMALLINT(UNLIMITED)");
    HDFql::execute("CREATE CHUNKED DATASET events/ts AS DOUBLE(UNLIMITED)");
    HDFql::execute("CREATE CHUNKED DATASET events/ps AS TINYINT(UNLIMITED)");

}


void writeData(const std::vector<double>& ts_v, std::vector<int16_t>& xs_v, std::vector<int16_t>& ys_v, std::vector<int8_t>& ps_v)
{
    //Input arrays are all the same size
    const int data_size = ts_v.size();

    //Open file
    sprintf(script_, "USE FILE %s", HDF5_path_.c_str());
    HDFql::execute(script_);

    //Add events
    sprintf(script_, "ALTER DIMENSION events/xs, events/ys, events/ts, events/ps TO +%d", data_size);
    HDFql::execute(script_);

    sprintf(script_, "INSERT INTO events/xs(-%d:1:1:%d) VALUES FROM MEMORY 0", data_size, data_size);
    HDFql::execute(script_);

    sprintf(script_, "INSERT INTO events/ys(-%d:1:1:%d) VALUES FROM MEMORY 1", data_size, data_size);
    HDFql::execute(script_);

    sprintf(script_, "INSERT INTO events/ts(-%d:1:1:%d) VALUES FROM MEMORY 2", data_size, data_size);
    HDFql::execute(script_);

    sprintf(script_, "INSERT INTO events/ps(-%d:1:1:%d) VALUES FROM MEMORY 3", data_size, data_size);
    HDFql::execute(script_);

    total_events_added_ += data_size;
    events_idx_++;
}


int main (int argc, const char * argv[]) {
    std::string path = "/tmp/test.h5";
    createHDF(path);

    const int data_size = 1000;
    const int iterations = 10000;
    std::vector<double> ts(data_size);
    std::vector<int16_t> xs(data_size);
    std::vector<int16_t> ys(data_size);
    std::vector<int8_t> ps(data_size);

    for(int i=0; i<data_size; i++)
    {
        ts_v.push_back(i);
        xs_v.push_back(i);
        ys_v.push_back(i);
        ps_v.push_back(1);
    }


    HDFql::variableRegister(&xs_v);
    HDFql::variableRegister(&ys_v);
    HDFql::variableRegister(&ts_v);
    HDFql::variableRegister(&ps_v);


    for(int i=0; i<iterations; i++)
    {
        writeData(ts, xs, ys, ps);
    }

}

In addition, is it possible to move these two consecutive lines of code sprintf(script_, "USE FILE %s", HDF5_path_.c_str()); HDFql::execute(script_); outside the writeData function and just open the file once? Doing so will for sure make things faster.

SOG
  • 876
  • 6
  • 10
  • Thank you very much! Just out of curiosity, how long does it take for this code to execute, for you? I'm wondering if the slow speed might be some other issue. – Mr Squid Apr 30 '20 at 12:00