I have a C-library which reads binary data from a file, converts it and stores everything in a large char* to return the data to anything calling it. This works fine within C but using python/Cython I run into problems allocating the memory.
The library prototype is:
int readWrapper(struct options opt, char *lineOut);
My pyx file I have:
from libc.string cimport strcpy, memset
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
cdef extern from "reader.h":
struct options:
int debug;
char *filename;
options opt
int readWrapper(options opt, char *lineOut);
def pyreader(file, date, debug=0):
import logging
cdef options options
# Get the filename
options.filename = <char *>malloc(len(file) * sizeof(char))
options.debug = debug
# Size of array
outSize = 50000
cdef char *line_output = <char *> malloc(outSize * sizeof(char))
memset(line_output, 1, outSize)
line_output[outSize] = 0
# Call reader
return_val = readWrapper(options, line_output)
# Create dataframe
from io import StringIO
data = StringIO(line_output.decode('UTF-8', 'strict'))
df = pd.read_csv(data, delim_whitespace=True, header=None)
# Free memory
free(line_output)
return df
This works fine as long as line_output remains within the size of outSize
. But some files are larger so how do I do this dynamically?
EDIT after DavidW's suggestions
The reader wrapper is like:
int readWrapper(struct options opt, char **lineOut)
{
// Open file for reading
fp = fopen(opt.filename, "r");
// Check for valid fp
if (fp == NULL)
{
printf("file pointer is null, aborting\n");
return (EXIT_FAILURE);
}
// Allocate memory
int ARRAY_SIZE = 5000;
*lineOut = NULL;
char *outLine = malloc(ARRAY_SIZE * sizeof (char));
if (outLine == NULL)
{
fprintf(stderr, "Memory allocation failed!");
return(EXIT_FAILURE);
}
// Create line and multi lines object
char line[255];
int numWritten = 0;
int memIncrease = 10000;
while (fp != feof)
{
// Read part of file
reader(fp, opt, line);
size_t num2Write = strlen(line);
if (ARRAY_SIZE < (numWritten + num2Write + 1))
{ // Won't fit so enlarge outLine
ARRAY_SIZE += memIncrease;
outLine = realloc(outLine, (sizeof *outLine * ARRAY_SIZE));
if (outLine == NULL)
{
fprintf(stderr, "Memory re-allocation failed!");
return(EXIT_FAILURE);
}
sprintf(outLine + numWritten, "%s", line);
numWritten += num2Write;
}
} // data block loop
*lineOut = outLine;
if (fp != NULL)
{
fclose(fp);
}
return (EXIT_SUCCESS);
}
The new pyx file:
from libc.string cimport strcpy, memset
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
cdef extern from "reader.h":
struct options:
int debug;
char *filename;
options opt
int readWrapper(options opt, char *lineOut);
def pyreader(file, date, debug=0):
import logging
cdef options options
# Get the filename
options.filename = <char *>malloc(len(file) * sizeof(char))
options.debug = debug
cdef char *line_output = NULL
# Call reader
return_val = readWrapper(options, &line_output)
# Create dataframe
from io import StringIO
data = StringIO(line_output.decode('UTF-8', 'strict'))
df = pd.read_csv(data, delim_whitespace=True, header=None)
# Free memory
free(line_output)
free(options.filename)
return df
This now works great, but using any printf
or fprintf(stdout,...)
statements in both the wrapper (C) and the python (pyx) part results in
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='utf-8'>
BrokenPipeError: [Errno 32] Broken pipe
when using python3 test.py | head
. Without the head no error is shown.
Finally, the suggestions regarding filename and its allocation also don't work for me. Using options.filename = file
results in TypeError: expected bytes, str found
at runtime but compiles. Interestingly, this only happens when I run the python code that calls the wrapper as such:
python3 test.py | head
. Without the pipe and head the BrokenPipeError is not present. Hence, it's not a big deal but would like to understand what is causing it.
EDIT after some searching on the BrokenPipeError
This BrokenPipeError issue happens with head and not with tail. An explanation of this "error" can be found here: https://stackoverflow.com/a/30091579/2885280
Solution pyx file:
The final reader.pyx file that works with the before mentioned readWrapper.c. Memory allocation is handled by C and clean-up (at the end) by the pyx code.
from libc.stdlib cimport free
cdef extern from "reader.h":
struct options:
int debug;
char *filename;
char *DAY;
options opt
int readWrapper(options opt, char **lineOut);
def pyreader(file, date, debug=0):
import logging
import sys
import errno
import pandas as pd
# Init return valus
a = pd.DataFrame()
cdef options options
cdef char *line_output = NULL
# logging
logging.basicConfig(stream=sys.stdout,
format='%(asctime)s:%(process)d:%(filename)s:%(lineno)s:pyreader: %(message)s',
datefmt='%Y%m%d_%H.%M.%S',
level=logging.DEBUG if debug > 0 else logging.INFO)
try:
# Check inputs
if file is None:
raise Exception("No valid filename provided")
if date is None:
raise Exception("No valid date provided")
# Get the filename
file_enc = file.encode("ascii")
options.filename = file_enc
# Get date
day_enc = date.encode('ascii')
options.DAY = day_enc
try:
# Call reader
return_val = readWrapper(options, &line_output)
if (return_val > 0):
logging.error("pyreadASTERIX2 failed with exitcode {}".format(return_val))
return a
except Exception:
logging.exception("Error occurred")
free(line_output)
return a
from io import StringIO
try:
data = StringIO(line_output.decode('UTF-8', 'strict'))
logging.debug("return_val: {} and size: {}".format(return_val, len(line_output.decode('UTF-8', 'strict'))))
a = pd.read_csv(data, delim_whitespace=True, header=None, dtype={'id':str})
if a.empty:
logging.error("failed to load {} not enough data to construct DataFrame".format(file))
return a
logging.debug("converted data into pd")
except Exception as e:
logging.exception("Exception occured while loading: {} into DataFrame".format(file))
return a
finally:
free(line_output)
logging.debug("Size of df: {}".format(len(a)))
# Success, return DataFrame
return a
except Exception:
logging.exception("pyreader returned with an exception:")
return a