memory leak in utf8 string parser

Question

typedef struct _utf8_multibyte {
    uint8_t bytes[4]; /* always max of 4 */
    int used; /* how many of the 4 bytes are used */
} utf8_multibyte_t;

typedef struct _utf8_string {
    uint8_t *raw; /* raw string buffer */
    uint32_t *chars; /* codepoints comprising the string */
    uint64_t length; /* number of codepoints in string */
} utf8_string_t;

...

utf8_string_t *utf8_init(unsigned char *s) {
    utf8_string_t *str = (utf8_string_t*)malloc(sizeof(utf8_string_t*));
    if (!str) {
        perror("utf8_string_t init");
        return NULL;
    }
    str->raw = (uint8_t*)malloc(strlen(s) * sizeof(uint8_t));
    if (!str->raw) {
        perror("malloc raw");
        return NULL;
    }
    memcpy(str->raw, s, strlen(s));
    str->length = utf8_strlen(s);
    str->chars = (uint32_t*)malloc(str->length * sizeof(uint32_t));
    if (!str->chars) {
        perror("malloc chars");
        return NULL;
    }
    uint8_t *p = s;
    utf8_multibyte_t mb;
    mb_clear(&mb);
    int j = 0;
    while (*p != '\0') {
        int len = utf8len(*p);
        mb.used = len;
        mb.bytes[0] = *p++;
        for (int i = 1; i < len; ++i) mb.bytes[i] = *p++;
        uint32_t cp = utf8_decode(&mb);
        str->chars[j++] = cp;
        printf("%x ", str->chars[j - 1]);
        mb_clear(&mb);
    }
    return str;
}

I am testing if the initialization is going correctly with this code:

    unsigned char data[] = {
        0xe5, 0xbd, 0xbc, 0xe5, 0xa5, 0xb3, 0xe3, 0x81, 0xaf, 0xe3, 0x82, 0xa2, 0xe3, 0x82, 0xa4, 0xe3, 
        0x82, 0xb9, 0xe3, 0x82, 0x92, 0xe9, 0xa3, 0x9f, 0xe3, 0x81, 0xb9, 0xe3, 0x81, 0x9f, 0xe3, 0x81, 
        0x8b, 0xe3, 0x81, 0xa3, 0xe3, 0x81, 0x9f, 0xe3, 0x81, 0x8b, 0xe3, 0x82, 0x89, 0xe8, 0xb2, 0xb7, 
        0xe3, 0x81, 0x84, 0xe3, 0x81, 0xab, 0xe5, 0x87, 0xba, 0xe3, 0x81, 0x8b, 0xe3, 0x81, 0x91, 0xe3, 
        0x81, 0xbe, 0xe3, 0x81, 0x97, 0xe3, 0x81, 0x9f, 0xe3, 0x80, 0x82, 0x0
    };
    unsigned char data1[] = {
        0xe5, 0xbd, 0xbc, 0xe5, 0xa5, 0xb3, 0xe3, 0x81, 0xaf, 0xe3, 0x82, 0xa2, 0xe3, 0x82, 0xa4, 0xe3, 
        0x82, 0xb9, 0xe3, 0x82, 0x92, 0xe9, 0xa3, 0x9f, 0xe3, 0x81, 0xb9, 0xe3, 0x81, 0x9f, 0xe3, 0x81, 
        0x8b, 0xe3, 0x81, 0xa3, 0xe3, 0x81, 0x9f, 0xe3, 0x81, 0x8b, 0xe3, 0x82, 0x89, 0xe8, 0xb2, 0xb7, 
        0xe3, 0x81, 0x84, 0xe3, 0x81, 0xab, 0xe5, 0x87, 0xba, 0xe3, 0x81, 0x8b, 0xe3, 0x81, 0x91, 0xe3, 
        0x81, 0xbe, 0xe3, 0x81, 0x97, 0xe3, 0x81, 0x9f, 0xe3, 0x80, 0x82, 0x0
    };

    utf8_string_t *str1 = utf8_init(data);
    utf8_string_t *str2 = utf8_init(data1);
    printf("%d %d\n", str1->length, str2->length);
    /*utf8_free(str1);
    utf8_free(str2);*/

One of the strings sometimes doesn't get initialized and when it does it reports it's length as 0 when it should be 25. I tried printing out the codepoints from the strings as they're initialized and they're completely ok so I am led to believe it's got something to do with the raw string allocation but I am not sure.

I know this has something to do with my memory management but I have been staring at this code for 2 days and still cannot figure out where my mistake is. Please help me.

EDIT: I think I managed to fix it thanks to the recommendations in the comments. Here's the updated code:

utf8_string_t *utf8_init(const uint8_t *s) {
    utf8_string_t *str = malloc(sizeof(utf8_string_t));
    if (!str) {
        free(str);
        return NULL;
    }

    uint64_t rawlen = strlen_uchar(s);
    str->raw = malloc((rawlen + 1) * sizeof(uint8_t));
    if (!str->raw) {
        free(str->raw);
        free(str);
        return NULL;
    }
    memcpy(str->raw, s, rawlen + 1);
    
    str->length = utf8_strlen(s);
    str->chars = malloc(str->length * sizeof(uint32_t));
    if (!str->chars) {
        free(str->raw);
        free(str->chars);
        free(str);
        return NULL;
    }
    
    uint8_t *p = (uint8_t*)s;
    utf8_multibyte_t mb;
    mb_clear(&mb);
    int j = 0;
    
    while (*p != '\0') {
        int len = utf8len(*p);
        mb.used = len;
        mb.bytes[0] = *p++;
        for (int i = 1; i < len; ++i) mb.bytes[i] = *p++;
        uint32_t cp = utf8_decode(&mb);
        str->chars[j++] = cp;
        mb_clear(&mb);
    }
    return str;
}

Please do tell if there are still parts of it which might bring about undefined behavior/bugs.

`malloc(sizeof(utf8_string_t*)` should be `malloc(sizeof(utf8_string_t)`. — Ian Abbott, Jun 24 '22 at 09:10
To begin with, in C you [don't have to (and really shouldn't) cast the result of `malloc`](https://stackoverflow.com/questions/605845/do-i-cast-the-result-of-malloc/). — Some programmer dude, Jun 24 '22 at 09:10
It is full of memory leaks. Several times you return NULL without freeing the already allocated memory — Giacomo Catenazzi, Jun 24 '22 at 09:11
And when allocating memory, use the variable you assign to or initialize in the `sizeof` expression. So instead of `utf8_string_t *str = (utf8_string_t*)malloc(sizeof(utf8_string_t*))` do `utf8_string_t *str = malloc(sizeof *str)` — Some programmer dude, Jun 24 '22 at 09:11
On another note, please don't call `strlen` multiple times. Call it *once* and store the result in a variable, which you can reuse. Will be a small but perhaps significant optimization. — Some programmer dude, Jun 24 '22 at 09:13
In addition to all the suggestions above my comment, consider using a `goto` that will free the memory and return `NULL` instead of doing the same thing over and over. — vmemmap, Jun 24 '22 at 09:34
_tell if there are still parts of it which might bring about undefined behavior/bugs_ - Yes, you might `free` NULL pointers which is not okay. You are not supposed to `free` a block of data that came from a failed `malloc`. — vmemmap, Jun 24 '22 at 16:06

memory leak in utf8 string parser

0 Answers0