typedef struct _utf8_multibyte {
uint8_t bytes[4]; /* always max of 4 */
int used; /* how many of the 4 bytes are used */
} utf8_multibyte_t;
typedef struct _utf8_string {
uint8_t *raw; /* raw string buffer */
uint32_t *chars; /* codepoints comprising the string */
uint64_t length; /* number of codepoints in string */
} utf8_string_t;
...
utf8_string_t *utf8_init(unsigned char *s) {
utf8_string_t *str = (utf8_string_t*)malloc(sizeof(utf8_string_t*));
if (!str) {
perror("utf8_string_t init");
return NULL;
}
str->raw = (uint8_t*)malloc(strlen(s) * sizeof(uint8_t));
if (!str->raw) {
perror("malloc raw");
return NULL;
}
memcpy(str->raw, s, strlen(s));
str->length = utf8_strlen(s);
str->chars = (uint32_t*)malloc(str->length * sizeof(uint32_t));
if (!str->chars) {
perror("malloc chars");
return NULL;
}
uint8_t *p = s;
utf8_multibyte_t mb;
mb_clear(&mb);
int j = 0;
while (*p != '\0') {
int len = utf8len(*p);
mb.used = len;
mb.bytes[0] = *p++;
for (int i = 1; i < len; ++i) mb.bytes[i] = *p++;
uint32_t cp = utf8_decode(&mb);
str->chars[j++] = cp;
printf("%x ", str->chars[j - 1]);
mb_clear(&mb);
}
return str;
}
I am testing if the initialization is going correctly with this code:
unsigned char data[] = {
0xe5, 0xbd, 0xbc, 0xe5, 0xa5, 0xb3, 0xe3, 0x81, 0xaf, 0xe3, 0x82, 0xa2, 0xe3, 0x82, 0xa4, 0xe3,
0x82, 0xb9, 0xe3, 0x82, 0x92, 0xe9, 0xa3, 0x9f, 0xe3, 0x81, 0xb9, 0xe3, 0x81, 0x9f, 0xe3, 0x81,
0x8b, 0xe3, 0x81, 0xa3, 0xe3, 0x81, 0x9f, 0xe3, 0x81, 0x8b, 0xe3, 0x82, 0x89, 0xe8, 0xb2, 0xb7,
0xe3, 0x81, 0x84, 0xe3, 0x81, 0xab, 0xe5, 0x87, 0xba, 0xe3, 0x81, 0x8b, 0xe3, 0x81, 0x91, 0xe3,
0x81, 0xbe, 0xe3, 0x81, 0x97, 0xe3, 0x81, 0x9f, 0xe3, 0x80, 0x82, 0x0
};
unsigned char data1[] = {
0xe5, 0xbd, 0xbc, 0xe5, 0xa5, 0xb3, 0xe3, 0x81, 0xaf, 0xe3, 0x82, 0xa2, 0xe3, 0x82, 0xa4, 0xe3,
0x82, 0xb9, 0xe3, 0x82, 0x92, 0xe9, 0xa3, 0x9f, 0xe3, 0x81, 0xb9, 0xe3, 0x81, 0x9f, 0xe3, 0x81,
0x8b, 0xe3, 0x81, 0xa3, 0xe3, 0x81, 0x9f, 0xe3, 0x81, 0x8b, 0xe3, 0x82, 0x89, 0xe8, 0xb2, 0xb7,
0xe3, 0x81, 0x84, 0xe3, 0x81, 0xab, 0xe5, 0x87, 0xba, 0xe3, 0x81, 0x8b, 0xe3, 0x81, 0x91, 0xe3,
0x81, 0xbe, 0xe3, 0x81, 0x97, 0xe3, 0x81, 0x9f, 0xe3, 0x80, 0x82, 0x0
};
utf8_string_t *str1 = utf8_init(data);
utf8_string_t *str2 = utf8_init(data1);
printf("%d %d\n", str1->length, str2->length);
/*utf8_free(str1);
utf8_free(str2);*/
One of the strings sometimes doesn't get initialized and when it does it reports it's length as 0 when it should be 25. I tried printing out the codepoints from the strings as they're initialized and they're completely ok so I am led to believe it's got something to do with the raw string allocation but I am not sure.
I know this has something to do with my memory management but I have been staring at this code for 2 days and still cannot figure out where my mistake is. Please help me.
EDIT: I think I managed to fix it thanks to the recommendations in the comments. Here's the updated code:
utf8_string_t *utf8_init(const uint8_t *s) {
utf8_string_t *str = malloc(sizeof(utf8_string_t));
if (!str) {
free(str);
return NULL;
}
uint64_t rawlen = strlen_uchar(s);
str->raw = malloc((rawlen + 1) * sizeof(uint8_t));
if (!str->raw) {
free(str->raw);
free(str);
return NULL;
}
memcpy(str->raw, s, rawlen + 1);
str->length = utf8_strlen(s);
str->chars = malloc(str->length * sizeof(uint32_t));
if (!str->chars) {
free(str->raw);
free(str->chars);
free(str);
return NULL;
}
uint8_t *p = (uint8_t*)s;
utf8_multibyte_t mb;
mb_clear(&mb);
int j = 0;
while (*p != '\0') {
int len = utf8len(*p);
mb.used = len;
mb.bytes[0] = *p++;
for (int i = 1; i < len; ++i) mb.bytes[i] = *p++;
uint32_t cp = utf8_decode(&mb);
str->chars[j++] = cp;
mb_clear(&mb);
}
return str;
}
Please do tell if there are still parts of it which might bring about undefined behavior/bugs.