I implemented simple syllabification algorithm following Improved Lansky algorithm but it's really slow when I need to run this algorithm on corpus over 2 million words. Could someone point me in the direction what causes it to be so slow? Algorithm below:
Everything after the last vowel (vowel group) belongs to the last syllable
Everything before the first vowel (vowel group) belongs to the first syllable
If the number of consonants between vowels is even (2n), they are divided into the halves first half belongs to the left vowel(s) and second to the right vowel(s) (n/n).
If the number of consonants between vowel(s) is odd(2n + 1), we divide them into n / n + 1 parts.
If there is only one consonant between vowels, it belongs to the left vowel(s).
#include <stdio.h> #include <string.h> #define VOWELS "aeiou" int get_n_consonant_between(char *word, int length) { int count = 0; int i = 0; while (i++ < length) { if (strchr(VOWELS, *word)) break; word++; count++; } return count; } void syllabification(char *word, int n_vowel_groups) { int i = 0, length = strlen(word), consonants; int syllables = 0, vowel_group = 0, syl_length = 0; char *syllable = word; char hola[length]; memset(hola, 0, length); if (n_vowel_groups < 2) { printf("CAN'T BE SPLIT INTO SYLLABLES\n\n"); return; } while (i < length) { if (strchr(VOWELS, word[i])) { syl_length++; i++; if (vowel_group) continue; vowel_group = 1; } else { if (vowel_group) { consonants = get_n_consonant_between(word + i, length - i); if (consonants == 1) { // printf("only one consonant\n"); syl_length++; strncpy(hola, syllable, syl_length); i++; } else { int count = consonants / 2; if ((consonants % 2) == 0) { /* number of consonants is 2n, first half belongs to the left vowel */ syl_length += count; } else { syl_length += count; } strncpy(hola, syllable, syl_length); i += count; } syllables++; if (syllables == n_vowel_groups) { printf("syllable done %d: %s\n", syllables, syllable); break; } printf("syllable %d: %s\n", syllables, hola); syllable = word + i; syl_length = 0; memset(hola, 0, length); } else { syl_length++; i++; } vowel_group = 0; } } } int count_vowel_groups(char *word) { int i, nvowels = 0; int vowel_group = 0; for (i = 0; i < strlen(word); i++) { if (strchr(VOWELS, word[i])) { if (vowel_group) continue; vowel_group = 1; } else { if (vowel_group) nvowels++; vowel_group = 0; } } // printf("%d vowel groups\n", nvowels); return nvowels; } void repl() { char *line = NULL; size_t len = 0; int i = 0; int count; FILE *file = fopen("../syllables.txt", "r"); while(i++ < 15) { getline(&line, &len, file); printf("\n\n%s\n", line); count = count_vowel_groups(line); syllabification(line, count); } } int main(int argc, char *argv[]) { // printf("Syllabification test:\n"); repl(); }
`