0

I implemented simple syllabification algorithm following Improved Lansky algorithm but it's really slow when I need to run this algorithm on corpus over 2 million words. Could someone point me in the direction what causes it to be so slow? Algorithm below:

  1. Everything after the last vowel (vowel group) belongs to the last syllable

  2. Everything before the first vowel (vowel group) belongs to the first syllable

  3. If the number of consonants between vowels is even (2n), they are divided into the halves first half belongs to the left vowel(s) and second to the right vowel(s) (n/n).

  4. If the number of consonants between vowel(s) is odd(2n + 1), we divide them into n / n + 1 parts.

  5. If there is only one consonant between vowels, it belongs to the left vowel(s).

    #include <stdio.h>
    #include <string.h>
    
    #define VOWELS "aeiou"
    
    int get_n_consonant_between(char *word, int length) {
         int count = 0;
         int i = 0;
    
         while (i++ < length) {
              if (strchr(VOWELS, *word)) break;
             word++;
             count++;
         } 
    
         return count;
     }
    
     void syllabification(char *word, int n_vowel_groups) {
         int i = 0, length = strlen(word), consonants;
         int syllables = 0, vowel_group = 0, syl_length = 0;
         char *syllable = word;
         char hola[length];
    
         memset(hola, 0, length);
    
         if (n_vowel_groups < 2) {
             printf("CAN'T BE SPLIT INTO SYLLABLES\n\n");
             return;
         }
    
         while (i < length) {
             if (strchr(VOWELS, word[i])) {
                 syl_length++;
                 i++;
                 if (vowel_group) continue;
                 vowel_group = 1;
             }
             else {
                 if (vowel_group) {
                      consonants = get_n_consonant_between(word + i, length - i);
                      if (consonants == 1) {
                          // printf("only one consonant\n");
                          syl_length++;
                          strncpy(hola, syllable, syl_length);
                          i++;
                      }
                      else {
                          int count = consonants / 2;
                          if ((consonants % 2) == 0) { /* number of consonants is 2n, first half belongs to the left vowel */
                         syl_length += count;
                }
                else {
                    syl_length += count;
                }
                strncpy(hola, syllable, syl_length);
                i += count;
            }
    
            syllables++;
            if (syllables == n_vowel_groups) {
                printf("syllable done %d: %s\n", syllables, syllable);
                break;
            }
            printf("syllable %d: %s\n", syllables, hola);
    
            syllable = word + i;
            syl_length = 0;
            memset(hola, 0, length);
        }
        else {
            syl_length++;
            i++;
        }
        vowel_group = 0;
         }
     }    
     }
    
     int count_vowel_groups(char *word) {
          int i, nvowels = 0;
          int vowel_group = 0;
    
          for (i = 0; i < strlen(word); i++) {
              if (strchr(VOWELS, word[i])) {
                  if (vowel_group) continue;
                  vowel_group = 1;
              }
              else {
                  if (vowel_group) nvowels++;
                  vowel_group = 0;
              }    
          }
          // printf("%d vowel groups\n", nvowels);
          return nvowels;
    }
    
     void repl() {
          char *line = NULL;
          size_t len = 0;
          int i = 0;
          int count;
          FILE *file = fopen("../syllables.txt", "r");
          while(i++ < 15) {
              getline(&line, &len, file);
              printf("\n\n%s\n", line);
              count = count_vowel_groups(line);
              syllabification(line, count);
          }
     }
    
     int main(int argc, char *argv[]) {
         // printf("Syllabification test:\n");
         repl();
     }
    

    `

  • This possibly belongs on [codereview.se], but basically I'd say your best bet is to avoid multiple scans of the input (why do you need to count the number of vowel groups at the beginning, for example?) and to find a faster way to identify vowels, such as a lookup table or even a simple bithack. – rici Mar 04 '18 at 02:36

2 Answers2

0

There are few things which you could do:

1) Profile the program and see where it spends most of the time.

2) Concentrate on most repetitive parts of the code.

3) Avoid multiple scans

4) Do not make unnecessary operations. Example:

a)

Do you need to always memset hola?

 memset(hola, 0, length);

Is seems to me that you can get of rid of it.

b)

for (i = 0; i < strlen(word); i++) {

No need to calculate strlen(word) inside the loop. You can do it outside:

 int len = strlen(word);
 for (i = 0; i < len; i++) {

You can get really good hints from profiling, follow them and zoom on the bottlenecks.

sg7
  • 6,108
  • 2
  • 32
  • 40
0

This is a lot of code to go through to even check if the implementation is correct, mainly because I don't know the terminology (like what exactly is a vowel group) of the algorithm. I've looked up and google returns me a lot of research papers (for which I can only see the abstract) for syllabification of different languages, so I'm not sure if the code is correct at all.

But I have a few suggestions that might make your code faster:

  1. Move all you strlen(word) out of the for-loop conditions. Save the length in a variable and use that variable instead. So from

    for (i = 0; i < strlen(word); i++)
    

    to

    size_t len = strlen(word);
    for(i = 0; i < len; i++)
    
  2. Don't use strchr for checking if a character is a vowel. I'd use a lookup table for this:

    // as global variable
    char vowels[256];
    
    int main(void)
    {
        vowels['a'] = 1;
        vowels['e'] = 1;
        vowels['i'] = 1;
        vowels['o'] = 1;
        vowels['u'] = 1;
        ...
    }
    

    and when you want to check if a character is a vowel:

    // 0x20 | c make c a lower case character
    if(vowel[0x20 | word[i]])
        syl_length++;
        i++;
        if (vowel_group) continue;
        vowel_group = 1;
    }
    

The first suggestion might give you a small performance increase, compilers are pretty clever and might optimize that anyway. The second suggestion might give you more performance, because it's just a look up. On the worst case strchr will have to go through the whole "aeiou" array many times.1

I also suggest that you profile your code. See this and this.


fotenotes

1I've made a very crud program that compares the runtime of the suggestion. I added a few extra bits of code in the hope that the compiler doesn't optimize the functions to aggressively.

#include <stdio.h>
#include <string.h>
#include <time.h>


int test1(time_t t)
{
    char text[] = "The lazy dog is very lazy";
    for(size_t i = 0; i < strlen(text); ++i)
        t += text[i];

    return t;
}

int test2(time_t t)
{
    char text[] = "The lazy dog is very lazy";
    size_t len = strlen(text);
    for(size_t i = 0; i < len; ++i)
        t += text[i];

    return t;
}

#define VOWELS "aeiou"
char vowels[256];

int test3(time_t t)
{
    char text[] = "The lazy dog is very lazy";
    size_t len = strlen(text);
    for(size_t i = 0; i < len; ++i)
    {
        if (strchr(VOWELS, text[i]))
            t += text[i];
        t += text[i];
    }

    return t;
}

int test4(time_t t)
{
    char text[] = "The lazy dog is very lazy";
    size_t len = strlen(text);
    for(size_t i = 0; i < len; ++i)
    {
        if(vowels[0x20 | text[i]])
            t += text[i];
        t += text[i];
    }

    return t;
}

int main(void)
{
    vowels['a'] = 1;
    vowels['e'] = 1;
    vowels['i'] = 1;
    vowels['o'] = 1;
    vowels['u'] = 1;
    long times = 50000000;

    long tmp = 0;

    clock_t t1 = 0, t2 = 0, t3 = 0, t4 = 0;

    for(long i = 0; i < times; ++i)
    {
        clock_t start,end;
        time_t t = time(NULL);

        start = clock();
        tmp += test1(t);
        end = clock();

        t1 += end - start;
        //t1 += ((double) (end - start)) / CLOCKS_PER_SEC;

        start = clock();
        tmp += test2(t);
        end = clock();

        t2 += end - start;

        start = clock();
        tmp += test3(t);
        end = clock();

        t3 += end - start;

        start = clock();
        tmp += test4(t);
        end = clock();

        t4 += end - start;
    }

    printf("t1: %lf %s\n", ((double) t1) / CLOCKS_PER_SEC, t1 < t2 ? "wins":"loses");
    printf("t2: %lf %s\n", ((double) t2) / CLOCKS_PER_SEC, t2 < t1 ? "wins":"loses");
    printf("t3: %lf %s\n", ((double) t3) / CLOCKS_PER_SEC, t3 < t4 ? "wins":"loses");
    printf("t4: %lf %s\n", ((double) t4) / CLOCKS_PER_SEC, t4 < t3 ? "wins":"loses");
    printf("tmp: %ld\n", tmp);


    return 0;
}

The results are:

$ gcc b.c -ob -Wall -O0
$ ./b 
t1: 10.866770 loses
t2: 7.588057 wins
t3: 10.801546 loses
t4: 8.366050 wins

$ gcc b.c -ob -Wall -O1
$ ./b
t1: 7.409297 loses
t2: 7.082418 wins
t3: 11.415080 loses
t4: 7.847086 wins

$ gcc b.c -ob -Wall -O2
$ ./b
t1: 6.292438 loses
t2: 5.855348 wins
t3: 9.306874 loses
t4: 6.584076 wins

$ gcc b.c -ob -Wall -O3
$ ./b
t1: 6.317390 loses
t2: 5.922087 wins
t3: 9.436450 loses
t4: 6.722685 wins
Pablo
  • 13,271
  • 4
  • 39
  • 59