2

I'm trying to find URLs within a large block of text

regex_t reg;

const char *regex="REGEXGOESHERE";
regmatch_t matches[16];

//Read data into variable filecontent
regcomp(&reg, regex, REG_EXTENDED);

int offset=0;
int j;
int found=0;
int start,end;
while( regexec(&reg, filecontent+offset, 16, matches, 0) == 0)
{
    printf("\n\n");
    start = matches[0].rm_so+offset;
    end = matches[0].rm_eo-1+offset;

    printf("regex /%s/ at bytes %d-%d\n",
                regex, start, end);

    for (j=start; j<=end; j++)
    {
        printf("%c",filecontent[j]);
    }
    offset += matches[0].rm_eo;
    found = 1;
}
close(f);

Now this works for a simple regex in const char * regex, like say regex = "https?.*.png". But if I want a complex regex for a URL like (https?:\/\/.*\.(?:png|jpg)), I have to escape the backslashes and hence it becomes:

"(https?:\\/\\/.*\\.(?:png|jpg))";

And then running it gives a segmentation fault.

What might be going wrong?

The Guy with The Hat
  • 10,836
  • 8
  • 57
  • 75
user1265125
  • 2,608
  • 8
  • 42
  • 65

1 Answers1

2

You need to check the return of regcomp, it will tell you that your regex is invalid.

#include <regex.h>
#include <stdio.h>

int main() {
  regex_t reg;

  const char *regex="(https?:\\/\\/.*\\.(?:png|jpg))";
  regmatch_t matches[16];
  int ret;
  char err[1024];

  //Read data into variable filecontent
  ret = regcomp(&reg, regex, REG_EXTENDED);
  if (ret != 0) {
        regerror(ret, &reg, err, 1024);
        printf("%s\n", err);
        return 1;
  }

  regfree(&reg);
  return 0;
}

You will get Invalid preceding regular expression

The reason is (?: which is not supported by POSIX regex, even extended ones.

CMoi
  • 846
  • 4
  • 9