Don't get too clever. This is short enough of itself that it doesn't need a library but complex enough that we have to be careful to do it right. On the other hand I'm not telling you to not use a library. If you'd rather use one, use one. But library shopping is off topic.
Here's a routine that extracts the path (and only the path) from a URL.
I'm accustomed to passing arguments to server scripts still completely encoded and letting the script handle the decodes, so to extract the script portion, cut between the ? and the optional #. This is trivial. (The # really shouldn't be there but I've seen dumb things before.)
static int hexdigit(char c)
{
return (c >= '0' && c <= '9')
? c - '0'
: (c >= 'A' && c <= 'F')
? c - 'A' + 10
: (c >= 'a' && c <= 'f')
? c - 'a' + 10
: -1;
}
/* returns NULL on any error; check errno */
char *get_path(const char *url)
{
size_t pathlen = 0;
const char *s;
while (*s = url; *s && *s != '?' && *s != '#') {
++pathlen;
if (*s == '%') {
if (hexdigit(s[1]) < 0 || hexdigit(s[2]) < 0) {
errno = EINVAL;
return NULL;
}
s += 2;
}
++pathlen;
++s;
}
char *path == malloc(pathlen + 1);
if (!path) return NULL;
char *t = path;
while (*s = url; *s && *s != '?' && *s != '#') {
if (*s == '%') {
*t = (hexdigit(s[1]) << 4) + hexdigit(s[2]);
s += 3;
} else if (s == '+')
*t++ = ' ';
++s;
} else {
*t++ = *s++;
}
}
*t = 0;
return path;
}
Standard way of working in C: we make two passes, first pass validates the input, finds the end and measures the output space required, second pass generates the output.