A possible solution might be similar to a solution that I provided for fast string search with many queries to the same string. That solution has an example implementation in C.
- Create an array of pointers into each char of your string. Sort it.
- Look up the first occurrence of the query in the sorted auffix array with binary search.
- Look up the last occurrence of the query. You can do this by incrementing the last char, so that you look for "abd" instead of "abc" to find the first non-match.
- Count all occurrences between the two matches that start before
L
.
This solution, however, is not O(q log n), because sorting is already O(n log n) and the q query look-ups are O(q log n).
I've re-written the example I linked to for your problem. It is C, not C++ and it won't compile with a C++ compiler, because of the way malloc
is used. It shouldn't be too hard to rewrite it in idiomatic C++.
The solution requires a lot of memory for the suffix array. It can treat about 130,000 queries an a 1.3 megabyte file in a bit more than a minute.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define die(...) exit((fprintf(stderr, "Fatal: " __VA_ARGS__), \
putc(10, stderr), 1))
typedef struct Haystack Haystack;
struct Haystack {
size_t size; /* Number of chars in string */
char *buf; /* Null-terminated char buffer */
char **ptr; /* Pointers into char buffer */
};
/*
* Count occurrence of c in zero-terminated string p.
*/
size_t strcount(const char *p, int c)
{
size_t n = 0;
for (;;) {
p = strchr(p, c);
if (p == NULL) return n;
p++;
n++;
}
return 0;
}
/*
* String comparison via pointers to strings.
*/
int pstrcmp(const void *a, const void *b)
{
const char *const *aa = a;
const char *const *bb = b;
return strcmp(*aa, *bb);
}
/*
* Create and prepare a hayst, i.e. a text file to search.
*/
Haystack *hayst_new(const char *fn)
{
Haystack *hayst;
FILE *f = fopen(fn, "r");
char *p;
char **pp;
if (f == NULL) die("Couldn't open %s", fn);
hayst = malloc(sizeof(*hayst));
if (hayst == NULL) die("Allocation failed");
fseek(f, 0, SEEK_END);
hayst->size = ftell(f);
fseek(f, 0, SEEK_SET);
hayst->buf = malloc(hayst->size + 1);
hayst->ptr = malloc(hayst->size * sizeof(*hayst->ptr));
if (hayst->buf == NULL) die("Allocation failed");
if (hayst->ptr == NULL) die("Allocation failed");
fread(hayst->buf, 1, hayst->size, f);
hayst->buf[hayst->size] = '\0';
fclose(f);
p = hayst->buf;
pp = hayst->ptr;
while (*p) *pp++ = p++;
qsort(hayst->ptr, hayst->size, sizeof(*hayst->ptr), pstrcmp);
return hayst;
}
/*
* Clean up hayst.
*/
void hayst_delete(Haystack *hayst)
{
free(hayst->buf);
free(hayst->ptr);
free(hayst);
}
/*
* Binary range search for string pointers.
*/
static char **pstr_bsearch(const char *key, size_t len,
char **arr, size_t high)
{
size_t low = 0;
while (low < high) {
size_t mid = (low + high) / 2;
int diff = strncmp(key, arr[mid], len);
if (diff <= 0) high = mid;
else low = mid + 1;
}
return arr + low;
}
/*
* Count occurrences of the string key in the haystack.
*/
size_t hayst_find(Haystack *hayst, size_t offset, size_t len)
{
char *key = hayst->buf + offset;
char **begin, **end;
size_t n = 0;
if (offset + len > hayst->size) return 0;
begin = pstr_bsearch(key, len, hayst->ptr, hayst->size);
if (begin == NULL) return 0;
key[len - 1]++;
end = pstr_bsearch(key, len, hayst->ptr, hayst->size);
key[len - 1]--;
if (end == NULL) return 0;
if (end == begin) return 0;
while (begin < end) {
if (*begin < key) n++;
begin++;
}
return n;
}
/*
* Example client code
*/
int main(int argc, char **argv)
{
Haystack *hayst;
FILE *f;
if (argc != 3) die("Usage: %s string queries", *argv);
hayst = hayst_new(argv[1]);
f = fopen(argv[2], "r");
if (f == NULL) die("Can't open %s.", argv[1]);
for (;;) {
char str[80];
size_t p, len;
size_t n;
if (fgets(str, sizeof(str), f) == NULL) break;
if (sscanf(str, "%zu %zu", &p, &len) < 2) continue;
n = hayst_find(hayst, p, len);
printf("%8zu %.*s\n", n, (int) len, hayst->buf + p);
}
fclose(f);
hayst_delete(hayst);
return 0;
}