For each sheet that you are treating, you could build a suffix array as described in this article. Before you start your search, read the sheet, find the line beginnings (as integer indices into the sheet buffer), create the suffix array and sort it as described in the article.
Now, if you are looking for the lines in which a pattern, say "table", occurs, you can search for the next entry after "table" and the next entry after "tablf", which is the first non-match, where you have moved the right-most letter, odometer-style.
If both indices are the same, there are no matches. If they are different, you'll get a list of pointers into the sheet:
"tab. And now ..."
----------------------------------------------------------------
"table and ..." 0x0100ab30
"table water for ..." 0x0100132b
"tablet computer ..." 0x01000208
----------------------------------------------------------------
"tabloid reporter ..."
This will give you a list of pointers from which, by subtracting the base pointer of the sheet buffer, you can get the integer offsets. Comparison with the line beginnings will give you the line numbers that correspond to these pointers. (The line numbers are sorted, so you can do binary search here.)
The memory overhead is an array of pointers that has the same size as the sheet buffer, so for 30,000 strings of 200 chars, that will be about 48MB on a 64-bit machine. (The overhead of the line indices is negligible.)
Sorting the array will take long, but it is done only once for each sheet.
Edit: The idea seems to work well. I have implemented it and can scan a dictionary of about 130,000 words on a text file of nearly 600k in less then one second:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define die(...) exit((fprintf(stderr, "Fatal: " __VA_ARGS__), \
putc(10, stderr), 1))
typedef struct Sheet Sheet;
struct Sheet {
size_t size; /* Number of chars */
char *buf; /* Null-terminated char buffer */
char **ptr; /* Pointers into char buffer */
size_t nline; /* number of lines */
int *line; /* array of offset of line beginnings */
size_t naux; /* size of scratch array */
char **aux; /* scratch array */
};
/*
* Count occurrence of c in zero-terminated string p.
*/
size_t strcount(const char *p, int c)
{
size_t n = 0;
for (;;) {
p = strchr(p, c);
if (p == NULL) return n;
p++;
n++;
}
return 0;
}
/*
* String comparison via pointers to strings.
*/
int pstrcmp(const void *a, const void *b)
{
const char *const *aa = a;
const char *const *bb = b;
return strcmp(*aa, *bb);
}
/*
* Pointer comparison.
*/
int ptrcmp(const void *a, const void *b)
{
const char *const *aa = a;
const char *const *bb = b;
if (*aa == *bb) return 0;
return (*aa < *bb) ? -1 : 1;
}
/*
* Create and prepare a sheet, i.e. a text file to search.
*/
Sheet *sheet_new(const char *fn)
{
Sheet *sheet;
FILE *f = fopen(fn, "r");
size_t n;
int last;
char *p;
char **pp;
if (f == NULL) die("Couldn't open %s", fn);
sheet = malloc(sizeof(*sheet));
if (sheet == NULL) die("Allocation failed");
fseek(f, 0, SEEK_END);
sheet->size = ftell(f);
fseek(f, 0, SEEK_SET);
sheet->buf = malloc(sheet->size + 1);
sheet->ptr = malloc(sheet->size * sizeof(*sheet->ptr));
if (sheet->buf == NULL) die("Allocation failed");
if (sheet->ptr == NULL) die("Allocation failed");
fread(sheet->buf, 1, sheet->size, f);
sheet->buf[sheet->size] = '\0';
fclose(f);
sheet->nline = strcount(sheet->buf, '\n');
sheet->line = malloc(sheet->nline * sizeof(*sheet->line));
sheet->aux = NULL;
sheet->naux = 0;
n = 0;
last = 0;
p = sheet->buf;
pp = sheet->ptr;
while (*p) {
*pp++ = p;
if (*p == '\n') {
sheet->line[n++] = last;
last = p - sheet->buf + 1;
}
p++;
}
qsort(sheet->ptr, sheet->size, sizeof(*sheet->ptr), pstrcmp);
return sheet;
}
/*
* Clean up sheet.
*/
void sheet_delete(Sheet *sheet)
{
free(sheet->buf);
free(sheet->ptr);
free(sheet->line);
free(sheet->aux);
free(sheet);
}
/*
* Binary range search for string pointers.
*/
static char **pstr_bsearch(const char *key,
char **arr, size_t high)
{
size_t low = 0;
while (low < high) {
size_t mid = (low + high) / 2;
int diff = strcmp(key, arr[mid]);
if (diff < 0) high = mid;
else low = mid + 1;
}
return arr + low;
}
/*
* Binary range search for line offsets.
*/
static const int *int_bsearch(int key, const int *arr, size_t high)
{
size_t low = 0;
while (low < high) {
size_t mid = (low + high) / 2;
int diff = key - arr[mid];
if (diff < 0) high = mid;
else low = mid + 1;
}
if (low < 1) return NULL;
return arr + low - 1;
}
/*
* Find occurrences of the string key in the sheet. Returns the
* number of lines in which the key occurs and assigns up to
* max lines to the line array. (If max is 0, line may be NULL.)
*/
int sheet_find(Sheet *sheet, char *key,
int line[], int max)
{
char **begin, **end;
int n = 0;
size_t i, m;
size_t last;
begin = pstr_bsearch(key, sheet->ptr, sheet->size);
if (begin == NULL) return 0;
key[strlen(key) - 1]++;
end = pstr_bsearch(key, sheet->ptr, sheet->size);
key[strlen(key) - 1]--;
if (end == NULL) return 0;
if (end == begin) return 0;
m = end - begin;
if (m > sheet->naux) {
if (sheet->naux == 0) sheet->naux = 0x100;
while (sheet->naux < m) sheet->naux *= 2;
sheet->aux = realloc(sheet->aux, sheet->naux * sizeof(*sheet->aux));
if (sheet->aux == NULL) die("Re-allocation failed");
}
memcpy(sheet->aux, begin, m * sizeof(*begin));
qsort(sheet->aux, m, sizeof(*begin), ptrcmp);
last = 0;
for (i = 0; i < m; i++) {
int offset = sheet->aux[i] - sheet->buf;
const int *p;
p = int_bsearch(offset, sheet->line + last, sheet->nline - last);
if (p) {
if (n < max) line[n] = p - sheet->line;
last = p - sheet->line + 1;
n++;
}
}
return n;
}
/*
* Example client code
*/
int main(int argc, char **argv)
{
Sheet *sheet;
FILE *f;
if (argc != 3) die("Usage: %s patterns corpus", *argv);
sheet = sheet_new(argv[2]);
f = fopen(argv[1], "r");
if (f == NULL) die("Can't open %s.", argv[1]);
for (;;) {
char str[80];
int line[50];
int i, n;
if (fgets(str, sizeof(str), f) == NULL) break;
strtok(str, "\n");
n = sheet_find(sheet, str, line, 50);
printf("%8d %s\n", n, str);
if (n > 50) n = 50;
for (i = 0; i < n; i++) printf(" [%d] %d\n", i, line[i] + 1);
}
fclose(f);
sheet_delete(sheet);
return 0;
}
The implementation has its rough edges, but it works. I'm not especially fond of the scratch array and the additional sorting on the found pointer range, but it turns out that even sorting the large suffix array doesn't take too long.
You can extend this solution to more sheets, if you like.