0

Problem: I am given a string s of size n and I have many queries of type
[L,R]
and for each query I have to return the number of those sub strings of s which are equal to s[L..R] and which starts before index L in s .

constraints:
n <= 2*10^6
queries <= 10^5

One brute force approach is using the suffix array where we traverse the LCP array to find answer to each query.

I think an O(q log n ) approach is good for this problem . Please suggest any ??

Thank in advance ..

v78
  • 2,803
  • 21
  • 44

2 Answers2

1

A possible solution might be similar to a solution that I provided for fast string search with many queries to the same string. That solution has an example implementation in C.

  • Create an array of pointers into each char of your string. Sort it.
  • Look up the first occurrence of the query in the sorted auffix array with binary search.
  • Look up the last occurrence of the query. You can do this by incrementing the last char, so that you look for "abd" instead of "abc" to find the first non-match.
  • Count all occurrences between the two matches that start before L.

This solution, however, is not O(q log n), because sorting is already O(n log n) and the q query look-ups are O(q log n).

I've re-written the example I linked to for your problem. It is C, not C++ and it won't compile with a C++ compiler, because of the way malloc is used. It shouldn't be too hard to rewrite it in idiomatic C++.

The solution requires a lot of memory for the suffix array. It can treat about 130,000 queries an a 1.3 megabyte file in a bit more than a minute.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define die(...) exit((fprintf(stderr, "Fatal: " __VA_ARGS__), \
    putc(10, stderr), 1))



typedef struct Haystack Haystack;    

struct Haystack {
    size_t size;    /* Number of chars in string */
    char *buf;      /* Null-terminated char buffer */
    char **ptr;     /* Pointers into char buffer */
};

/*
 *      Count occurrence of c in zero-terminated string p.
 */
size_t strcount(const char *p, int c)
{
    size_t n = 0;

    for (;;) {
        p = strchr(p, c);
        if (p == NULL) return n;
        p++;
        n++;        
    }

    return 0;
}

/*
 *      String comparison via pointers to strings.
 */
int pstrcmp(const void *a, const void *b)
{
    const char *const *aa = a;
    const char *const *bb = b;

    return strcmp(*aa, *bb);
}

/*
 *      Create and prepare a hayst, i.e. a text file to search.
 */
Haystack *hayst_new(const char *fn)
{
    Haystack *hayst;
    FILE *f = fopen(fn, "r");
    char *p;
    char **pp;

    if (f == NULL) die("Couldn't open %s", fn);

    hayst = malloc(sizeof(*hayst));
    if (hayst == NULL) die("Allocation failed");

    fseek(f, 0, SEEK_END);
    hayst->size = ftell(f);
    fseek(f, 0, SEEK_SET);

    hayst->buf = malloc(hayst->size + 1);
    hayst->ptr = malloc(hayst->size * sizeof(*hayst->ptr));

    if (hayst->buf == NULL) die("Allocation failed");
    if (hayst->ptr == NULL) die("Allocation failed");

    fread(hayst->buf, 1, hayst->size, f);
    hayst->buf[hayst->size] = '\0';
    fclose(f);

    p = hayst->buf;
    pp = hayst->ptr;
    while (*p) *pp++ = p++;

    qsort(hayst->ptr, hayst->size, sizeof(*hayst->ptr), pstrcmp);

    return hayst;
}

/*
 *      Clean up hayst.
 */
void hayst_delete(Haystack *hayst)
{
    free(hayst->buf);
    free(hayst->ptr);
    free(hayst);
}

/*
 *      Binary range search for string pointers.
 */
static char **pstr_bsearch(const char *key, size_t len,
    char **arr, size_t high)
{
    size_t low = 0;

    while (low < high) {
        size_t mid = (low + high) / 2;
        int diff = strncmp(key, arr[mid], len);

        if (diff <= 0) high = mid;
        else low = mid + 1;
    }

    return arr + low;
}

/*
 *      Count occurrences of the string key in the haystack.
 */
size_t hayst_find(Haystack *hayst, size_t offset, size_t len)
{
    char *key = hayst->buf + offset;
    char **begin, **end;
    size_t n = 0;

    if (offset + len > hayst->size) return 0;

    begin = pstr_bsearch(key, len, hayst->ptr, hayst->size);
    if (begin == NULL) return 0;

    key[len - 1]++;
    end = pstr_bsearch(key, len, hayst->ptr, hayst->size);
    key[len - 1]--;
    if (end == NULL) return 0;
    if (end == begin) return 0;

    while (begin < end) {
        if (*begin < key) n++;
        begin++;
    }

    return n;
}

/*
 *      Example client code
 */
int main(int argc, char **argv)
{
    Haystack *hayst;
    FILE *f;

    if (argc != 3) die("Usage: %s string queries", *argv);

    hayst = hayst_new(argv[1]);

    f = fopen(argv[2], "r");
    if (f == NULL) die("Can't open %s.", argv[1]);
    for (;;) {
        char str[80];
        size_t p, len;
        size_t n;

        if (fgets(str, sizeof(str), f) == NULL) break;
        if (sscanf(str, "%zu %zu", &p, &len) < 2) continue;
        n = hayst_find(hayst, p, len);
        printf("%8zu %.*s\n", n, (int) len, hayst->buf + p);
    }
    fclose(f);

    hayst_delete(hayst);

    return 0;
}
Community
  • 1
  • 1
M Oehm
  • 28,726
  • 3
  • 31
  • 42
1

I have proposed to build a sorted suffix array and to query it with binary search. Another approach is to build a trie of the queries and make a single pass through the string to accumulate counts:

  • Read all queries. A query has the left and right indices as well as a count.
  • Build a trie for all queries. A pointer to a query serves as end marker for each query.
  • Traverse the string once. For each char, traverse the trie and accumulate the count at nodes that have a pointer to a query.

This method seems to be faster. It cannot handle duplicate queries. It also only makes sense when all queries are known beforehand. (Unfortunately, my example implementation has shown that my code for the other answer has a serious bug, probably in the ranged binary search; it miscounts some queries.)

Edit: I've added my example implementation in C -- not in C++, sorry -- below.

The implementation is wasteful, because it allots child nodes for the whole range of possible chars except '\0', of which the great majority will be NULL. A better implementation would map the chars of the queries to a tighter alphabet.

The trie must be built in an extra loop after reading the whole array, so that reallocations don't invalidate reference pointers into that array.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define die(...) exit((fprintf(stderr, "Fatal: " __VA_ARGS__), \
    putc(10, stderr), 1))



typedef struct Query Query;
typedef struct Trie Trie;
typedef struct Trienode Trienode;

struct Query {
    char *p;                /* starting point in buffer */
    size_t len;             /* length of query in chars */
    size_t count;           /* number of occurrences */
};

struct Trie {
    Trienode *head;
};

struct Trienode {
    Query *query;           /* query reference */
    Trienode *next[255];    /* child nodes */
};

/*
 *      Read whole file to buffer and return size in n.
 */
char *slurp(const char *fn, size_t *n)
{
    FILE *f = fopen(fn, "r");
    size_t size;
    char *buf;

    if (f == NULL) die("Couldn't open %s", fn);

    fseek(f, 0, SEEK_END);
    size = ftell(f);
    fseek(f, 0, SEEK_SET);

    buf = malloc(size + 1);

    if (buf == NULL) die("Allocation failed");

    fread(buf, 1, size, f);
    buf[size] = '\0';
    fclose(f);

    if (n) *n = size;
    return buf;
}

/*
 *      Insert query string and reference into trie.
 */
void trie_insert(Trie *trie, Query *q)
{
    char *str = q->p;
    Trienode **n = &trie->head;
    size_t i;

    for (i = 0; i < q->len; i++) {    
        if (*n == NULL) {
            *n = malloc(sizeof(**n));
            if (*n == NULL) die("Coudn't allocate node");
            memset(*n, 0, sizeof(**n));
        }

        n = &(*n)->next[(unsigned char) str[i] - 1];
    }   

    if (*n == NULL) {
        *n = malloc(sizeof(**n));
        if (*n == NULL) die("Coudn't allocate node");
        memset(*n, 0, sizeof(**n));
    }

    (*n)->query = q;
}

static void trie_delete_node(Trienode *n)
{
    size_t i;

    for (i = 0; i < 255; i++) {
        if (n->next[i]) trie_delete_node(n->next[i]);
    }

    free(n);
}

/*
 *      Destroy trie and its nodes.
 */
void trie_delete(Trie *trie)
{
    if (trie->head) trie_delete_node(trie->head);
}

/*
 *      Find occurrences of all queries. The count member of all
 *      queries must be 0 before calling this routine.
 */
void search(char *buf, Trie *trie)
{
    while (*buf) {
        Trienode *n = trie->head;
        char *p = buf;

        while (n && *p) {
            if (n->query) {
                Query *q = n->query;

                if (buf < q->p) q->count++;
            }
            n = n->next[(unsigned char) *p - 1];
            p++;
        }

        buf++;
    }
}

/*
 *      Example client code
 */
int main(int argc, char **argv)
{
    Query *query = NULL;
    size_t nquery = 0;
    size_t squery = 0;

    char *buf;
    size_t nbuf;

    Trie trie = {NULL};
    FILE *f;
    size_t i;

    if (argc != 3) die("Usage: %s string queries", *argv);

    // Read string buffer from file
    buf = slurp(argv[1], &nbuf);

    // Read query array
    f = fopen(argv[2], "r");
    if (f == NULL) die("Can't open %s.", argv[1]);
    for (;;) {
        char str[80];
        size_t p, len;

        if (fgets(str, sizeof(str), f) == NULL) break;
        if (sscanf(str, "%zu %zu", &p, &len) < 2) continue;

        if (nquery >= squery) {
            squery *= 2;
            if (squery == 0) squery = 0x400;
            query = realloc(query, squery * sizeof(*query));
            if (query == NULL) die("Reallocation failed.");
        }

        query[nquery].p = buf + p;
        query[nquery].len = len;
        query[nquery].count = 0;
        nquery++;
    }
    fclose(f);

    // Build tree from queries
    for (i = 0; i < nquery; i++) {
        Query *q = query + i;
        trie_insert(&trie, q);
    }

    // Assign the query counts
    search(buf, &trie);

    // Print results
    for (i = 0; i < nquery; i++) {
        Query *q = query + i;
        printf("%8zu %.*s\n", q->count, (int) q->len, q->p);
    }

    // Clean up    
    trie_delete(&trie);
    free(buf);
    free(query);

    return 0;
}
M Oehm
  • 28,726
  • 3
  • 31
  • 42