Reason for Segmentation Fault During Depth First Search on Tree of Large Size C++

Question

I'm trying to solve https://open.kattis.com/problems/rootedsubtrees and part of the solution requires finding the minimum distance between any 2 nodes on the tree. To do this, I'm using Lowest Common Ancestor as a subroutine. Part of my LCA code uses a DFS to traverse the tree. Somehow, running this code on a line graph of size 200000 leads to a segmentation fault during the DFS section of the code.

#pragma GCC optimize("Ofast")
#pragma GCC target("sse,sse2,sse3,ssse3,sse4,popcnt,abm,mmx,avx,avx2,fma")
#include <bits/stdc++.h>
using namespace std;

typedef long long ll;
typedef vector<int> vi;
#define fast_cin()                    \
    ios_base::sync_with_stdio(false); \
    cin.tie(NULL);                    \
    cout.tie(NULL);

int n, q, idx;
vector<int> adjlist[200009];
vector<int> L, E,
    H;  // depth at traversal index, node at traversal index, first traversal index of node

void dfs(int cur, int depth) {
    cout << "dfs " << cur << " " << idx << endl;
    H[cur] = idx;
    E[idx] = cur;
    L[idx++] = depth;
    for (int &nxt : adjlist[cur]) {
        if (H[nxt] != -1) continue;
        dfs(nxt, depth + 1);
        E[idx] = cur;  // backtrack to current node
        L[idx++] = depth;
    }
}

class SparseTable {  // OOP style
   private:
    vi A, P2, L2;
    vector<vi> SpT;  // the Sparse Table
   public:
    SparseTable() {}  // default constructor

    SparseTable(vi &initialA) {  // pre-processing routine
        A = initialA;
        int n = (int)A.size();
        int L2_n = (int)log2(n) + 1;
        P2.assign(L2_n, 0);
        L2.assign(1 << L2_n, 0);
        for (int i = 0; i <= L2_n; ++i) {
            P2[i] = (1 << i);  // to speed up 2^i
            L2[(1 << i)] = i;  // to speed up log_2(i)
        }
        for (int i = 2; i < P2[L2_n]; ++i)
            if (L2[i] == 0) L2[i] = L2[i - 1];  // to fill in the blanks

        // the initialization phase
        SpT = vector<vi>(L2[n] + 1, vi(n));
        for (int j = 0; j < n; ++j) SpT[0][j] = j;  // RMQ of sub array [j..j]

        // the two nested loops below have overall time complexity = O(n log n)
        for (int i = 1; P2[i] <= n; ++i)               // for all i s.t. 2^i <= n
            for (int j = 0; j + P2[i] - 1 < n; ++j) {  // for all valid j
                int x = SpT[i - 1][j];                 // [j..j+2^(i-1)-1]
                int y = SpT[i - 1][j + P2[i - 1]];     // [j+2^(i-1)..j+2^i-1]
                SpT[i][j] = A[x] <= A[y] ? x : y;
            }
    }

    int RMQ(int i, int j) {
        int k = L2[j - i + 1];          // 2^k <= (j-i+1)
        int x = SpT[k][i];              // covers [i..i+2^k-1]
        int y = SpT[k][j - P2[k] + 1];  // covers [j-2^k+1..j]
        return A[x] <= A[y] ? x : y;
    }
};

int LCA(int u, int v, SparseTable &SpT) {
    if (H[u] > H[v]) swap(u, v);
    return E[SpT.RMQ(H[u], H[v])];
}

int APSP(int u, int v, SparseTable &SpT) {
    int ancestor = LCA(u, v, SpT);
    return L[H[u]] + L[H[v]] - 2 * L[H[ancestor]];
}

int main() {
    fast_cin();
    cin >> n >> q;

    L.assign(2 * (n + 9), 0);
    E.assign(2 * (n + 9), 0);
    H.assign(n + 9, -1);
    idx = 0;

    int u, v;
    for (int i = 0; i < n - 1; i++) {
        cin >> u >> v;
        u--;
        v--;
        adjlist[u].emplace_back(v);
        adjlist[v].emplace_back(u);
    }
    dfs(0, 0);
    SparseTable SpT(L);

    ll d;
    while (q--) {
        cin >> u >> v;
        u--;
        v--;
        d = (ll) APSP(u, v, SpT) + 1;
        cout << (ll) n - d + (d) * (d + 1) / 2 << endl;
    }

    return 0;
}

Using the following Python Code to generate the input of a large line graph

n = 200000
q = 1
print(n, q)
for i in range(1, n):
    print(i, i+1)
print(1, 200000)

I get the following last few lines of output before my program crashes.

.
.
.
dfs 174494 174494
dfs 174495 174495
dfs 174496 174496
dfs 174497 174497
dfs 174498 174498
Segmentation fault (core dumped)

Is the problem an issue of exhausting stack space with the recursion or something else?

you can also include the depth in your logging, and add sanity checks for the [ index ] values to make sure index >= 0 and if you have a sensible maximum index < max — Dave S, Apr 24 '22 at 03:29
`typedef long long ll; typedef vector vi;` -- What is the purpose for these macros? All they do is obfuscate the code. Then this: `#include ` is not a standard C++ header. Include the actual standard headers, i.e ``, ``, etc. — PaulMcKenzie, Apr 24 '22 at 03:37
*Somehow, running this code* -- You can figure out why the "somehow" is occurring by [debugging the code](https://stackoverflow.com/questions/25385173/what-is-a-debugger-and-how-can-it-help-me-diagnose-problems). — PaulMcKenzie, Apr 24 '22 at 03:41
`for (int i = 0; i <= L2_n; ++i) { P2[i] = (1 << i);` -- This is an out-of-bounds access when `i == L2_n`. Loops using `<=` as a condition are always suspect for off-by-one errors. — PaulMcKenzie, Apr 24 '22 at 04:03
@PaulMcKenzie Thanks for pointing out the the off-by-one error. As for the use of the macros and non-standard headers, I understand they are not good software development practices. Rather, I only use them in competitive programming settings to reduce the amount of code I need to type (to increase speed). — Brandon Tang, Apr 24 '22 at 04:13
That "increase in typing speed" is all negated by compilation time. When you introduce macros, the code takes longer to compile/build. Then that `#include — PaulMcKenzie, Apr 24 '22 at 04:14
Also, how is the python code related to the C++ code? I don't see a `print` function in the C++ code. I could understand if the python code was producing randomized cases that is fed into the C++ code. Maybe that's what you could do in the C++ code — PaulMcKenzie, Apr 24 '22 at 04:19
@PaulMcKenzie The python code is used to generate a large testcase for the C++ code. I could make it generate randomized test cases, but I felt that that was not required because the problem I had was with handling large input leading to crashing rather than handling large input leading to wrong answers. — Brandon Tang, Apr 24 '22 at 04:28

score 1 · Answer 1 · answered Apr 24 '22 at 04:09

You posted a lot of code, but here is one obvious error in the SparseMatrix class:

std::vector<int> P2;
//...
P2.assign(L2_n, 0);
for (int i = 0; i <= L2_n; ++i) 
{
   P2[i] = (1 << i);  // <-- Out of bounds access when i == L2_n

To show you the error, change that line of code to this:

   P2.at(i) = (1 << i);  // <-- Out of bounds access when i == L2_n

You will now get a std::out_of_range exception thrown.

If you write a loop using <=, that loop will be considered suspicious, since a lot of off-by-one and buffer overrun errors occur with loop conditions written this way.

score 0 · Answer 2 · answered Apr 24 '22 at 04:24

I believe stack exhaustion was the main problem in running the code on my machine. I re-implemented the DFS in an iterative fashion.

stack<tuple<int, int, bool>> st; // cur, depth, first_time
st.push ({0, 0, 1});
while (!st.empty()) {
    auto [cur, depth, first_time] = st.top();
    st.pop();

    if (first_time){
        H[cur] = idx;
    }
    E[idx] = cur;
    L[idx++] = depth;
    for (int &nxt : adjlist[cur]) {
        if (H[nxt] != -1) continue;
        st.push({cur, depth, 0});
        st.push({nxt, depth+1, 1});
        break;
    }
}

and my code was able to run the large testcase on my machine.

I'm not sure is this is relevant to the original question, but after this change, the code still flagged a run-time error on the online judge and I eventually realized that the issue was that the sparse table was using too much memory, so I fixed that by avoiding wasted declared but not used memory spaces in rows of the sparse table. Then the online judge deemed it as being too slow. So I reverted the DFS code back to the recursive version, and it was accepted. Note that the accepted solution actually crashes on my machine when running the large testcase... I guess my machine has a more limited stack space than the online grader.

The accepted solution is here

#pragma GCC optimize("Ofast")
#pragma GCC target("sse,sse2,sse3,ssse3,sse4,popcnt,abm,mmx,avx,avx2,fma")
#include <bits/stdc++.h>
using namespace std;

typedef long long ll;
typedef vector<int> vi;
#define fast_cin()                    \
    ios_base::sync_with_stdio(false); \
    cin.tie(NULL);                    \
    cout.tie(NULL);

int n, q, idx;
vector<int> adjlist[(int)2e5 + 9];
vector<int> L, E,
    H;  // depth at traversal index, node at traversal index, first traversal index of node

void dfs(int cur, int depth) {
    H[cur] = idx;
    E[idx] = cur;
    L[idx++] = depth;
    for (int &nxt : adjlist[cur]) {
        if (H[nxt] != -1) continue;
        dfs(nxt, depth + 1);
        E[idx] = cur;  // backtrack to current node
        L[idx++] = depth;
    }
}

class SparseTable {  // OOP style
   private:
    vi A, P2, L2;
    vector<vi> SpT;  // the Sparse Table
   public:
    SparseTable() {}  // default constructor

    SparseTable(vi &initialA) {  // pre-processing routine
        A = initialA;
        int n = (int)A.size();
        int L2_n = (int)log2(n) + 1;
        P2.assign(L2_n + 1, 0);
        L2.assign((1 << L2_n) + 1, 0);
        for (int i = 0; i <= L2_n; ++i) {
            P2[i] = (1 << i);  // to speed up 2^i
            L2[(1 << i)] = i;  // to speed up log_2(i)
        }
        for (int i = 2; i < P2[L2_n]; ++i)
            if (L2[i] == 0) L2[i] = L2[i - 1];  // to fill in the blanks

        // the initialization phase
        SpT = vector<vi>(L2[n] + 1, vi());
        SpT[0] = vi(n, 0);
        for (int j = 0; j < n; ++j) SpT[0][j] = j;  // RMQ of sub array [j..j]

        // the two nested loops below have overall time complexity = O(n log n)
        for (int i = 1; P2[i] <= n; ++i) {             // for all i s.t. 2^i <= n
            SpT[i] = vi(n + 1 - P2[i]);                // initialize SpT[i]
            for (int j = 0; j + P2[i] - 1 < n; ++j) {  // for all valid j
                int x = SpT[i - 1][j];                 // [j..j+2^(i-1)-1]
                int y = SpT[i - 1][j + P2[i - 1]];     // [j+2^(i-1)..j+2^i-1]
                SpT[i][j] = A[x] <= A[y] ? x : y;
            }
        }
    }

    int RMQ(int i, int j) {
        int k = L2[j - i + 1];          // 2^k <= (j-i+1)
        int x = SpT[k][i];              // covers [i..i+2^k-1]
        int y = SpT[k][j - P2[k] + 1];  // covers [j-2^k+1..j]
        return A[x] <= A[y] ? x : y;
    }
};

int LCA(int u, int v, SparseTable &SpT) {
    if (H[u] > H[v]) swap(u, v);
    return E[SpT.RMQ(H[u], H[v])];
}

int APSP(int u, int v, SparseTable &SpT) {
    int ancestor = LCA(u, v, SpT);
    return L[H[u]] + L[H[v]] - 2 * L[H[ancestor]];
}

int main() {
    fast_cin();
    cin >> n >> q;

    L.assign(2 * (n), 0);
    E.assign(2 * (n), 0);
    H.assign(n, -1);
    idx = 0;

    int u, v;
    for (int i = 0; i < n - 1; i++) {
        cin >> u >> v;
        u--;
        v--;
        adjlist[u].emplace_back(v);
        adjlist[v].emplace_back(u);
    }
    dfs(n - 1, 0);
    SparseTable SpT(L);

    ll d;
    while (q--) {
        cin >> u >> v;
        u--;
        v--;
        d = (ll)APSP(u, v, SpT) + 1LL;
        cout << (ll)n - d + (d) * (d + 1) / (ll)2 << endl;
    }

    return 0;
}

Reason for Segmentation Fault During Depth First Search on Tree of Large Size C++

2 Answers2