2

I'm learning C++ and am doing something I'm comfortable with in java to start out. Particle simulation and flocking using a quadtree to cheaply find particles in a region. Everything is working but when I use the quadtree to get the particles from a region it's really slow (about 1s for 5000 calls).

I tried replacing the vector with an array and measuring the execution time of various parts of the function. Am I making any rooky mistakes like unnecessarily copying objects etc.? I'm using 5000 particles, I can't imagine 1fps is the fastest it can go.

Full code for minimal reproducable example as per request:

main.cpp

#include <string>
#include <iostream>
#include <random>
#include <chrono>
#include <thread>
#include <cmath>

#include "Particle.h"
#include "Quadtree.h"

// Clock
using namespace std::chrono;
using namespace std::this_thread;

// Global constants
const int SCREEN_WIDTH = 640;
const int SCREEN_HEIGHT = 480;
const int desiredFPS = 30;
const int frameTimeMS = int(1000 / (double)desiredFPS);
const int numberOfParticles = 5000;

// Random number generation
std::random_device dev;
std::mt19937 rng(dev());
std::uniform_real_distribution<> dist(0, 1);

Particle particles[numberOfParticles];
Quadtree quadtree = Quadtree(0, 0, SCREEN_WIDTH, SCREEN_HEIGHT);

int main(int argc, char* args[])
{
    for (int i = 0; i < numberOfParticles; i++)
    {
        particles[i] = Particle(dist(rng) * SCREEN_WIDTH, dist(rng) * SCREEN_HEIGHT);
    }

    // Clock for making all frames equally long and achieving the desired framerate when possible
    auto lapStartTime = system_clock::now();

    // Main loop
    for (int i = 0; i < 1; i++)
    {
        // Insert the particles into the quadtree
        quadtree = Quadtree(0, 0, SCREEN_WIDTH, SCREEN_HEIGHT);
        for (int i = 0; i < numberOfParticles; i++)
        {
            quadtree.insert(&particles[i]);
        }

        double neighbourhoodRadius = 40;
        for (int i = 0; i < numberOfParticles; i++)
        {
            // THIS IS THE PART THAT IS SLOW
            std::vector<Particle*> neighbours = quadtree.getCircle(
                particles[i].x,
                particles[i].y,
                neighbourhoodRadius
            );
        }

        // Update clocks
        auto nextFrameTime = lapStartTime + milliseconds(frameTimeMS);
        sleep_until(nextFrameTime);
        lapStartTime = nextFrameTime;
    }
    return 0;
}

Quadtree.h

#pragma once

#include <vector>
#include "Particle.h"
#include "Rect.h"

class Quadtree
{
public:
    const static int capacity = 10; // Capacity of any section
    Quadtree(double px, double py, double width, double height);
    Quadtree(Rect r);
    bool insert(Particle* p); // Add a particle to the tree
    std::vector<Particle*> getCircle(double px, double py, double r);
    int numberOfItems(); // Total amount in the quadtree
private:
    std::vector<Particle*> particles; // Particles stored by this section
    std::vector<Quadtree> sections; // Branches (only if split)
    Rect area; // Region occupied by the quadtree
    bool isSplit() { return sections.size() > 0; }
    void split(); // Split the quadtree into 4 branches
};

Quadtree.cpp

#include <iostream>
#include "Quadtree.h"

Quadtree::Quadtree(double px, double py, double width, double height)
{
    area = Rect(px, py, width, height);
    sections = {};
    particles = {};
}

Quadtree::Quadtree(Rect r)
{
    area = r;
    sections = {};
    particles = {};
}

bool Quadtree::insert(Particle* p)
{
    if (area.intersectPoint(p->x, p->y))
    {
        if (!isSplit() && particles.size() < capacity)
        {
            particles.push_back(p);
        }
        else
        {
            if (!isSplit()) // Capacity is reached and tree is not split yet
            {
                split();
            }

            // That this is a reference is very important!
            // Otherwise a copy of the tree will be modified
            for (Quadtree& s : sections)
            {
                if (s.insert(p))
                {
                    return true;
                }
            }
        }

        return true;
    }
    else
    {
        return false;
    }
}

std::vector<Particle*> Quadtree::getCircle(double px, double py, double r)
{
    std::vector<Particle*> selection = {};
    if (!isSplit())
    {
        // Add all particles from this section that lie within the circle
        for (Particle* p : particles)
        {
            double a = px - p->x;
            double b = py - p->y;
            if (a * a + b * b <= r * r)
            {
                selection.push_back(p);
            }
        }
    }
    else
    {
        // The section is split so add all the particles from the
        // branches together
        for (Quadtree& s : sections)
        {
            // Check if the branch and the circle even have any intersection
            if (s.area.intersectRect(Rect(px - r, py - r, 2 * r, 2 * r)))
            {
                // Get the particles from the branch and add them to selection
                std::vector<Particle*> branchSelection = s.getCircle(px, py, r);
                selection.insert(selection.end(), branchSelection.begin(), branchSelection.end());
            }
        }
    }
    return selection;
}

void Quadtree::split()
{
    sections.push_back(Quadtree(area.getSection(2, 2, 0, 0)));
    sections.push_back(Quadtree(area.getSection(2, 2, 0, 1)));
    sections.push_back(Quadtree(area.getSection(2, 2, 1, 0)));
    sections.push_back(Quadtree(area.getSection(2, 2, 1, 1)));

    std::vector<Particle*> oldParticles{ particles };
    particles.clear();

    for (Particle* p : oldParticles)
    {
        bool success = insert(p);
    }
}

int Quadtree::numberOfItems()
{
    if (!isSplit())
    {
        return particles.size();
    }
    else
    {
        int result = 0;
        for (Quadtree& q : sections)
        {
            result += q.numberOfItems();
        }
        return result;
    }
}

Particle.h

#pragma once

class Particle {
public:
    double x;
    double y;
    Particle(double px, double py) : x(px), y(py) {}
    Particle() = default;
};

Rect.h

#pragma once

class Rect
{
public:
    double x;
    double y;
    double w;
    double h;
    Rect(double px, double py, double width, double height);
    Rect() : x(0), y(0), w(0), h(0) {}
    bool intersectPoint(double px, double py);
    bool intersectRect(Rect r);
    Rect getSection(int rows, int cols, int ix, int iy);
};

Rect.cpp

#include "Rect.h"

Rect::Rect(double px, double py, double width, double height)
{
    x = px;
    y = py;
    w = width;
    h = height;
}

bool Rect::intersectPoint(double px, double py)
{
    return px >= x && px < x + w && py >= y && py < y + h;
}

bool Rect::intersectRect(Rect r)
{
    return x + w >= r.x && y + h >= r.y && x <= r.x + r.w && y <= r.y + r.w;
}

Rect Rect::getSection(int cols, int rows, int ix, int iy)
{
    return Rect(x + ix * w / cols, y + iy * h / rows, w / cols, h / rows);
}
Fractal Salamander
  • 127
  • 1
  • 2
  • 6
  • Don't use vectors for performance critical applications. But for now: pass a reference to the output `vector` as an argument to that function (instead of returning a new vector every time), append results into that, and ditch the `insert`. – beothunder Jul 30 '22 at 09:41
  • 1
    @beothunder Why not return the vector by value? What's wrong with using vectors for performance critical applications in general? – Ted Lyngmo Jul 30 '22 at 09:45
  • I'd like to see a [mre] of this code so I could run tests myself. I'd remove the recursiveness to see where that gets me. – Ted Lyngmo Jul 30 '22 at 09:46
  • 1
    *instead of returning a new vector every time* -- C++ and return value optimization has come a long way since the early 1990's. – PaulMcKenzie Jul 30 '22 at 09:46
  • 1. `insert` has to copy everything, right? 2. Try stepping through any vector function (vectors are designed to be general and/or convinient, not performant). – beothunder Jul 30 '22 at 09:47
  • @PaulMcKenzie the pass by reference, instead of return, is exclusively in order to axe the `insert`. – beothunder Jul 30 '22 at 09:49
  • @beothunder 1. Yes, `std::vector::insert` copies/moves elements. An alternative would be `std::list::splice`. 2. Stepping through a vector is super-fast. It's designed to be so. – Ted Lyngmo Jul 30 '22 at 09:50
  • 2
    *but when I use the quadtree to get the particles from a region it's really slow* -- Are you running an unoptimized "debug" build or an optimized build? What compiler options did you use to build your application? Any question that asks "why is C++ slow?" should be accompanied by the compiler and the compiler options used to build the application. – PaulMcKenzie Jul 30 '22 at 09:52
  • @beothunder The insert() is taking place at the back of the vector, which doesn't move any existing elements. Second, [copy elision](https://stackoverflow.com/questions/12953127/what-are-copy-elision-and-return-value-optimization). – PaulMcKenzie Jul 30 '22 at 10:02
  • 1
    @beothunder -- *vectors are designed to be general and/or convinient, not performant* -- This is not necessarily true. The writers of a compiler's standard library will place performance of vector at top or near top of their list. That's why they craft it using placement-new (in almost all cases) instead of a naive `new[]` and `delete[]` wrapper. – PaulMcKenzie Jul 30 '22 at 10:05
  • @PaulMcKenzie Bruh, step into something silly, like `push_back` with optimized build, look at the disassembly, and explain to me how _that_ is performant. A single compare and move (preferably wide) is performant, not that. – beothunder Jul 30 '22 at 10:08
  • @beothunder -- Until the OP confirms that they are running an optimized build, your suggestions to the OP may not lead to any speed-up whatsoever. – PaulMcKenzie Jul 30 '22 at 10:12
  • @beothunder The `insert` in this case will, if the `vector` already has capacity enough, most probably be one single `memcpy`/`memmove`. If it needs to realloc, it's a different matter. – Ted Lyngmo Jul 30 '22 at 10:12
  • @TedLyngmo exactly, instead of _none_ `memcpy`/`memmove`. Mind you for every (recursive and branching!) call. realloc can be fixed by setting the capacity high enough, not really a concern – beothunder Jul 30 '22 at 10:13
  • @beothunder Well, I've already optimized away the recursiveness in my head as I mentioned at the top :-) – Ted Lyngmo Jul 30 '22 at 10:15
  • @TedLyngmo `for (Quadtree& s : sections)` - optimize away that – beothunder Jul 30 '22 at 10:16
  • @PaulMcKenzie I was not running an optimized build I'm ashamed to say, I didn't know that was a thing to be honest. I have since switched to a performance build and it has gotten much faster but is still only at about 8 fps. – Fractal Salamander Jul 30 '22 at 10:20
  • `std::vector branchSelection = s.getCircle(px, py, r);` -- Please post `getCircle()`. If it supposed to return an existing circle, then returning a copy each time should be replaced by return a reference to the existing circle. Or if it does return a reference, then `branchSelection` should be a reference. – PaulMcKenzie Jul 30 '22 at 10:23
  • @beothunder Optimize away the recursive calls to `getCircle`. You can usually do everything in one function without too much of a hassle. – Ted Lyngmo Jul 30 '22 at 10:23
  • 2
    @TedLyngmo -- ok, I see it now, my mistake. – PaulMcKenzie Jul 30 '22 at 10:25
  • 1
    @TedLyngmo not here, at least not without additional storage (here it is the stack, and there is no reason why it should be something different). The quad tree branches (thats how they be), and multiple children can collide with the target primitive, so there isn't one call to `getCircle` at the end, there are multiple, there have to be, otherwise you couldn't walk the tree. – beothunder Jul 30 '22 at 10:31
  • @beothunder Yes, one usually would have to use a local stack (or similar) to avoid the recursive calls and it often helps. I can't say that it will definitely will do in this case without actually being able to test it though, which is why I've asked for a [mre] multiple times. – Ted Lyngmo Jul 30 '22 at 10:37
  • @FractalSalamander -- Just for laughs, try: `std::vector branchSelection(std::move(s.getCircle(px, py, r)));` to see if that makes any improvement. – PaulMcKenzie Jul 30 '22 at 10:38
  • @TedLyngmo I've added a minimal reproducible example. Sorry that it is so much but it really is the bare minimum. – Fractal Salamander Jul 30 '22 at 10:44
  • @FractalSalamander `s.getCircle(px, py, r);` -- The other thing is that if `px, py, r` do not change often for a given `Quadtree`, but is still being called many times, then possibly memoizing `px, py, r;` to compute the value once, and simply return the same value if `px, py, r` is detected again may be something you could try. Of course you would probably need to store a `map` or similar structure for the memoization. It all depends on how often the `px, py, r` triplet gets used, and if there aren't too many different combinations of `px, py, r`. – PaulMcKenzie Jul 30 '22 at 10:47
  • @PaulMcKenzie did you miss the "Particle simulation and flocking using a quadtree" part? Those values change every time. – beothunder Jul 30 '22 at 10:52
  • @PaulMcKenzie `r` is practically never going to change but `px, py` will vary a lot since these will be the position of a particle which is finding every other particle in it's neighbourhood. Since `getCircle()` will be called for every particle it changes constantly. I have added passing the vector as a reference but it hasn't sped it up very noticably. – Fractal Salamander Jul 30 '22 at 10:52
  • 1
    @FractalSalamander `Quadtree::Quadtree(double px, double py, double width, double height) : area(px, py, width, height) {}` -- Use the member-initialization list, and there is no need to set the vector's to empty, since the default for vector is for it to be empty. You should also use the member-initialization list for your other class constructors. – PaulMcKenzie Jul 30 '22 at 10:55
  • @FractalSalamander *I have added passing the vector as a reference but it hasn't sped it up very noticably.* -- Well, I kind of felt that would be the case. Did you try the `std::move` I mentioned earlier? – PaulMcKenzie Jul 30 '22 at 10:58
  • 4
    Perhaps don't create a new quadtree every frame. In any case you should **profile** your application in order to find out where the time is spend. – n. m. could be an AI Jul 30 '22 at 10:59
  • 1
    It’s probably malloc/free. Reuse your vectors: don’t recreate the tree instead add the `clear()` method, don’t return new vectors from `getCircle` instead pass a reference.. – Soonts Jul 30 '22 at 11:14

1 Answers1

7

So... In the original code creating the quadtree takes about 0.001s (relatively insignificant), and the neighbor search takes about 0.06s - here is our culprit (as mentioned by the OP).

Passing the std::vector<Particle*> neighbours as a reference to the getCircle function, gets rid of the insert call at the end of the function as well as new vector allocations (hi to everyone saying "oh, it will be optimized away automatically"). The time is reduced to 0.011s.

The nieghbours vector can be taken out of the main loop, and cleared after use, so that it only resizes on the first frame.

I do not see any more immediately obvious targets (without doing a complete rewrite). Maybe I will add something later.


I decided to approach this more systematically: I added an #if switch for every change I made and actually recorded some statistics, instead of eyeballing it. (Evey change is added incrementally, times include tree construction).

original by reference out of loop
min time: 0.0638s 0.0127s 0.0094s
avg time: 0.0664s 0.0136s 0.0104s
max time: 0.0713s 0.0157s 0.0137s

All measurements were done on my machine, with optimized build, using QueryPerfoemanceCounter.


I did end up rewriting the whole thing...

Got rid of vectors.

  • The Quadtree::particles is now Particle* particles[capacity] with a count.
  • sections is a pointer; isSplit just checks if sections is 0.
  • Since the total (or maximum) number of particles is known, the number of particles that can be returned by getCircle can't be more than that. So I allocate that much outside of the main loop to store neighbours. Adding another result involves just bumping a pointer (without even a check in release). And resetting it after use is done by setting the count to 0 (see arena or bump allocator).
  • The maximum number of quadtree nodes can be inferred from the number of particles. So, similarly, splitting just bumps the pointer by 4.

Trying to precompute the Rect in getCircle, or put px, py, r (and/or that rect as well) in a struct (passed as value or reference) does not yield any improvement (or is detremental). (was suggested by Goswin von Brederlow).

Then I flipped the recursion (was suggested by Ted Lyngmo). The temporary stack is, again, preallocated. And then I did the same thing for insert.

rewrite non-recursive insert as well
min_time: 0.0077 0.0069 0.0068
avg_time: 0.0089 0.0073 0.0070
max_time: 0.0084 0.0078 0.0074

So in the end the most impactful thing was the very first - not inserting and not creating unnecessary vectors every call, but instead passing the same one by reference.

One last thing - might want to store the quadtree particles separately, since most of the time getCircle is traversing nodes, where particles are not stored.

Otherwise, I do not see how to improve this any more. At this point it would require someone actually smart or crazy...

beothunder
  • 551
  • 2
  • 14
  • 1
    Instead of adding `QueryPerformanceCounter` I would recommand using a profiler. For code like that, it should really works well. Many profiler have trials and if OP is serious about performance because he want highest possible FPS, such tool should really be useful and worth every pennies. – Phil1970 Jul 30 '22 at 12:37
  • Yup. But I do not have access to a good profiler (RAD's Telemetry would be nice). And you can get actual results with QPC. Profilers have to use it behind the scenes anyway. – beothunder Jul 30 '22 at 12:41
  • Using -fLTO or adding the functions for `Rect` to the header should give a good boost to the speed. Also if the area of a node is completely in the circle all points can be added recursively without any further "inside" checks. – Goswin von Brederlow Jul 30 '22 at 14:49
  • Does moving `Rect(px - r, py - r, 2 * r, 2 * r)` out of the `getCircle` improve things? Maybe make a `struct Circle` that has a bounding rectangle and pass a `const Circle &`. – Goswin von Brederlow Jul 30 '22 at 14:52
  • @GoswinvonBrederlow What do you mean by adding the `Rect` functions to the header? Which functions and which header? – Fractal Salamander Jul 30 '22 at 15:19
  • 1
    @FractalSalamander Instead of declaring functions in `Rect.h` and then definig them in `Rect.cpp`, define them in `Rect.h` (not just declare them). – beothunder Jul 30 '22 at 15:28
  • @FractalSalamander If you define the methods of `Rect` in the header instead of a separate compilation unit then the compiler can inline them. For example in `s.area.intersectRect(Rect(px - r, py - r, 2 * r, 2 * r))` this would avoid having to actually create a Rect on the stack at all and just do the comparisons directly on the values. – Goswin von Brederlow Jul 31 '22 at 20:57
  • @beothunder Great "going all-in" approach. +1 – Ted Lyngmo Aug 01 '22 at 01:29