How do you measure peak memory bandwidth in OpenGL?

Question

Just to get an idea of what kind of speeds I should be expecting I have been trying to benchmark transfer between global memory and shaders, rather than relying on GPU spec sheets. However I can't get close to the theoretical maximum. In fact I'm out by a factor of 50!.

I'm using a GTX Titan X, which is said to have 336.5GB/s. Linux x64 driver 352.21.

I found a CUDA benchmark here which gives me ~240–250GB/s (this is more what I expect).

I'm trying to match exactly what they do with shaders. I've tried vertex shaders, compute shaders, accessing buffer objects via image_load_store and NV_shader_buffer_store, with floats, vec4s, loops inside the shader (with coalesced addressing within the work group) and various methods of timing. I'm stuck at ~7GB/s (see the update below).

Why is GL so much slower? Am I doing something wrong and if so, how should it be done?

Here's my MWE with three methods (1. vertex shader with image_load_store, 2. vertex shader with bindless graphics, 3. compute shader with bindless graphics):

//#include <windows.h>
#include <assert.h>
#include <stdio.h>
#include <memory.h>
#include <GL/glew.h>
#include <GL/glut.h>

const char* imageSource =
    "#version 440\n"
    "uniform layout(r32f) imageBuffer data;\n"
    "uniform float val;\n"
    "void main() {\n"
    "   imageStore(data, gl_VertexID, vec4(val, 0.0, 0.0, 0.0));\n"
    "   gl_Position = vec4(0.0);\n"
    "}\n";

const char* bindlessSource =
    "#version 440\n"
    "#extension GL_NV_gpu_shader5 : enable\n"
    "#extension GL_NV_shader_buffer_load : enable\n"
    "uniform float* data;\n"
    "uniform float val;\n"
    "void main() {\n"
    "   data[gl_VertexID] = val;\n"
    "   gl_Position = vec4(0.0);\n"
    "}\n";

const char* bindlessComputeSource =
    "#version 440\n"
    "#extension GL_NV_gpu_shader5 : enable\n"
    "#extension GL_NV_shader_buffer_load : enable\n"
    "layout(local_size_x = 256) in;\n"
    "uniform float* data;\n"
    "uniform float val;\n"
    "void main() {\n"
    "   data[gl_GlobalInvocationID.x] = val;\n"
    "}\n";

GLuint compile(GLenum type, const char* shaderSrc)
{
    GLuint shader = glCreateShader(type);
    glShaderSource(shader, 1, (const GLchar**)&shaderSrc, NULL);
    glCompileShader(shader);
    int success = 0;
    int loglen = 0;
    glGetShaderiv(shader, GL_COMPILE_STATUS, &success);
    glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &loglen);
    GLchar* log = new GLchar[loglen];
    glGetShaderInfoLog(shader, loglen, &loglen, log);
    if (!success)
    {
        printf("%s\n", log);
        exit(0);
    }
    GLuint program = glCreateProgram();
    glAttachShader(program, shader);
    glLinkProgram(program);
    return program;
}

GLuint timerQueries[2];
void start()
{
    glGenQueries(2, timerQueries);
    glQueryCounter(timerQueries[0], GL_TIMESTAMP);
}

float stop()
{
    glMemoryBarrier(GL_ALL_BARRIER_BITS);
    GLsync sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
    glWaitSync(sync, 0, GL_TIMEOUT_IGNORED);
    glQueryCounter(timerQueries[1], GL_TIMESTAMP);
    GLint available = 0;
    while (!available) //sometimes gets stuck here for whatever reason
        glGetQueryObjectiv(timerQueries[1], GL_QUERY_RESULT_AVAILABLE, &available);
    GLuint64 a, b;
    glGetQueryObjectui64v(timerQueries[0], GL_QUERY_RESULT, &a);
    glGetQueryObjectui64v(timerQueries[1], GL_QUERY_RESULT, &b);
    glDeleteQueries(2, timerQueries);
    return b - a;
}

int main(int argc, char** argv)
{
    float* check;
    glutInit(&argc, argv);
    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
    glutCreateWindow("test");
    glewInit();

    int bufferSize = 64 * 1024 * 1024; //64MB
    int loops = 500;

    glEnable(GL_RASTERIZER_DISCARD);

    float* dat = new float[bufferSize/sizeof(float)];
    memset(dat, 0, bufferSize);

    //create a buffer with data
    GLuint buffer;
    glGenBuffers(1, &buffer);
    glBindBuffer(GL_TEXTURE_BUFFER, buffer);
    glBufferData(GL_TEXTURE_BUFFER, bufferSize, NULL, GL_STATIC_DRAW);

    //get a bindless address
    GLuint64 address;
    glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
    glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);

    //make a texture alias for it
    GLuint bufferTexture;
    glGenTextures(1, &bufferTexture);
    glBindTexture(GL_TEXTURE_BUFFER, bufferTexture);
    glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, buffer);
    glBindImageTextureEXT(0, bufferTexture, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R32F);

    //compile the shaders
    GLuint imageShader = compile(GL_VERTEX_SHADER, imageSource);
    GLuint bindlessShader = compile(GL_VERTEX_SHADER, bindlessSource);
    GLuint bindlessComputeShader = compile(GL_COMPUTE_SHADER, bindlessComputeSource);

    //warm-up and check values
    glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
    glUseProgram(imageShader);
    glUniform1i(glGetUniformLocation(imageShader, "data"), 0);
    glUniform1f(glGetUniformLocation(imageShader, "val"), 1.0f);
    glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
    glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    //check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
    //for (int i = 0; i < bufferSize/sizeof(float); ++i)
    //  assert(check[i] == 1.0f);
    //glUnmapBuffer(GL_TEXTURE_BUFFER);

    glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
    glUseProgram(bindlessShader);
    glProgramUniformui64NV(bindlessShader, glGetUniformLocation(bindlessShader, "data"), address);
    glUniform1f(glGetUniformLocation(bindlessShader, "val"), 1.0f);
    glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
    //glMemoryBarrier(GL_ALL_BARRIER_BITS); //this causes glDispatchCompute to segfault later, so don't uncomment
    //check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
    //for (int i = 0; i < bufferSize/sizeof(float); ++i)
    //  assert(check[i] == 1.0f);
    //glUnmapBuffer(GL_TEXTURE_BUFFER);

    glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
    glUseProgram(bindlessComputeShader);
    glProgramUniformui64NV(bindlessComputeShader, glGetUniformLocation(bindlessComputeShader, "data"), address);
    glUniform1f(glGetUniformLocation(bindlessComputeShader, "val"), 1.0f);
    glDispatchCompute(bufferSize/(sizeof(float) * 256), 1, 1);
    glMemoryBarrier(GL_ALL_BARRIER_BITS);
    //check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
    //for (int i = 0; i < bufferSize/sizeof(float); ++i)
    //  assert(check[i] == 1.0f); //glDispatchCompute doesn't actually write anything with bindless graphics
    //glUnmapBuffer(GL_TEXTURE_BUFFER);
    glFinish();

    //time image_load_store
    glUseProgram(imageShader);
    glUniform1i(glGetUniformLocation(imageShader, "data"), 0);
    glUniform1f(glGetUniformLocation(imageShader, "val"), 1.0f);
    start();
    for (int i = 0; i < loops; ++i)
        glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
    GLuint64 imageTime = stop();
    printf("image_load_store: %.2fGB/s\n", (float)((bufferSize * (double)loops) / imageTime));

    //time bindless
    glUseProgram(bindlessShader);
    glProgramUniformui64NV(bindlessShader, glGetUniformLocation(bindlessShader, "data"), address);
    glUniform1f(glGetUniformLocation(bindlessShader, "val"), 1.0f);
    start();
    for (int i = 0; i < loops; ++i)
        glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
    GLuint64 bindlessTime = stop();
    printf("bindless: %.2fGB/s\n", (float)((bufferSize * (double)loops) / bindlessTime));

    //time bindless in a compute shader
    glUseProgram(bindlessComputeShader);
    glProgramUniformui64NV(bindlessComputeShader, glGetUniformLocation(bindlessComputeShader, "data"), address);
    glUniform1f(glGetUniformLocation(bindlessComputeShader, "val"), 1.0f);
    start();
    for (int i = 0; i < loops; ++i)
        glDispatchCompute(bufferSize/(sizeof(float) * 256), 1, 1);
    GLuint64 bindlessComputeTime = stop();
    printf("bindless compute: %.2fGB/s\n", (float)((bufferSize * (double)loops) / bindlessComputeTime));
    assert(glGetError() == GL_NO_ERROR);
    return 0;
}

My output:

image_load_store: 6.66GB/s
bindless: 6.68GB/s
bindless compute: 6.65GB/s

Some notes:

Compute shaders with bindless graphics don't appear to write anything (the commented out assert fails), or at least the data isn't retrieved with glMapBuffer even though the speed matches the other methods. Using image_load_store in the compute shader works and gives the same speed the vertex shaders (though I thought that'd be one too many permutations to post).
Calling glMemoryBarrier(GL_ALL_BARRIER_BITS) before glDispatchCompute causes a crash in the driver.
Commenting out the three glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);, which are used to check the output, raises the speed of the first two tests to 17GB/s and the compute shader skyrockets to 292GB/s which is much closer to what I'd like but this can't be trusted because of point 1.
Sometimes while (!available) hangs for ages (ctrl-c when I get tired of waiting shows its still in the loop).

For reference, here's the CUDA code:

//http://www.ks.uiuc.edu/Research/vmd/doxygen/CUDABench_8cu-source.html

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>

#define CUERR { cudaError_t err; \
    if ((err = cudaGetLastError()) != cudaSuccess) { \
    printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
    return -1; }}

//
// GPU device global memory bandwidth benchmark
//
template <class T>
__global__ void gpuglobmemcpybw(T *dest, const T *src) {
    const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
    dest[idx] = src[idx];
}

template <class T>
__global__ void gpuglobmemsetbw(T *dest, const T val) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    dest[idx] = val;
}

typedef float4 datatype;

static int cudaglobmembw(int cudadev, double *gpumemsetgbsec, double *gpumemcpygbsec) {
    int i;
    int len = 1 << 22; // one thread per data element
    int loops = 500;
    datatype *src, *dest;
    datatype val=make_float4(1.0f, 1.0f, 1.0f, 1.0f);

    // initialize to zero for starters
    float memsettime = 0.0f;
    float memcpytime = 0.0f;
    *gpumemsetgbsec = 0.0;
    *gpumemcpygbsec = 0.0;

    // attach to the selected device
    cudaError_t rc;
    rc = cudaSetDevice(cudadev);
    if (rc != cudaSuccess) {
        #if CUDART_VERSION >= 2010
        rc = cudaGetLastError(); // query last error and reset error state
        if (rc != cudaErrorSetOnActiveProcess)
        return -1; // abort and return an error
        #else
        cudaGetLastError(); // just ignore and reset error state, since older CUDA
        // revs don't have a cudaErrorSetOnActiveProcess enum
        #endif
    }

    cudaMalloc((void **) &src, sizeof(datatype)*len);
    CUERR
    cudaMalloc((void **) &dest, sizeof(datatype)*len);
    CUERR

    dim3 BSz(256, 1, 1);
    dim3 GSz(len / (BSz.x * BSz.y * BSz.z), 1, 1); 

    // do a warm-up pass
    gpuglobmemsetbw<datatype><<< GSz, BSz >>>(src, val);
    CUERR
    gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
    CUERR
    gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
    CUERR

    cudaEvent_t start, end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);

    // execute the memset kernel
    cudaEventRecord(start, 0);
    for (i=0; i<loops; i++) {
    gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
    }
    CUERR
    cudaEventRecord(end, 0);
    CUERR
    cudaEventSynchronize(start);
    CUERR
    cudaEventSynchronize(end);
    CUERR
    cudaEventElapsedTime(&memsettime, start, end);
    CUERR

    // execute the memcpy kernel
    cudaEventRecord(start, 0);
    for (i=0; i<loops; i++) {
    gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
    }
    cudaEventRecord(end, 0);
    CUERR
    cudaEventSynchronize(start);
    CUERR
    cudaEventSynchronize(end);
    CUERR
    cudaEventElapsedTime(&memcpytime, start, end);
    CUERR

    cudaEventDestroy(start);
    CUERR
    cudaEventDestroy(end);
    CUERR

    *gpumemsetgbsec = (len * sizeof(datatype) / (1024.0 * 1024.0)) / (memsettime / loops);
    *gpumemcpygbsec = (2 * len * sizeof(datatype) / (1024.0 * 1024.0)) / (memcpytime / loops);
    cudaFree(dest);
    cudaFree(src);
    CUERR

    return 0;
}

int main()
{
    double a, b;
    cudaglobmembw(0, &a, &b);
    printf("%f %f\n", (float)a, (float)b);
    return 0;
}

Update:

It seems that the buffer gets made non-resident on my glBufferData calls which were there to check output was being written. As per the extension:

A buffer is also made non-resident implicitly as a result of being respecified via BufferData or being deleted.
...
BufferData is specified to "delete the existing data store", so the GPU address of that data should become invalid. The buffer is therefore made non-resident in the current context.

At a guess, OpenGL then streams in the buffer object data each frame and doesn't cache it in video memory. This explains why the compute shader failed the assert, however there's a slight anomaly that bindless graphics in the vertex shader still worked when not resident, but I'll ignore that for now. I have no idea why a 64MB buffer object wouldn't default to being resident (though perhaps after first use) when there's 12GB available.

So after each call to glBufferData I make it resident again and get the address in case its changed:

glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
assert(glIsBufferResidentNV(GL_TEXTURE_BUFFER)); //sanity check

I'm now getting 270–290GB/s with the compute shader using either image_load_store or bindless graphics. Now my question includes:

Given the buffer seems to be resident for each test and the compute shader is nice and fast, why are the vertex shader versions still so slow?
Without the bindless graphics extension, how should regular OpenGL users put data into video memory (actually put and not idly suggest that the driver might just like to)?

I'm pretty sure I would have noticed this problem in real world situations, and it's this contrived benchmark that hits a slow path, so how could I trick the driver into making a buffer object resident? Running a compute shader first doesn't change anything.

Seems like the CUDA benchmark is measuring GPU local memory bandwidth, while your OpenGL benchmark does measure PCI-E link bandwidth, as if the OpenGL driver would make a shadow copy of the results of the computation. I'd try again with a compute shader. — datenwolf, Jun 30 '15 at 17:00
@datenwolf Thanks for the interest. I assume you mean CUDA is measuring GPU global memory (no [local array](http://stackoverflow.com/questions/10297067/local-arrays-in-cuda) is declared in the kernel). Unless the GPU is virtualizing the buffer objects in system memory, the only PCI-E data transfers are from the initial `glBufferData`s (and `glMapBuffer` is uncommented) which is before the actual benchmark. Is there something different I should be doing with regard to the compute shader's assert failing? — jozxyqk, Jun 30 '15 at 17:39
With `GPU local` I meant "RAM installed locally on the GPU board" instead of the memory of the system the GPU is connected to (which is nonlocal memory as far as the GPU is concerned, bcause it can't access it without going through the peripheral bus). I didn't mean CUDA semantics there. — datenwolf, Jun 30 '15 at 19:16
The numbers you quoted are so eeriely close to GPU local memory access vs. PCI-E link bandwidth, that this would be the first thing I'd investigate. Also OpenGL implementations often have to make shadow copies from GPU to system memory for certain things the specification requires to be actually implemented. — datenwolf, Jun 30 '15 at 19:18
@datenwolf indeed! `assert(glIsBufferResidentNV)` before the tests failed. see the update. Vertex shaders are still dead slow. Perhaps `gl_VertexID` is not sequential across shaders in the work group/"warp" and I'm hitting worst case cache performance. — jozxyqk, Jul 01 '15 at 08:19

score 1 · Answer 1 · answered Nov 07 '15 at 15:04

1

You are asking the driver to read from your process memory, dat. This causes extensive cache coherency traffic. When the GPU reads that memory, it can't be sure that it is up to date, it might be in the CPU cache, modified, and not written back to RAM yet. This causes the GPU to actually have to read from the CPU cache, which is far more expensive than bypassing the CPU and reading the RAM. The RAM is often idle during normal operation, because a modern CPU's hit rate is typically 95% to 99%. The cache is used continuously.

To achieve maximum performance, you need to let the driver allocate the memory. Normal memory your program uses, like global variables and the heap are allocated in writeback memory. Driver allocated memory will usually be allocated as write combining or uncacheable, which eliminates the coherency traffic.

Peak advertised bandwidth numbers will be achieved only without cache coherency overhead.

To let the driver allocate it, use glBufferData with a nullptr for the data.

It isn't all rosy though, if you manage to coerce the driver into using a system memory write combining buffer. CPU reads to such addresses will be very slow. Sequential writes are optimized by the CPU, but random writes will cause the write combining buffer to flush frequently, hurting performance.

answered Nov 07 '15 at 15:04

doug65536

6,562
3
43
53

Thanks for the interest! I don't believe `glBufferData` will maintain coherency with the array you pass it. In fact I think you can free the memory after calling `glBufferData`. Yes, I would very much like to know how to force the GPU to allocate memory for the buffer and have it stay resident (without using the vendor specific `glMakeBufferResidentNV`) but I believe `glBufferData` with a null pointer (as in the first call in my code) is the same as supplying data, just without initializing the buffer. It seems like the `GL_STATIC_DRAW` hint is not being acted upon. – jozxyqk Nov 10 '15 at 02:19
@jozxyqk I didn't say anything about "maintaining" coherency, I was referring to the initial read of memory by the GPU - it is writeback memory which might be in the cache, when the GPU reads it, so the CPU cache is involved, bottlenecking it on CPU L1 bandwidth, causing that data transfer to be slower than a memory region that is not eligible to be in the cache. – doug65536 Nov 10 '15 at 12:22
I'm more interested in shader--gpu memory bandwidth. The buffer calls are just there to initialize/zero the data and prove my tests work. I appear to be running into the issue that OpenGL is internally keeping a copy of the data in system memory and streaming it in over PCIE rather than store it in gpu memory. – jozxyqk Nov 10 '15 at 12:30
@jozxyqk http://www.nvidia.com/content/pci_express/PCI_Express_files/v3_document.html – doug65536 Nov 10 '15 at 12:32
@jozxyqk Your question is "why are they getting X GB/sec and I am getting Y GB/sec", isn't it? – doug65536 Nov 10 '15 at 12:37
1

Yes, and as per the comments, it seems like the reason it's sometimes a lot slower is the buffer is not resident in GPU memory and instead has to be passed through the PCIE bus every time I access it. I'm looking for the right way to force it to become resident so I get consistent benchmarks. – jozxyqk Nov 10 '15 at 12:46

How do you measure peak memory bandwidth in OpenGL?

1 Answers1