4

I was recently tasked with improving some embedded code where a disproportionate amount of time was being spent in malloc.

The client wants an easy fix that could be effected with some very simple search and replace commands (i.e., no massive, complex changes to the source code) while delivering a substantial benefit.

Upon analysis, it looks like the vast majority (~80%) of allocations were for 128 bytes or less.

To that end, I put together some proof of concept code using a quickpool, the idea being that one allocation from malloc is done at start up (enough to hold, for example, 1024 blocks of 128 bytes each) and allocations wehich could be satisfied from this quickpool would be. I was hoping that removing the more complex calculations in malloc (more complex since it has to handle varying size blocks) would result in a speed increase.

Then, all source code is changed to use the quickpool instead of regular malloc. In the event that the quickpool was exhausted or the allocation required more than the block size, the request would be passed through to malloc.

The results weren't bad, giving me a (roughly) 50% reduction in time but I'm wondering if I can do better.

The quickpool functions are shown below. First the header file:

#ifndef QUICKPOOL_H_INCLUDED
    #define QUICKPOOL_H_INCLUDED

    #include <stdlib.h>

    typedef struct qp_pool_s qp_pool;

    qp_pool *qp_create (size_t, size_t);
    qp_pool *qp_destroy (qp_pool *);
    void *qp_malloc (qp_pool *, size_t);
    void *qp_free (qp_pool *, void *);
#endif

The create and destroy functions allow for different pools to be created with different block sizes and block counts:

#include <string.h>
#include "quickpool.h"

// Various global values.

#define DEFLT_BLKS 1024
#define DEFLT_SZ    128

struct qp_pool_s {
    char *data;         // Actual blocks.
    char *enddata;      // First byte beyond data.
    char *isused;       // Used map for blocks.
    size_t total_blks;  // Number of blocks in pool.
    size_t free_blks;   // Free blocks in pool.
    size_t blk_sz;      // Size of each block.
    size_t low_free;    // Lowest free block.
};

 

qp_pool *qp_create (size_t quant, size_t sz) {
    // Zero means a default size.

    if (quant == 0) quant = DEFLT_BLKS;
    if (sz == 0)    sz = DEFLT_SZ;

    // Allocate memory blocks for pool.

    qp_pool *pool = malloc (sizeof (*pool));
    if (pool == NULL) return NULL;

    if ((pool->data = malloc (quant * sz)) == NULL) {
            free (pool);
            return NULL;
    }

    if ((pool->isused = malloc (quant)) == NULL) {
            free (pool->data);
            free (pool);
            return NULL;
    }

    // Set information on pool and return it.

    pool->enddata = &(pool->data[quant * sz]);
    memset (pool->isused, 0, quant);
    pool->total_blks = quant;
    pool->free_blks = quant;
    pool->blk_sz = sz;
    pool->low_free = 0;

    return pool;
}

 

qp_pool *qp_destroy (qp_pool *pool) {
    // Just free all the memory for pool.

    if (pool != NULL) {
            free (pool->data);
            free (pool->isused);
            free (pool);
    }

    return NULL;
}

And then there are the malloc and free counterparts:

void *qp_malloc (qp_pool *pool, size_t sz) {
    int index;

    // If no pool, need more than BUFSZ bytes or pool empty, use default.

    if (pool == NULL) return malloc (sz);

    if ((sz > pool->blk_sz) || (pool->free_blks == 0))
            return malloc (sz);

    // Otherwise, get from quickpool. First we find a free block.

    for (index = pool->low_free; pool->isused[index]; index++)
            ;

    // Then we mark it used.

    pool->isused[index] = 1;
    pool->free_blks--;

    // Set lowest possible free block for speeding up next search.

    pool->low_free = index + 1;

    return &(pool->data[index * pool->blk_sz]);
}

 

void *qp_free (qp_pool *pool, void *ptr) {
    int index;

    // No pool created, use default.

    if (pool == NULL) {
            free (ptr);
            return NULL;
    }

    // Not in quick pool, use default.

    if (((char*)ptr < pool->data) || ((char*)ptr >= pool->enddata)) {
            free (ptr);
            return NULL;
    }

    // This is a quickpool address, free it.

    index = ((char*)ptr - pool->data) / pool->blk_sz;
    pool->isused[index] = 0;
    pool->free_blks++;

    // Optimise next search.

    if (index < pool->low_free)
            pool->low_free = index;

    return NULL;
}

For completeness, the main program for testing is below:

#include <stdio.h>
#include <string.h>
#include <time.h>

#include "quickpool.h"

#define FREE  0
#define ALLOC 1

#define NUMPTRS 512
static void *pointer[NUMPTRS];
static size_t numPointers = 0;

int main (int argCount, char *argVal[]) {
    int count, val, index, memsz, stMode, seed;

    qp_pool *quickPool;

    seed = atoi (argVal[1]);
    stMode = (strcmp (argVal[2], "standard") == 0);

    srand (seed);
    int baseline = clock();
    quickPool = qp_create (0, 0);

    for (count = 0; count < 1000000; count++) {
        if (numPointers == 0)
            val = ALLOC;
        else if (numPointers == NUMPTRS)
            val = FREE;
        else if (numPointers > NUMPTRS/2)
            val = ((rand() % 100) < 50) ? FREE : ALLOC;
        else
            val = ((rand() % 100) < 33) ? FREE : ALLOC;

        if (val == FREE) {
            index = rand() % numPointers;
            if (stMode)
                free (pointer[index]);
            else
                qp_free (quickPool, pointer[index]);
            pointer[index] = pointer[--numPointers];
        } else {
            memsz = rand() % 160;
            if (stMode)
                pointer[numPointers++] = malloc (memsz);
            else
                pointer[numPointers++] = qp_malloc (quickPool, memsz);
        }
    }

    quickPool = qp_destroy (quickPool);

    baseline = clock() - baseline;
    printf ("%d\n", baseline * 1000 / CLOCKS_PER_SEC);

    return 0;
}

along with a shell script for analysis:

#!/usr/bin/bash
normal=0
quick=0
printf "    %10s  %10s\n" Normal Quick
printf "    ==========  ==========\n"
for iter1 in 0 1 ; do
    for iter2 in 0 1 2 3 4 5 6 7 8 9 ; do
            seed=$RANDOM

            val=$(./qptest.exe $seed standard)
            printf "${iter1}${iter2}  %10d  " $val
            ((normal = normal + val))

            val=$(./qptest.exe $seed quickpool)
            printf "%10d\n" $val
            ((quick = quick + val))
    done
done
printf "    ==========  ==========\n"
((pct = quick * 100 / normal))
printf "sum %10d  %10d (%d%%)\n" $normal $quick $pct

which outputs:

        Normal       Quick
    ==========  ==========
00         469         219
01         453         219
02         453         235
03         453         219
04         453         235
05         453         219
06         469         219
07         453         234
08         453         219
09         453         219
10         453         219
11         453         235
12         453         235
13         453         219
14         453         219
15         453         235
16         453         235
17         453         235
18         469         219
19         469         234
    ==========  ==========
sum       9124        4522 (49%)

Now my question is as follows: is there any other scope for optimising the quickpool code that doesn't rely on bypassing the requirements:

  • an easy-to-integrate fix, requiring only simple global-search-and-replace on the souce code.
  • the ability to have different sized pools (# of blocks) with different sized blocks.
  • dropping through to malloc if the quick pool cannot satisfy the request.
paxdiablo
  • 854,327
  • 234
  • 1,573
  • 1,953
  • The mix of bash scripts with `.exe` files is something of an anachronism! – caf Sep 01 '11 at 03:41
  • @caf, it was done under Cygwin :-) – paxdiablo Sep 01 '11 at 03:42
  • Couldn't you drop in jemalloc? – Steve-o Sep 01 '11 at 03:44
  • @Mark, if you mean the test behaviour, I did a million iteration where free/alloc was chosen randomly. The alloc/free mix was 100%/0% if none were alloced, 0/100% if 512 alloced, 50/50 if more than 255 alloced otherwise 67/33. The amount to alloc was between 0 and 160 which should put about 80% below the 128-byte threshold. – paxdiablo Sep 01 '11 at 03:47
  • @Steve, not sure on the licencing of jemalloc but cusomer would be _very_ reticent about using external code, regardless of licence. – paxdiablo Sep 01 '11 at 03:48
  • @pax I'm thinking finer grained line-oriented stats (like you get from using gprof) so we know which parts of the new code are taking the most time. My initial thought was you could better optimize the "search" for free blocks, but on closer inspection I'd wager that it's actually almost always a single iteration. – Mark Elliot Sep 01 '11 at 03:51
  • @Mark, put in the suggestion to use `gprof` as an answer. That's worth an upvote. – paxdiablo Sep 01 '11 at 04:07

4 Answers4

4

If you can afford to spend a little bit of memory (say, a pointer per block) then you can keep the free blocks in a LIFO stack and eliminate the searches in qp_malloc() and qp_free(). It does make the code slightly more complicated but assures an O(1) time for all allocations.

Amardeep AC9MF
  • 18,464
  • 5
  • 40
  • 50
  • Memory's not too much of an issue (within reason). But I see your point. I could have a circular buffer with the free indexes which would bypass a sequential search altogether. Good one. – paxdiablo Sep 01 '11 at 03:49
  • Not too shabby. This change has dropped the running time from 50% down to 43% at the cost of a single `size_t` array holding the free indexes (and head/tail indicators for the circular buffer of course). – paxdiablo Sep 01 '11 at 04:04
  • I'm glad to hear it helped. Performance comes easy if you can throw memory at the problem. :-) – Amardeep AC9MF Sep 01 '11 at 04:19
4

I have obtained a good speedup over your version (approx 25% on my hardware) while maintaining the existing interfaces by using a free list instead of a free map.

As a bonus, the code even gets simpler:

#include <string.h>
#include "quickpool.h"

// Various global values.

#define DEFLT_BLKS 1024
#define DEFLT_SZ    128

struct qp_pool_s {
    char *data;         // Actual blocks.
    char *enddata;      // First byte beyond data.
    size_t blk_sz;      // Size of each block.
    void *next_free;    // Next free block
};

qp_pool *qp_create (size_t quant, size_t sz)
{
    char *blk;

    // Zero means a default size.  sizeof(void *) is minimum block size.

    if (quant == 0) quant = DEFLT_BLKS;
    if (sz == 0)
        sz = DEFLT_SZ;

    /* Round up size to a multiple of sizeof(void *) */
    sz = (sz + sizeof(void *) - 1) & ~(sizeof(void *) - 1);

    // Allocate memory blocks for pool.

    qp_pool *pool = malloc (sizeof (*pool));
    if (pool == NULL) return NULL;

    if ((pool->data = malloc (quant * sz)) == NULL) {
            free (pool);
            return NULL;
    }

    /* Set up free chain */
    for (blk = pool->data; blk < &pool->data[(quant - 1) * sz]; blk += sz)
            *(void **)blk = blk + sz;
    *(void **)blk = NULL;
    pool->next_free = pool->data;

    // Set information on pool and return it.

    pool->enddata = &(pool->data[quant * sz]);
    pool->blk_sz = sz;

    return pool;
}

qp_pool *qp_destroy (qp_pool *pool) {
    // Just free all the memory for pool.

    if (pool != NULL) {
            free (pool->data);
            free (pool);
    }

    return NULL;
}

void *qp_malloc (qp_pool *pool, size_t sz) {
    void *blk;

    // If no pool, need more than BUFSZ bytes or pool empty, use default.

    if (pool == NULL) return malloc (sz);

    if ((sz > pool->blk_sz) || (pool->next_free == NULL))
            return malloc (sz);

    // Otherwise, get from quickpool. First we find a free block.
    blk = pool->next_free;

    // Then we mark it used.
    pool->next_free = *(void **)blk;

    return blk;
}

void *qp_free (qp_pool *pool, void *ptr) {

    // No pool created, use default.

    if (pool == NULL) {
            free (ptr);
            return NULL;
    }

    // Not in quick pool, use default.

    if (((char*)ptr < pool->data) || ((char*)ptr >= pool->enddata)) {
            free (ptr);
            return NULL;
    }

    // This is a quickpool address, free it.
    *(void **)ptr = pool->next_free;
    pool->next_free = ptr;

    return NULL;
}

Results (my system malloc is obviously quite fast):

        Normal       Quick         Caf
    ==========  ==========  ==========
00         210         140         100
01         130         140         100
02         130         130         100
03         130         140         100
04         130         130         100
05         130         130          90
06         130         140         100
07         130         130         100
08         130         140         100
09         130         140         100
10         120         140         100
11         120         140         100
12         130         130         100
13         130         140         100
14         120         130         110
15         130         140         100
16         130         130         100
17         130         140         100
18         130         130         100
19         130         140         100
    ==========  ==========  ==========
sum       2650        2720        2000
                (    102%)  (     75%)
caf
  • 233,326
  • 40
  • 323
  • 462
  • Nice one, caf, this drops run time to 36%. As per the comment to Mark's answer, this change makes `main` the major user of resources (72%) so the quickpool code is now pretty good. – paxdiablo Sep 01 '11 at 05:12
  • @paxdiablo: I've made a slight bugfix - it now rounds up the block size to ensure it's a multiple of `sizeof(void *)`, so that it's always correctly aligned for that access. Depending on what you do with the returned blocks, you might want to align it even more strictly. – caf Sep 01 '11 at 05:50
  • Thanks, caf, I'll leave it as is for now. The actual chip is a Dallas 68302, based on the Motorola 68K chips, and I can't remember the specifics but I know `malloc` itself will give me a correctly aligned value. And, since the default block size is 128, I don't think that'll be a problem. We'll just stipulate that, if callers don't use the default, they're responsible for ensuring alignment by specifying a decent sz in the qp_create call: ie, no `qp_create (1024,13)` :-) The actual sz values are going to be 32, 64, 128 and so on anyway, just because they're nice round numbers. – paxdiablo Sep 01 '11 at 07:24
2

You could store the used/unused block flag as single bits in an array of bytes - assuming a 32bit system you could store the 1024 bits in 32 ints.

Then finding a spare block is just a matter of walking over 32 ints looking for one that isn't 0xffffffff

Martin Beckett
  • 94,801
  • 28
  • 188
  • 263
  • I did actually think of that but I wasn't sure that the increase in search speed would outweigh the cost of needing bitwise operations. But, since I keep harping on at everyone here about my "measure, don't guess" mantra, I'll give it a try :-) Cheers. – paxdiablo Sep 01 '11 at 03:41
  • Bitwise operations are superfast plus you are only doing an int-compare until you need to find which bit to look at. And reducing the amount of space spent storing meta-info improves cache performance. – Martin Beckett Sep 01 '11 at 03:43
1

At this point I think you ought to be using a code profiling tool, like gprof, to figure out what parts of your new code are really costing you the most time. It might also be worth running a profile on the whole program to figure out where to spend your time optimizing.

Mark Elliot
  • 75,278
  • 22
  • 140
  • 160
  • Thanks, Mark. I haven't been able to get gprof to give me line-by-line stuff (yet) but it's told me that `qp_malloc` represents 46% of run time at 119ns/call, `main` is 31%, `qp_free` is 15% at 40ns/call, others accumulate to less than 10%. So I'll focus on `qp_malloc` for now. – paxdiablo Sep 01 '11 at 04:32
  • @pax some good info on line-by-line funkiness [here](http://sourceware.org/binutils/docs/gprof/Line_002dby_002dline.html), but if all else fails you can split qp_malloc into sub-calls and "binary search" to find the costly code. – Mark Elliot Sep 01 '11 at 04:38
  • Compiling with all the right flags and `gprof -l` finally got me the line-by-line info. 31% of execution time is spent on that `for` loop searching for a free block (hence why the caf and Amardeep changes had such an effect). – paxdiablo Sep 01 '11 at 04:50
  • @pax awesome! rinse and repeat :) (where does caf's solution spend it's time?) – Mark Elliot Sep 01 '11 at 04:51
  • 16% on each of `main:call-qpfree`, `main:call-qpmalloc` and `main:if(val==free)`. 8% on each of `main:call-rand-50%`, `main:remove-pointer-from-list-after-free`, `main:for-loop`, `qpmalloc:first-if`, `qpmalloc:second-if` and `qpmalloc:call-malloc`. In other words, `main` is the main culprit at 72%. – paxdiablo Sep 01 '11 at 05:10
  • @pax - I'd try increasing the pool size, stalling on the second if might mean you're hitting the size limit, other than that there's nothing left on the table with memory management. – Mark Elliot Sep 01 '11 at 05:15
  • I agree - don't stop optimizing just because you've fixed one problem. You should rinse and repeat. On the other hand, I wouldn't recommend [gprof, for these reasons](http://stackoverflow.com/questions/1777556/alternatives-to-gprof/1779343#1779343). If you can't get a wall-time line-percent-reporting stack-sampler like Zoom, it can be done by hand, which I do. – Mike Dunlavey Sep 01 '11 at 16:23