How to execute faster than "snprintf(mystr, 22, "{%+0.4f,%+0.4f}", (double)3.14159265, (double) 2.718281828459);" on a 32 bit mcu

Question

I've tried a few things, any it seems that at best I'm 1.5x slower than the printf() family of functions, which boggles my mind a bit. I think what I'm up against in this situation is the addressing of my device is 32bit, and I don't have an FPU. I've tried a couple of "ftoa()" implementations and constrained them to only look for 2 digits on the left of the decimal point, and left myself some breadcrumbs as to what the total length is of a larger overall string that I'm trying to build. At the end of the day, it seems like the nature of an array of 8-bit elements on a 32bit system is leading to a bunch of hidden shift operations, bitwise "OR" and bitwise NAND operations that are just slowing things down ridiculously...

Anyone have any general tips for this situation? (other than a re-architect to an 8.24 fixed point design) I've tried the compiler optimizations from wysiwyg to execution speed focused, nothing seems to beat snprintf.

Here's the fastest one that I had tried:

#if (__DEBUG)
 #define DATA_FIFO_SIZE         (8)
#else
 #define DATA_FIFO_SIZE         (1024)
#endif

typedef struct
{
    int32_t  rval[4];
    double   cval[4];
    uint16_t idx;
    uint16_t padding; //@attention the compiler was padding with 2 bytes to align to 32bit
} data_fifo_entry;

const char V_ERR_MSG[7] = "ERROR,\0";

static data_fifo_entry data_fifo[DATA_FIFO_SIZE];
static char embed_text[256];

/****
 * float to ASCII, adapted from
 * https://stackoverflow.com/questions/2302969/how-to-implement-char-ftoafloat-num-without-sprintf-library-function-i#7097567
 *
 ****/

 //@attention the following floating point #defs are linked!!
#define MAX_DIGITS_TO_PRINT_FLOAT           (6)
#define MAX_SUPPORTED_PRINTABLE_FLOAT       (+999999.99999999999999999999999999)
#define MIN_SUPPORTED_PRINTABLE_FLOAT       (-999999.99999999999999999999999999)
#define FLOAT_TEST6                         (100000.0)
#define FLOAT_TEST5                         (10000.0)
#define FLOAT_TEST4                         (1000.0)
#define FLOAT_TEST3                         (100.0)
#define FLOAT_TEST2                         (10.0)
#define FLOAT_TEST1                         (1.0)

static inline int ftoa(char *s, const float f_in, const uint8_t precision)
{
    float f_p    = 0.0001;
    float n      = f_in;
    int   neg    = (n < 0.0);
    int   length = 0;

    switch (precision)
    {
        case (1):
        {
            f_p = 0.1;
            break;
        }
        case (2):
        {
            f_p = 0.01;
            break;
        }
        case (3):
        {
            f_p = 0.001;
            break;
        }
        //case (4) is the default assumption
        case (5):
        {
            f_p = 0.00001;
            break;
        }
        case (6):
        {
            f_p = 0.000001;
            break;
        }
        default:   //already assumed, no assignments here
        {
            break;
        }
    }              /* switch */
    // handle special cases
    if (isnan(n))
    {
        strcpy(s, "nan\0");
        length = 4;
    }
    else if ((isinf(n)) || (n >= MAX_SUPPORTED_PRINTABLE_FLOAT) ||
        ((-1.0 * n) < MIN_SUPPORTED_PRINTABLE_FLOAT))
    {
        strcpy(s, "inf\0");
        length = 4;
    }
    else if (n == 0.0)
    {
        int idx;
        s[length++] = '+';
        s[length++] = '0';
        s[length++] = '.';
        for (idx = 0; idx < precision; idx++)
        {
            s[length++] = '0';
        }
        s[length++] = '\0';
    }
    else if (((n > 0.0) && (n < f_p)) || ((n < 0.0) && ((-1.0 * n) < f_p)))
    {
        int idx;
        if (n >= 0.0)
        {
            s[length++] = '+';
        }
        else
        {
            s[length++] = '-';
        }
        s[length++] = '0';
        s[length++] = '.';
        for (idx = 1; idx < precision; idx++)
        {
            s[length++] = '0';
        }
        s[length++] = '\0';
    }
    else
    {
        int digit, m;
        if (neg)
        {
            n = -n;
        }
        // calculate magnitude
        if (n >= FLOAT_TEST6)
        {
            m = 6;
        }
        else if (n >= FLOAT_TEST5)
        {
            m = 5;
        }
        else if (n >= FLOAT_TEST4)
        {
            m = 4;
        }
        else if (n >= FLOAT_TEST3)
        {
            m = 3;
        }
        else if (n >= FLOAT_TEST2)
        {
            m = 2;
        }
        else if (n >= FLOAT_TEST1)
        {
            m = 1;
        }
        else
        {
            m = 0;
        }
        if (neg)
        {
            s[length++] = '-';
        }
        else
        {
            s[length++] = '+';
        }
        // set up for scientific notation
        if (m < 1.0)
        {
            m = 0;
        }
        // convert the number
        while (n > f_p || m >= 0)
        {
            double weight = pow(10.0, m);
            if ((weight > 0) && !isinf(weight))
            {
                digit       = floor(n / weight);
                n          -= (digit * weight);
                s[length++] = '0' + digit;
            }
            if ((m == 0) && (n > 0))
            {
                s[length++] = '.';
            }
            m--;
        }
        s[length++] = '\0';
    }
    return (length - 1);
}                  /* ftoa */


static inline void print2_and_idx(int8_t idx1, int8_t idx2, uint16_t fifo_idx)
{
    //@attention 10 characters already in the buffer, idx does NOT start at zero
    uint8_t idx         = V_PREFIX_LENGTH;
    char    scratch[16] = {'\0'};
    char *  p_fifo_id;

    if ((idx1 >= 0) && (idx1 < MAX_IDX) && (idx2 >= 0) && (idx2 < MAX_IDX) &&
        (fifo_idx >= 0) && (fifo_idx < DATA_FIFO_SIZE))
    {
        ftoa(scratch, data_fifo[fifo_idx].cval[idx1], 4);
        memcpy((void *)&embed_text[idx += 7], (void *)scratch, 7);
        embed_text[idx++] = ',';
        ftoa(scratch, data_fifo[fifo_idx].cval[idx2], 4);
        memcpy((void *)&embed_text[idx += 7], (void *)scratch, 7);
        embed_text[idx++] = ',';
        //!\todo maybe print the .idx as fixed width, zero pad to 5 digits
        p_fifo_id         = utoa((char *)&embed_text[idx], (unsigned int)data_fifo[fifo_idx].idx, 10);
        idx              += strlen(p_fifo_id);
        embed_text[idx++] = ',';
    }
    else
    {
        memcpy((void *)&embed_text[idx], (void *)V_ERR_MSG, 7);
    }
}                  /* print2_and_idx */

once you add float from the language perspective it is going to explode in size and drop in performance, much of the time you can do everything in fixed point (the soft fpu is implemented using fixed point, if they can do the overkill stuff in fixed you can do exactly what you need in fixed with less overhead). you can try to reduce the floating point you are using, and if you have a C library that supports snprintf or anything printf you are again looking for a big binary, just do it yourself. — old_timer, Sep 27 '17 at 18:59
Use `long long pi, e; ... pi = 3.14159265*1000.0; e = 2.718281828459*1000.0;` and then print the 2 integers. IOWs, pay the price for code to do FP multiply and conversion to an integer, yet leave the formatting and print in the integer domain. — chux - Reinstate Monica, Sep 27 '17 at 19:11
@chux good thoughts, but I have real variables to print... The "pi" and "e" are just examples of what I'm starting with — grambo, Sep 27 '17 at 19:17
@old_timer I'm onboard with a rewrite in fixed point, but the program does not have the calendar time for that — grambo, Sep 27 '17 at 19:19
@grambo I am counting on that as " pay the price for code to do FP multiply". Had these been fixed values, there would be no need for run-time multiplication as the compiler would have done `pi = 31415;` — chux - Reinstate Monica, Sep 27 '17 at 19:20
As compared to `void format2double_1(char *mystr, double pi, double e) { snprintf(mystr, 22, "{%+0.4f,%+0.4f}", pi, e); }`, my code was about 20x faster. Look forward to comparing against your posted effort. — chux - Reinstate Monica, Sep 27 '17 at 19:57
if the speed of snprintf is essential - I think that the algorithm is just wrong. printf family functions are really well written and optimised. — 0___________, Sep 27 '17 at 20:11
`case (1):` - I saw something new. Like `return (length - 1);`. Parentheses are not necessary in both cases. — i486, Sep 27 '17 at 20:12
With `void format2double_3(char *mystr, double pi, double e) { mystr[0] = '{'; int offset = 1; offset += ftoa(mystr + offset , pi, 4); mystr[offset++] = ','; offset += ftoa(mystr + offset , e, 4); mystr[offset++] = '}'; mystr[offset] = 0; }` Your `ftoa()` code is rounding wrong and create a leading 0 `{+03.1415,+02.7182}`, unlike `snprintf(mystr, 22, ”{%+0.4f,%+0.4f}“, 3.14159265, 2.718281828459);` --> `{+3.1416,+2.7183}`. With inconsistent functionality, the goal is unclear. Your code about 10x faster than baseline. — chux - Reinstate Monica, Sep 27 '17 at 20:27
@PeterJ_01: as well written as they may be, they have to handle several corner cases of FP, plus the parsing of the format string, and the code must be generic to handle all the possible formatting requirements; if this functionality is never required (say, you always have "normal" numbers, a fixed format and, once scaled, all your numbers can fit an integer) you can gain quite some performance. — Matteo Italia, Sep 27 '17 at 20:36
@chux the leading zero bug wasn't chased down as the performance was so poor... — grambo, Sep 27 '17 at 20:40
@grambo Your `ftoa(buf, x, 4)` fails many corner cases, primarily precision ones even exceeding 0.0001 difference, missing trailing zeros, extra leading zeros and an annoying `0.0001(16 zeros)5` converted to `"+0"` a total loss of precision, Concern about performance maybe should be secondary. — chux - Reinstate Monica, Sep 27 '17 at 21:04
Tip: to make code _faster_, spend more time specifying the details of the goal. The more you narrow the needed functionality, the more speed can be had. The title's equivalent functionality is a good narrowing (yet could be more narrowed if restricted in type (e.g. `float`) and FP range. Narrowed functionality and speed comes at a price, more development time, less flexibility. BTW: Is it fast as measured by averaging typical FP values or fast by worst case time - again, the more info --> more speed. — chux - Reinstate Monica, Sep 28 '17 at 14:14
Sorry that I am another one questioning the requirements of your code, but: why does the runtime of a routine which is essentially written to produce _human_ readable output, matter at all? Who is (or are) these humans that are going read at the pace of your CPU output bandwidth? — Vroomfondel, Sep 29 '17 at 13:48

chux - Reinstate Monica · Answer 1 · 2017-09-28T14:50:58.153

Instead of using *printf() with FP arguments, convert the FP values first into scaled integers.
With still calling snprintf(), yet with integer and simple character arguments, my code was about 20x faster than the baseline.

Your mileage may vary. YMMV.

//baseline
void format2double_1(char *mystr, double pi, double e) {
  snprintf(mystr, 22, "{%+0.4f,%+0.4f}", pi, e);
  //puts(mystr);
}

void format2double_2(char *mystr, double pi, double e) {
  int pi_i = (int) lrint(pi * 10000.0);
  int api_i = abs(pi_i);
  int e_i = (int) lrint(e * 10000.0);
  int ae_i = abs(e_i);
  snprintf(mystr, 22, "{%c%d.%04d,%c%d.%04d}", //
      "+-"[pi_i < 0], api_i / 10000, api_i % 10000, //
      "+-"[e_i < 0], ae_i / 10000, ae_i % 10000);
  //puts(mystr);
}

[edit]

For a proper -0.0 text, use "+-"[!!signbit(pi)]

[edit]

Some idea for OP to consider as a ftoa() replacement. Central code is lrint(f_in * fscale[precision]); which rounds and scales. Untested.

#define PRINTABLE_MAGNITUDE_LIMIT       1000000

int ftoa_1(char *s, const float f_in, const uint8_t precision) {
  int n;
  sprintf(s, "%+.*f%n", precision, f_in, &n);
  return n;
}

int ftoa_2(char *s, const float f_in, const uint8_t precision) {
  float fscale[] = { 1, 10, 100, 1000, 10000, 100000, 1000000 };
  long iscale[] = { 1, 10, 100, 1000, 10000, 100000, 1000000 };
  assert(precision > 0 && precision < sizeof fscale / sizeof fscale[0]);
  // gross range check
  if (f_in > -PRINTABLE_MAGNITUDE_LIMIT && f_in < PRINTABLE_MAGNITUDE_LIMIT) {
    long value = lrint(f_in * fscale[precision]);
    value = labs(value);
    long scale = iscale[precision];
    long ipart = value / scale;
    long fpart = value % scale;
    // fine range check
    if (ipart < PRINTABLE_MAGNITUDE_LIMIT) {
      int n;
      sprintf(s, "%c%ld:%0*ld%n", signbit(f_in) ? '-' : '+', ipart, precision,
          fpart, &n);
      return n;
    }
  }
  // Out of range values need not be of performance concern for now.
  return ftoa_1(s, f_in, precision);
}

[edit]

To convert a positive or 0 integer to a string quickly without the need to shift the buffer or reverse it, see below. It also returns the string length for subsequent string building.

// Convert an unsigned to a decimal string and return its length
size_t utoa_length(char *dest, unsigned u) {
  size_t len = 0;
  if (u >= 10) {
    len = utoa_length(dest, u/10);
    dest += len;
  }
  dest[0] = '0' + u%10;
  dest[1] = '\0';
  return len + 1;
}

Impressive solution! should work for most floating point values smaller than `INT_MAX / 10000`, but using `long long` would give more range, also one small detail: `-0.0` print as `+0.0000` instead of `-0.0000`. — chqrlie, Sep 27 '17 at 21:18
Thanks, it looks good, I should be able to benchmark that tomorrow on my platform... I will play with a couple of different variations on the choice of sign character too. — grambo, Sep 27 '17 at 21:27
@chqrlie -Fie! ;-). You may enjoy [+0.0 and -0.0 give different ...](https://stackoverflow.com/q/25332133/2410359). — chux - Reinstate Monica, Sep 27 '17 at 21:30
@chqrlie True about range, yet with a buffer of 22 for 2 FPs, I suspect OP is working in a `fabs(x) < INT_MAX/1000.0` range — chux - Reinstate Monica, Sep 27 '17 at 21:35
@chqrlie As it turns out, `signbit(f_in) ? '-' : '+'` is much better than `"+-"[pi_i < 0]` for all sorts of values between `-0.00005 < x <= -0.0` as they all round to `0`. — chux - Reinstate Monica, Sep 27 '17 at 22:37
Indeed! I wrote a test program to verify the behavior on `-0.0` and did test negative values smaller than `0.0001`, but I focussed on `-0.0` and did not even see the bigger picture. — chqrlie, Sep 27 '17 at 22:43
Let us [continue this discussion in chat](http://chat.stackoverflow.com/rooms/155468/discussion-between-chux-and-chqrlie). — chux - Reinstate Monica, Sep 28 '17 at 01:55

Matteo Italia · Answer 2 · 2017-09-27T22:30:44.720

In a similar vein of @chux's answer, if the remaining snprintf is still slow you can go down the rabbit hole of hand-composing strings/hand-rendering integers.

char *fmtp04f(char *buf, char *lim, double d) {
    // if there's no space at all don't bother
    if(buf==lim) return buf;
    // 10 characters in maximum 32 bit integer, one for the dot,
    // one for the terminating NUL in debug prints
    char b[12];
    // current position in the buffer
    char *bp = b;
    // scale and round
    int32_t i = lrint(d * 10000.);
    // write sign and fix i sign
    // (we do have at least one character available in buf)
    if(signbit(d)) {
        *buf++='-';
        i = -i;
    } else {
        *buf++='+';
    }
    // *always* write down the last 4 digits, even if they are zeroes
    // (they'll become the 4 digits after the decimal dot)
    for(; bp!=b+4; ) {
        *bp++ = '0' + i%10;
        i/=10;
    }
    *bp++='.';
    // write down the remaining digits, writing at least one
    do {
        *bp++ = '0' + i%10;
        i/=10;
    } while(i != 0);
    // bp is at the character after the last, step back
    --bp;
    // data is now into b *in reversed order*;
    // reverse-copy it into the user-provided buffer
    while(buf!=lim) {
        *buf++ = *bp;
        // check before decrementing, as a pointer to one-before-first
        // is not allowed in C
        if(bp == b) break;
        --bp;
    }
    if(buf!=lim) *buf=0;    // "regular" case: terminate *after*
    else         lim[-1]=0; // bad case: truncate
    return buf;
}

void doformat(char *buf, char *lim, double a, double b) {
    if(buf==lim) return; // cannot do anything
    *buf++='{';
    if(buf==lim) goto end;
    buf = fmtp04f(buf, lim, a);
    if(buf==lim) return; // already terminated by fmtp04f
    *buf++=',';
    if(buf==lim) goto end;
    buf = fmtp04f(buf, lim, b);
    if(buf==lim) return; // idem
    *buf++='}';
    if(buf==lim) goto end;
    *buf++=0;
end:
    lim[-1]=0;   // always terminate
}

It passes some random tests, so I'm reasonably confident that it is not too wrong.

For some reason, @chux version on my machine (64 bit Linux, gcc 6.3) is generally 2/3 times faster than the baseline, while my version is usually 10/30 times faster than the baseline. I don't know if this is because my snprintf is particularly good or particularly bad. As said above, YMMV.

Yes, dropping `sprintf()` completely is even faster! I coded something like yours with `utoa_length()` from [here](https://stackoverflow.com/a/46456777/2410359) and was about 1.3 faster than your fine solution. We could have fun all day, yet duty calls. — chux - Reinstate Monica, Sep 28 '17 at 15:18

How to execute faster than "snprintf(mystr, 22, "{%+0.4f,%+0.4f}", (double)3.14159265, (double) 2.718281828459);" on a 32 bit mcu

2 Answers2