Parsing numbers in natural languages is not trivial. You can detect small numbers with string matching and it is easy to parse numbers like "twenty-three" by splitting them, but a general approach is difficult. For example, the word "hundred" in "three hundred thousand" refers to the 100,000s, not the 100s.
There is no library function for this, because numbers are better and more universally represented as decimal numbers. Also, such a function would work in one language only.
But you can roll your own. The code below divides the word up into chunks of value (eg four, twenty-three) and multiplier (eg thousand). Both are optional. It keeps a stack of such value-multiplier pairs to keep track of "precedence": "one hundred and four thousand" means (1 * 100 + 4) * 1000
, but "one thousand one hundred and four" means 1 * 1000 + 1 * 100 + 4
. When the first multiplicator is less than the following, it is evaluated first.
The code is case-sensitive, but it can work on string literals. A small test-suite is in the main
function below.
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
static const char *next(const char **p, int *len)
{
const char *t = NULL;
while (**p == ' ' || **p == '-') (*p)++;
if (**p == '\0') return NULL;
t = *p;
while (**p && **p != ' ' && **p != '-') (*p)++;
if (len) *len = *p - t;
return t;
}
static int eq(const char *q, const char *p, int len)
{
if (p == NULL) return 0;
while (len--) {
if (*p != *q) return 0;
if (*p == '\0' || *q == '\0') return 0;
p++; q++;
}
return (*q == '\0');
}
static int in(const char **q, const char *p, int len)
{
int ix = 0;
if (p == NULL) return 0;
while (*q) {
if (eq(*q, p, len)) return ix;
ix++;
q++;
}
return -1;
}
#define MAX 32
int parse_num(const char *p, int64_t *n)
{
static const char *ones[] = {
"zero", "one", "two", "three", "four", "five", "six",
"seven", "eight", "nine", "ten", "eleven", "twelve",
"thirteen", "fourteen", "fifteen", "sixteen",
"seventeen", "eighteen", "nineteen", NULL
};
static const char *tens[] = {
"-", "-", "twenty", "thirty", "forty", "fifty",
"sixty", "seventy", "eighty", "ninety", NULL
};
int64_t value[MAX];
int64_t base[MAX];
int nvalue = 0;
const char *t;
int neg = 0;
int len;
int x;
*n = 0;
t = next(&p, &len);
if (eq("minus", t, len)) {
neg = 1;
t = next(&p, &len);
}
while (t) {
int64_t num = 0;
int64_t mult = 1;
int valid = 0;
if (nvalue && eq("and", t, len)) t = next(&p, &len);
x = in(tens, t, len);
if (x >= 0) {
num += x * 10;
t = next(&p, &len);
valid = 1;
}
x = in(ones, t, len);
if (x >= 0) {
num += x;
t = next(&p, &len);
valid = 1;
}
if (valid == 0) {
long int l;
char *end;
l = strtol(t, &end, 10);
if (end == p) {
num = l;
valid = 1;
t = next(&p, &len);
}
}
if (eq("quadrillion", t, len)) mult = 1000000000000000LL;
if (eq("trillion", t, len)) mult = 1000000000000LL;
if (eq("billion", t, len)) mult = 1000000000LL;
if (eq("million", t, len)) mult = 1000000LL;
if (eq("thousand", t, len)) mult = 1000LL;
if (eq("hundred", t, len)) mult = 100LL;
if (mult > 1) {
valid = 1;
t = next(&p, &len);
}
if (valid == 0) return 0;
if (nvalue && base[nvalue - 1] == mult) return 0;
while (nvalue && base[nvalue - 1] < mult) {
nvalue--;
num += value[nvalue] * base[nvalue];
}
if (nvalue == MAX) return 0;
if (mult > 1 && num == 0) num = 1;
value[nvalue] = num;
base[nvalue] = mult;
nvalue++;
}
if (t != NULL) return 0;
if (nvalue == 0) return 0;
while (nvalue--) {
*n += value[nvalue] * base[nvalue];
}
if (neg) *n = -*n;
return 1;
}
int main()
{
const char *str[] = {
"zero",
"three",
"minus one",
"ten",
"twenty-one",
"eighty",
"eight hundred eighty-eight",
"three hundred and nineteen",
"eleven hundred",
"one hundred and twenty one",
"twenty-four thousand",
"thirty thousand one",
"two million",
"two hundred thirty thousand and eleven",
"three million two hundred and thirty thousand and eleven",
"minus eight trillion",
"fifty-five billion one million nine thousand and twelve",
"one thousand million",
"nine hundred thousand",
"nine thousand hundred",
"one hundred thousand million",
"nineteen hundred eighty-four",
"4 billion 294 million 967 thousand 296",
"two thousand thousand",
"minus",
"thirty-something",
NULL
};
const char **p = str;
while (*p) {
int64_t n;
int res;
res = parse_num(*p, &n);
if (res) {
printf("%18" PRId64, n);
} else {
printf("%18s", "---");
}
printf(" %s\n", *p);
p++;
}
return 0;
}