If one wants to parse files like,
2409, blah, blah, blah
0x10,foo, bar, baz, qux
# This is more difficult.
010 , a\
a, b b\#\\\,still b,c
one is probably better off just using a parser generator like lex and yacc or my favourite, re2c.
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <limits.h>
#include <assert.h>
/* Tokens. */
#define PARAM(A) A
#define STRINGISE(A) #A
#define TOKENS(X) X(ERROR), X(END), X(COMMA), X(NEWLINE), \
X(ESCAPE), X(WSP), X(NUMBER), X(WORD)
enum Token { TOKENS(PARAM) };
static const char *const tokens[] = { TOKENS(STRINGISE) };
struct Lexer { size_t line; char *marker, *from, *cursor; };
static enum Token lex(struct Lexer *lexer) {
assert(lexer);
/*!re2c
re2c:yyfill:enable = 0;
re2c:define:YYCTYPE = char;
re2c:define:YYCURSOR = lexer->cursor;
re2c:define:YYMARKER = lexer->marker; // Rules overlap.
newline = "\n" | ("\r" "\n"?);
oct = "0" [0-7]*;
dec = [1-9][0-9]*;
hex = '0x' [0-9a-fA-F]+;
num = oct | dec | hex;
word = [^\x00\\\n\r \t\v\f,0-9]+;
comment = "#" [^\x00\n\r]* newline;
*/
scan:
lexer->from = lexer->cursor;
/*!re2c
* { return ERROR; }
"\x00" { return END; }
[ \t\v\f]+ { return WSP; }
newline { lexer->line++; return NEWLINE; }
"\\\n" | comment { lexer->line++; goto scan; }
"\\\\" | "\\," | "\\ " | "\\n" | "\\#" { return ESCAPE; }
"," { return COMMA; }
word { return WORD; }
num { return NUMBER; }
*/
}
struct Buffer {
char *data;
size_t size, capacity;
};
static char *buffer_reserve(struct Buffer *const buf, const size_t reserve) {
const size_t min = buf->size + reserve;
size_t c = buf->capacity;
char *data;
assert(buf);
if(reserve > (size_t)-1 - buf->size || min > ((size_t)-1 >> 1) + 1)
{ errno = ERANGE; return 0; }
if(min > c) {
if(!c) c = 1;
while(min <= c) c <<= 1;
if(!(data = realloc(buf->data, c))) return 0;
buf->data = data;
buf->capacity = c;
}
return buf->data + buf->size;
}
struct Word { char *start, *end; };
struct Parser {
int id, id_set, first_comma;
size_t num_words;
struct Word words[64]; /* Lazy. */
char *start_words, *end_words;
};
static size_t parser_max_words = sizeof ((struct Parser *)0)->words
/ sizeof *((struct Parser *)0)->words;
static void clear_parser(struct Parser *const parser) {
assert(parser);
parser->id_set = 0;
parser->first_comma = 1;
parser->num_words = 0;
parser->start_words = parser->end_words = 0;
}
static void print_parser(const struct Parser *const parser) {
const struct Word *word = parser->words,
*word_end = parser->words + parser->num_words;
assert(parser && parser->id_set && parser->num_words <= parser_max_words);
printf("#%d: ", parser->id);
for( ; word < word_end; word++) {
if(word != parser->words) printf(", ");
if(!word->start) { printf("<null>"); continue; }
assert(word->start <= word->end);
if(word->start == word->end) { printf("<empty>"); continue; }
printf("<%.*s>", (int)(word->end - word->start), word->start);
}
fputc('\n', stdout);
}
static void expand_word(struct Parser *const parser,
const struct Lexer *const lexer) {
assert(parser && lexer && lexer->from < lexer->cursor);
if(!parser->start_words) {
assert(!parser->end_words);
parser->start_words = lexer->from;
}
parser->end_words = (lexer->from + INT_MAX >= lexer->cursor) ?
lexer->cursor : lexer->from + INT_MAX;
}
static int store_word(struct Parser *const parser) {
struct Word *word;
assert(parser);
if(parser->num_words >= parser_max_words) return errno = EILSEQ, 0;
word = parser->words + parser->num_words++;
word->start = parser->start_words;
word->end = parser->end_words;
parser->start_words = parser->end_words = 0;
return 1;
}
int main(int argc, char **argv) {
const size_t granularity = 1024;
struct Lexer lexer = { 1, 0, 0, 0 };
struct Parser parser;
size_t nread;
struct Buffer buf = { 0, 0, 0 };
char *b;
FILE *fp = 0;
int success = 0, end_of_buffer = 0;
/* Open. */
if(argc != 2) return fprintf(stderr, "Needs filename.\n"), EXIT_FAILURE;
if(!(fp = fopen(argv[1], "r"))) goto catch;
/* Read. */
do {
if(!(b = buffer_reserve(&buf, granularity))) goto catch;
nread = fread(b, 1, granularity, fp);
buf.size += nread;
} while(nread == granularity);
if(ferror(fp)) goto catch;
fclose(fp), fp = 0;
if(!(b = buffer_reserve(&buf, 1))) goto catch;
*b = '\0'; /* Make sure it's a string. */
/* Parse. */
lexer.cursor = buf.data;
clear_parser(&parser);
do {
enum Token tok;
switch((tok = lex(&lexer))) {
case ERROR: goto catch;
case END: end_of_buffer = 1; break;
case COMMA:
if(!parser.id_set) { errno = EILSEQ; goto catch; }
if(parser.first_comma) { parser.first_comma = 0; break; }
if(!store_word(&parser)) goto catch;
break;
case NEWLINE:
if(parser.id_set) {
/* We require at least key, data. */
if(!store_word(&parser)) goto catch;
print_parser(&parser);
clear_parser(&parser);
} else if(parser.start_words) {
errno = EILSEQ; goto catch;
}
break;
case ESCAPE:
if(!parser.id_set) { errno = EILSEQ; goto catch; }
expand_word(&parser, &lexer);
break;
case WSP: break;
case NUMBER:
if(parser.id_set) {
expand_word(&parser, &lexer);
} else {
char *end;
long i = strtol(lexer.from, &end, 0);
if(end != lexer.cursor || i < INT_MIN || i > INT_MAX)
{ errno = EDOM; goto catch; }
parser.id = (int)i;
parser.id_set = 1;
}
break;
case WORD:
expand_word(&parser, &lexer);
break;
}
} while(!end_of_buffer);
success = EXIT_SUCCESS;
goto finally;
catch:
fprintf(stderr, "While on line %lu.\n", (unsigned long)lexer.line);
perror("parsing");
assert(!lexer.from || (lexer.from < lexer.cursor
&& lexer.from + INT_MAX >= lexer.cursor));
if(lexer.from) fprintf(stderr, "While on %.*s.\n",
(int)(lexer.cursor - lexer.from), lexer.from);
finally:
free(buf.data);
if(fp) fclose(fp);
return success;
}
Prints,
#2409: <blah>, <blah>, <blah>
#16: <foo>, <bar>, <baz>, <qux>
#8: <a\
a>, <b b\#\\\,still b>, <c>
but that's probably overkill.