0

I want to parse lines as this one:

20191201 170003296,1.102290,1.102470,0

To do so, I have written the following basic yacc/lex grammar/lexer:

grammar.y:

%{

#include <time.h>
#include <stdio.h>

%}

%union {

        struct tm       datetime;               /* DATE TIME VALUES */
        double          float_number;           /* 8 BYTES DOUBLE VALUE */
        long            integer_number;         /* 8 BYTES INTEGER VALUE */

}

%token  <datetime>              DATETIME
%token  <float_number>          FLOAT_NUMBER
%token  <integer_number>        INTEGER_NUMBER

%%

lastbid_lastask:        DATETIME ',' FLOAT_NUMBER ',' FLOAT_NUMBER ',' INTEGER_NUMBER '\n' { printf("MATCH %lf %lf %ld\n", $3, $5, $7); }
                        ;

%%

int main(int argc, char *argv[]) {

        yyparse();

        return 0;

}

lexer.l:

%{

#include <time.h>
#include "grammar.h"

void read_float_number(void);
void read_integer_number(void);
void read_date_YYYYMMDD_HHMMSSmmm(void);
void yyerror(const char* msg);

%}


%%

                                                                                                /* YYYYMMDD HHMMSSmmm DATE */
[12][09][0-9][0-9][0-1][0-9][0-3][0-9][ ][0-2][0-9][0-5][0-9][0-5][0-9][0-9][0-9][0-9]          { read_date_YYYYMMDD_HHMMSSmmm(); return DATETIME; }

                                                                                                /* FLOAT NUMBER */
[0-9]+\.[0-9]+                                                                                  { read_float_number(); return FLOAT_NUMBER; }

                                                                                                /* INTEGER NUMBER */
[0-9]+                                                                                          { read_integer_number(); return INTEGER_NUMBER; }

                                                                                                /* PASS ',' CHARACTER */
,                                                                                               { return ','; } 

                                                                                                /* PASS '\n' CHARACTER */
\n                                                                                              { return '\n'; } 

                                                                                                /* PASS UNEXPECTED CHARACTER */
.                                                                                               { return yytext[0]; }


%%

/* READ FLOAT NUMBER */
void read_float_number(void) {
        printf("void read_float_number(void)\n");
        printf("#%s#\n", yytext);
        sscanf(yytext, "%lf", &yylval.float_number);
        printf("%lf\n", yylval.float_number);
}

/* READ INTEGER NUMBER */
void read_integer_number(void) {
        printf("void read_integer_number(void)\n");
        printf("#%s#\n", yytext);
        sscanf(yytext, "%ld", &yylval.integer_number);
        printf("%ld\n", yylval.integer_number);
}

/* READ YYYYMMDD HHMMSSmmm DATE */
void read_date_YYYYMMDD_HHMMSSmmm(void) {

        printf("void read_date_YYYYMMDD_HHMMSSmmm(void)\n");
        printf("#%s#\n", yytext);

        /*  DATETIME STRUCT TM */
        struct tm dt;

        /* READ VALUES */
        sscanf(yytext, "%4d%2d%2d %2d%2d%2d", &dt.tm_year, &dt.tm_mon, &dt.tm_mday, &dt.tm_hour, &dt.tm_min, &dt.tm_sec);

        /* NORMALIZE VALUES */
        dt.tm_year = dt.tm_year - 1900;         /* NORMALIZE YEAR */
        dt.tm_mon = dt.tm_mon - 1;              /* NORMALIZE MONTH */
        dt.tm_isdst = -1;                       /* NO INFORMATION ABOUT DST */
        mktime(&dt);                            /* NORMALIZE STRUCT TM */

        /* PRINT DATE TIME */
        char buffer[80];
        strftime(buffer, 80, "%c %Z", &dt);
        printf("%s\n", buffer);

        /* COPY STRUCT TM TO YACC RETURN VALUE */
        memcpy(&yylval.datetime, &dt, sizeof(dt));


}

/* YYERROR */
void yyerror(const char* msg) {
        fprintf(stderr, "yyerror %s\n", msg);
        exit(1);
}

makefile:

CCFLAGS = -std=c89 -c
YFLAGS = -d     # Forces generation of y.tab.h
OBJS = lexer.o grammar.o
TARGET = readfile

readfile:               $(OBJS)
                        cc $(OBJS) -std=c89 -ll -o $(TARGET)

grammar.h grammar.o:    grammar.y
                        yacc $(YFLAGS) -ogrammar.c grammar.y
                        cc $(CCFLAGS) grammar.c

lexer.o:                lexer.l grammar.h
                        lex -olexer.c lexer.l
                        cc $(CCFLAGS) lexer.c

clean:
                        rm -f $(OBJS) grammar.[ch] lexer.c

When I test the program I get the following behaviour, first line is correctly parsed but second line causes an yyerror:

% ./readfile 
20191201 170003296,1.102290,1.102470,0
void read_date_YYYYMMDD_HHMMSSmmm(void)
#20191201 170003296#
Sun Dec  1 17:00:03 2019 CET
void read_float_number(void)
#1.102290#
1.102290
void read_float_number(void)
#1.102470#
1.102470
void read_integer_number(void)
#0#
0
MATCH 1.102290 1.102470 0
20191201 170004413,1.102320,1.102470,0
void read_date_YYYYMMDD_HHMMSSmmm(void)
#20191201 170004413#
Sun Dec  1 17:00:04 2019 CET
yyerror syntax error

Why does second line fail?

M.E.
  • 4,955
  • 4
  • 49
  • 128
  • 1
    yacc is not parsing lines but the *entire file*, yet you've described that your language will ever have one `lastbid_lastask` in it... – Antti Haapala -- Слава Україні Jul 14 '20 at 18:20
  • My goal would be the grammar being executed for every input line, in this example there is only one rule in the grammar, but the idea is that there will be more so the program will be able to parse a predefined set of CSV file formats at once. Any tip on how that shall be done in yacc (I mean read line by line and parse) – M.E. Jul 14 '20 at 19:00
  • I have added a return statement in the grammar rule. Without it, program fails on subsequent parsing, now I am struggling to detect when there is no more input (CONTROL+D) – M.E. Jul 14 '20 at 20:04
  • Don't put return statements into yacc rules. Unlike the lexer, the parser does not expect that and it may end up with internal variables in an inconsistent state. If you want a rule to immediately accept the input, use `YYACCEPT` (or `YYABORT` if you want the parse to fail with a non-zero return code.) – rici Jul 14 '20 at 20:47

1 Answers1

1

Your grammar matches exactly one line.

If you want to match multiple lines, add at the beginning:

input: lastbid_lastask
     | input lastbid_lastask

Or, if you are happy to accept empty files (and why not?)

input: /* empty */
     | input lastbid_lastask
rici
  • 234,347
  • 28
  • 237
  • 341
  • That fixes the multiline, and now I realize it is the structure used in the books. But how do I handle CONTROL+D to reflect that there is no more input? That would be technically another question but highly related to this – M.E. Jul 14 '20 at 20:50
  • You don't need to do anything. That will all happen automatically. (If you have some issue with ctrl+d, please describe what it is.) – rici Jul 14 '20 at 20:51
  • Ok, I will include that in another question because I do have issues with CONTROL+D. – M.E. Jul 14 '20 at 20:52
  • Related: https://stackoverflow.com/questions/62903945/why-this-yacclex-basic-parser-do-not-handle-controld-eof – M.E. Jul 14 '20 at 21:02