]> begriffs open source - sa-parse/blob - src/csv.l
need reentrant lexer
[sa-parse] / src / csv.l
1 %{
2 #include <stdio.h>
3 #include <string.h>
4 #include <stdlib.h>
5 #include <unistd.h>
6 #include "csv.tab.h"
7
8 /* Fix for bison-bridge with prefix */
9 #define YYSTYPE CSV_STYPE
10
11 /* Buffer for building field content */
12 static char field_buffer[8192];
13 static size_t field_pos = 0;
14
15 void reset_field_buffer() {
16     field_pos = 0;
17     field_buffer[0] = '\0';
18 }
19
20 void append_to_field(char c) {
21     if (field_pos < sizeof(field_buffer) - 1) {  /* Reserve space for null terminator */
22         field_buffer[field_pos++] = c;
23         field_buffer[field_pos] = '\0';
24     }
25 }
26
27 void append_string_to_field(const char* s) {
28     while (*s && field_pos < sizeof(field_buffer) - 1) {  /* Reserve space for null terminator */
29         field_buffer[field_pos++] = *s++;
30     }
31     field_buffer[field_pos] = '\0';
32 }
33
34 char* get_field_content() {
35     char* result = strdup(field_buffer);  /* NOTE: Caller must free() this memory */
36     if (!result) {
37         fprintf(stderr, "Memory allocation failed in get_field_content()\n");
38         exit(1);  /* Conservative approach: exit on memory failure */
39     }
40     return result;
41 }
42 %}
43
44 %option noyywrap
45 %option yylineno
46 %option nounput
47 %option noinput
48 %option prefix="csv_"
49 %option bison-bridge
50 %option reentrant
51
52 %x ESCAPED_FIELD
53
54 TEXTDATA    [\x20-\x21\x23-\x2B\x2D-\x7E]
55 COMMA       \x2C
56 CR          \x0D
57 LF          \x0A
58 DQUOTE      \x22
59 CRLF        {CR}{LF}
60
61 %%
62
63 {COMMA}                 { 
64                           /* Check if we need to emit an empty field before the comma */
65                           /* This will be handled by the parser grammar instead */
66                           return COMMA_TOK; 
67                         }
68
69 {CRLF}                  { return CRLF_TOK; }
70
71 {LF}                    { return CRLF_TOK; }
72
73 {DQUOTE}                { 
74                           reset_field_buffer(); 
75                           BEGIN(ESCAPED_FIELD); 
76                         }
77
78 <ESCAPED_FIELD>{DQUOTE}{DQUOTE}  { append_to_field('"'); }
79
80 <ESCAPED_FIELD>{DQUOTE}          { 
81                                    yylval->str = get_field_content();
82                                    BEGIN(INITIAL);
83                                    return FIELD_TOK;
84                                  }
85
86 <ESCAPED_FIELD>{TEXTDATA}        { append_to_field(yytext[0]); }
87
88 <ESCAPED_FIELD>{COMMA}           { append_to_field(','); }
89
90 <ESCAPED_FIELD>{CR}              { append_to_field('\r'); }
91
92 <ESCAPED_FIELD>{LF}              { append_to_field('\n'); }
93
94 <ESCAPED_FIELD>.                 { append_to_field(yytext[0]); }
95
96 <ESCAPED_FIELD><<EOF>>           {
97                                    /* Handle unterminated quoted field */
98                                    yylval->str = get_field_content();
99                                    BEGIN(INITIAL);
100                                    return FIELD_TOK;
101                                  }
102
103 {TEXTDATA}+             { 
104                           yylval->str = strdup(yytext);  /* NOTE: Caller must free() this memory */
105                           if (!yylval->str) {
106                               fprintf(stderr, "Memory allocation failed in TEXTDATA rule\n");
107                               exit(1);  /* Conservative approach: exit on memory failure */
108                           }
109                           return FIELD_TOK; 
110                         }
111
112 [ \t]                   { /* ignore whitespace outside of fields */ }
113
114 .                       { return yytext[0]; }
115
116 %%