11 static list_t *lexer_buffer = &SENTINEL_LIST;
19 static int lexer_continuation = -1;
20 static lexer_file_t lexer_file;
22 __attribute__((constructor)) void lexer_init(void) {
23 lexer_file.file = "(stdin)";
25 lexer_file.fp = stdin;
28 static void lexer_file_unget(int ch) {
31 if (lexer_continuation >= 0)
32 ungetc(lexer_continuation, lexer_file.fp);
33 lexer_continuation = ch;
36 static int lexer_file_get(void) {
37 int ch = (lexer_continuation < 0) ? getc(lexer_file.fp) : lexer_continuation;
38 lexer_continuation = -1;
40 if ((ch = getc(lexer_file.fp)) == '\n') {
42 return lexer_file_get();
54 static lexer_token_t *lexer_token_copy(lexer_token_t *token) {
55 return memcpy(malloc(sizeof(lexer_token_t)), token, sizeof(lexer_token_t));
58 static lexer_token_t *lexer_identifier(string_t *str) {
59 return lexer_token_copy(&(lexer_token_t){
60 .type = LEXER_TOKEN_IDENTIFIER,
61 .string = string_buffer(str)
64 static lexer_token_t *lexer_strtok(string_t *str) {
65 return lexer_token_copy(&(lexer_token_t){
66 .type = LEXER_TOKEN_STRING,
67 .string = string_buffer(str)
70 static lexer_token_t *lexer_punct(int punct) {
71 return lexer_token_copy(&(lexer_token_t){
72 .type = LEXER_TOKEN_PUNCT,
76 static lexer_token_t *lexer_number(char *string) {
77 return lexer_token_copy(&(lexer_token_t){
78 .type = LEXER_TOKEN_NUMBER,
82 static lexer_token_t *lexer_char(char value) {
83 return lexer_token_copy(&(lexer_token_t){
84 .type = LEXER_TOKEN_CHAR,
89 static void lexer_skip_comment_line(void) {
91 int c = lexer_file_get();
101 static void lexer_skip_comment_block(void) {
105 } state = comment_outside;
108 int c = lexer_file_get();
110 state = comment_astrick;
111 else if (state == comment_astrick && c == '/')
114 state = comment_outside;
118 static int lexer_skip(void) {
120 while ((c = lexer_file_get()) != EOF) {
121 if (isspace(c) || c == '\n' || c == '\r')
129 static lexer_token_t *lexer_read_number(int c) {
130 string_t *string = string_create();
131 string_cat(string, c);
133 int p = lexer_file_get();
134 if (!isdigit(p) && !isalpha(p) && p != '.') {
136 return lexer_number(string_buffer(string));
138 string_cat(string, p);
143 static bool lexer_read_character_octal_brace(int c, int *r) {
144 if ('0' <= c && c <= '7') {
145 *r = (*r << 3) | (c - '0');
151 static int lexer_read_character_octal(int c) {
153 if (lexer_read_character_octal_brace((c = lexer_file_get()), &r)) {
154 if (!lexer_read_character_octal_brace((c = lexer_file_get()), &r))
161 static bool lexer_read_character_universal_test(unsigned int c) {
162 if (0x800 <= c && c<= 0xDFFF)
164 return 0xA0 <= c || c == '$' || c == '@' || c == '`';
167 static int lexer_read_character_universal(int length) {
169 for (int i = 0; i < length; i++) {
170 int c = lexer_file_get();
172 case '0' ... '9': r = (r << 4) | (c - '0'); continue;
173 case 'a' ... 'f': r = (r << 4) | (c - 'a' + 10); continue;
174 case 'A' ... 'F': r = (r << 4) | (c - 'A' + 10); continue;
176 compile_error("not a valid universal character: %c", c);
180 if (!lexer_read_character_universal_test(r)) {
182 "not a valid universal character: \\%c%0*x",
183 (length == 4) ? 'u' : 'U',
191 static int lexer_read_character_hexadecimal(void) {
192 int c = lexer_file_get();
196 compile_error("malformatted hexadecimal character");
198 for (;; c = lexer_file_get()) {
200 case '0' ... '9': r = (r << 4) | (c - '0'); continue;
201 case 'a' ... 'f': r = (r << 4) | (c - 'a' + 10); continue;
202 case 'A' ... 'F': r = (r << 4) | (c - 'A' + 10); continue;
212 static int lexer_read_character_escaped(void) {
213 int c = lexer_file_get();
216 case '\'': return '\'';
217 case '"': return '"';
218 case '?': return '?';
219 case '\\': return '\\';
220 case 'a': return '\a';
221 case 'b': return '\b';
222 case 'f': return '\f';
223 case 'n': return '\n';
224 case 'r': return '\r';
225 case 't': return '\t';
226 case 'v': return '\v';
227 case 'e': return '\033';
228 case '0' ... '7': return lexer_read_character_octal(c);
229 case 'x': return lexer_read_character_hexadecimal();
230 case 'u': return lexer_read_character_universal(4);
231 case 'U': return lexer_read_character_universal(8);
233 compile_error("malformatted escape sequence");
240 static lexer_token_t *lexer_read_character(void) {
241 int c = lexer_file_get();
242 int r = (c == '\\') ? lexer_read_character_escaped() : c;
244 if (lexer_file_get() != '\'')
245 compile_error("unterminated character");
247 return lexer_char((char)r);
250 static lexer_token_t *lexer_read_string(void) {
251 string_t *string = string_create();
253 int c = lexer_file_get();
255 compile_error("Expected termination for string literal");
260 c = lexer_read_character_escaped();
261 string_cat(string, c);
263 return lexer_strtok(string);
266 static lexer_token_t *lexer_read_identifier(int c1) {
267 string_t *string = string_create();
268 string_cat(string, (char)c1);
271 int c2 = lexer_file_get();
272 if (isalnum(c2) || c2 == '_' || c2 == '$') {
273 string_cat(string, c2);
275 lexer_file_unget(c2);
276 return lexer_identifier(string);
282 static lexer_token_t *lexer_read_reclassify_one(int expect1, int a, int e) {
283 int c = lexer_file_get();
285 return lexer_punct(a);
287 return lexer_punct(e);
289 static lexer_token_t *lexer_read_reclassify_two(int expect1, int a, int expect2, int b, int e) {
290 int c = lexer_file_get();
292 return lexer_punct(a);
294 return lexer_punct(b);
296 return lexer_punct(e);
299 static lexer_token_t *lexer_read_token(void);
301 static lexer_token_t *lexer_minicpp(void) {
302 string_t *string = string_create();
303 string_t *method = string_create();
307 for (const char *p = "pragma"; *p; p++) {
308 if ((ch = lexer_file_get()) != *p) {
309 string_cat(string, ch);
314 for (ch = lexer_file_get(); ch && ch != '\n'; ch = lexer_file_get()) {
317 string_cat(method, ch);
320 buffer = string_buffer(method);
322 if (!strcmp(buffer, "warning_disable"))
323 compile_warning = false;
324 if (!strcmp(buffer, "warning_enable"))
325 compile_warning = true;
330 buffer = string_buffer(string);
331 for (char *beg = &buffer[string_length(string)]; beg != &buffer[-1]; --beg)
332 lexer_file_unget(*beg);
335 lexer_skip_comment_line();
336 return lexer_read_token();
339 static lexer_token_t *lexer_read_token(void) {
345 switch ((c = lexer_file_get())) {
346 case '0' ... '9': return lexer_read_number(c);
347 case '"': return lexer_read_string();
348 case '\'': return lexer_read_character();
353 return lexer_read_identifier(c);
355 if (opt_extension_test(EXTENSION_DOLLAR))
356 return lexer_read_identifier(c);
360 switch ((c = lexer_file_get())) {
361 case '"': return lexer_read_string();
362 case '\'': return lexer_read_character();
365 return lexer_read_identifier('L');
368 switch ((c = lexer_file_get())) {
370 lexer_skip_comment_line();
371 return lexer_read_token();
373 lexer_skip_comment_block();
374 return lexer_read_token();
377 return lexer_punct(LEXER_TOKEN_COMPOUND_DIV);
379 return lexer_punct('/');
381 // ignore preprocessor lines for now
383 return lexer_minicpp();
391 return lexer_punct(c);
393 case '+': return lexer_read_reclassify_two('+', LEXER_TOKEN_INCREMENT, '=', LEXER_TOKEN_COMPOUND_ADD, '+');
394 case '&': return lexer_read_reclassify_two('&', LEXER_TOKEN_AND, '=', LEXER_TOKEN_COMPOUND_AND, '&');
395 case '|': return lexer_read_reclassify_two('|', LEXER_TOKEN_OR, '=', LEXER_TOKEN_COMPOUND_OR, '|');
396 case '*': return lexer_read_reclassify_one('=', LEXER_TOKEN_COMPOUND_MUL, '*');
397 case '%': return lexer_read_reclassify_one('=', LEXER_TOKEN_COMPOUND_MOD, '%');
398 case '=': return lexer_read_reclassify_one('=', LEXER_TOKEN_EQUAL, '=');
399 case '!': return lexer_read_reclassify_one('=', LEXER_TOKEN_NEQUAL, '!');
400 case '^': return lexer_read_reclassify_one('=', LEXER_TOKEN_COMPOUND_XOR, '^');
403 switch ((c = lexer_file_get())) {
404 case '-': return lexer_punct(LEXER_TOKEN_DECREMENT);
405 case '>': return lexer_punct(LEXER_TOKEN_ARROW);
406 case '=': return lexer_punct(LEXER_TOKEN_COMPOUND_SUB);
411 return lexer_punct('-');
414 if ((c = lexer_file_get()) == '=')
415 return lexer_punct(LEXER_TOKEN_LEQUAL);
417 return lexer_read_reclassify_one('=', LEXER_TOKEN_COMPOUND_LSHIFT, LEXER_TOKEN_LSHIFT);
419 return lexer_punct('<');
421 if ((c = lexer_file_get()) == '=')
422 return lexer_punct(LEXER_TOKEN_GEQUAL);
424 return lexer_read_reclassify_one('=', LEXER_TOKEN_COMPOUND_RSHIFT, LEXER_TOKEN_RSHIFT);
426 return lexer_punct('>');
429 n = lexer_file_get();
432 return lexer_read_number(c);
435 string_t *str = string_create();
436 string_catf(str, "..%c", lexer_file_get());
437 return lexer_identifier(str);
440 return lexer_punct('.');
446 compile_error("Unexpected character: `%c`", c);
451 bool lexer_ispunct(lexer_token_t *token, int c) {
452 return token && (token->type == LEXER_TOKEN_PUNCT) && (token->punct == c);
455 void lexer_unget(lexer_token_t *token) {
458 list_push(lexer_buffer, token);
461 lexer_token_t *lexer_next(void) {
462 if (list_length(lexer_buffer) > 0)
463 return list_pop(lexer_buffer);
464 return lexer_read_token();
467 lexer_token_t *lexer_peek(void) {
468 lexer_token_t *token = lexer_next();
473 char *lexer_token_string(lexer_token_t *token) {
474 string_t *string = string_create();
477 switch (token->type) {
478 case LEXER_TOKEN_PUNCT:
479 if (token->punct == LEXER_TOKEN_EQUAL) {
480 string_catf(string, "==");
481 return string_buffer(string);
483 case LEXER_TOKEN_CHAR:
484 string_cat(string, token->character);
485 return string_buffer(string);
486 case LEXER_TOKEN_NUMBER:
487 string_catf(string, "%d", token->integer);
488 return string_buffer(string);
489 case LEXER_TOKEN_STRING:
490 string_catf(string, "\"%s\"", token->string);
491 return string_buffer(string);
492 case LEXER_TOKEN_IDENTIFIER:
493 return token->string;
497 compile_ice("unexpected token");
501 char *lexer_marker(void) {
502 string_t *string = string_create();
503 string_catf(string, "%s:%zu", lexer_file.file, lexer_file.line);
504 return string_buffer(string);