pd.if.org Git - lice/blob - lexer.c

   1 #include <stdlib.h>
   2 #include <ctype.h>
   3 #include <stdio.h>
   4 #include <string.h>
   5
   6 #include "lexer.h"
   7 #include "util.h"
   8 #include "lice.h"
   9 #include "opt.h"
  10
  11 static list_t *lexer_buffer = &SENTINEL_LIST;
  12
  13 typedef struct {
  14     char   *file;
  15     size_t  line;
  16     FILE   *fp;
  17 } lexer_file_t;
  18
  19 static int          lexer_continuation = -1;
  20 static lexer_file_t lexer_file;
  21
  22 __attribute__((constructor)) void lexer_init(void) {
  23     lexer_file.file = "(stdin)";
  24     lexer_file.line = 1;
  25     lexer_file.fp   = stdin;
  26 }
  27
  28 static void lexer_file_unget(int ch) {
  29     if (ch == '\n')
  30         lexer_file.line --;
  31     if (lexer_continuation >= 0)
  32         ungetc(lexer_continuation, lexer_file.fp);
  33     lexer_continuation = ch;
  34 }
  35
  36 static int lexer_file_get(void) {
  37     int ch = (lexer_continuation < 0) ? getc(lexer_file.fp) : lexer_continuation;
  38     lexer_continuation = -1;
  39     if (ch == '\\') {
  40         if ((ch = getc(lexer_file.fp)) == '\n') {
  41             lexer_file.line ++;
  42             return lexer_file_get();
  43         }
  44         lexer_file_unget(ch);
  45         return '\\';
  46
  47     }
  48     if (ch == '\n')
  49         lexer_file.line ++;
  50
  51     return ch;
  52 }
  53
  54 static lexer_token_t *lexer_token_copy(lexer_token_t *token) {
  55     return memcpy(malloc(sizeof(lexer_token_t)), token, sizeof(lexer_token_t));
  56 }
  57
  58 static lexer_token_t *lexer_identifier(string_t *str) {
  59     return lexer_token_copy(&(lexer_token_t){
  60         .type      = LEXER_TOKEN_IDENTIFIER,
  61         .string    = string_buffer(str)
  62     });
  63 }
  64 static lexer_token_t *lexer_strtok(string_t *str) {
  65     return lexer_token_copy(&(lexer_token_t){
  66         .type      = LEXER_TOKEN_STRING,
  67         .string    = string_buffer(str)
  68     });
  69 }
  70 static lexer_token_t *lexer_punct(int punct) {
  71     return lexer_token_copy(&(lexer_token_t){
  72         .type      = LEXER_TOKEN_PUNCT,
  73         .punct     = punct
  74     });
  75 }
  76 static lexer_token_t *lexer_number(char *string) {
  77     return lexer_token_copy(&(lexer_token_t){
  78         .type      = LEXER_TOKEN_NUMBER,
  79         .string    = string
  80     });
  81 }
  82 static lexer_token_t *lexer_char(char value) {
  83     return lexer_token_copy(&(lexer_token_t){
  84         .type      = LEXER_TOKEN_CHAR,
  85         .character = value
  86     });
  87 }
  88
  89 static void lexer_skip_comment_line(void) {
  90     for (;;) {
  91         int c = lexer_file_get();
  92         if (c == EOF)
  93             return;
  94         if (c == '\n') {
  95             lexer_file_unget(c);
  96             return;
  97         }
  98     }
  99 }
 100
 101 static void lexer_skip_comment_block(void) {
 102     enum {
 103         comment_outside,
 104         comment_astrick
 105     } state = comment_outside;
 106
 107     for (;;) {
 108         int c = lexer_file_get();
 109         if (c == '*')
 110             state = comment_astrick;
 111         else if (state == comment_astrick && c == '/')
 112             return;
 113         else
 114             state = comment_outside;
 115     }
 116 }
 117
 118 static int lexer_skip(void) {
 119     int c;
 120     while ((c = lexer_file_get()) != EOF) {
 121         if (isspace(c) || c == '\n' || c == '\r')
 122             continue;
 123         lexer_file_unget(c);
 124         return c;
 125     }
 126     return EOF;
 127 }
 128
 129 static lexer_token_t *lexer_read_number(int c) {
 130     string_t *string = string_create();
 131     string_cat(string, c);
 132     for (;;) {
 133         int p = lexer_file_get();
 134         if (!isdigit(p) && !isalpha(p) && p != '.') {
 135             lexer_file_unget(p);
 136             return lexer_number(string_buffer(string));
 137         }
 138         string_cat(string, p);
 139     }
 140     return NULL;
 141 }
 142
 143 static bool lexer_read_character_octal_brace(int c, int *r) {
 144     if ('0' <= c && c <= '7') {
 145         *r = (*r << 3) | (c - '0');
 146         return true;
 147     }
 148     return false;
 149 }
 150
 151 static int lexer_read_character_octal(int c) {
 152     int r = c - '0';
 153     if (lexer_read_character_octal_brace((c = lexer_file_get()), &r)) {
 154         if (!lexer_read_character_octal_brace((c = lexer_file_get()), &r))
 155             lexer_file_unget(c);
 156     } else
 157         lexer_file_unget(c);
 158     return r;
 159 }
 160
 161 static bool lexer_read_character_universal_test(unsigned int c) {
 162     if (0x800 <= c && c<= 0xDFFF)
 163         return false;
 164     return 0xA0 <= c || c == '$' || c == '@' || c == '`';
 165 }
 166
 167 static int lexer_read_character_universal(int length) {
 168     unsigned int r = 0;
 169     for (int i = 0; i < length; i++) {
 170         int c = lexer_file_get();
 171         switch (c) {
 172             case '0' ... '9': r = (r << 4) | (c - '0');      continue;
 173             case 'a' ... 'f': r = (r << 4) | (c - 'a' + 10); continue;
 174             case 'A' ... 'F': r = (r << 4) | (c - 'A' + 10); continue;
 175             default:
 176                 compile_error("not a valid universal character: %c", c);
 177
 178         }
 179     }
 180     if (!lexer_read_character_universal_test(r)) {
 181         compile_error(
 182             "not a valid universal character: \\%c%0*x",
 183             (length == 4) ? 'u' : 'U',
 184             length,
 185             r
 186         );
 187     }
 188     return r;
 189 }
 190
 191 static int lexer_read_character_hexadecimal(void) {
 192     int c = lexer_file_get();
 193     int r = 0;
 194
 195     if (!isxdigit(c))
 196         compile_error("malformatted hexadecimal character");
 197
 198     for (;; c = lexer_file_get()) {
 199         switch (c) {
 200             case '0' ... '9': r = (r << 4) | (c - '0');      continue;
 201             case 'a' ... 'f': r = (r << 4) | (c - 'a' + 10); continue;
 202             case 'A' ... 'F': r = (r << 4) | (c - 'A' + 10); continue;
 203
 204             default:
 205                 lexer_file_unget(c);
 206                 return r;
 207         }
 208     }
 209     return -1;
 210 }
 211
 212 static int lexer_read_character_escaped(void) {
 213     int c = lexer_file_get();
 214
 215     switch (c) {
 216         case '\'':        return '\'';
 217         case '"':         return '"';
 218         case '?':         return '?';
 219         case '\\':        return '\\';
 220         case 'a':         return '\a';
 221         case 'b':         return '\b';
 222         case 'f':         return '\f';
 223         case 'n':         return '\n';
 224         case 'r':         return '\r';
 225         case 't':         return '\t';
 226         case 'v':         return '\v';
 227         case 'e':         return '\033';
 228         case '0' ... '7': return lexer_read_character_octal(c);
 229         case 'x':         return lexer_read_character_hexadecimal();
 230         case 'u':         return lexer_read_character_universal(4);
 231         case 'U':         return lexer_read_character_universal(8);
 232         case EOF:
 233             compile_error("malformatted escape sequence");
 234
 235         default:
 236             return c;
 237     }
 238 }
 239
 240 static lexer_token_t *lexer_read_character(void) {
 241     int c = lexer_file_get();
 242     int r = (c == '\\') ? lexer_read_character_escaped() : c;
 243
 244     if (lexer_file_get() != '\'')
 245         compile_error("unterminated character");
 246
 247     return lexer_char((char)r);
 248 }
 249
 250 static lexer_token_t *lexer_read_string(void) {
 251     string_t *string = string_create();
 252     for (;;) {
 253         int c = lexer_file_get();
 254         if (c == EOF)
 255             compile_error("Expected termination for string literal");
 256
 257         if (c == '"')
 258             break;
 259         if (c == '\\')
 260             c = lexer_read_character_escaped();
 261         string_cat(string, c);
 262     }
 263     return lexer_strtok(string);
 264 }
 265
 266 static lexer_token_t *lexer_read_identifier(int c1) {
 267     string_t *string = string_create();
 268     string_cat(string, (char)c1);
 269
 270     for (;;) {
 271         int c2 = lexer_file_get();
 272         if (isalnum(c2) || c2 == '_' || c2 == '$') {
 273             string_cat(string, c2);
 274         } else {
 275             lexer_file_unget(c2);
 276             return lexer_identifier(string);
 277         }
 278     }
 279     return NULL;
 280 }
 281
 282 static lexer_token_t *lexer_read_reclassify_one(int expect1, int a, int e) {
 283     int c = lexer_file_get();
 284     if (c == expect1)
 285         return lexer_punct(a);
 286     lexer_file_unget(c);
 287     return lexer_punct(e);
 288 }
 289 static lexer_token_t *lexer_read_reclassify_two(int expect1, int a, int expect2, int b, int e) {
 290     int c = lexer_file_get();
 291     if (c == expect1)
 292         return lexer_punct(a);
 293     if (c == expect2)
 294         return lexer_punct(b);
 295     lexer_file_unget(c);
 296     return lexer_punct(e);
 297 }
 298
 299 static lexer_token_t *lexer_read_token(void);
 300
 301 static lexer_token_t *lexer_minicpp(void) {
 302     string_t *string = string_create();
 303     string_t *method = string_create();
 304     char     *buffer;
 305     int       ch;
 306
 307     for (const char *p = "pragma"; *p; p++) {
 308         if ((ch = lexer_file_get()) != *p) {
 309             string_cat(string, ch);
 310             goto error;
 311         }
 312     }
 313
 314     for (ch = lexer_file_get(); ch && ch != '\n'; ch = lexer_file_get()) {
 315         if (isspace(ch))
 316             continue;
 317         string_cat(method, ch);
 318     }
 319
 320     buffer = string_buffer(method);
 321
 322     if (!strcmp(buffer, "warning_disable"))
 323         compile_warning = false;
 324     if (!strcmp(buffer, "warning_enable"))
 325         compile_warning = true;
 326
 327     goto fall;
 328
 329 error:
 330     buffer = string_buffer(string);
 331     for (char *beg = &buffer[string_length(string)]; beg != &buffer[-1]; --beg)
 332         lexer_file_unget(*beg);
 333
 334 fall:
 335     lexer_skip_comment_line();
 336     return lexer_read_token();
 337 }
 338
 339 static lexer_token_t *lexer_read_token(void) {
 340     int c;
 341     int n;
 342
 343     lexer_skip();
 344
 345     switch ((c = lexer_file_get())) {
 346         case '0' ... '9':  return lexer_read_number(c);
 347         case '"':          return lexer_read_string();
 348         case '\'':         return lexer_read_character();
 349         case 'a' ... 'z':
 350         case 'A' ... 'K':
 351         case 'M' ... 'Z':
 352         case '_':
 353             return lexer_read_identifier(c);
 354         case '$':
 355             if (opt_extension_test(EXTENSION_DOLLAR))
 356                 return lexer_read_identifier(c);
 357             break;
 358
 359         case 'L':
 360             switch ((c = lexer_file_get())) {
 361                 case '"':  return lexer_read_string();
 362                 case '\'': return lexer_read_character();
 363             }
 364             lexer_file_unget(c);
 365             return lexer_read_identifier('L');
 366
 367         case '/':
 368             switch ((c = lexer_file_get())) {
 369                 case '/':
 370                     lexer_skip_comment_line();
 371                     return lexer_read_token();
 372                 case '*':
 373                     lexer_skip_comment_block();
 374                     return lexer_read_token();
 375             }
 376             if (c == '=')
 377                 return lexer_punct(LEXER_TOKEN_COMPOUND_DIV);
 378             lexer_file_unget(c);
 379             return lexer_punct('/');
 380
 381         // ignore preprocessor lines for now
 382         case '#':
 383             return lexer_minicpp();
 384
 385         case '(': case ')':
 386         case ',': case ';':
 387         case '[': case ']':
 388         case '{': case '}':
 389         case '?': case ':':
 390         case '~':
 391             return lexer_punct(c);
 392
 393         case '+': return lexer_read_reclassify_two('+', LEXER_TOKEN_INCREMENT,    '=', LEXER_TOKEN_COMPOUND_ADD, '+');
 394         case '&': return lexer_read_reclassify_two('&', LEXER_TOKEN_AND,          '=', LEXER_TOKEN_COMPOUND_AND, '&');
 395         case '|': return lexer_read_reclassify_two('|', LEXER_TOKEN_OR,           '=', LEXER_TOKEN_COMPOUND_OR,  '|');
 396         case '*': return lexer_read_reclassify_one('=', LEXER_TOKEN_COMPOUND_MUL, '*');
 397         case '%': return lexer_read_reclassify_one('=', LEXER_TOKEN_COMPOUND_MOD, '%');
 398         case '=': return lexer_read_reclassify_one('=', LEXER_TOKEN_EQUAL,        '=');
 399         case '!': return lexer_read_reclassify_one('=', LEXER_TOKEN_NEQUAL,       '!');
 400         case '^': return lexer_read_reclassify_one('=', LEXER_TOKEN_COMPOUND_XOR, '^');
 401
 402         case '-':
 403             switch ((c = lexer_file_get())) {
 404                 case '-': return lexer_punct(LEXER_TOKEN_DECREMENT);
 405                 case '>': return lexer_punct(LEXER_TOKEN_ARROW);
 406                 case '=': return lexer_punct(LEXER_TOKEN_COMPOUND_SUB);
 407                 default:
 408                     break;
 409             }
 410             lexer_file_unget(c);
 411             return lexer_punct('-');
 412
 413         case '<':
 414             if ((c = lexer_file_get()) == '=')
 415                 return lexer_punct(LEXER_TOKEN_LEQUAL);
 416             if (c == '<')
 417                 return lexer_read_reclassify_one('=', LEXER_TOKEN_COMPOUND_LSHIFT, LEXER_TOKEN_LSHIFT);
 418             lexer_file_unget(c);
 419             return lexer_punct('<');
 420         case '>':
 421             if ((c = lexer_file_get()) == '=')
 422                 return lexer_punct(LEXER_TOKEN_GEQUAL);
 423             if (c == '>')
 424                 return lexer_read_reclassify_one('=', LEXER_TOKEN_COMPOUND_RSHIFT, LEXER_TOKEN_RSHIFT);
 425             lexer_file_unget(c);
 426             return lexer_punct('>');
 427
 428         case '.':
 429             n = lexer_file_get();
 430             if (isdigit(n)) {
 431                 lexer_file_unget(n);
 432                 return lexer_read_number(c);
 433             }
 434             if (n == '.') {
 435                 string_t *str = string_create();
 436                 string_catf(str, "..%c", lexer_file_get());
 437                 return lexer_identifier(str);
 438             }
 439             lexer_file_unget(n);
 440             return lexer_punct('.');
 441
 442         case EOF:
 443             return NULL;
 444
 445         default:
 446             compile_error("Unexpected character: `%c`", c);
 447     }
 448     return NULL;
 449 }
 450
 451 bool lexer_ispunct(lexer_token_t *token, int c) {
 452     return token && (token->type == LEXER_TOKEN_PUNCT) && (token->punct == c);
 453 }
 454
 455 void lexer_unget(lexer_token_t *token) {
 456     if (!token)
 457         return;
 458     list_push(lexer_buffer, token);
 459 }
 460
 461 lexer_token_t *lexer_next(void) {
 462     if (list_length(lexer_buffer) > 0)
 463         return list_pop(lexer_buffer);
 464     return lexer_read_token();
 465 }
 466
 467 lexer_token_t *lexer_peek(void) {
 468     lexer_token_t *token = lexer_next();
 469     lexer_unget(token);
 470     return token;
 471 }
 472
 473 char *lexer_token_string(lexer_token_t *token) {
 474     string_t *string = string_create();
 475     if (!token)
 476         return "(null)";
 477     switch (token->type) {
 478         case LEXER_TOKEN_PUNCT:
 479             if (token->punct == LEXER_TOKEN_EQUAL) {
 480                 string_catf(string, "==");
 481                 return string_buffer(string);
 482             }
 483         case LEXER_TOKEN_CHAR:
 484             string_cat(string, token->character);
 485             return string_buffer(string);
 486         case LEXER_TOKEN_NUMBER:
 487             string_catf(string, "%d", token->integer);
 488             return string_buffer(string);
 489         case LEXER_TOKEN_STRING:
 490             string_catf(string, "\"%s\"", token->string);
 491             return string_buffer(string);
 492         case LEXER_TOKEN_IDENTIFIER:
 493             return token->string;
 494         default:
 495             break;
 496     }
 497     compile_ice("unexpected token");
 498     return NULL;
 499 }
 500
 501 char *lexer_marker(void) {
 502     string_t *string = string_create();
 503     string_catf(string, "%s:%zu", lexer_file.file, lexer_file.line);
 504     return string_buffer(string);
 505 }