pd.if.org Git - pdutils/blob - utils/sh/tok.c

   1 #include <stdio.h>
   2 #include "gram.h"
   3
   4 /* START HEADER */
   5 struct token {
   6         int type;
   7         int rtype; /* a "real type", e.g. In, NAME, etc. */
   8         int length;
   9         int delimiter;
  10         int nondigits;
  11         int nonnames;
  12         struct token *next;
  13         char text[2048];
  14 };
  15
  16 struct token_state {
  17         FILE *input;
  18         struct token *tok;
  19
  20         char *strinput;
  21         int is_string;
  22         int strlen;
  23         int cursor;
  24
  25         int pc; /* previous char */
  26         int cc; /* current char */
  27         int nc; /* next char */
  28         int push_back; /* true if next char is valid */
  29
  30         int prev_op; /* previous character was used as part of an operator */
  31         int prev_word; /* previous character is part of a word */
  32         int quoting; /* type of quoting */
  33
  34         char buf[2048];
  35         int buflen;
  36 };
  37
  38 struct token_state *ts_init(struct token_state *ts, FILE *f);
  39 int get_token(struct token_state *ts, struct token *tok);
  40
  41 /* END HEADER */
  42
  43 #if DEBUG_TOKEN
  44 static void ptoken(struct token_state *ts) {
  45         fprintf(stderr, "token %d = %s\n", ts->tok->type, ts->tok->text);
  46 }
  47 #endif
  48
  49 static int nextch(struct token_state *ts);
  50 static int peek(struct token_state *ts);
  51 static int nextchar(struct token_state *ts);
  52 static int process_char(struct token_state *ts);
  53
  54 static int peek(struct token_state *ts) {
  55         if (!ts->push_back) {
  56                 ts->nc = nextchar(ts);
  57                 ts->push_back = 1;
  58         }
  59         return ts->nc;
  60 }
  61
  62 static int nextch(struct token_state *ts) {
  63         if (!ts->push_back) {
  64                 ts->nc = nextchar(ts);
  65         }
  66         ts->pc = ts->cc;
  67         ts->cc = ts->nc;
  68         ts->push_back = 0;
  69         return ts->cc;
  70 }
  71
  72 #if 0
  73 static int pushback(struct token_state *ts) {
  74         /* TODO error if already pushed back? */
  75         ts->nc = ts->cc;
  76         ts->cc = ts->pc;
  77         return ts->cc;
  78 }
  79 #endif
  80
  81 static int add_to_token(struct token_state *ts, int ch) {
  82         ts->tok->text[ts->tok->length++] = (char)ch;
  83 #if DEBUG_TOKEN
  84         ptoken(ts);
  85 #endif
  86         return ts->tok->length;
  87 }
  88
  89 static int start_token(struct token_state *ts, int type, int ch) {
  90         int i;
  91         for (i=0;i<sizeof ts->tok->text; i++) {
  92                 ts->tok->text[i] = 0;
  93         }
  94         ts->tok->type = type;
  95         ts->tok->text[0] = (char)ch;
  96         ts->tok->length = 1;
  97 #if DEBUG_TOKEN
  98         fprintf(stderr, "starting ");
  99         ptoken(ts);
 100 #endif
 101         return 1;
 102 }
 103
 104 struct token_state *ts_init(struct token_state *ts, FILE *f) {
 105         if (!ts) return NULL;
 106         /* TODO malloc one */
 107
 108         ts->is_string = 0;
 109         ts->push_back = 0;
 110         ts->prev_op = 0;
 111         ts->prev_word = 0;
 112         ts->quoting = 0;
 113
 114         ts->input = f;
 115         ts->buf[0] = 0;
 116         return ts;
 117 }
 118
 119 int get_token(struct token_state *ts, struct token *tok) {
 120         int i;
 121         ts->tok = tok;
 122         tok->type = 0;
 123         tok->length = 0;
 124         ts->prev_op = 0;
 125         ts->prev_word = 0;
 126         ts->quoting = 0;
 127         for (i=0;i<sizeof tok->text; i++) {
 128                 tok->text[i] = 0;
 129         }
 130
 131         while (!process_char(ts)) {
 132         }
 133         return tok->type;
 134 }
 135
 136 /* true if the character could be used with the given token to make
 137  * an operator
 138  */
 139 static int can_op(int type, int ch) {
 140         /* TODO */
 141         switch (type) {
 142                 case TOKEN_Ampersand:
 143                         if (ch == '&') return TOKEN_AND_IF; break;
 144                 case TOKEN_Pipe:
 145                         if (ch == '|') return TOKEN_OR_IF; break;
 146                 case TOKEN_Semicolon:
 147                         if (ch == ';') return TOKEN_DSEMI; break;
 148                 case TOKEN_DLESS:
 149                         if (ch == '-') return TOKEN_DLESSDASH; break;
 150                 case TOKEN_Lessthan:
 151                         switch (ch) {
 152                                 case '<': return TOKEN_DLESS; break;
 153                                 case '&': return TOKEN_LESSAND; break;
 154                                 case '>': return TOKEN_LESSGREAT; break;
 155                                 default: break;
 156                         }
 157                         break;
 158                 case TOKEN_Greaterthan:
 159                         switch (ch) {
 160                                 case '>': return TOKEN_DGREAT; break;
 161                                 case '&': return TOKEN_GREATAND; break;
 162                                 case '|': return TOKEN_CLOBBER; break;
 163                                 default: break;
 164                         }
 165                         break;
 166                 default:
 167                         break;
 168         }
 169         return 0;
 170 }
 171
 172 static int valid_token(struct token_state *ts) {
 173         return ts->tok && ts->tok->length;
 174 }
 175
 176 static int delimit(struct token_state *ts, int ch) {
 177         if (valid_token(ts)) {
 178                 ts->tok->delimiter = ch;
 179                 if (ts->tok->length == 1 && ts->tok->text[0] == '\n') {
 180                         ts->tok->type = TOKEN_NEWLINE;
 181                 } else if (!ts->tok->nondigits && (ts->tok->delimiter == '>' || ts->tok->delimiter == '<')) {
 182                         ts->tok->type = TOKEN_IO_NUMBER;
 183                 }
 184 #if DEBUG_TOKEN
 185         fprintf(stderr, "delimited ");
 186         ptoken(ts);
 187 #endif
 188                 return 1;
 189         }
 190
 191         return 0;
 192 }
 193
 194 /* return true if delimited */
 195 static int process_char(struct token_state *ts) {
 196         int cc;
 197
 198         cc = peek(ts);
 199 #if 1
 200         fprintf(stderr, "looking at a '%c'\n", cc);
 201 #endif
 202
 203 /*
 204  * If the end of input is recognized, the current token shall be delimited. If
 205  * there is no current token, the end-of-input indicator shall be returned as
 206  * the token.
 207  */
 208         if (cc == EOF) {
 209                 if (delimit(ts,cc)) {
 210                         return 1;
 211                 }
 212                 fprintf(stderr, "EOF Token\n");
 213                 ts->tok->type = EOF;
 214                 return 1;
 215         }
 216
 217 /*
 218  * If the previous character was used as part of an operator and the current
 219  * character is not quoted and can be used with the current characters to form
 220  * an operator, it shall be used as part of that (operator) token.
 221  */
 222         if (ts->prev_op && !ts->quoting) {
 223                 int newop = can_op(ts->tok->type, cc);
 224                 if (newop) {
 225                         ts->tok->type = newop;
 226                         add_to_token(ts,cc);
 227                         nextch(ts);
 228                         return 0;
 229                 }
 230         }
 231
 232 /*
 233  * If the previous character was used as part of an operator and the current
 234  * character cannot be used with the current characters to form an
 235  * operator, the operator containing the previous character shall be
 236  * delimited.
 237  */
 238         if (ts->prev_op && ! can_op(ts->tok->type, cc)) {
 239                 if (delimit(ts,cc)) return 1;
 240                 /* TODO error here, should be impossible */
 241         }
 242
 243         /* match quote */
 244 /*
 245  * If the current character is <backslash>, single-quote, or double-quote and
 246  * it is not quoted, it shall affect quoting for subsequent characters up to
 247  * the end of the quoted text. The rules for quoting are as described in
 248  * Quoting . During token recognition no substitutions shall be actually
 249  * performed, and the result token shall contain exactly the characters that
 250  * appear in the input (except for <newline> joining), unmodified, including
 251  * any embedded or enclosing quotes or substitution operators, between the
 252  * <quotation-mark> and the end of the quoted text. The token shall not be
 253  * delimited by the end of the quoted field.
 254  */
 255         if (!ts->quoting) {
 256                 if (cc == '\\' || cc == '\'' || cc == '"') {
 257                         ts->quoting = cc;
 258                 }
 259         }
 260
 261         /* match expansion */
 262 /*
 263  * If the current character is an unquoted '$' or '`', the shell shall identify
 264  * the start of any candidates for parameter expansion (Parameter Expansion),
 265  * command substitution (Command Substitution), or arithmetic expansion
 266  * (Arithmetic Expansion) from their introductory unquoted character sequences:
 267  * '$' or "${", "$(" or '`', and "$((", respectively. The shell shall read
 268  * sufficient input to determine the end of the unit to be expanded (as
 269  * explained in the cited sections). While processing the characters, if
 270  * instances of expansions or quoting are found nested within the substitution,
 271  * the shell shall recursively process them in the manner specified for the
 272  * construct that is found. The characters found from the beginning of the
 273  * substitution to its end, allowing for any recursion necessary to recognize
 274  * embedded constructs, shall be included unmodified in the result token,
 275  * including any embedded or enclosing substitution operators or quotes.  The
 276  * token shall not be delimited by the end of the substitution.
 277  */
 278 #if 0
 279         if (!ts->quoting) {
 280                 if (cc == '$' || cc == '`') {
 281                         int nc;
 282                         nc = peek(ts);
 283                         switch (nc) {
 284                                 case '{':
 285                                         ts->expanding = EXP_BRACKET; break;
 286                                 case '(':
 287                                         ts->expanding = EXP_COMMAND; break;
 288                                 default:
 289                                         ts->expanding = EXP_WORDVAR; break;
 290                         }
 291                         do_expansion(ts, cc);
 292                 } else if (cc == '`') {
 293                         ts->expanding = EXP_COMMAND;
 294                 }
 295         }
 296 #endif
 297
 298 /*
 299  * If the current character is not quoted and can be used as the first
 300  * character of a new operator, the current token (if any) shall be delimited.
 301  * The current character shall be used as the beginning of the next (operator)
 302  * token.
 303  */
 304
 305 /*
 306  * If the current character is an unquoted <newline>, the current token shall
 307  * be delimited.
 308  */
 309
 310 /*
 311  * If the current character is an unquoted <blank>, any token containing the
 312  * previous character is delimited and the current character shall be
 313  * discarded.
 314  */
 315
 316 /*
 317  * The application shall quote the following characters if they are to
 318  * represent themselves:
 319  *
 320  * |  &  ;  <  >  (  )  $  `  \  "  '  <space>  <tab>  <newline>
 321  */
 322
 323
 324         if (! ts->quoting) {
 325                 switch (cc) {
 326                         case '&':
 327                                 if (delimit(ts,cc)) return 1;
 328                                 nextch(ts);
 329                                 ts->prev_op = 1;
 330                                 start_token(ts, TOKEN_Ampersand, cc);
 331                                 return 0; break;
 332                         case '|':
 333                                 if (delimit(ts,cc)) return 1;
 334                                 nextch(ts);
 335                                 ts->prev_op = 1;
 336                                 start_token(ts, TOKEN_Pipe,cc);
 337                                 return 0; break;
 338                         case ';':
 339                                 if (delimit(ts,cc)) return 1;
 340                                 nextch(ts);
 341                                 ts->prev_op = 1;
 342                                 start_token(ts, TOKEN_Semicolon,cc);
 343                                 return 0; break;
 344                         case '<':
 345                                 if (delimit(ts,cc)) return 1;
 346                                 nextch(ts);
 347                                 ts->prev_op = 1;
 348                                 start_token(ts, TOKEN_Lessthan,cc);
 349                                 return 0; break;
 350                         case '>':
 351                                 if (delimit(ts,cc)) return 1;
 352                                 nextch(ts);
 353                                 ts->prev_op = 1;
 354                                 start_token(ts, TOKEN_Greaterthan,cc);
 355                                 return 0; break;
 356                         case '\n':
 357                                 if (delimit(ts,cc)) return 1;
 358                                 fprintf(stderr, "newline\n");
 359                                 nextch(ts);
 360                                 start_token(ts, TOKEN_NEWLINE,cc);
 361                                 if (delimit(ts,cc)) return 1;
 362                                 break;
 363                         case '(':
 364                                 if (delimit(ts,cc)) return 1;
 365                                 nextch(ts);
 366                                 start_token(ts, TOKEN_Lparen,cc);
 367                                 if (delimit(ts,cc)) return 1;
 368                                 break;
 369                         case ')':
 370                                 if (delimit(ts,cc)) return 1;
 371                                 nextch(ts);
 372                                 start_token(ts, TOKEN_Rparen,cc);
 373                                 if (delimit(ts,cc)) return 1;
 374                                 break;
 375                         case ' ':
 376                         case '\t':
 377                                 if (delimit(ts,cc)) return 1;
 378                                 nextch(ts);
 379                                 return 0; break;
 380                         default:
 381                                 break;
 382                 }
 383         }
 384
 385 /*
 386  * If the previous character was part of a word, the current character shall
 387  * be appended to that word.
 388  */
 389         if (ts->prev_word) {
 390                 add_to_token(ts, cc);
 391                 nextch(ts);
 392                 return 0;
 393         }
 394
 395 /*
 396  * If the current character is a '#', it and all subsequent characters up
 397  * to, but excluding, the next <newline> shall be discarded as a comment.
 398  * The <newline> that ends the line is not considered part of the comment.
 399  */
 400         if (cc == '#') {
 401                 cc = nextch(ts);
 402                 cc = peek(ts);
 403                 while (cc != '\n' && cc != EOF) {
 404                         cc = nextch(ts);
 405                         cc = peek(ts);
 406                 }
 407                 return 0;
 408         }
 409
 410 /*
 411  * The current character is used as the start of a new word.
 412  */
 413         ts->prev_word = 1;
 414         start_token(ts, TOKEN_WORD, cc);
 415         nextch(ts);
 416         return 0;
 417 }
 418
 419 static int nextchar(struct token_state *ts) {
 420         if (ts->is_string) {
 421                 if (ts->cursor >= ts->strlen) {
 422                         return EOF;
 423                 }
 424                 return (int)(unsigned char)ts->strinput[ts->cursor++];
 425         }
 426
 427         return fgetc(ts->input);
 428 }
 429