#include #include "gram.h" /* START HEADER */ struct token { int type; int rtype; /* a "real type", e.g. In, NAME, etc. */ int length; int delimiter; int nondigits; int nonnames; struct token *next; char text[2048]; }; struct token_state { FILE *input; struct token *tok; char *strinput; int is_string; int strlen; int cursor; int pc; /* previous char */ int cc; /* current char */ int nc; /* next char */ int push_back; /* true if next char is valid */ int prev_op; /* previous character was used as part of an operator */ int prev_word; /* previous character is part of a word */ int quoting; /* type of quoting */ char buf[2048]; int buflen; }; struct token_state *ts_init(struct token_state *ts, FILE *f); int get_token(struct token_state *ts, struct token *tok); /* END HEADER */ #if DEBUG_TOKEN static void ptoken(struct token_state *ts) { fprintf(stderr, "token %d = %s\n", ts->tok->type, ts->tok->text); } #endif static int nextch(struct token_state *ts); static int peek(struct token_state *ts); static int nextchar(struct token_state *ts); static int process_char(struct token_state *ts); static int peek(struct token_state *ts) { if (!ts->push_back) { ts->nc = nextchar(ts); ts->push_back = 1; } return ts->nc; } static int nextch(struct token_state *ts) { if (!ts->push_back) { ts->nc = nextchar(ts); } ts->pc = ts->cc; ts->cc = ts->nc; ts->push_back = 0; return ts->cc; } #if 0 static int pushback(struct token_state *ts) { /* TODO error if already pushed back? */ ts->nc = ts->cc; ts->cc = ts->pc; return ts->cc; } #endif static int add_to_token(struct token_state *ts, int ch) { ts->tok->text[ts->tok->length++] = (char)ch; #if DEBUG_TOKEN ptoken(ts); #endif return ts->tok->length; } static int start_token(struct token_state *ts, int type, int ch) { int i; for (i=0;itok->text; i++) { ts->tok->text[i] = 0; } ts->tok->type = type; ts->tok->text[0] = (char)ch; ts->tok->length = 1; #if DEBUG_TOKEN fprintf(stderr, "starting "); ptoken(ts); #endif return 1; } struct token_state *ts_init(struct token_state *ts, FILE *f) { if (!ts) return NULL; /* TODO malloc one */ ts->is_string = 0; ts->push_back = 0; ts->prev_op = 0; ts->prev_word = 0; ts->quoting = 0; ts->input = f; ts->buf[0] = 0; return ts; } int get_token(struct token_state *ts, struct token *tok) { int i; ts->tok = tok; tok->type = 0; tok->length = 0; ts->prev_op = 0; ts->prev_word = 0; ts->quoting = 0; for (i=0;itext; i++) { tok->text[i] = 0; } while (!process_char(ts)) { } return tok->type; } /* true if the character could be used with the given token to make * an operator */ static int can_op(int type, int ch) { /* TODO */ switch (type) { case TOKEN_Ampersand: if (ch == '&') return TOKEN_AND_IF; break; case TOKEN_Pipe: if (ch == '|') return TOKEN_OR_IF; break; case TOKEN_Semicolon: if (ch == ';') return TOKEN_DSEMI; break; case TOKEN_DLESS: if (ch == '-') return TOKEN_DLESSDASH; break; case TOKEN_Lessthan: switch (ch) { case '<': return TOKEN_DLESS; break; case '&': return TOKEN_LESSAND; break; case '>': return TOKEN_LESSGREAT; break; default: break; } break; case TOKEN_Greaterthan: switch (ch) { case '>': return TOKEN_DGREAT; break; case '&': return TOKEN_GREATAND; break; case '|': return TOKEN_CLOBBER; break; default: break; } break; default: break; } return 0; } static int valid_token(struct token_state *ts) { return ts->tok && ts->tok->length; } static int delimit(struct token_state *ts, int ch) { if (valid_token(ts)) { ts->tok->delimiter = ch; if (ts->tok->length == 1 && ts->tok->text[0] == '\n') { ts->tok->type = TOKEN_NEWLINE; } else if (!ts->tok->nondigits && (ts->tok->delimiter == '>' || ts->tok->delimiter == '<')) { ts->tok->type = TOKEN_IO_NUMBER; } #if DEBUG_TOKEN fprintf(stderr, "delimited "); ptoken(ts); #endif return 1; } return 0; } /* return true if delimited */ static int process_char(struct token_state *ts) { int cc; cc = peek(ts); #if 1 fprintf(stderr, "looking at a '%c'\n", cc); #endif /* * If the end of input is recognized, the current token shall be delimited. If * there is no current token, the end-of-input indicator shall be returned as * the token. */ if (cc == EOF) { if (delimit(ts,cc)) { return 1; } fprintf(stderr, "EOF Token\n"); ts->tok->type = EOF; return 1; } /* * If the previous character was used as part of an operator and the current * character is not quoted and can be used with the current characters to form * an operator, it shall be used as part of that (operator) token. */ if (ts->prev_op && !ts->quoting) { int newop = can_op(ts->tok->type, cc); if (newop) { ts->tok->type = newop; add_to_token(ts,cc); nextch(ts); return 0; } } /* * If the previous character was used as part of an operator and the current * character cannot be used with the current characters to form an * operator, the operator containing the previous character shall be * delimited. */ if (ts->prev_op && ! can_op(ts->tok->type, cc)) { if (delimit(ts,cc)) return 1; /* TODO error here, should be impossible */ } /* match quote */ /* * If the current character is , single-quote, or double-quote and * it is not quoted, it shall affect quoting for subsequent characters up to * the end of the quoted text. The rules for quoting are as described in * Quoting . During token recognition no substitutions shall be actually * performed, and the result token shall contain exactly the characters that * appear in the input (except for joining), unmodified, including * any embedded or enclosing quotes or substitution operators, between the * and the end of the quoted text. The token shall not be * delimited by the end of the quoted field. */ if (!ts->quoting) { if (cc == '\\' || cc == '\'' || cc == '"') { ts->quoting = cc; } } /* match expansion */ /* * If the current character is an unquoted '$' or '`', the shell shall identify * the start of any candidates for parameter expansion (Parameter Expansion), * command substitution (Command Substitution), or arithmetic expansion * (Arithmetic Expansion) from their introductory unquoted character sequences: * '$' or "${", "$(" or '`', and "$((", respectively. The shell shall read * sufficient input to determine the end of the unit to be expanded (as * explained in the cited sections). While processing the characters, if * instances of expansions or quoting are found nested within the substitution, * the shell shall recursively process them in the manner specified for the * construct that is found. The characters found from the beginning of the * substitution to its end, allowing for any recursion necessary to recognize * embedded constructs, shall be included unmodified in the result token, * including any embedded or enclosing substitution operators or quotes. The * token shall not be delimited by the end of the substitution. */ #if 0 if (!ts->quoting) { if (cc == '$' || cc == '`') { int nc; nc = peek(ts); switch (nc) { case '{': ts->expanding = EXP_BRACKET; break; case '(': ts->expanding = EXP_COMMAND; break; default: ts->expanding = EXP_WORDVAR; break; } do_expansion(ts, cc); } else if (cc == '`') { ts->expanding = EXP_COMMAND; } } #endif /* * If the current character is not quoted and can be used as the first * character of a new operator, the current token (if any) shall be delimited. * The current character shall be used as the beginning of the next (operator) * token. */ /* * If the current character is an unquoted , the current token shall * be delimited. */ /* * If the current character is an unquoted , any token containing the * previous character is delimited and the current character shall be * discarded. */ /* * The application shall quote the following characters if they are to * represent themselves: * * | & ; < > ( ) $ ` \ " ' */ if (! ts->quoting) { switch (cc) { case '&': if (delimit(ts,cc)) return 1; nextch(ts); ts->prev_op = 1; start_token(ts, TOKEN_Ampersand, cc); return 0; break; case '|': if (delimit(ts,cc)) return 1; nextch(ts); ts->prev_op = 1; start_token(ts, TOKEN_Pipe,cc); return 0; break; case ';': if (delimit(ts,cc)) return 1; nextch(ts); ts->prev_op = 1; start_token(ts, TOKEN_Semicolon,cc); return 0; break; case '<': if (delimit(ts,cc)) return 1; nextch(ts); ts->prev_op = 1; start_token(ts, TOKEN_Lessthan,cc); return 0; break; case '>': if (delimit(ts,cc)) return 1; nextch(ts); ts->prev_op = 1; start_token(ts, TOKEN_Greaterthan,cc); return 0; break; case '\n': if (delimit(ts,cc)) return 1; fprintf(stderr, "newline\n"); nextch(ts); start_token(ts, TOKEN_NEWLINE,cc); if (delimit(ts,cc)) return 1; break; case '(': if (delimit(ts,cc)) return 1; nextch(ts); start_token(ts, TOKEN_Lparen,cc); if (delimit(ts,cc)) return 1; break; case ')': if (delimit(ts,cc)) return 1; nextch(ts); start_token(ts, TOKEN_Rparen,cc); if (delimit(ts,cc)) return 1; break; case ' ': case '\t': if (delimit(ts,cc)) return 1; nextch(ts); return 0; break; default: break; } } /* * If the previous character was part of a word, the current character shall * be appended to that word. */ if (ts->prev_word) { add_to_token(ts, cc); nextch(ts); return 0; } /* * If the current character is a '#', it and all subsequent characters up * to, but excluding, the next shall be discarded as a comment. * The that ends the line is not considered part of the comment. */ if (cc == '#') { cc = nextch(ts); cc = peek(ts); while (cc != '\n' && cc != EOF) { cc = nextch(ts); cc = peek(ts); } return 0; } /* * The current character is used as the start of a new word. */ ts->prev_word = 1; start_token(ts, TOKEN_WORD, cc); nextch(ts); return 0; } static int nextchar(struct token_state *ts) { if (ts->is_string) { if (ts->cursor >= ts->strlen) { return EOF; } return (int)(unsigned char)ts->strinput[ts->cursor++]; } return fgetc(ts->input); }