+++ /dev/null
-#include <stdio.h>
-#include "gram.h"
-
-/* START HEADER */
-struct token {
- int type;
- int rtype; /* a "real type", e.g. In, NAME, etc. */
- int length;
- int delimiter;
- int nondigits;
- int nonnames;
- struct token *next;
- char text[2048];
-};
-
-struct token_state {
- FILE *input;
- struct token *tok;
-
- char *strinput;
- int is_string;
- int strlen;
- int cursor;
-
- int pc; /* previous char */
- int cc; /* current char */
- int nc; /* next char */
- int push_back; /* true if next char is valid */
-
- int prev_op; /* previous character was used as part of an operator */
- int prev_word; /* previous character is part of a word */
- int quoting; /* type of quoting */
-
- char buf[2048];
- int buflen;
-};
-
-struct token_state *ts_init(struct token_state *ts, FILE *f);
-int get_token(struct token_state *ts, struct token *tok);
-
-/* END HEADER */
-
-#if DEBUG_TOKEN
-static void ptoken(struct token_state *ts) {
- fprintf(stderr, "token %d = %s\n", ts->tok->type, ts->tok->text);
-}
-#endif
-
-static int nextch(struct token_state *ts);
-static int peek(struct token_state *ts);
-static int nextchar(struct token_state *ts);
-static int process_char(struct token_state *ts);
-
-static int peek(struct token_state *ts) {
- if (!ts->push_back) {
- ts->nc = nextchar(ts);
- ts->push_back = 1;
- }
- return ts->nc;
-}
-
-static int nextch(struct token_state *ts) {
- if (!ts->push_back) {
- ts->nc = nextchar(ts);
- }
- ts->pc = ts->cc;
- ts->cc = ts->nc;
- ts->push_back = 0;
- return ts->cc;
-}
-
-#if 0
-static int pushback(struct token_state *ts) {
- /* TODO error if already pushed back? */
- ts->nc = ts->cc;
- ts->cc = ts->pc;
- return ts->cc;
-}
-#endif
-
-static int add_to_token(struct token_state *ts, int ch) {
- ts->tok->text[ts->tok->length++] = (char)ch;
-#if DEBUG_TOKEN
- ptoken(ts);
-#endif
- return ts->tok->length;
-}
-
-static int start_token(struct token_state *ts, int type, int ch) {
- int i;
- for (i=0;i<sizeof ts->tok->text; i++) {
- ts->tok->text[i] = 0;
- }
- ts->tok->type = type;
- ts->tok->text[0] = (char)ch;
- ts->tok->length = 1;
-#if DEBUG_TOKEN
- fprintf(stderr, "starting ");
- ptoken(ts);
-#endif
- return 1;
-}
-
-struct token_state *ts_init(struct token_state *ts, FILE *f) {
- if (!ts) return NULL;
- /* TODO malloc one */
-
- ts->is_string = 0;
- ts->push_back = 0;
- ts->prev_op = 0;
- ts->prev_word = 0;
- ts->quoting = 0;
-
- ts->input = f;
- ts->buf[0] = 0;
- return ts;
-}
-
-int get_token(struct token_state *ts, struct token *tok) {
- int i;
- ts->tok = tok;
- tok->type = 0;
- tok->length = 0;
- ts->prev_op = 0;
- ts->prev_word = 0;
- ts->quoting = 0;
- for (i=0;i<sizeof tok->text; i++) {
- tok->text[i] = 0;
- }
-
- while (!process_char(ts)) {
- }
- return tok->type;
-}
-
-/* true if the character could be used with the given token to make
- * an operator
- */
-static int can_op(int type, int ch) {
- /* TODO */
- switch (type) {
- case TOKEN_Ampersand:
- if (ch == '&') return TOKEN_AND_IF; break;
- case TOKEN_Pipe:
- if (ch == '|') return TOKEN_OR_IF; break;
- case TOKEN_Semicolon:
- if (ch == ';') return TOKEN_DSEMI; break;
- case TOKEN_DLESS:
- if (ch == '-') return TOKEN_DLESSDASH; break;
- case TOKEN_Lessthan:
- switch (ch) {
- case '<': return TOKEN_DLESS; break;
- case '&': return TOKEN_LESSAND; break;
- case '>': return TOKEN_LESSGREAT; break;
- default: break;
- }
- break;
- case TOKEN_Greaterthan:
- switch (ch) {
- case '>': return TOKEN_DGREAT; break;
- case '&': return TOKEN_GREATAND; break;
- case '|': return TOKEN_CLOBBER; break;
- default: break;
- }
- break;
- default:
- break;
- }
- return 0;
-}
-
-static int valid_token(struct token_state *ts) {
- return ts->tok && ts->tok->length;
-}
-
-static int delimit(struct token_state *ts, int ch) {
- if (valid_token(ts)) {
- ts->tok->delimiter = ch;
- if (ts->tok->length == 1 && ts->tok->text[0] == '\n') {
- ts->tok->type = TOKEN_NEWLINE;
- } else if (!ts->tok->nondigits && (ts->tok->delimiter == '>' || ts->tok->delimiter == '<')) {
- ts->tok->type = TOKEN_IO_NUMBER;
- }
-#if DEBUG_TOKEN
- fprintf(stderr, "delimited ");
- ptoken(ts);
-#endif
- return 1;
- }
-
- return 0;
-}
-
-/* return true if delimited */
-static int process_char(struct token_state *ts) {
- int cc;
-
- cc = peek(ts);
-#if 1
- fprintf(stderr, "looking at a '%c'\n", cc);
-#endif
-
-/*
- * If the end of input is recognized, the current token shall be delimited. If
- * there is no current token, the end-of-input indicator shall be returned as
- * the token.
- */
- if (cc == EOF) {
- if (delimit(ts,cc)) {
- return 1;
- }
- fprintf(stderr, "EOF Token\n");
- ts->tok->type = EOF;
- return 1;
- }
-
-/*
- * If the previous character was used as part of an operator and the current
- * character is not quoted and can be used with the current characters to form
- * an operator, it shall be used as part of that (operator) token.
- */
- if (ts->prev_op && !ts->quoting) {
- int newop = can_op(ts->tok->type, cc);
- if (newop) {
- ts->tok->type = newop;
- add_to_token(ts,cc);
- nextch(ts);
- return 0;
- }
- }
-
-/*
- * If the previous character was used as part of an operator and the current
- * character cannot be used with the current characters to form an
- * operator, the operator containing the previous character shall be
- * delimited.
- */
- if (ts->prev_op && ! can_op(ts->tok->type, cc)) {
- if (delimit(ts,cc)) return 1;
- /* TODO error here, should be impossible */
- }
-
- /* match quote */
-/*
- * If the current character is <backslash>, single-quote, or double-quote and
- * it is not quoted, it shall affect quoting for subsequent characters up to
- * the end of the quoted text. The rules for quoting are as described in
- * Quoting . During token recognition no substitutions shall be actually
- * performed, and the result token shall contain exactly the characters that
- * appear in the input (except for <newline> joining), unmodified, including
- * any embedded or enclosing quotes or substitution operators, between the
- * <quotation-mark> and the end of the quoted text. The token shall not be
- * delimited by the end of the quoted field.
- */
- if (!ts->quoting) {
- if (cc == '\\' || cc == '\'' || cc == '"') {
- ts->quoting = cc;
- }
- }
-
- /* match expansion */
-/*
- * If the current character is an unquoted '$' or '`', the shell shall identify
- * the start of any candidates for parameter expansion (Parameter Expansion),
- * command substitution (Command Substitution), or arithmetic expansion
- * (Arithmetic Expansion) from their introductory unquoted character sequences:
- * '$' or "${", "$(" or '`', and "$((", respectively. The shell shall read
- * sufficient input to determine the end of the unit to be expanded (as
- * explained in the cited sections). While processing the characters, if
- * instances of expansions or quoting are found nested within the substitution,
- * the shell shall recursively process them in the manner specified for the
- * construct that is found. The characters found from the beginning of the
- * substitution to its end, allowing for any recursion necessary to recognize
- * embedded constructs, shall be included unmodified in the result token,
- * including any embedded or enclosing substitution operators or quotes. The
- * token shall not be delimited by the end of the substitution.
- */
-#if 0
- if (!ts->quoting) {
- if (cc == '$' || cc == '`') {
- int nc;
- nc = peek(ts);
- switch (nc) {
- case '{':
- ts->expanding = EXP_BRACKET; break;
- case '(':
- ts->expanding = EXP_COMMAND; break;
- default:
- ts->expanding = EXP_WORDVAR; break;
- }
- do_expansion(ts, cc);
- } else if (cc == '`') {
- ts->expanding = EXP_COMMAND;
- }
- }
-#endif
-
-/*
- * If the current character is not quoted and can be used as the first
- * character of a new operator, the current token (if any) shall be delimited.
- * The current character shall be used as the beginning of the next (operator)
- * token.
- */
-
-/*
- * If the current character is an unquoted <newline>, the current token shall
- * be delimited.
- */
-
-/*
- * If the current character is an unquoted <blank>, any token containing the
- * previous character is delimited and the current character shall be
- * discarded.
- */
-
-/*
- * The application shall quote the following characters if they are to
- * represent themselves:
- *
- * | & ; < > ( ) $ ` \ " ' <space> <tab> <newline>
- */
-
-
- if (! ts->quoting) {
- switch (cc) {
- case '&':
- if (delimit(ts,cc)) return 1;
- nextch(ts);
- ts->prev_op = 1;
- start_token(ts, TOKEN_Ampersand, cc);
- return 0; break;
- case '|':
- if (delimit(ts,cc)) return 1;
- nextch(ts);
- ts->prev_op = 1;
- start_token(ts, TOKEN_Pipe,cc);
- return 0; break;
- case ';':
- if (delimit(ts,cc)) return 1;
- nextch(ts);
- ts->prev_op = 1;
- start_token(ts, TOKEN_Semicolon,cc);
- return 0; break;
- case '<':
- if (delimit(ts,cc)) return 1;
- nextch(ts);
- ts->prev_op = 1;
- start_token(ts, TOKEN_Lessthan,cc);
- return 0; break;
- case '>':
- if (delimit(ts,cc)) return 1;
- nextch(ts);
- ts->prev_op = 1;
- start_token(ts, TOKEN_Greaterthan,cc);
- return 0; break;
- case '\n':
- if (delimit(ts,cc)) return 1;
- fprintf(stderr, "newline\n");
- nextch(ts);
- start_token(ts, TOKEN_NEWLINE,cc);
- if (delimit(ts,cc)) return 1;
- break;
- case '(':
- if (delimit(ts,cc)) return 1;
- nextch(ts);
- start_token(ts, TOKEN_Lparen,cc);
- if (delimit(ts,cc)) return 1;
- break;
- case ')':
- if (delimit(ts,cc)) return 1;
- nextch(ts);
- start_token(ts, TOKEN_Rparen,cc);
- if (delimit(ts,cc)) return 1;
- break;
- case ' ':
- case '\t':
- if (delimit(ts,cc)) return 1;
- nextch(ts);
- return 0; break;
- default:
- break;
- }
- }
-
-/*
- * If the previous character was part of a word, the current character shall
- * be appended to that word.
- */
- if (ts->prev_word) {
- add_to_token(ts, cc);
- nextch(ts);
- return 0;
- }
-
-/*
- * If the current character is a '#', it and all subsequent characters up
- * to, but excluding, the next <newline> shall be discarded as a comment.
- * The <newline> that ends the line is not considered part of the comment.
- */
- if (cc == '#') {
- cc = nextch(ts);
- cc = peek(ts);
- while (cc != '\n' && cc != EOF) {
- cc = nextch(ts);
- cc = peek(ts);
- }
- return 0;
- }
-
-/*
- * The current character is used as the start of a new word.
- */
- ts->prev_word = 1;
- start_token(ts, TOKEN_WORD, cc);
- nextch(ts);
- return 0;
-}
-
-static int nextchar(struct token_state *ts) {
- if (ts->is_string) {
- if (ts->cursor >= ts->strlen) {
- return EOF;
- }
- return (int)(unsigned char)ts->strinput[ts->cursor++];
- }
-
- return fgetc(ts->input);
-}
-