X-Git-Url: https://pd.if.org/git/?p=pdutils;a=blobdiff_plain;f=utils%2Fsh%2Ftok.c;fp=utils%2Fsh%2Ftok.c;h=a92634352a1d2b426376049532686f1fd2623573;hp=0000000000000000000000000000000000000000;hb=dce1707cec7cd4268fe73371c7488052e5058181;hpb=5cd80baaa2b300ac0e2bded7bf9dfc7b74022ab8 diff --git a/utils/sh/tok.c b/utils/sh/tok.c new file mode 100644 index 0000000..a926343 --- /dev/null +++ b/utils/sh/tok.c @@ -0,0 +1,429 @@ +#include +#include "gram.h" + +/* START HEADER */ +struct token { + int type; + int rtype; /* a "real type", e.g. In, NAME, etc. */ + int length; + int delimiter; + int nondigits; + int nonnames; + struct token *next; + char text[2048]; +}; + +struct token_state { + FILE *input; + struct token *tok; + + char *strinput; + int is_string; + int strlen; + int cursor; + + int pc; /* previous char */ + int cc; /* current char */ + int nc; /* next char */ + int push_back; /* true if next char is valid */ + + int prev_op; /* previous character was used as part of an operator */ + int prev_word; /* previous character is part of a word */ + int quoting; /* type of quoting */ + + char buf[2048]; + int buflen; +}; + +struct token_state *ts_init(struct token_state *ts, FILE *f); +int get_token(struct token_state *ts, struct token *tok); + +/* END HEADER */ + +#if DEBUG_TOKEN +static void ptoken(struct token_state *ts) { + fprintf(stderr, "token %d = %s\n", ts->tok->type, ts->tok->text); +} +#endif + +static int nextch(struct token_state *ts); +static int peek(struct token_state *ts); +static int nextchar(struct token_state *ts); +static int process_char(struct token_state *ts); + +static int peek(struct token_state *ts) { + if (!ts->push_back) { + ts->nc = nextchar(ts); + ts->push_back = 1; + } + return ts->nc; +} + +static int nextch(struct token_state *ts) { + if (!ts->push_back) { + ts->nc = nextchar(ts); + } + ts->pc = ts->cc; + ts->cc = ts->nc; + ts->push_back = 0; + return ts->cc; +} + +#if 0 +static int pushback(struct token_state *ts) { + /* TODO error if already pushed back? */ + ts->nc = ts->cc; + ts->cc = ts->pc; + return ts->cc; +} +#endif + +static int add_to_token(struct token_state *ts, int ch) { + ts->tok->text[ts->tok->length++] = (char)ch; +#if DEBUG_TOKEN + ptoken(ts); +#endif + return ts->tok->length; +} + +static int start_token(struct token_state *ts, int type, int ch) { + int i; + for (i=0;itok->text; i++) { + ts->tok->text[i] = 0; + } + ts->tok->type = type; + ts->tok->text[0] = (char)ch; + ts->tok->length = 1; +#if DEBUG_TOKEN + fprintf(stderr, "starting "); + ptoken(ts); +#endif + return 1; +} + +struct token_state *ts_init(struct token_state *ts, FILE *f) { + if (!ts) return NULL; + /* TODO malloc one */ + + ts->is_string = 0; + ts->push_back = 0; + ts->prev_op = 0; + ts->prev_word = 0; + ts->quoting = 0; + + ts->input = f; + ts->buf[0] = 0; + return ts; +} + +int get_token(struct token_state *ts, struct token *tok) { + int i; + ts->tok = tok; + tok->type = 0; + tok->length = 0; + ts->prev_op = 0; + ts->prev_word = 0; + ts->quoting = 0; + for (i=0;itext; i++) { + tok->text[i] = 0; + } + + while (!process_char(ts)) { + } + return tok->type; +} + +/* true if the character could be used with the given token to make + * an operator + */ +static int can_op(int type, int ch) { + /* TODO */ + switch (type) { + case TOKEN_Ampersand: + if (ch == '&') return TOKEN_AND_IF; break; + case TOKEN_Pipe: + if (ch == '|') return TOKEN_OR_IF; break; + case TOKEN_Semicolon: + if (ch == ';') return TOKEN_DSEMI; break; + case TOKEN_DLESS: + if (ch == '-') return TOKEN_DLESSDASH; break; + case TOKEN_Lessthan: + switch (ch) { + case '<': return TOKEN_DLESS; break; + case '&': return TOKEN_LESSAND; break; + case '>': return TOKEN_LESSGREAT; break; + default: break; + } + break; + case TOKEN_Greaterthan: + switch (ch) { + case '>': return TOKEN_DGREAT; break; + case '&': return TOKEN_GREATAND; break; + case '|': return TOKEN_CLOBBER; break; + default: break; + } + break; + default: + break; + } + return 0; +} + +static int valid_token(struct token_state *ts) { + return ts->tok && ts->tok->length; +} + +static int delimit(struct token_state *ts, int ch) { + if (valid_token(ts)) { + ts->tok->delimiter = ch; + if (ts->tok->length == 1 && ts->tok->text[0] == '\n') { + ts->tok->type = TOKEN_NEWLINE; + } else if (!ts->tok->nondigits && (ts->tok->delimiter == '>' || ts->tok->delimiter == '<')) { + ts->tok->type = TOKEN_IO_NUMBER; + } +#if DEBUG_TOKEN + fprintf(stderr, "delimited "); + ptoken(ts); +#endif + return 1; + } + + return 0; +} + +/* return true if delimited */ +static int process_char(struct token_state *ts) { + int cc; + + cc = peek(ts); +#if 1 + fprintf(stderr, "looking at a '%c'\n", cc); +#endif + +/* + * If the end of input is recognized, the current token shall be delimited. If + * there is no current token, the end-of-input indicator shall be returned as + * the token. + */ + if (cc == EOF) { + if (delimit(ts,cc)) { + return 1; + } + fprintf(stderr, "EOF Token\n"); + ts->tok->type = EOF; + return 1; + } + +/* + * If the previous character was used as part of an operator and the current + * character is not quoted and can be used with the current characters to form + * an operator, it shall be used as part of that (operator) token. + */ + if (ts->prev_op && !ts->quoting) { + int newop = can_op(ts->tok->type, cc); + if (newop) { + ts->tok->type = newop; + add_to_token(ts,cc); + nextch(ts); + return 0; + } + } + +/* + * If the previous character was used as part of an operator and the current + * character cannot be used with the current characters to form an + * operator, the operator containing the previous character shall be + * delimited. + */ + if (ts->prev_op && ! can_op(ts->tok->type, cc)) { + if (delimit(ts,cc)) return 1; + /* TODO error here, should be impossible */ + } + + /* match quote */ +/* + * If the current character is , single-quote, or double-quote and + * it is not quoted, it shall affect quoting for subsequent characters up to + * the end of the quoted text. The rules for quoting are as described in + * Quoting . During token recognition no substitutions shall be actually + * performed, and the result token shall contain exactly the characters that + * appear in the input (except for joining), unmodified, including + * any embedded or enclosing quotes or substitution operators, between the + * and the end of the quoted text. The token shall not be + * delimited by the end of the quoted field. + */ + if (!ts->quoting) { + if (cc == '\\' || cc == '\'' || cc == '"') { + ts->quoting = cc; + } + } + + /* match expansion */ +/* + * If the current character is an unquoted '$' or '`', the shell shall identify + * the start of any candidates for parameter expansion (Parameter Expansion), + * command substitution (Command Substitution), or arithmetic expansion + * (Arithmetic Expansion) from their introductory unquoted character sequences: + * '$' or "${", "$(" or '`', and "$((", respectively. The shell shall read + * sufficient input to determine the end of the unit to be expanded (as + * explained in the cited sections). While processing the characters, if + * instances of expansions or quoting are found nested within the substitution, + * the shell shall recursively process them in the manner specified for the + * construct that is found. The characters found from the beginning of the + * substitution to its end, allowing for any recursion necessary to recognize + * embedded constructs, shall be included unmodified in the result token, + * including any embedded or enclosing substitution operators or quotes. The + * token shall not be delimited by the end of the substitution. + */ +#if 0 + if (!ts->quoting) { + if (cc == '$' || cc == '`') { + int nc; + nc = peek(ts); + switch (nc) { + case '{': + ts->expanding = EXP_BRACKET; break; + case '(': + ts->expanding = EXP_COMMAND; break; + default: + ts->expanding = EXP_WORDVAR; break; + } + do_expansion(ts, cc); + } else if (cc == '`') { + ts->expanding = EXP_COMMAND; + } + } +#endif + +/* + * If the current character is not quoted and can be used as the first + * character of a new operator, the current token (if any) shall be delimited. + * The current character shall be used as the beginning of the next (operator) + * token. + */ + +/* + * If the current character is an unquoted , the current token shall + * be delimited. + */ + +/* + * If the current character is an unquoted , any token containing the + * previous character is delimited and the current character shall be + * discarded. + */ + +/* + * The application shall quote the following characters if they are to + * represent themselves: + * + * | & ; < > ( ) $ ` \ " ' + */ + + + if (! ts->quoting) { + switch (cc) { + case '&': + if (delimit(ts,cc)) return 1; + nextch(ts); + ts->prev_op = 1; + start_token(ts, TOKEN_Ampersand, cc); + return 0; break; + case '|': + if (delimit(ts,cc)) return 1; + nextch(ts); + ts->prev_op = 1; + start_token(ts, TOKEN_Pipe,cc); + return 0; break; + case ';': + if (delimit(ts,cc)) return 1; + nextch(ts); + ts->prev_op = 1; + start_token(ts, TOKEN_Semicolon,cc); + return 0; break; + case '<': + if (delimit(ts,cc)) return 1; + nextch(ts); + ts->prev_op = 1; + start_token(ts, TOKEN_Lessthan,cc); + return 0; break; + case '>': + if (delimit(ts,cc)) return 1; + nextch(ts); + ts->prev_op = 1; + start_token(ts, TOKEN_Greaterthan,cc); + return 0; break; + case '\n': + if (delimit(ts,cc)) return 1; + fprintf(stderr, "newline\n"); + nextch(ts); + start_token(ts, TOKEN_NEWLINE,cc); + if (delimit(ts,cc)) return 1; + break; + case '(': + if (delimit(ts,cc)) return 1; + nextch(ts); + start_token(ts, TOKEN_Lparen,cc); + if (delimit(ts,cc)) return 1; + break; + case ')': + if (delimit(ts,cc)) return 1; + nextch(ts); + start_token(ts, TOKEN_Rparen,cc); + if (delimit(ts,cc)) return 1; + break; + case ' ': + case '\t': + if (delimit(ts,cc)) return 1; + nextch(ts); + return 0; break; + default: + break; + } + } + +/* + * If the previous character was part of a word, the current character shall + * be appended to that word. + */ + if (ts->prev_word) { + add_to_token(ts, cc); + nextch(ts); + return 0; + } + +/* + * If the current character is a '#', it and all subsequent characters up + * to, but excluding, the next shall be discarded as a comment. + * The that ends the line is not considered part of the comment. + */ + if (cc == '#') { + cc = nextch(ts); + cc = peek(ts); + while (cc != '\n' && cc != EOF) { + cc = nextch(ts); + cc = peek(ts); + } + return 0; + } + +/* + * The current character is used as the start of a new word. + */ + ts->prev_word = 1; + start_token(ts, TOKEN_WORD, cc); + nextch(ts); + return 0; +} + +static int nextchar(struct token_state *ts) { + if (ts->is_string) { + if (ts->cursor >= ts->strlen) { + return EOF; + } + return (int)(unsigned char)ts->strinput[ts->cursor++]; + } + + return fgetc(ts->input); +} +