7 int rtype; /* a "real type", e.g. In, NAME, etc. */
25 int pc; /* previous char */
26 int cc; /* current char */
27 int nc; /* next char */
28 int push_back; /* true if next char is valid */
30 int prev_op; /* previous character was used as part of an operator */
31 int prev_word; /* previous character is part of a word */
32 int quoting; /* type of quoting */
38 struct token_state *ts_init(struct token_state *ts, FILE *f);
39 int get_token(struct token_state *ts, struct token *tok);
44 static void ptoken(struct token_state *ts) {
45 fprintf(stderr, "token %d = %s\n", ts->tok->type, ts->tok->text);
49 static int nextch(struct token_state *ts);
50 static int peek(struct token_state *ts);
51 static int nextchar(struct token_state *ts);
52 static int process_char(struct token_state *ts);
54 static int peek(struct token_state *ts) {
56 ts->nc = nextchar(ts);
62 static int nextch(struct token_state *ts) {
64 ts->nc = nextchar(ts);
73 static int pushback(struct token_state *ts) {
74 /* TODO error if already pushed back? */
81 static int add_to_token(struct token_state *ts, int ch) {
82 ts->tok->text[ts->tok->length++] = (char)ch;
86 return ts->tok->length;
89 static int start_token(struct token_state *ts, int type, int ch) {
91 for (i=0;i<sizeof ts->tok->text; i++) {
95 ts->tok->text[0] = (char)ch;
98 fprintf(stderr, "starting ");
104 struct token_state *ts_init(struct token_state *ts, FILE *f) {
105 if (!ts) return NULL;
106 /* TODO malloc one */
119 int get_token(struct token_state *ts, struct token *tok) {
127 for (i=0;i<sizeof tok->text; i++) {
131 while (!process_char(ts)) {
136 /* true if the character could be used with the given token to make
139 static int can_op(int type, int ch) {
142 case TOKEN_Ampersand:
143 if (ch == '&') return TOKEN_AND_IF; break;
145 if (ch == '|') return TOKEN_OR_IF; break;
146 case TOKEN_Semicolon:
147 if (ch == ';') return TOKEN_DSEMI; break;
149 if (ch == '-') return TOKEN_DLESSDASH; break;
152 case '<': return TOKEN_DLESS; break;
153 case '&': return TOKEN_LESSAND; break;
154 case '>': return TOKEN_LESSGREAT; break;
158 case TOKEN_Greaterthan:
160 case '>': return TOKEN_DGREAT; break;
161 case '&': return TOKEN_GREATAND; break;
162 case '|': return TOKEN_CLOBBER; break;
172 static int valid_token(struct token_state *ts) {
173 return ts->tok && ts->tok->length;
176 static int delimit(struct token_state *ts, int ch) {
177 if (valid_token(ts)) {
178 ts->tok->delimiter = ch;
179 if (ts->tok->length == 1 && ts->tok->text[0] == '\n') {
180 ts->tok->type = TOKEN_NEWLINE;
181 } else if (!ts->tok->nondigits && (ts->tok->delimiter == '>' || ts->tok->delimiter == '<')) {
182 ts->tok->type = TOKEN_IO_NUMBER;
185 fprintf(stderr, "delimited ");
194 /* return true if delimited */
195 static int process_char(struct token_state *ts) {
200 fprintf(stderr, "looking at a '%c'\n", cc);
204 * If the end of input is recognized, the current token shall be delimited. If
205 * there is no current token, the end-of-input indicator shall be returned as
209 if (delimit(ts,cc)) {
212 fprintf(stderr, "EOF Token\n");
218 * If the previous character was used as part of an operator and the current
219 * character is not quoted and can be used with the current characters to form
220 * an operator, it shall be used as part of that (operator) token.
222 if (ts->prev_op && !ts->quoting) {
223 int newop = can_op(ts->tok->type, cc);
225 ts->tok->type = newop;
233 * If the previous character was used as part of an operator and the current
234 * character cannot be used with the current characters to form an
235 * operator, the operator containing the previous character shall be
238 if (ts->prev_op && ! can_op(ts->tok->type, cc)) {
239 if (delimit(ts,cc)) return 1;
240 /* TODO error here, should be impossible */
245 * If the current character is <backslash>, single-quote, or double-quote and
246 * it is not quoted, it shall affect quoting for subsequent characters up to
247 * the end of the quoted text. The rules for quoting are as described in
248 * Quoting . During token recognition no substitutions shall be actually
249 * performed, and the result token shall contain exactly the characters that
250 * appear in the input (except for <newline> joining), unmodified, including
251 * any embedded or enclosing quotes or substitution operators, between the
252 * <quotation-mark> and the end of the quoted text. The token shall not be
253 * delimited by the end of the quoted field.
256 if (cc == '\\' || cc == '\'' || cc == '"') {
261 /* match expansion */
263 * If the current character is an unquoted '$' or '`', the shell shall identify
264 * the start of any candidates for parameter expansion (Parameter Expansion),
265 * command substitution (Command Substitution), or arithmetic expansion
266 * (Arithmetic Expansion) from their introductory unquoted character sequences:
267 * '$' or "${", "$(" or '`', and "$((", respectively. The shell shall read
268 * sufficient input to determine the end of the unit to be expanded (as
269 * explained in the cited sections). While processing the characters, if
270 * instances of expansions or quoting are found nested within the substitution,
271 * the shell shall recursively process them in the manner specified for the
272 * construct that is found. The characters found from the beginning of the
273 * substitution to its end, allowing for any recursion necessary to recognize
274 * embedded constructs, shall be included unmodified in the result token,
275 * including any embedded or enclosing substitution operators or quotes. The
276 * token shall not be delimited by the end of the substitution.
280 if (cc == '$' || cc == '`') {
285 ts->expanding = EXP_BRACKET; break;
287 ts->expanding = EXP_COMMAND; break;
289 ts->expanding = EXP_WORDVAR; break;
291 do_expansion(ts, cc);
292 } else if (cc == '`') {
293 ts->expanding = EXP_COMMAND;
299 * If the current character is not quoted and can be used as the first
300 * character of a new operator, the current token (if any) shall be delimited.
301 * The current character shall be used as the beginning of the next (operator)
306 * If the current character is an unquoted <newline>, the current token shall
311 * If the current character is an unquoted <blank>, any token containing the
312 * previous character is delimited and the current character shall be
317 * The application shall quote the following characters if they are to
318 * represent themselves:
320 * | & ; < > ( ) $ ` \ " ' <space> <tab> <newline>
327 if (delimit(ts,cc)) return 1;
330 start_token(ts, TOKEN_Ampersand, cc);
333 if (delimit(ts,cc)) return 1;
336 start_token(ts, TOKEN_Pipe,cc);
339 if (delimit(ts,cc)) return 1;
342 start_token(ts, TOKEN_Semicolon,cc);
345 if (delimit(ts,cc)) return 1;
348 start_token(ts, TOKEN_Lessthan,cc);
351 if (delimit(ts,cc)) return 1;
354 start_token(ts, TOKEN_Greaterthan,cc);
357 if (delimit(ts,cc)) return 1;
358 fprintf(stderr, "newline\n");
360 start_token(ts, TOKEN_NEWLINE,cc);
361 if (delimit(ts,cc)) return 1;
364 if (delimit(ts,cc)) return 1;
366 start_token(ts, TOKEN_Lparen,cc);
367 if (delimit(ts,cc)) return 1;
370 if (delimit(ts,cc)) return 1;
372 start_token(ts, TOKEN_Rparen,cc);
373 if (delimit(ts,cc)) return 1;
377 if (delimit(ts,cc)) return 1;
386 * If the previous character was part of a word, the current character shall
387 * be appended to that word.
390 add_to_token(ts, cc);
396 * If the current character is a '#', it and all subsequent characters up
397 * to, but excluding, the next <newline> shall be discarded as a comment.
398 * The <newline> that ends the line is not considered part of the comment.
403 while (cc != '\n' && cc != EOF) {
411 * The current character is used as the start of a new word.
414 start_token(ts, TOKEN_WORD, cc);
419 static int nextchar(struct token_state *ts) {
421 if (ts->cursor >= ts->strlen) {
424 return (int)(unsigned char)ts->strinput[ts->cursor++];
427 return fgetc(ts->input);