X-Git-Url: https://pd.if.org/git/?p=pdutils;a=blobdiff_plain;f=utils%2Fsh%2Ftok.c;fp=utils%2Fsh%2Ftok.c;h=a92634352a1d2b426376049532686f1fd2623573;hp=0000000000000000000000000000000000000000;hb=dce1707cec7cd4268fe73371c7488052e5058181;hpb=5cd80baaa2b300ac0e2bded7bf9dfc7b74022ab8

diff --git a/utils/sh/tok.c b/utils/sh/tok.c
new file mode 100644
index 0000000..a926343
--- /dev/null
+++ b/utils/sh/tok.c
@@ -0,0 +1,429 @@
+#include <stdio.h>
+#include "gram.h"
+
+/* START HEADER */
+struct token {
+	int type;
+	int rtype; /* a "real type", e.g. In, NAME, etc. */
+	int length;
+	int delimiter;
+	int nondigits;
+	int nonnames;
+	struct token *next;
+	char text[2048];
+};
+
+struct token_state {
+	FILE *input;
+	struct token *tok;
+
+	char *strinput;
+	int is_string;
+	int strlen;
+	int cursor;
+
+	int pc; /* previous char */
+	int cc; /* current char */
+	int nc; /* next char */
+	int push_back; /* true if next char is valid */
+
+	int prev_op; /* previous character was used as part of an operator */
+	int prev_word; /* previous character is part of a word */
+	int quoting; /* type of quoting */
+
+	char buf[2048];
+	int buflen;
+};
+
+struct token_state *ts_init(struct token_state *ts, FILE *f);
+int get_token(struct token_state *ts, struct token *tok);
+
+/* END HEADER */
+
+#if DEBUG_TOKEN
+static void ptoken(struct token_state *ts) {
+	fprintf(stderr, "token %d = %s\n", ts->tok->type, ts->tok->text);
+}
+#endif
+
+static int nextch(struct token_state *ts);
+static int peek(struct token_state *ts);
+static int nextchar(struct token_state *ts); 
+static int process_char(struct token_state *ts);
+
+static int peek(struct token_state *ts) {
+	if (!ts->push_back) {
+		ts->nc = nextchar(ts);
+		ts->push_back = 1;
+	}
+	return ts->nc;
+}
+
+static int nextch(struct token_state *ts) {
+	if (!ts->push_back) {
+		ts->nc = nextchar(ts);
+	}
+	ts->pc = ts->cc;
+	ts->cc = ts->nc;
+	ts->push_back = 0;
+	return ts->cc;
+}
+
+#if 0
+static int pushback(struct token_state *ts) {
+	/* TODO error if already pushed back? */
+	ts->nc = ts->cc;
+	ts->cc = ts->pc;
+	return ts->cc;
+}
+#endif
+
+static int add_to_token(struct token_state *ts, int ch) {
+	ts->tok->text[ts->tok->length++] = (char)ch;
+#if DEBUG_TOKEN
+	ptoken(ts);
+#endif
+	return ts->tok->length;
+}
+
+static int start_token(struct token_state *ts, int type, int ch) {
+	int i;
+	for (i=0;i<sizeof ts->tok->text; i++) {
+		ts->tok->text[i] = 0;
+	}
+	ts->tok->type = type;
+	ts->tok->text[0] = (char)ch;
+	ts->tok->length = 1;
+#if DEBUG_TOKEN
+	fprintf(stderr, "starting ");
+	ptoken(ts);
+#endif
+	return 1;
+}
+
+struct token_state *ts_init(struct token_state *ts, FILE *f) {
+	if (!ts) return NULL;
+	/* TODO malloc one */
+
+	ts->is_string = 0;
+	ts->push_back = 0;
+	ts->prev_op = 0;
+	ts->prev_word = 0;
+	ts->quoting = 0;
+
+	ts->input = f;
+	ts->buf[0] = 0;
+	return ts;
+}
+
+int get_token(struct token_state *ts, struct token *tok) {
+	int i;
+	ts->tok = tok;
+	tok->type = 0;
+	tok->length = 0;
+	ts->prev_op = 0;
+	ts->prev_word = 0;
+	ts->quoting = 0;
+	for (i=0;i<sizeof tok->text; i++) {
+		tok->text[i] = 0;
+	}
+
+	while (!process_char(ts)) {
+	}
+	return tok->type;
+}
+
+/* true if the character could be used with the given token to make
+ * an operator
+ */
+static int can_op(int type, int ch) {
+	/* TODO */
+	switch (type) {
+		case TOKEN_Ampersand:
+			if (ch == '&') return TOKEN_AND_IF; break;
+		case TOKEN_Pipe:
+			if (ch == '|') return TOKEN_OR_IF; break;
+		case TOKEN_Semicolon:
+			if (ch == ';') return TOKEN_DSEMI; break;
+		case TOKEN_DLESS:
+			if (ch == '-') return TOKEN_DLESSDASH; break;
+		case TOKEN_Lessthan:
+			switch (ch) {
+				case '<': return TOKEN_DLESS; break;
+				case '&': return TOKEN_LESSAND; break;
+				case '>': return TOKEN_LESSGREAT; break;
+				default: break;
+			}
+			break;
+		case TOKEN_Greaterthan:
+			switch (ch) {
+				case '>': return TOKEN_DGREAT; break;
+				case '&': return TOKEN_GREATAND; break;
+				case '|': return TOKEN_CLOBBER; break;
+				default: break;
+			}
+			break;
+		default:
+			break;
+	}
+	return 0;
+}
+
+static int valid_token(struct token_state *ts) {
+	return ts->tok && ts->tok->length;
+}
+
+static int delimit(struct token_state *ts, int ch) {
+	if (valid_token(ts)) {
+		ts->tok->delimiter = ch;
+		if (ts->tok->length == 1 && ts->tok->text[0] == '\n') {
+			ts->tok->type = TOKEN_NEWLINE;
+		} else if (!ts->tok->nondigits && (ts->tok->delimiter == '>' || ts->tok->delimiter == '<')) {
+			ts->tok->type = TOKEN_IO_NUMBER;
+		}
+#if DEBUG_TOKEN
+	fprintf(stderr, "delimited ");
+	ptoken(ts);
+#endif
+		return 1;
+	}
+
+	return 0;
+}
+
+/* return true if delimited */
+static int process_char(struct token_state *ts) {
+	int cc;
+
+	cc = peek(ts);
+#if 1
+	fprintf(stderr, "looking at a '%c'\n", cc);
+#endif
+
+/*
+ * If the end of input is recognized, the current token shall be delimited. If
+ * there is no current token, the end-of-input indicator shall be returned as
+ * the token.
+ */
+	if (cc == EOF) {
+		if (delimit(ts,cc)) {
+			return 1;
+		}
+		fprintf(stderr, "EOF Token\n");
+		ts->tok->type = EOF;
+		return 1;
+	}
+
+/*
+ * If the previous character was used as part of an operator and the current
+ * character is not quoted and can be used with the current characters to form
+ * an operator, it shall be used as part of that (operator) token.
+ */
+	if (ts->prev_op && !ts->quoting) {
+		int newop = can_op(ts->tok->type, cc);
+		if (newop) {
+			ts->tok->type = newop;
+			add_to_token(ts,cc);
+			nextch(ts);
+			return 0;
+		}
+	}
+
+/*
+ * If the previous character was used as part of an operator and the current
+ * character cannot be used with the current characters to form an
+ * operator, the operator containing the previous character shall be
+ * delimited.
+ */
+	if (ts->prev_op && ! can_op(ts->tok->type, cc)) {
+		if (delimit(ts,cc)) return 1;
+		/* TODO error here, should be impossible */
+	}
+
+	/* match quote */
+/*
+ * If the current character is <backslash>, single-quote, or double-quote and
+ * it is not quoted, it shall affect quoting for subsequent characters up to
+ * the end of the quoted text. The rules for quoting are as described in
+ * Quoting . During token recognition no substitutions shall be actually
+ * performed, and the result token shall contain exactly the characters that
+ * appear in the input (except for <newline> joining), unmodified, including
+ * any embedded or enclosing quotes or substitution operators, between the
+ * <quotation-mark> and the end of the quoted text. The token shall not be
+ * delimited by the end of the quoted field.
+ */
+	if (!ts->quoting) {
+		if (cc == '\\' || cc == '\'' || cc == '"') {
+			ts->quoting = cc;
+		}
+	}
+
+	/* match expansion */
+/*
+ * If the current character is an unquoted '$' or '`', the shell shall identify
+ * the start of any candidates for parameter expansion (Parameter Expansion),
+ * command substitution (Command Substitution), or arithmetic expansion
+ * (Arithmetic Expansion) from their introductory unquoted character sequences:
+ * '$' or "${", "$(" or '`', and "$((", respectively. The shell shall read
+ * sufficient input to determine the end of the unit to be expanded (as
+ * explained in the cited sections). While processing the characters, if
+ * instances of expansions or quoting are found nested within the substitution,
+ * the shell shall recursively process them in the manner specified for the
+ * construct that is found. The characters found from the beginning of the
+ * substitution to its end, allowing for any recursion necessary to recognize
+ * embedded constructs, shall be included unmodified in the result token,
+ * including any embedded or enclosing substitution operators or quotes.  The
+ * token shall not be delimited by the end of the substitution.
+ */
+#if 0
+	if (!ts->quoting) {
+		if (cc == '$' || cc == '`') {
+			int nc;
+			nc = peek(ts);
+			switch (nc) {
+				case '{':
+					ts->expanding = EXP_BRACKET; break;
+				case '(':
+					ts->expanding = EXP_COMMAND; break;
+				default:
+					ts->expanding = EXP_WORDVAR; break;
+			}
+			do_expansion(ts, cc);
+		} else if (cc == '`') {
+			ts->expanding = EXP_COMMAND;
+		}
+	}
+#endif
+
+/*
+ * If the current character is not quoted and can be used as the first
+ * character of a new operator, the current token (if any) shall be delimited.
+ * The current character shall be used as the beginning of the next (operator)
+ * token.
+ */
+
+/*
+ * If the current character is an unquoted <newline>, the current token shall
+ * be delimited.
+ */
+
+/*
+ * If the current character is an unquoted <blank>, any token containing the
+ * previous character is delimited and the current character shall be
+ * discarded.
+ */
+
+/* 
+ * The application shall quote the following characters if they are to
+ * represent themselves:
+ *
+ * |  &  ;  <  >  (  )  $  `  \  "  '  <space>  <tab>  <newline>
+ */
+
+
+	if (! ts->quoting) {
+		switch (cc) {
+			case '&':
+				if (delimit(ts,cc)) return 1;
+				nextch(ts);
+				ts->prev_op = 1;
+				start_token(ts, TOKEN_Ampersand, cc);
+			       	return 0; break;
+			case '|':
+				if (delimit(ts,cc)) return 1;
+				nextch(ts);
+				ts->prev_op = 1;
+				start_token(ts, TOKEN_Pipe,cc);
+			       	return 0; break;
+			case ';':
+				if (delimit(ts,cc)) return 1;
+				nextch(ts);
+				ts->prev_op = 1;
+				start_token(ts, TOKEN_Semicolon,cc);
+			       	return 0; break;
+			case '<':
+				if (delimit(ts,cc)) return 1;
+				nextch(ts);
+				ts->prev_op = 1;
+				start_token(ts, TOKEN_Lessthan,cc);
+			       	return 0; break;
+			case '>':
+				if (delimit(ts,cc)) return 1;
+				nextch(ts);
+				ts->prev_op = 1;
+				start_token(ts, TOKEN_Greaterthan,cc);
+			       	return 0; break;
+			case '\n':
+				if (delimit(ts,cc)) return 1;
+				fprintf(stderr, "newline\n");
+				nextch(ts);
+				start_token(ts, TOKEN_NEWLINE,cc);
+				if (delimit(ts,cc)) return 1;
+				break;
+			case '(':
+				if (delimit(ts,cc)) return 1;
+				nextch(ts);
+				start_token(ts, TOKEN_Lparen,cc);
+				if (delimit(ts,cc)) return 1;
+				break;
+			case ')':
+				if (delimit(ts,cc)) return 1;
+				nextch(ts);
+				start_token(ts, TOKEN_Rparen,cc);
+				if (delimit(ts,cc)) return 1;
+				break;
+			case ' ':
+			case '\t':
+				if (delimit(ts,cc)) return 1;
+			       	nextch(ts);
+			       	return 0; break;
+			default:
+				break;
+		}
+	}
+
+/*
+ * If the previous character was part of a word, the current character shall
+ * be appended to that word.
+ */
+	if (ts->prev_word) {
+		add_to_token(ts, cc);
+		nextch(ts);
+		return 0;
+	}
+
+/*
+ * If the current character is a '#', it and all subsequent characters up
+ * to, but excluding, the next <newline> shall be discarded as a comment.
+ * The <newline> that ends the line is not considered part of the comment.
+ */
+	if (cc == '#') {
+		cc = nextch(ts);
+		cc = peek(ts);
+		while (cc != '\n' && cc != EOF) {
+			cc = nextch(ts);
+			cc = peek(ts);
+		}
+		return 0;
+	}
+
+/*
+ * The current character is used as the start of a new word.
+ */
+	ts->prev_word = 1;
+	start_token(ts, TOKEN_WORD, cc);
+	nextch(ts);
+	return 0;
+}
+
+static int nextchar(struct token_state *ts) {
+	if (ts->is_string) {
+		if (ts->cursor >= ts->strlen) {
+			return EOF;
+		}
+		return (int)(unsigned char)ts->strinput[ts->cursor++];
+	}
+
+	return fgetc(ts->input);
+}
+