2 * This file contains code for
4 * int rexpr(char *expr, char *s);
8 * 1 if 's' is in the language described by the regular expression 'expr'
10 * -1 if the regular expression is invalid
12 * Language membership is determined by constructing a non-deterministic
13 * finite automata (NFA) from the regular expression. A depth-
14 * first-search is performed on the NFA (graph) to check for a match of 's'.
15 * Each non-epsilon arc consumes one character from 's'. Backtracking is
16 * performed to check all possible paths through the NFA.
18 * Regular expressions follow the meta-language:
20 * <regExpr> ::= <andExpr> ( '|' <andExpr> )*
22 * <andExpr> ::= <expr> ( <expr> )*
24 * <expr> ::= {'~'} '[' <atomList> ']' <repeatSymbol>
25 * | '(' <regExpr> ')' <repeatSymbol>
26 * | '{' <regExpr> '}' <repeatSymbol>
27 * | <atom> <repeatSymbol>
29 * <repeatSymbol> ::= { '*' | '+' }
31 * <atomList> ::= <atom> ( <atom> )*
32 * | { <atomList> } <atom> '-' <atom> { <atomList> }
34 * <atom> ::= Token[Atom]
37 * ~ means complement the set in [..]. i.e. all characters not listed
38 * * means match 0 or more times (can be on expression or atom)
39 * + means match 1 or more times (can be on expression or atom)
43 * x-y all characters from x to y (found only in [..])
44 * \xx the character with value xx
48 * match 1 or more lower-case letters (e.g. variable)
51 * match a hex number with 0x on front (e.g. 0xA1FF)
53 * [0-9]+.[0-9]+{e[0-9]+}
54 * match a floating point number (e.g. 3.14e21)
57 * if ( rexpr("[a-zA-Z][a-zA-Z0-9]+", str) ) then str is keyword
74 static int regExpr( GraphPtr g );
75 static int andExpr( GraphPtr g );
76 static int expr( GraphPtr g );
77 static int repeatSymbol( GraphPtr g );
78 static int atomList( char *p, int complement );
79 static int next( void );
80 static ArcPtr newGraphArc( void );
81 static NodePtr newNode( void );
82 static int ArcBetweenGraphNode( NodePtr i, NodePtr j, int label );
83 static Graph BuildNFA_atom( int label );
84 static Graph BuildNFA_AB( Graph A, Graph B );
85 static Graph BuildNFA_AorB( Graph A, Graph B );
86 static Graph BuildNFA_set( char *s );
87 static Graph BuildNFA_Astar( Graph A );
88 static Graph BuildNFA_Aplus( Graph A );
89 static Graph BuildNFA_Aoptional( Graph A );
94 static int repeatSymbol();
95 static int atomList();
97 static ArcPtr newGraphArc();
98 static NodePtr newNode();
99 static int ArcBetweenGraphNode();
100 static Graph BuildNFA_atom();
101 static Graph BuildNFA_AB();
102 static Graph BuildNFA_AorB();
103 static Graph BuildNFA_set();
104 static Graph BuildNFA_Astar();
105 static Graph BuildNFA_Aplus();
106 static Graph BuildNFA_Aoptional();
110 static int token, tokchar;
111 static NodePtr accept;
112 static NodePtr freelist = NULL;
115 * return 1 if s in language described by expr
117 * -1 if expr is an invalid regular expression
126 fprintf(stderr, "rexpr(%s,%s);\n", expr,s);
130 if ( regExpr(&nfa) == -1 ) return -1;
132 result = match(nfa.left, s);
133 /* free all your memory */
135 while ( p!=NULL ) { q = p->track; free(p); p = q; }
140 * do a depth-first-search on the NFA looking for a path from start to
141 * accept state labelled with the characters of 's'.
149 if ( automaton == accept && *s == '\0' ) return 1; /* match */
151 for (p=automaton->arcs; p!=NULL; p=p->next) /* try all arcs */
153 if ( p->label == Epsilon )
155 if ( match(p->target, s) ) return 1;
157 else if ( p->label == *s )
158 if ( match(p->target, s+1) ) return 1;
164 * <regExpr> ::= <andExpr> ( '|' {<andExpr>} )*
166 * Return -1 if syntax error
167 * Return 0 if none found
168 * Return 1 if a regExrp was found
176 if ( andExpr(&g1) == -1 )
181 while ( token == '|' )
186 if ( a == -1 ) return -1; /* syntax error below */
187 else if ( !a ) return 1; /* empty alternative */
188 g1 = BuildNFA_AorB(g1, g2);
191 if ( token!='\0' ) return -1;
198 * <andExpr> ::= <expr> ( <expr> )*
206 if ( expr(&g1) == -1 )
211 while ( token==Atom || token=='{' || token=='(' || token=='~' || token=='[' )
213 if (expr(&g2) == -1) return -1;
214 g1 = BuildNFA_AB(g1, g2);
222 * <expr> ::= {'~'} '[' <atomList> ']' <repeatSymbol>
223 * | '(' <regExpr> ')' <repeatSymbol>
224 * | '{' <regExpr> '}' <repeatSymbol>
225 * | <atom> <repeatSymbol>
232 char s[257]; /* alloc space for string of char in [] */
234 if ( token == '~' || token == '[' )
236 if ( token == '~' ) {complement = 1; next();}
237 if ( token != '[' ) return -1;
239 if ( atomList( s, complement ) == -1 ) return -1;
240 *g = BuildNFA_set( s );
241 if ( token != ']' ) return -1;
249 if ( regExpr( g ) == -1 ) return -1;
250 if ( token != ')' ) return -1;
258 if ( regExpr( g ) == -1 ) return -1;
259 if ( token != '}' ) return -1;
261 /* S p e c i a l C a s e O p t i o n a l { } */
262 if ( token != '*' && token != '+' )
264 *g = BuildNFA_Aoptional( *g );
271 *g = BuildNFA_atom( tokchar );
281 * <repeatSymbol> ::= { '*' | '+' }
289 case '*' : *g = BuildNFA_Astar( *g ); next(); break;
290 case '+' : *g = BuildNFA_Aplus( *g ); next(); break;
296 * <atomList> ::= <atom> { <atom> }*
297 * { <atomList> } <atom> '-' <atom> { <atomList> }
303 atomList(p, complement)
307 static unsigned char set[256]; /* no duplicates */
311 if ( token != Atom ) return -1;
313 for (i=0; i<256; i++) set[i] = 0;
314 while ( token == Atom )
316 if ( !set[tokchar] ) *s++ = tokchar;
317 set[tokchar] = 1; /* Add atom to set */
319 if ( token == '-' ) /* have we found '-' */
321 first = *(s-1); /* Get last char */
323 if ( token != Atom ) return -1;
328 for (i = first+1; i <= last; i++)
330 if ( !set[tokchar] ) *s++ = i;
331 set[i] = 1; /* Add atom to set */
339 for (i=0; i<256; i++) set[i] = !set[i];
340 for (i=1,s=p; i<256; i++) if ( set[i] ) *s++ = i;
346 /* a somewhat stupid lexical analyzer */
350 while ( *_c==' ' || *_c=='\t' || *_c=='\n' ) _c++;
357 while ( isdigit(*_c) )
359 n = n*10 + (*_c++ - '0');
368 case 'n' : tokchar = '\n'; break;
369 case 't' : tokchar = '\t'; break;
370 case 'r' : tokchar = '\r'; break;
371 default : tokchar = *_c;
377 else if ( isgraph(*_c) && *_c!='[' && *_c!='(' && *_c!='{' &&
378 *_c!='-' && *_c!='}' && *_c!=')' && *_c!=']' &&
379 *_c!='+' && *_c!='*' && *_c!='~' && *_c!='|' )
386 token = tokchar = *_c++;
390 /* N F A B u i l d i n g R o u t i n e s */
397 p = (ArcPtr) calloc(1, sizeof(Arc));
398 if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
399 if ( freelist != NULL ) p->track = (ArcPtr) freelist;
400 freelist = (NodePtr) p;
409 p = (NodePtr) calloc(1, sizeof(Node));
410 if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
411 if ( freelist != NULL ) p->track = freelist;
417 ArcBetweenGraphNodes(i, j, label)
424 if ( i->arcs == NULL ) i->arctail = i->arcs = a;
425 else {(i->arctail)->next = a; i->arctail = a;}
438 ArcBetweenGraphNodes(g.left, g.right, label);
448 ArcBetweenGraphNodes(A.right, B.left, Epsilon);
461 ArcBetweenGraphNodes(g.left, A.left, Epsilon);
462 ArcBetweenGraphNodes(g.left, B.left, Epsilon);
464 ArcBetweenGraphNodes(A.right, g.right, Epsilon);
465 ArcBetweenGraphNodes(B.right, g.right, Epsilon);
475 if ( s == NULL ) return g;
481 ArcBetweenGraphNodes(g.left, g.right, *s++);
495 ArcBetweenGraphNodes(g.left, A.left, Epsilon);
496 ArcBetweenGraphNodes(g.left, g.right, Epsilon);
497 ArcBetweenGraphNodes(A.right, g.right, Epsilon);
498 ArcBetweenGraphNodes(A.right, A.left, Epsilon);
507 ArcBetweenGraphNodes(A.right, A.left, Epsilon);
513 BuildNFA_Aoptional( A )
521 ArcBetweenGraphNodes(g.left, A.left, Epsilon);
522 ArcBetweenGraphNodes(g.left, g.right, Epsilon);
523 ArcBetweenGraphNodes(A.right, g.right, Epsilon);