X-Git-Url: https://pd.if.org/git/?p=pccts;a=blobdiff_plain;f=support%2Frexpr%2Frexpr.c;fp=support%2Frexpr%2Frexpr.c;h=4b7f95e33351ba2955f4b0c2b3816b4320022ccd;hp=0000000000000000000000000000000000000000;hb=56dd00148e59773742903ee71be791eaa49a8616;hpb=cb15b978c765a661bf3154d865fa3e2401d649f5 diff --git a/support/rexpr/rexpr.c b/support/rexpr/rexpr.c new file mode 100755 index 0000000..4b7f95e --- /dev/null +++ b/support/rexpr/rexpr.c @@ -0,0 +1,526 @@ +/* + * This file contains code for + * + * int rexpr(char *expr, char *s); + * + * which answers + * + * 1 if 's' is in the language described by the regular expression 'expr' + * 0 if it is not + * -1 if the regular expression is invalid + * + * Language membership is determined by constructing a non-deterministic + * finite automata (NFA) from the regular expression. A depth- + * first-search is performed on the NFA (graph) to check for a match of 's'. + * Each non-epsilon arc consumes one character from 's'. Backtracking is + * performed to check all possible paths through the NFA. + * + * Regular expressions follow the meta-language: + * + * ::= ( '|' )* + * + * ::= ( )* + * + * ::= {'~'} '[' ']' + * | '(' ')' + * | '{' '}' + * | + * + * ::= { '*' | '+' } + * + * ::= ( )* + * | { } '-' { } + * + * ::= Token[Atom] + * + * Notes: + * ~ means complement the set in [..]. i.e. all characters not listed + * * means match 0 or more times (can be on expression or atom) + * + means match 1 or more times (can be on expression or atom) + * {} optional + * () grouping + * [] set of atoms + * x-y all characters from x to y (found only in [..]) + * \xx the character with value xx + * + * Examples: + * [a-z]+ + * match 1 or more lower-case letters (e.g. variable) + * + * 0x[0-9A-Fa-f]+ + * match a hex number with 0x on front (e.g. 0xA1FF) + * + * [0-9]+.[0-9]+{e[0-9]+} + * match a floating point number (e.g. 3.14e21) + * + * Code example: + * if ( rexpr("[a-zA-Z][a-zA-Z0-9]+", str) ) then str is keyword + * + * Terence Parr + * Purdue University + * April 1991 + */ + +#include +#include +#ifdef __STDC__ +#include +#else +#include +#endif +#include "rexpr.h" + +#ifdef __STDC__ +static int regExpr( GraphPtr g ); +static int andExpr( GraphPtr g ); +static int expr( GraphPtr g ); +static int repeatSymbol( GraphPtr g ); +static int atomList( char *p, int complement ); +static int next( void ); +static ArcPtr newGraphArc( void ); +static NodePtr newNode( void ); +static int ArcBetweenGraphNode( NodePtr i, NodePtr j, int label ); +static Graph BuildNFA_atom( int label ); +static Graph BuildNFA_AB( Graph A, Graph B ); +static Graph BuildNFA_AorB( Graph A, Graph B ); +static Graph BuildNFA_set( char *s ); +static Graph BuildNFA_Astar( Graph A ); +static Graph BuildNFA_Aplus( Graph A ); +static Graph BuildNFA_Aoptional( Graph A ); +#else +static int regExpr(); +static int andExpr(); +static int expr(); +static int repeatSymbol(); +static int atomList(); +static int next(); +static ArcPtr newGraphArc(); +static NodePtr newNode(); +static int ArcBetweenGraphNode(); +static Graph BuildNFA_atom(); +static Graph BuildNFA_AB(); +static Graph BuildNFA_AorB(); +static Graph BuildNFA_set(); +static Graph BuildNFA_Astar(); +static Graph BuildNFA_Aplus(); +static Graph BuildNFA_Aoptional(); +#endif + +static char *_c; +static int token, tokchar; +static NodePtr accept; +static NodePtr freelist = NULL; + +/* + * return 1 if s in language described by expr + * 0 if s is not + * -1 if expr is an invalid regular expression + */ +rexpr(expr, s) +char *expr, *s; +{ + NodePtr p,q; + Graph nfa; + int result; + + fprintf(stderr, "rexpr(%s,%s);\n", expr,s); + freelist = NULL; + _c = expr; + next(); + if ( regExpr(&nfa) == -1 ) return -1; + accept = nfa.right; + result = match(nfa.left, s); + /* free all your memory */ + p = q = freelist; + while ( p!=NULL ) { q = p->track; free(p); p = q; } + return result; +} + +/* + * do a depth-first-search on the NFA looking for a path from start to + * accept state labelled with the characters of 's'. + */ +match(automaton, s) +NodePtr automaton; +char *s; +{ + ArcPtr p; + + if ( automaton == accept && *s == '\0' ) return 1; /* match */ + + for (p=automaton->arcs; p!=NULL; p=p->next) /* try all arcs */ + { + if ( p->label == Epsilon ) + { + if ( match(p->target, s) ) return 1; + } + else if ( p->label == *s ) + if ( match(p->target, s+1) ) return 1; + } + return 0; +} + +/* + * ::= ( '|' {} )* + * + * Return -1 if syntax error + * Return 0 if none found + * Return 1 if a regExrp was found + */ +static +regExpr(g) +GraphPtr g; +{ + Graph g1, g2; + + if ( andExpr(&g1) == -1 ) + { + return -1; + } + + while ( token == '|' ) + { + int a; + next(); + a = andExpr(&g2); + if ( a == -1 ) return -1; /* syntax error below */ + else if ( !a ) return 1; /* empty alternative */ + g1 = BuildNFA_AorB(g1, g2); + } + + if ( token!='\0' ) return -1; + + *g = g1; + return 1; +} + +/* + * ::= ( )* + */ +static +andExpr(g) +GraphPtr g; +{ + Graph g1, g2; + + if ( expr(&g1) == -1 ) + { + return -1; + } + + while ( token==Atom || token=='{' || token=='(' || token=='~' || token=='[' ) + { + if (expr(&g2) == -1) return -1; + g1 = BuildNFA_AB(g1, g2); + } + + *g = g1; + return 1; +} + +/* + * ::= {'~'} '[' ']' + * | '(' ')' + * | '{' '}' + * | + */ +static +expr(g) +GraphPtr g; +{ + int complement = 0; + char s[257]; /* alloc space for string of char in [] */ + + if ( token == '~' || token == '[' ) + { + if ( token == '~' ) {complement = 1; next();} + if ( token != '[' ) return -1; + next(); + if ( atomList( s, complement ) == -1 ) return -1; + *g = BuildNFA_set( s ); + if ( token != ']' ) return -1; + next(); + repeatSymbol( g ); + return 1; + } + if ( token == '(' ) + { + next(); + if ( regExpr( g ) == -1 ) return -1; + if ( token != ')' ) return -1; + next(); + repeatSymbol( g ); + return 1; + } + if ( token == '{' ) + { + next(); + if ( regExpr( g ) == -1 ) return -1; + if ( token != '}' ) return -1; + next(); + /* S p e c i a l C a s e O p t i o n a l { } */ + if ( token != '*' && token != '+' ) + { + *g = BuildNFA_Aoptional( *g ); + } + repeatSymbol( g ); + return 1; + } + if ( token == Atom ) + { + *g = BuildNFA_atom( tokchar ); + next(); + repeatSymbol( g ); + return 1; + } + + return -1; +} + +/* + * ::= { '*' | '+' } + */ +static +repeatSymbol(g) +GraphPtr g; +{ + switch ( token ) + { + case '*' : *g = BuildNFA_Astar( *g ); next(); break; + case '+' : *g = BuildNFA_Aplus( *g ); next(); break; + } + return 1; +} + +/* + * ::= { }* + * { } '-' { } + * + * a-b is same as ab + * q-a is same as q + */ +static +atomList(p, complement) +char *p; +int complement; +{ + static unsigned char set[256]; /* no duplicates */ + int first, last, i; + char *s = p; + + if ( token != Atom ) return -1; + + for (i=0; i<256; i++) set[i] = 0; + while ( token == Atom ) + { + if ( !set[tokchar] ) *s++ = tokchar; + set[tokchar] = 1; /* Add atom to set */ + next(); + if ( token == '-' ) /* have we found '-' */ + { + first = *(s-1); /* Get last char */ + next(); + if ( token != Atom ) return -1; + else + { + last = tokchar; + } + for (i = first+1; i <= last; i++) + { + if ( !set[tokchar] ) *s++ = i; + set[i] = 1; /* Add atom to set */ + } + next(); + } + } + *s = '\0'; + if ( complement ) + { + for (i=0; i<256; i++) set[i] = !set[i]; + for (i=1,s=p; i<256; i++) if ( set[i] ) *s++ = i; + *s = '\0'; + } + return 1; +} + +/* a somewhat stupid lexical analyzer */ +static +next() +{ + while ( *_c==' ' || *_c=='\t' || *_c=='\n' ) _c++; + if ( *_c=='\\' ) + { + _c++; + if ( isdigit(*_c) ) + { + int n=0; + while ( isdigit(*_c) ) + { + n = n*10 + (*_c++ - '0'); + } + if ( n>255 ) n=255; + tokchar = n; + } + else + { + switch (*_c) + { + case 'n' : tokchar = '\n'; break; + case 't' : tokchar = '\t'; break; + case 'r' : tokchar = '\r'; break; + default : tokchar = *_c; + } + _c++; + } + token = Atom; + } + else if ( isgraph(*_c) && *_c!='[' && *_c!='(' && *_c!='{' && + *_c!='-' && *_c!='}' && *_c!=')' && *_c!=']' && + *_c!='+' && *_c!='*' && *_c!='~' && *_c!='|' ) + { + token = Atom; + tokchar = *_c++; + } + else + { + token = tokchar = *_c++; + } +} + +/* N F A B u i l d i n g R o u t i n e s */ + +static +ArcPtr +newGraphArc() +{ + ArcPtr p; + p = (ArcPtr) calloc(1, sizeof(Arc)); + if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);} + if ( freelist != NULL ) p->track = (ArcPtr) freelist; + freelist = (NodePtr) p; + return p; +} + +static +NodePtr +newNode() +{ + NodePtr p; + p = (NodePtr) calloc(1, sizeof(Node)); + if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);} + if ( freelist != NULL ) p->track = freelist; + freelist = p; + return p; +} + +static +ArcBetweenGraphNodes(i, j, label) +NodePtr i, j; +int label; +{ + ArcPtr a; + + a = newGraphArc(); + if ( i->arcs == NULL ) i->arctail = i->arcs = a; + else {(i->arctail)->next = a; i->arctail = a;} + a->label = label; + a->target = j; +} + +static Graph +BuildNFA_atom(label) +int label; +{ + Graph g; + + g.left = newNode(); + g.right = newNode(); + ArcBetweenGraphNodes(g.left, g.right, label); + return( g ); +} + +static Graph +BuildNFA_AB(A, B) +Graph A, B; +{ + Graph g; + + ArcBetweenGraphNodes(A.right, B.left, Epsilon); + g.left = A.left; + g.right = B.right; + return( g ); +} + +static Graph +BuildNFA_AorB(A, B) +Graph A, B; +{ + Graph g; + + g.left = newNode(); + ArcBetweenGraphNodes(g.left, A.left, Epsilon); + ArcBetweenGraphNodes(g.left, B.left, Epsilon); + g.right = newNode(); + ArcBetweenGraphNodes(A.right, g.right, Epsilon); + ArcBetweenGraphNodes(B.right, g.right, Epsilon); + return( g ); +} + +static Graph +BuildNFA_set( s ) +char *s; +{ + Graph g; + + if ( s == NULL ) return g; + + g.left = newNode(); + g.right = newNode(); + while ( *s != '\0' ) + { + ArcBetweenGraphNodes(g.left, g.right, *s++); + } + return g; +} + +static Graph +BuildNFA_Astar( A ) +Graph A; +{ + Graph g; + + g.left = newNode(); + g.right = newNode(); + + ArcBetweenGraphNodes(g.left, A.left, Epsilon); + ArcBetweenGraphNodes(g.left, g.right, Epsilon); + ArcBetweenGraphNodes(A.right, g.right, Epsilon); + ArcBetweenGraphNodes(A.right, A.left, Epsilon); + + return( g ); +} + +static Graph +BuildNFA_Aplus( A ) +Graph A; +{ + ArcBetweenGraphNodes(A.right, A.left, Epsilon); + + return( A ); +} + +static Graph +BuildNFA_Aoptional( A ) +Graph A; +{ + Graph g; + + g.left = newNode(); + g.right = newNode(); + + ArcBetweenGraphNodes(g.left, A.left, Epsilon); + ArcBetweenGraphNodes(g.left, g.right, Epsilon); + ArcBetweenGraphNodes(A.right, g.right, Epsilon); + + return( g ); +}