RegExpr provides regular expression matching of text strings
MODULE------------------------------------------------------------ compiling ---; IMPORT Text; REVEAL T = BRANDED "RegExpr.T" REF RECORD body : TEXT; root : CARDINAL; ops : REF ARRAY OF Desc; END; TYPE Op = { Or, And, Concat, AnyString, ThisString }; StrLen = [0..16_ffff]; Desc = RECORD a, b: CARDINAL; (* operands: (left, right) or string: (start, len) *) min, max: StrLen; ch: CHAR; op: Op; END; RegExpr
TYPE
ParseState = RECORD
body : TEXT;
ops : REF ARRAY OF Desc;
ch : CHAR;
len : CARDINAL;
next_ch : CARDINAL;
next_buf : CARDINAL;
next_op : CARDINAL;
END;
PROCEDURE Compile (pattern: TEXT): T RAISES {Error} =
VAR t: T; s: ParseState;
BEGIN
IF (pattern = NIL) THEN RAISE Error ("NIL pattern"); END;
t := NEW (T, body := pattern);
s.len := Text.Length (pattern);
IF (s.len < 1) THEN
s.next_op := 0;
t.ops := NEW (REF ARRAY OF Desc, 1);
t.root := EmptyString (s);
ELSE
s.body := pattern;
s.ops := NEW (REF ARRAY OF Desc, s.len + s.len);
s.ch := Text.GetChar (pattern, 0);
s.next_ch := 1;
s.next_buf := 0;
s.next_op := 0;
t.root := ParseExpr (s);
t.ops := s.ops;
END;
RETURN t;
END Compile;
PROCEDURE ParseExpr (VAR s: ParseState): CARDINAL RAISES {Error} =
VAR a, b: CARDINAL;
BEGIN
a := ParseTerm (s);
WHILE (s.ch = '|') DO
NextCh (s);
b := ParseTerm (s);
WITH z = s.ops [s.next_op] DO
z.op := Op.Or; z.a := a; z.b := b;
z.min := MIN (s.ops[a].min, s.ops[b].min);
z.max := MAX (s.ops[b].max, s.ops[b].max);
END;
a := s.next_op;
INC (s.next_op);
END;
RETURN a;
END ParseExpr;
PROCEDURE ParseTerm (VAR s: ParseState): CARDINAL RAISES {Error} =
VAR a, b: CARDINAL;
BEGIN
a := ParseFactor (s);
WHILE (s.ch = '&') DO
NextCh (s);
b := ParseFactor (s);
WITH z = s.ops [s.next_op] DO
z.op := Op.And; z.a := a; z.b := b;
z.min := MAX (s.ops[a].min, s.ops[b].min);
z.max := MIN (s.ops[b].max, s.ops[b].max);
END;
a := s.next_op;
INC (s.next_op);
END;
RETURN a;
END ParseTerm;
PROCEDURE ParseFactor (VAR s: ParseState): CARDINAL RAISES {Error} =
VAR a, b: CARDINAL;
BEGIN
a := ParsePrimary (s);
WHILE (s.next_ch <= s.len)
AND (s.ch # '|') AND (s.ch # '&') AND (s.ch # ')') DO
b := ParsePrimary (s);
WITH z = s.ops [s.next_op] DO
z.op := Op.Concat; z.a := a; z.b := b;
z.min := MIN (s.ops[a].min + s.ops[b].min, LAST (StrLen));
z.max := MIN (s.ops[a].max + s.ops[b].max, LAST (StrLen));
END;
a := s.next_op;
INC (s.next_op);
END;
RETURN a;
END ParseFactor;
PROCEDURE ParsePrimary (VAR s: ParseState): CARDINAL RAISES {Error} =
VAR x := s.next_op;
BEGIN
CASE s.ch OF
| '&', '|', ')' =>
RETURN EmptyString (s);
| '*' =>
NextCh (s);
WITH z = s.ops [x] DO
z.op := Op.AnyString;
z.min := 0;
z.max := LAST (StrLen);
END;
INC (s.next_op);
RETURN x;
| '@' =>
NextCh (s);
WITH z = s.ops [x] DO
z.op := Op.AnyString;
z.min := 1;
z.max := 1;
END;
INC (s.next_op);
RETURN x;
| '(' =>
NextCh (s);
x := ParseExpr (s);
IF (s.ch = ')') THEN
NextCh (s); (* ok *)
ELSE
RAISE Error ("unmatched parenthesis");
END;
RETURN x;
ELSE
RETURN ParseString (s);
END;
END ParsePrimary;
PROCEDURE ParseString (VAR s: ParseState): CARDINAL =
VAR x := s.next_op;
BEGIN
INC (s.next_op);
WITH z = s.ops [x] DO
z.op := Op.ThisString;
z.a := s.next_buf;
z.b := 0;
WHILE (s.next_ch <= s.len)
AND (s.ch # '|') AND (s.ch # '&') AND (s.ch # '*')
AND (s.ch # '@') AND (s.ch # '(') AND (s.ch # ')') DO
IF (s.ch = '\134') AND (s.next_ch < s.len) THEN
NextCh (s); (* eat the backslash escape *)
END;
s.ops[s.next_buf].ch := s.ch; INC (s.next_buf);
INC (z.b);
NextCh (s);
END;
z.min := z.b;
z.max := z.b
END;
RETURN x;
END ParseString;
PROCEDURE EmptyString (VAR s: ParseState): CARDINAL =
VAR x := s.next_op;
BEGIN
WITH z = s.ops[x] DO
z.op := Op.ThisString;
z.a := 0;
z.b := 0;
z.min := 0;
z.max := 0;
END;
INC (s.next_op);
RETURN x;
END EmptyString;
PROCEDURE NextCh (VAR s: ParseState) =
BEGIN
IF (s.next_ch < s.len) THEN
s.ch := Text.GetChar (s.body, s.next_ch);
INC (s.next_ch);
ELSE
s.ch := '\000';
INC (s.next_ch);
END;
END NextCh;
------------------------------------------------------------- matching ---
PROCEDURE----------------------------------------------------------------- misc ---Match (t: T; txt: TEXT): BOOLEAN = BEGIN RETURN MatchSubstring (t, txt, 0, 0); END Match; PROCEDUREMatchSubstring (t: T; txt: TEXT; pre, post: CARDINAL): BOOLEAN = VAR len: INTEGER; buf: ARRAY [0..255] OF CHAR; ref: REF ARRAY OF CHAR; BEGIN IF (t = NIL) THEN RETURN TRUE; END; IF (txt = NIL) THEN RETURN FALSE; END; len := Text.Length (txt) - pre - post; IF (len <= NUMBER (buf)) THEN Text.SetChars (buf, txt, pre); RETURN MatchSub (t, SUBARRAY (buf, 0, len)); ELSE ref := NEW (REF ARRAY OF CHAR, len); Text.SetChars (buf, txt, pre); RETURN MatchSub (t, ref^); END; END MatchSubstring; PROCEDUREMatchSub (t: T; READONLY str: ARRAY OF CHAR): BOOLEAN = BEGIN IF (t = NIL) THEN RETURN TRUE; END; IF (NUMBER (str) <= 0) THEN RETURN FALSE; END; RETURN DoMatch (t.ops, t.root, str, 0, NUMBER (str)); END MatchSub; PROCEDUREDoMatch (ops: REF ARRAY OF Desc; x: CARDINAL; READONLY txt: ARRAY OF CHAR; start, len: INTEGER): BOOLEAN = BEGIN WITH z = ops[x] DO IF (len < z.min) OR (z.max < len) THEN RETURN FALSE; END; CASE z.op OF | Op.Or => RETURN DoMatch (ops, z.a, txt, start, len) OR DoMatch (ops, z.b, txt, start, len); | Op.And => RETURN DoMatch (ops, z.a, txt, start, len) AND DoMatch (ops, z.b, txt, start, len); | Op.AnyString => RETURN TRUE; | Op.ThisString => FOR i := 0 TO z.b - 1 DO IF (ops[z.a + i].ch # txt [start + i]) THEN RETURN FALSE; END; END; RETURN TRUE; | Op.Concat => WITH za = ops[z.a], zb = ops[z.b] DO VAR max_a := MIN (za.max, len - zb.min); min_a := MAX (za.min, len - zb.max); BEGIN FOR i := max_a TO min_a BY -1 DO IF DoMatch (ops, z.a, txt, start, i ) AND DoMatch (ops, z.b, txt, start+i, len-i) THEN RETURN TRUE; END; END; RETURN FALSE; END; END; END; (* CASE*) END; (* WITH *) END DoMatch;
PROCEDURESimpleString (t: T): TEXT = BEGIN IF (t = NIL) THEN RETURN NIL; END; WITH z = t.ops [t.root] DO IF (z.op = Op.ThisString) AND (z.a = 0) AND (z.b = Text.Length (t.body)) THEN RETURN t.body; END; END; RETURN NIL; END SimpleString; BEGIN END RegExpr.