#include #include #include #include #include #include #include #include "libframe/frame.h" #include #include #include #include "dat.h" #include "fns.h" Rangeset sel; Rune* lastregexp; /* * Machine Information */ typedef struct Inst Inst; struct Inst { uint type; /* < OPERATOR ==> literal, otherwise action */ union { int sid; int subid; int class; Inst* other; Inst* right; } u; union { Inst* left; Inst* next; } u1; }; #define NPROG 1024 Inst program[NPROG]; Inst* progp; Inst* startinst; /* First inst. of program; might not be program[0] */ Inst* bstartinst; /* same for backwards machine */ Channel* rechan; /* chan(Inst*) */ typedef struct Ilist Ilist; struct Ilist { Inst* inst; /* Instruction of the thread */ Rangeset se; uint startp; /* first char of match */ }; #define NLIST 127 Ilist *tl, *nl; /* This list, next list */ Ilist list[2][NLIST + 1]; /* +1 for trailing null */ static Rangeset sempty; /* * Actions and Tokens * * 0x10000xx are operators, value == precedence * 0x20000xx are tokens, i.e. operands for operators */ #define OPERATOR 0x1000000 /* Bit set in all operators */ #define START (OPERATOR + 0) /* Start, used for marker on stack */ #define RBRA (OPERATOR + 1) /* Right bracket, ) */ #define LBRA (OPERATOR + 2) /* Left bracket, ( */ #define OR (OPERATOR + 3) /* Alternation, | */ #define CAT (OPERATOR + 4) /* Concatentation, implicit operator */ #define STAR (OPERATOR + 5) /* Closure, * */ #define PLUS (OPERATOR + 6) /* a+ == aa* */ #define QUEST (OPERATOR + 7) /* a? == a|nothing, i.e. 0 or 1 a's */ #define ANY 0x2000000 /* Any character but newline, . */ #define NOP (ANY + 1) /* No operation, internal use only */ #define BOL (ANY + 2) /* Beginning of line, ^ */ #define EOL (ANY + 3) /* End of line, $ */ #define CCLASS (ANY + 4) /* Character class, [] */ #define NCCLASS (ANY + 5) /* Negated character class, [^] */ #define END (ANY + 0x77) /* Terminate: match found */ #define ISATOR OPERATOR #define ISAND ANY #define QUOTED 0x4000000 /* Bit set for \-ed lex characters */ /* * Parser Information */ typedef struct Node Node; struct Node { Inst* first; Inst* last; }; #define NSTACK 20 Node andstack[NSTACK]; Node* andp; int atorstack[NSTACK]; int* atorp; int lastwasand; /* Last token was operand */ int cursubid; int subidstack[NSTACK]; int* subidp; int backwards; int nbra; Rune* exprp; /* pointer to next character in source expression */ #define DCLASS 10 /* allocation increment */ int nclass; /* number active */ int Nclass; /* high water mark */ Rune** class; int negateclass; int addinst(Ilist* l, Inst* inst, Rangeset* sep); void newmatch(Rangeset*); void bnewmatch(Rangeset*); void pushand(Inst*, Inst*); void pushator(int); Node* popand(int); int popator(void); void startlex(Rune*); int lex(void); void operator(int); void operand(int); void evaluntil(int); void optimize(Inst*); void bldcclass(void); void rxinit(void) { rechan = chancreate(sizeof(Inst*), 0); chansetname(rechan, "rechan"); lastregexp = runemalloc(1); } void regerror(char* e) { lastregexp[0] = 0; warning(nil, "regexp: %s\n", e); sendp(rechan, nil); threadexits(nil); } Inst* newinst(int t) { if (progp >= &program[NPROG]) regerror("expression too long"); progp->type = t; progp->u1.left = nil; progp->u.right = nil; return progp++; } void realcompile(void* arg) { int token; Rune* s; threadsetname("regcomp"); s = arg; startlex(s); atorp = atorstack; andp = andstack; subidp = subidstack; cursubid = 0; lastwasand = FALSE; /* Start with a low priority operator to prime parser */ pushator(START - 1); while ((token = lex()) != END) { if ((token & ISATOR) == OPERATOR) operator(token); else operand(token); } /* Close with a low priority operator */ evaluntil(START); /* Force END */ operand(END); evaluntil(START); if (nbra) regerror("unmatched `('"); --andp; /* points to first and only operand */ sendp(rechan, andp->first); threadexits(nil); } /* r is null terminated */ int rxcompile(Rune* r) { int i, nr; Inst* oprogp; nr = runestrlen(r) + 1; if (runeeq(lastregexp, runestrlen(lastregexp) + 1, r, nr) == TRUE) return TRUE; lastregexp[0] = 0; for (i = 0; i < nclass; i++) free(class[i]); nclass = 0; progp = program; backwards = FALSE; bstartinst = nil; threadcreate(realcompile, r, STACK); startinst = recvp(rechan); if (startinst == nil) return FALSE; optimize(program); oprogp = progp; backwards = TRUE; threadcreate(realcompile, r, STACK); bstartinst = recvp(rechan); if (bstartinst == nil) return FALSE; optimize(oprogp); lastregexp = runerealloc(lastregexp, nr); runemove(lastregexp, r, nr); return TRUE; } void operand(int t) { Inst* i; if (lastwasand) operator(CAT); /* catenate is implicit */ i = newinst(t); if (t == CCLASS) { if (negateclass) i->type = NCCLASS; /* UGH */ i->u.class = nclass - 1; /* UGH */ } pushand(i, i); lastwasand = TRUE; } void operator(int t) { if (t == RBRA && --nbra < 0) regerror("unmatched `)'"); if (t == LBRA) { cursubid++; /* silently ignored */ nbra++; if (lastwasand) operator(CAT); } else evaluntil(t); if (t != RBRA) pushator(t); lastwasand = FALSE; if (t == STAR || t == QUEST || t == PLUS || t == RBRA) lastwasand = TRUE; /* these look like operands */ } void pushand(Inst* f, Inst* l) { if (andp >= &andstack[NSTACK]) error("operand stack overflow"); andp->first = f; andp->last = l; andp++; } void pushator(int t) { if (atorp >= &atorstack[NSTACK]) error("operator stack overflow"); *atorp++ = t; if (cursubid >= NRange) *subidp++ = -1; else *subidp++ = cursubid; } Node* popand(int op) { char buf[64]; if (andp <= &andstack[0]) if (op) { sprint(buf, "missing operand for %c", op); regerror(buf); } else regerror("malformed regexp"); return --andp; } int popator() { if (atorp <= &atorstack[0]) error("operator stack underflow"); --subidp; return *--atorp; } void evaluntil(int pri) { Node *op1, *op2, *t; Inst *inst1, *inst2; while (pri == RBRA || atorp[-1] >= pri) { switch (popator()) { case LBRA: op1 = popand('('); inst2 = newinst(RBRA); inst2->u.subid = *subidp; op1->last->u1.next = inst2; inst1 = newinst(LBRA); inst1->u.subid = *subidp; inst1->u1.next = op1->first; pushand(inst1, inst2); return; /* must have been RBRA */ default: error("unknown regexp operator"); break; case OR: op2 = popand('|'); op1 = popand('|'); inst2 = newinst(NOP); op2->last->u1.next = inst2; op1->last->u1.next = inst2; inst1 = newinst(OR); inst1->u.right = op1->first; inst1->u1.left = op2->first; pushand(inst1, inst2); break; case CAT: op2 = popand(0); op1 = popand(0); if (backwards && op2->first->type != END) { t = op1; op1 = op2; op2 = t; } op1->last->u1.next = op2->first; pushand(op1->first, op2->last); break; case STAR: op2 = popand('*'); inst1 = newinst(OR); op2->last->u1.next = inst1; inst1->u.right = op2->first; pushand(inst1, inst1); break; case PLUS: op2 = popand('+'); inst1 = newinst(OR); op2->last->u1.next = inst1; inst1->u.right = op2->first; pushand(op2->first, inst1); break; case QUEST: op2 = popand('?'); inst1 = newinst(OR); inst2 = newinst(NOP); inst1->u1.left = inst2; inst1->u.right = op2->first; op2->last->u1.next = inst2; pushand(inst1, inst2); break; } } } void optimize(Inst* start) { Inst *inst, *target; for (inst = start; inst->type != END; inst++) { target = inst->u1.next; while (target->type == NOP) target = target->u1.next; inst->u1.next = target; } } void startlex(Rune* s) { exprp = s; nbra = 0; } int lex(void) { int c; c = *exprp++; switch (c) { case '\\': if (*exprp) if ((c = *exprp++) == 'n') c = '\n'; break; case 0: c = END; --exprp; /* In case we come here again */ break; case '*': c = STAR; break; case '?': c = QUEST; break; case '+': c = PLUS; break; case '|': c = OR; break; case '.': c = ANY; break; case '(': c = LBRA; break; case ')': c = RBRA; break; case '^': c = BOL; break; case '$': c = EOL; break; case '[': c = CCLASS; bldcclass(); break; } return c; } int nextrec(void) { if (exprp[0] == 0 || (exprp[0] == '\\' && exprp[1] == 0)) regerror("malformed `[]'"); if (exprp[0] == '\\') { exprp++; if (*exprp == 'n') { exprp++; return '\n'; } return *exprp++ | QUOTED; } return *exprp++; } void bldcclass(void) { int c1, c2, n, na; Rune* classp; classp = runemalloc(DCLASS); n = 0; na = DCLASS; /* we have already seen the '[' */ if (*exprp == '^') { classp[n++] = '\n'; /* don't match newline in negate case */ negateclass = TRUE; exprp++; } else negateclass = FALSE; while ((c1 = nextrec()) != ']') { if (c1 == '-') { Error: free(classp); regerror("malformed `[]'"); } if (n + 4 >= na) { /* 3 runes plus NUL */ na += DCLASS; classp = runerealloc(classp, na); } if (*exprp == '-') { exprp++; /* eat '-' */ if ((c2 = nextrec()) == ']') goto Error; classp[n + 0] = Runemax; classp[n + 1] = c1; classp[n + 2] = c2; n += 3; } else classp[n++] = c1 & ~QUOTED; } classp[n] = 0; if (nclass == Nclass) { Nclass += DCLASS; class = realloc(class, Nclass * sizeof(Rune*)); } class[nclass++] = classp; } int classmatch(int classno, int c, int negate) { Rune* p; p = class[classno]; while (*p) { if (*p == Runemax) { if (p[1] <= c && c <= p[2]) return !negate; p += 3; } else if (*p++ == c) return !negate; } return negate; } /* * Note optimization in addinst: * *l must be pending when addinst called; if *l has been looked * at already, the optimization is a bug. */ int addinst(Ilist* l, Inst* inst, Rangeset* sep) { Ilist* p; for (p = l; p->inst; p++) { if (p->inst == inst) { if ((sep)->r[0].q0 < p->se.r[0].q0) p->se = *sep; /* this would be bug */ return 0; /* It's already there */ } } p->inst = inst; p->se = *sep; (p + 1)->inst = nil; return 1; } int rxnull(void) { return startinst == nil || bstartinst == nil; } /* either t!=nil or r!=nil, and we match the string in the appropriate place */ int rxexecute(Text* t, Rune* r, uint startp, uint eof, Rangeset* rp) { int flag; Inst* inst; Ilist* tlp; uint p; int nnl, ntl; int nc, c; int wrapped; int startchar; flag = 0; p = startp; startchar = 0; wrapped = 0; nnl = 0; if (startinst->type < OPERATOR) startchar = startinst->type; list[0][0].inst = list[1][0].inst = nil; sel.r[0].q0 = -1; if (t != nil) nc = t->file->b.nc; else nc = runestrlen(r); /* Execute machine once for each character */ for (;; p++) { doloop: if (p >= eof || p >= nc) { switch (wrapped++) { case 0: /* let loop run one more click */ case 2: break; case 1: /* expired; wrap to beginning */ if (sel.r[0].q0 >= 0 || eof != Infinity) goto Return; list[0][0].inst = list[1][0].inst = nil; p = 0; goto doloop; default: goto Return; } c = 0; } else { if (((wrapped && p >= startp) || sel.r[0].q0 > 0) && nnl == 0) break; if (t != nil) c = textreadc(t, p); else c = r[p]; } /* fast check for first char */ if (startchar && nnl == 0 && c != startchar) continue; tl = list[flag]; nl = list[flag ^= 1]; nl->inst = nil; ntl = nnl; nnl = 0; if (sel.r[0].q0 < 0 && (!wrapped || p < startp || startp == eof)) { /* Add first instruction to this list */ sempty.r[0].q0 = p; if (addinst(tl, startinst, &sempty)) if (++ntl >= NLIST) { Overflow: warning(nil, "regexp list overflow\n"); sel.r[0].q0 = -1; goto Return; } } /* Execute machine until this list is empty */ for (tlp = tl; inst = tlp->inst; tlp++) { /* assignment = */ Switchstmt: switch (inst->type) { default: /* regular character */ if (inst->type == c) { Addinst: if (addinst(nl, inst->u1.next, &tlp->se)) if (++nnl >= NLIST) goto Overflow; } break; case LBRA: if (inst->u.subid >= 0) tlp->se.r[inst->u.subid].q0 = p; inst = inst->u1.next; goto Switchstmt; case RBRA: if (inst->u.subid >= 0) tlp->se.r[inst->u.subid].q1 = p; inst = inst->u1.next; goto Switchstmt; case ANY: if (c != '\n') goto Addinst; break; case BOL: if ( p == 0 || (t != nil && textreadc(t, p - 1) == '\n') || (r != nil && r[p - 1] == '\n')) { Step: inst = inst->u1.next; goto Switchstmt; } break; case EOL: if (c == '\n') goto Step; break; case CCLASS: if (c >= 0 && classmatch(inst->u.class, c, 0)) goto Addinst; break; case NCCLASS: if (c >= 0 && classmatch(inst->u.class, c, 1)) goto Addinst; break; case OR: /* evaluate right choice later */ if (addinst(tlp, inst->u.right, &tlp->se)) if (++ntl >= NLIST) goto Overflow; /* efficiency: advance and re-evaluate */ inst = inst->u1.left; goto Switchstmt; case END: /* Match! */ tlp->se.r[0].q1 = p; newmatch(&tlp->se); break; } } } Return: *rp = sel; return sel.r[0].q0 >= 0; } void newmatch(Rangeset* sp) { if ( sel.r[0].q0 < 0 || sp->r[0].q0 < sel.r[0].q0 || (sp->r[0].q0 == sel.r[0].q0 && sp->r[0].q1 > sel.r[0].q1)) sel = *sp; } int rxbexecute(Text* t, uint startp, Rangeset* rp) { int flag; Inst* inst; Ilist* tlp; int p; int nnl, ntl; int c; int wrapped; int startchar; flag = 0; nnl = 0; wrapped = 0; p = startp; startchar = 0; if (bstartinst->type < OPERATOR) startchar = bstartinst->type; list[0][0].inst = list[1][0].inst = nil; sel.r[0].q0 = -1; /* Execute machine once for each character, including terminal NUL */ for (;; --p) { doloop: if (p <= 0) { switch (wrapped++) { case 0: /* let loop run one more click */ case 2: break; case 1: /* expired; wrap to end */ if (sel.r[0].q0 >= 0) goto Return; list[0][0].inst = list[1][0].inst = nil; p = t->file->b.nc; goto doloop; case 3: default: goto Return; } c = 0; } else { if (((wrapped && p <= startp) || sel.r[0].q0 > 0) && nnl == 0) break; c = textreadc(t, p - 1); } /* fast check for first char */ if (startchar && nnl == 0 && c != startchar) continue; tl = list[flag]; nl = list[flag ^= 1]; nl->inst = nil; ntl = nnl; nnl = 0; if (sel.r[0].q0 < 0 && (!wrapped || p > startp)) { /* Add first instruction to this list */ /* the minus is so the optimizations in addinst work */ sempty.r[0].q0 = -p; if (addinst(tl, bstartinst, &sempty)) if (++ntl >= NLIST) { Overflow: warning(nil, "regexp list overflow\n"); sel.r[0].q0 = -1; goto Return; } } /* Execute machine until this list is empty */ for (tlp = tl; inst = tlp->inst; tlp++) { /* assignment = */ Switchstmt: switch (inst->type) { default: /* regular character */ if (inst->type == c) { Addinst: if (addinst(nl, inst->u1.next, &tlp->se)) if (++nnl >= NLIST) goto Overflow; } break; case LBRA: if (inst->u.subid >= 0) tlp->se.r[inst->u.subid].q0 = p; inst = inst->u1.next; goto Switchstmt; case RBRA: if (inst->u.subid >= 0) tlp->se.r[inst->u.subid].q1 = p; inst = inst->u1.next; goto Switchstmt; case ANY: if (c != '\n') goto Addinst; break; case BOL: if (c == '\n' || p == 0) { Step: inst = inst->u1.next; goto Switchstmt; } break; case EOL: if (p < t->file->b.nc && textreadc(t, p) == '\n') goto Step; break; case CCLASS: if (c > 0 && classmatch(inst->u.class, c, 0)) goto Addinst; break; case NCCLASS: if (c > 0 && classmatch(inst->u.class, c, 1)) goto Addinst; break; case OR: /* evaluate right choice later */ if (addinst(tl, inst->u.right, &tlp->se)) if (++ntl >= NLIST) goto Overflow; /* efficiency: advance and re-evaluate */ inst = inst->u1.left; goto Switchstmt; case END: /* Match! */ tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */ tlp->se.r[0].q1 = p; bnewmatch(&tlp->se); break; } } } Return: *rp = sel; return sel.r[0].q0 >= 0; } void bnewmatch(Rangeset* sp) { int i; if ( sel.r[0].q0 < 0 || sp->r[0].q0 > sel.r[0].q1 || (sp->r[0].q0 == sel.r[0].q1 && sp->r[0].q1 < sel.r[0].q0)) for (i = 0; i < NRange; i++) { /* note the reversal; q0<=q1 */ sel.r[i].q0 = sp->r[i].q1; sel.r[i].q1 = sp->r[i].q0; } }