lib/re1.5: Move re1.5 code from extmod to lib.

It's third-party code, and not necessarily tied to extmod. Signed-off-by: Damien George <damien@micropython.org>
2026-01-07 04:30:24 +01:00 · 2021-07-12 16:33:19 +10:00
parent e2f0b181f9
commit d1d172f536
5 changed files with 0 additions and 0 deletions
--- a/lib/re1.5/charclass.c
+++ b/lib/re1.5/charclass.c
@@ -0,0 +1,33 @@
+#include "re1.5.h"
+
+int _re1_5_classmatch(const char *pc, const char *sp)
+{
+    // pc points to "cnt" byte after opcode
+    int is_positive = (pc[-1] == Class);
+    int cnt = *pc++;
+    while (cnt--) {
+        if (*sp >= *pc && *sp <= pc[1]) return is_positive;
+        pc += 2;
+    }
+    return !is_positive;
+}
+
+int _re1_5_namedclassmatch(const char *pc, const char *sp)
+{
+    // pc points to name of class
+    int off = (*pc >> 5) & 1;
+    if ((*pc | 0x20) == 'd') {
+        if (!(*sp >= '0' && *sp <= '9')) {
+            off ^= 1;
+        }
+    } else if ((*pc | 0x20) == 's') {
+        if (!(*sp == ' ' || (*sp >= '\t' && *sp <= '\r'))) {
+            off ^= 1;
+        }
+    } else { // w
+        if (!((*sp >= 'A' && *sp <= 'Z') || (*sp >= 'a' && *sp <= 'z') || (*sp >= '0' && *sp <= '9') || *sp == '_')) {
+            off ^= 1;
+        }
+    }
+    return off;
+}
--- a/lib/re1.5/compilecode.c
+++ b/lib/re1.5/compilecode.c
@@ -0,0 +1,229 @@
+// Copyright 2014 Paul Sokolovsky.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re1.5.h"
+
+#define INSERT_CODE(at, num, pc) \
+    ((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num)
+#define REL(at, to) (to - at - 2)
+#define EMIT(at, byte) (code ? (code[at] = byte) : (at))
+#define EMIT_CHECKED(at, byte) (_emit_checked(at, code, byte, &err))
+#define PC (prog->bytelen)
+
+static void _emit_checked(int at, char *code, int val, bool *err) {
+    *err |= val != (int8_t)val;
+    if (code) {
+        code[at] = val;
+    }
+}
+
+static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
+{
+    char *code = sizecode ? NULL : prog->insts;
+    bool err = false;
+    int start = PC;
+    int term = PC;
+    int alt_label = 0;
+
+    for (; *re && *re != ')'; re++) {
+        switch (*re) {
+        case '\\':
+            re++;
+            if (!*re) return NULL; // Trailing backslash
+            if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
+                term = PC;
+                EMIT(PC++, NamedClass);
+                EMIT(PC++, *re);
+                prog->len++;
+                break;
+            }
+            MP_FALLTHROUGH
+        default:
+            term = PC;
+            EMIT(PC++, Char);
+            EMIT(PC++, *re);
+            prog->len++;
+            break;
+        case '.':
+            term = PC;
+            EMIT(PC++, Any);
+            prog->len++;
+            break;
+        case '[': {
+            int cnt;
+            term = PC;
+            re++;
+            if (*re == '^') {
+                EMIT(PC++, ClassNot);
+                re++;
+            } else {
+                EMIT(PC++, Class);
+            }
+            PC++; // Skip # of pair byte
+            prog->len++;
+            for (cnt = 0; *re != ']'; re++, cnt++) {
+                if (*re == '\\') {
+                    ++re;
+                }
+                if (!*re) return NULL;
+                EMIT(PC++, *re);
+                if (re[1] == '-' && re[2] != ']') {
+                    re += 2;
+                }
+                EMIT(PC++, *re);
+            }
+            EMIT_CHECKED(term + 1, cnt);
+            break;
+        }
+        case '(': {
+            term = PC;
+            int sub = 0;
+            int capture = re[1] != '?' || re[2] != ':';
+
+            if (capture) {
+                sub = ++prog->sub;
+                EMIT(PC++, Save);
+                EMIT_CHECKED(PC++, 2 * sub);
+                prog->len++;
+            } else {
+                    re += 2;
+            }
+
+            re = _compilecode(re + 1, prog, sizecode);
+            if (re == NULL || *re != ')') return NULL; // error, or no matching paren
+
+            if (capture) {
+                EMIT(PC++, Save);
+                EMIT_CHECKED(PC++, 2 * sub + 1);
+                prog->len++;
+            }
+
+            break;
+        }
+        case '?':
+            if (PC == term) return NULL; // nothing to repeat
+            INSERT_CODE(term, 2, PC);
+            if (re[1] == '?') {
+                EMIT(term, RSplit);
+                re++;
+            } else {
+                EMIT(term, Split);
+            }
+            EMIT_CHECKED(term + 1, REL(term, PC));
+            prog->len++;
+            term = PC;
+            break;
+        case '*':
+            if (PC == term) return NULL; // nothing to repeat
+            INSERT_CODE(term, 2, PC);
+            EMIT(PC, Jmp);
+            EMIT_CHECKED(PC + 1, REL(PC, term));
+            PC += 2;
+            if (re[1] == '?') {
+                EMIT(term, RSplit);
+                re++;
+            } else {
+                EMIT(term, Split);
+            }
+            EMIT_CHECKED(term + 1, REL(term, PC));
+            prog->len += 2;
+            term = PC;
+            break;
+        case '+':
+            if (PC == term) return NULL; // nothing to repeat
+            if (re[1] == '?') {
+                EMIT(PC, Split);
+                re++;
+            } else {
+                EMIT(PC, RSplit);
+            }
+            EMIT_CHECKED(PC + 1, REL(PC, term));
+            PC += 2;
+            prog->len++;
+            term = PC;
+            break;
+        case '|':
+            if (alt_label) {
+                EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1);
+            }
+            INSERT_CODE(start, 2, PC);
+            EMIT(PC++, Jmp);
+            alt_label = PC++;
+            EMIT(start, Split);
+            EMIT_CHECKED(start + 1, REL(start, PC));
+            prog->len += 2;
+            term = PC;
+            break;
+        case '^':
+            EMIT(PC++, Bol);
+            prog->len++;
+            term = PC;
+            break;
+        case '$':
+            EMIT(PC++, Eol);
+            prog->len++;
+            term = PC;
+            break;
+        }
+    }
+
+    if (alt_label) {
+        EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1);
+    }
+    return err ? NULL : re;
+}
+
+int re1_5_sizecode(const char *re)
+{
+    ByteProg dummyprog = {
+         // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
+        .bytelen = 5 + NON_ANCHORED_PREFIX
+    };
+
+    if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1;
+
+    return dummyprog.bytelen;
+}
+
+int re1_5_compilecode(ByteProg *prog, const char *re)
+{
+    prog->len = 0;
+    prog->bytelen = 0;
+    prog->sub = 0;
+
+    // Add code to implement non-anchored operation ("search"),
+    // for anchored operation ("match"), this code will be just skipped.
+    // TODO: Implement search in much more efficient manner
+    prog->insts[prog->bytelen++] = RSplit;
+    prog->insts[prog->bytelen++] = 3;
+    prog->insts[prog->bytelen++] = Any;
+    prog->insts[prog->bytelen++] = Jmp;
+    prog->insts[prog->bytelen++] = -5;
+    prog->len += 3;
+
+    prog->insts[prog->bytelen++] = Save;
+    prog->insts[prog->bytelen++] = 0;
+    prog->len++;
+
+    re = _compilecode(re, prog, /*sizecode*/0);
+    if (re == NULL || *re) return 1;
+
+    prog->insts[prog->bytelen++] = Save;
+    prog->insts[prog->bytelen++] = 1;
+    prog->len++;
+
+    prog->insts[prog->bytelen++] = Match;
+    prog->len++;
+
+    return 0;
+}
+
+#if 0
+int main(int argc, char *argv[])
+{
+    int pc = 0;
+    ByteProg *code = re1_5_compilecode(argv[1]);
+    re1_5_dumpcode(code);
+}
+#endif
--- a/lib/re1.5/dumpcode.c
+++ b/lib/re1.5/dumpcode.c
@@ -0,0 +1,65 @@
+// Copyright 2014 Paul Sokolovsky.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re1.5.h"
+
+void re1_5_dumpcode(ByteProg *prog)
+{
+    int pc = 0;
+    char *code = prog->insts;
+    while (pc < prog->bytelen) {
+                printf("%2d: ", pc);
+                switch(code[pc++]) {
+                default:
+                        assert(0);
+//                        re1_5_fatal("printprog");
+                case Split:
+                        printf("split %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
+                        pc++;
+                        break;
+                case RSplit:
+                        printf("rsplit %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
+                        pc++;
+                        break;
+                case Jmp:
+                        printf("jmp %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
+                        pc++;
+                        break;
+                case Char:
+                        printf("char %c\n", code[pc++]);
+                        break;
+                case Any:
+                        printf("any\n");
+                        break;
+                case Class:
+                case ClassNot: {
+                        int num = code[pc];
+                        printf("class%s %d", (code[pc - 1] == ClassNot ? "not" : ""), num);
+                        pc++;
+                        while (num--) {
+                            printf(" 0x%02x-0x%02x", code[pc], code[pc + 1]);
+                            pc += 2;
+                        }
+                        printf("\n");
+                        break;
+                }
+                case NamedClass:
+                        printf("namedclass %c\n", code[pc++]);
+                        break;
+                case Match:
+                        printf("match\n");
+                        break;
+                case Save:
+                        printf("save %d\n", (unsigned char)code[pc++]);
+                        break;
+                case Bol:
+                        printf("assert bol\n");
+                        break;
+                case Eol:
+                        printf("assert eol\n");
+                        break;
+                }
+    }
+    printf("Bytes: %d, insts: %d\n", prog->bytelen, prog->len);
+}
--- a/lib/re1.5/re1.5.h
+++ b/lib/re1.5/re1.5.h
@@ -0,0 +1,154 @@
+// Copyright 2007-2009 Russ Cox.  All Rights Reserved.
+// Copyright 2014 Paul Sokolovsky.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef _RE1_5_REGEXP__H
+#define _RE1_5_REGEXP__H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#define nil ((void*)0)
+#define nelem(x) (sizeof(x)/sizeof((x)[0]))
+
+typedef struct Regexp Regexp;
+typedef struct Prog Prog;
+typedef struct ByteProg ByteProg;
+typedef struct Inst Inst;
+typedef struct Subject Subject;
+
+struct Regexp
+{
+	int type;
+	int n;
+	int ch;
+	Regexp *left;
+	Regexp *right;
+};
+
+enum	/* Regexp.type */
+{
+	Alt = 1,
+	Cat,
+	Lit,
+	Dot,
+	Paren,
+	Quest,
+	Star,
+	Plus,
+};
+
+Regexp *parse(char*);
+Regexp *reg(int type, Regexp *left, Regexp *right);
+void printre(Regexp*);
+#ifndef re1_5_fatal
+void re1_5_fatal(char*);
+#endif
+#ifndef re1_5_stack_chk
+#define re1_5_stack_chk()
+#endif
+void *mal(int);
+
+struct Prog
+{
+	Inst *start;
+	int len;
+};
+
+struct ByteProg
+{
+	int bytelen;
+	int len;
+	int sub;
+	char insts[0];
+};
+
+struct Inst
+{
+	int opcode;
+	int c;
+	int n;
+	Inst *x;
+	Inst *y;
+	int gen;	// global state, oooh!
+};
+
+enum	/* Inst.opcode */
+{
+	// Instructions which consume input bytes (and thus fail if none left)
+	CONSUMERS = 1,
+	Char = CONSUMERS,
+	Any,
+	Class,
+	ClassNot,
+	NamedClass,
+
+	ASSERTS = 0x50,
+	Bol = ASSERTS,
+	Eol,
+
+	// Instructions which take relative offset as arg
+	JUMPS = 0x60,
+	Jmp = JUMPS,
+	Split,
+	RSplit,
+
+	// Other (special) instructions
+	Save = 0x7e,
+	Match = 0x7f,
+};
+
+#define inst_is_consumer(inst) ((inst) < ASSERTS)
+#define inst_is_jump(inst) ((inst) & 0x70 == JUMPS)
+
+Prog *compile(Regexp*);
+void printprog(Prog*);
+
+extern int gen;
+
+enum {
+	MAXSUB = 20
+};
+
+typedef struct Sub Sub;
+
+struct Sub
+{
+	int ref;
+	int nsub;
+	const char *sub[MAXSUB];
+};
+
+Sub *newsub(int n);
+Sub *incref(Sub*);
+Sub *copy(Sub*);
+Sub *update(Sub*, int, const char*);
+void decref(Sub*);
+
+struct Subject {
+	const char *begin;
+	const char *end;
+};
+
+
+#define NON_ANCHORED_PREFIX 5
+#define HANDLE_ANCHORED(bytecode, is_anchored) ((is_anchored) ? (bytecode) + NON_ANCHORED_PREFIX : (bytecode))
+
+int re1_5_backtrack(ByteProg*, Subject*, const char**, int, int);
+int re1_5_pikevm(ByteProg*, Subject*, const char**, int, int);
+int re1_5_recursiveloopprog(ByteProg*, Subject*, const char**, int, int);
+int re1_5_recursiveprog(ByteProg*, Subject*, const char**, int, int);
+int re1_5_thompsonvm(ByteProg*, Subject*, const char**, int, int);
+
+int re1_5_sizecode(const char *re);
+int re1_5_compilecode(ByteProg *prog, const char *re);
+void re1_5_dumpcode(ByteProg *prog);
+void cleanmarks(ByteProg *prog);
+int _re1_5_classmatch(const char *pc, const char *sp);
+int _re1_5_namedclassmatch(const char *pc, const char *sp);
+
+#endif /*_RE1_5_REGEXP__H*/
--- a/lib/re1.5/recursiveloop.c
+++ b/lib/re1.5/recursiveloop.c
@@ -0,0 +1,87 @@
+// Copyright 2007-2009 Russ Cox.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re1.5.h"
+
+static int
+recursiveloop(char *pc, const char *sp, Subject *input, const char **subp, int nsubp)
+{
+	const char *old;
+	int off;
+
+	re1_5_stack_chk();
+
+	for(;;) {
+		if(inst_is_consumer(*pc)) {
+			// If we need to match a character, but there's none left, it's fail
+			if(sp >= input->end)
+				return 0;
+		}
+		switch(*pc++) {
+		case Char:
+			if(*sp != *pc++)
+				return 0;
+			MP_FALLTHROUGH
+		case Any:
+			sp++;
+			continue;
+		case Class:
+		case ClassNot:
+			if (!_re1_5_classmatch(pc, sp))
+				return 0;
+			pc += *(unsigned char*)pc * 2 + 1;
+			sp++;
+			continue;
+                case NamedClass:
+			if (!_re1_5_namedclassmatch(pc, sp))
+				return 0;
+			pc++;
+			sp++;
+			continue;
+		case Match:
+			return 1;
+		case Jmp:
+			off = (signed char)*pc++;
+			pc = pc + off;
+			continue;
+		case Split:
+			off = (signed char)*pc++;
+			if(recursiveloop(pc, sp, input, subp, nsubp))
+				return 1;
+			pc = pc + off;
+			continue;
+		case RSplit:
+			off = (signed char)*pc++;
+			if(recursiveloop(pc + off, sp, input, subp, nsubp))
+				return 1;
+			continue;
+		case Save:
+			off = (unsigned char)*pc++;
+			if(off >= nsubp) {
+				continue;
+			}
+			old = subp[off];
+			subp[off] = sp;
+			if(recursiveloop(pc, sp, input, subp, nsubp))
+				return 1;
+			subp[off] = old;
+			return 0;
+		case Bol:
+			if(sp != input->begin)
+				return 0;
+			continue;
+		case Eol:
+			if(sp != input->end)
+				return 0;
+			continue;
+		}
+		re1_5_fatal("recursiveloop");
+	}
+}
+
+int
+re1_5_recursiveloopprog(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
+{
+	return recursiveloop(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, input, subp, nsubp);
+}