extmod/modre: Add support for start- and endpos.

Pattern objects have two additional parameters for the ::search and ::match
methods to define the starting and ending position of the subject within
the string to be searched.

This allows for searching a sub-string without creating a slice.  However,
one caveat of using the start-pos rather than a slice is that the start
anchor (`^`) remains anchored to the beginning of the text.

Signed-off-by: Jared Hancock <jared@greezybacon.me>
This commit is contained in:
Jared Hancock
2024-03-25 20:58:51 -05:00
committed by Damien George
parent 485dac783b
commit 14ccdeb4d7
3 changed files with 114 additions and 3 deletions

View File

@@ -154,8 +154,8 @@ Regex objects
Compiled regular expression. Instances of this class are created using Compiled regular expression. Instances of this class are created using
`re.compile()`. `re.compile()`.
.. method:: regex.match(string) .. method:: regex.match(string, [pos, [endpos]])
regex.search(string) regex.search(string, [pos, [endpos]])
regex.sub(replace, string, count=0, flags=0, /) regex.sub(replace, string, count=0, flags=0, /)
Similar to the module-level functions :meth:`match`, :meth:`search` Similar to the module-level functions :meth:`match`, :meth:`search`
@@ -163,6 +163,16 @@ Compiled regular expression. Instances of this class are created using
Using methods is (much) more efficient if the same regex is applied to Using methods is (much) more efficient if the same regex is applied to
multiple strings. multiple strings.
The optional second parameter *pos* gives an index in the string where the
search is to start; it defaults to ``0``. This is not completely equivalent
to slicing the string; the ``'^'`` pattern character matches at the real
beginning of the string and at positions just after a newline, but not
necessarily at the index where the search is to start.
The optional parameter *endpos* limits how far the string will be searched;
it will be as if the string is *endpos* characters long, so only the
characters from *pos* to ``endpos - 1`` will be searched for a match.
.. method:: regex.split(string, max_split=-1, /) .. method:: regex.split(string, max_split=-1, /)
Split a *string* using regex. If *max_split* is given, it specifies Split a *string* using regex. If *max_split* is given, it specifies

View File

@@ -196,10 +196,11 @@ static void re_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t
// Note: this function can't be named re_exec because it may clash with system headers, eg on FreeBSD // Note: this function can't be named re_exec because it may clash with system headers, eg on FreeBSD
static mp_obj_t re_exec_helper(bool is_anchored, uint n_args, const mp_obj_t *args) { static mp_obj_t re_exec_helper(bool is_anchored, uint n_args, const mp_obj_t *args) {
(void)n_args;
mp_obj_re_t *self; mp_obj_re_t *self;
bool was_compiled = false;
if (mp_obj_is_type(args[0], (mp_obj_type_t *)&re_type)) { if (mp_obj_is_type(args[0], (mp_obj_type_t *)&re_type)) {
self = MP_OBJ_TO_PTR(args[0]); self = MP_OBJ_TO_PTR(args[0]);
was_compiled = true;
} else { } else {
self = MP_OBJ_TO_PTR(mod_re_compile(1, args)); self = MP_OBJ_TO_PTR(mod_re_compile(1, args));
} }
@@ -207,6 +208,28 @@ static mp_obj_t re_exec_helper(bool is_anchored, uint n_args, const mp_obj_t *ar
size_t len; size_t len;
subj.begin_line = subj.begin = mp_obj_str_get_data(args[1], &len); subj.begin_line = subj.begin = mp_obj_str_get_data(args[1], &len);
subj.end = subj.begin + len; subj.end = subj.begin + len;
if (was_compiled && n_args > 2) {
// Arg #2 is starting-pos
mp_int_t startpos = mp_obj_get_int(args[2]);
if (startpos > (mp_int_t)len) {
startpos = len;
} else if (startpos < 0) {
startpos = 0;
}
subj.begin += startpos;
if (n_args > 3) {
// Arg #3 is ending-pos
mp_int_t endpos = mp_obj_get_int(args[3]);
if (endpos > (mp_int_t)len) {
endpos = len;
} else if (endpos < startpos) {
endpos = startpos;
}
subj.end = subj.begin_line + endpos;
}
}
int caps_num = (self->re.sub + 1) * 2; int caps_num = (self->re.sub + 1) * 2;
mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, caps, char *, caps_num); mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, caps, char *, caps_num);
// cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char

View File

@@ -0,0 +1,78 @@
# test start and end pos specification
try:
import re
except ImportError:
print("SKIP")
raise SystemExit
def print_groups(match):
print("----")
try:
if match is not None:
i = 0
while True:
print(match.group(i))
i += 1
except IndexError:
pass
p = re.compile(r"o")
m = p.match("dog")
print_groups(m)
m = p.match("dog", 1)
print_groups(m)
m = p.match("dog", 2)
print_groups(m)
# No match past end of input
m = p.match("dog", 5)
print_groups(m)
m = p.match("dog", 0, 1)
print_groups(m)
# Caret only matches the actual beginning
p = re.compile(r"^o")
m = p.match("dog", 1)
print_groups(m)
# End at beginning means searching empty string
p = re.compile(r"o")
m = p.match("dog", 1, 1)
print_groups(m)
# End before the beginning doesn't match anything
m = p.match("dog", 2, 1)
print_groups(m)
# Negative starting values don't crash
m = p.search("dog", -2)
print_groups(m)
m = p.search("dog", -2, -5)
print_groups(m)
# Search also works
print("--search")
p = re.compile(r"o")
m = p.search("dog")
print_groups(m)
m = p.search("dog", 1)
print_groups(m)
m = p.search("dog", 2)
print_groups(m)
# Negative starting values don't crash
m = p.search("dog", -2)
print_groups(m)
m = p.search("dog", -2, -5)
print_groups(m)