From d4751a164e485e1353bc78135700690a37409153 Mon Sep 17 00:00:00 2001 From: Damien George Date: Tue, 6 Jan 2026 12:30:42 +1100 Subject: [PATCH] py: Add support for PEP 750's t-strings. This commit adds support for t-strings by leveraging the existing f-string parser in the lexer. It includes: - t-string parsing in `py/lexer.c` - new built-in `__template__()` function to construct t-string objects - new built-in `Template` and `Interpolation` classes which implement all the functionality from PEP 750 - new built-in `string` module with `templatelib` sub-module, which contains the classes `Template` and `Interpolation` The way the t-string parser works is that an input t-string like: t"hello {name:5}" is converted character-by-character by the lexer/tokenizer to: __template__(("hello ", "",), name, "name", None, "5") For reference, if it were an f-string it would be converted to: "hello {:5}".format(name) Some properties of this implementation: - it's enabled by default at the full feature level, MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_FULL_FEATURES - when enabled on a Cortex-M bare-metal port it costs about +3000 bytes - there are no limits on the size or complexity of t-strings, and it allows arbitrary levels of nesting of f-strings and t-strings (up to the memory available to the compiler) - the 'a' (ascii) conversion specifier is not supported (MicroPython does not have the built-in `ascii` function) - space after conversion specifier, eg t"{x!r :10}", is not supported - arguments to `__template__` and `Interpolation` are not fully validated (it's not necessary, it won't crash if the wrong arguments are passed in) Otherwise the implementation here matches CPython. Signed-off-by: Damien George --- py/lexer.c | 221 +++++++++++++- py/modbuiltins.c | 6 + py/modstring.c | 56 ++++ py/mpconfig.h | 8 +- py/obj.h | 5 + py/objtemplate.c | 395 +++++++++++++++++++++++++ py/py.cmake | 2 + py/py.mk | 2 + py/qstrdefs.h | 4 + tests/ports/unix/extra_coverage.py.exp | 6 +- 10 files changed, 691 insertions(+), 14 deletions(-) create mode 100644 py/modstring.c create mode 100644 py/objtemplate.c diff --git a/py/lexer.c b/py/lexer.c index 8ddbe42339..22ff4d2cd3 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -59,7 +59,11 @@ static bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) { return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3; } -#if MICROPY_PY_FSTRINGS +#if MICROPY_PY_TSTRINGS +static bool is_char_or5(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4, byte c5) { + return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4 || lex->chr0 == c5; +} +#elif MICROPY_PY_FSTRINGS static bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) { return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4; } @@ -108,7 +112,11 @@ static bool is_following_odigit(mp_lexer_t *lex) { static bool is_string_or_bytes(mp_lexer_t *lex) { return is_char_or(lex, '\'', '\"') - #if MICROPY_PY_FSTRINGS + #if MICROPY_PY_TSTRINGS + || (is_char_or5(lex, 'r', 'u', 'b', 'f', 't') && is_char_following_or(lex, '\'', '\"')) + || (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r') || is_char_and(lex, 'r', 't') || is_char_and(lex, 't', 'r')) + && is_char_following_following_or(lex, '\'', '\"'))) + #elif MICROPY_PY_FSTRINGS || (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"')) || (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) @@ -312,7 +320,7 @@ static bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) { return true; } -static void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) { +static void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring, bool is_tstring) { // get first quoting character char quote_char = '\''; if (is_char(lex, '\"')) { @@ -345,20 +353,52 @@ static void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) } } #endif + #if MICROPY_PY_TSTRINGS + if (is_tstring) { + if (vstr_len(&lex->fstring_args) == 0) { + vstr_add_byte(&lex->vstr, '('); + vstr_add_byte(&lex->vstr, '('); + for (size_t q = 0; q < num_quotes; ++q) { + vstr_add_byte(&lex->vstr, quote_char); + } + } + } + #endif + + #if MICROPY_PY_TSTRINGS + size_t tstring_num_interpolations = 0; + size_t end_of_format_index = 0; + size_t nested_formatting_in_tstring = 0; + bool nested_formatting_needs_fstring = false; + #endif while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) { if (is_char(lex, quote_char)) { n_closing += 1; vstr_add_char(&lex->vstr, CUR_CHAR(lex)); + #if MICROPY_PY_TSTRINGS + } else if (is_tstring && is_char(lex, '\n')) { + // handle multi-line t-strings + vstr_add_byte(&lex->vstr, '\\'); + vstr_add_byte(&lex->vstr, 'n'); + #endif } else { n_closing = 0; #if MICROPY_PY_FSTRINGS - while (is_fstring && is_char(lex, '{')) { + while ((is_fstring || is_tstring) && is_char(lex, '{')) { + #if MICROPY_PY_TSTRINGS + if (nested_formatting_in_tstring) { + ++nested_formatting_in_tstring; + break; + } + #endif next_char(lex); if (is_char(lex, '{')) { // "{{" is passed through unchanged to be handled by str.format - vstr_add_byte(&lex->vstr, '{'); + if (!is_tstring) { + vstr_add_byte(&lex->vstr, '{'); + } next_char(lex); } else { // wrap each argument in (), e.g. @@ -395,24 +435,102 @@ static void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) vstr_add_byte(&lex->fstring_args, c); next_char(lex); } + #if MICROPY_PY_TSTRINGS + bool was_debug = false; + #endif if (lex->fstring_args.buf[lex->fstring_args.len - 1] == '=') { // if the last character of the arg was '=', then inject "arg=" before the '{'. // f'{a=}' --> 'a={}'.format(a) vstr_add_strn(&lex->vstr, lex->fstring_args.buf + i, lex->fstring_args.len - i); // remove the trailing '=' lex->fstring_args.len--; + #if MICROPY_PY_TSTRINGS + was_debug = true; + #endif + } + #if MICROPY_PY_TSTRINGS + if (is_tstring) { + // truncate trailing spaces + while (lex->fstring_args.len && unichar_isspace(lex->fstring_args.buf[lex->fstring_args.len - 1])) { + lex->fstring_args.len--; + } + } + #endif + if (lex->fstring_args.len == i) { + // empty format, eg f'{}' + // (should apply to both f-strings and t-strings, needs test) + lex->tok_kind = MP_TOKEN_MALFORMED_FSTRING; } // close the paren-wrapped arg to .format(). vstr_add_byte(&lex->fstring_args, ')'); // comma-separate args to .format(). vstr_add_byte(&lex->fstring_args, ','); + #if MICROPY_PY_TSTRINGS + if (is_tstring) { + // start the interpolation part + + // duplicate expression to a string + vstr_add_byte(&lex->fstring_args, quote_char); + size_t nn = lex->fstring_args.len - i - 3; + for (size_t j = 0; j < nn; ++j) { + byte b = lex->fstring_args.buf[i + j]; + if (b == quote_char) { + vstr_add_byte(&lex->fstring_args, '\\'); + } else if (b == '\\') { + vstr_add_byte(&lex->fstring_args, '\\'); + } + vstr_add_byte(&lex->fstring_args, b); + } + vstr_add_byte(&lex->fstring_args, quote_char); + vstr_add_byte(&lex->fstring_args, ','); + + // start next part of string as next __template__ argument + for (size_t q = 0; q < num_quotes; ++q) { + vstr_add_byte(&lex->vstr, quote_char); + } + vstr_add_byte(&lex->vstr, ','); + for (size_t q = 0; q < num_quotes; ++q) { + vstr_add_byte(&lex->vstr, quote_char); + } + + // process conv and format spec + if (is_char(lex, '!')) { + next_char(lex); + vstr_add_byte(&lex->fstring_args, quote_char); + vstr_add_byte(&lex->fstring_args, CUR_CHAR(lex)); + next_char(lex); + vstr_add_byte(&lex->fstring_args, quote_char); + vstr_add_byte(&lex->fstring_args, ','); + } else if (was_debug && !is_char(lex, ':')) { + vstr_add_str(&lex->fstring_args, "'r',"); + } else { + vstr_add_str(&lex->fstring_args, "None,"); + } + + // start format str + if (is_char(lex, ':')) { + next_char(lex); + } + nested_formatting_in_tstring = 1; + end_of_format_index = lex->vstr.len; + } + #endif } vstr_add_byte(&lex->vstr, '{'); goto continue_outer; } #endif - if (is_char(lex, '\\')) { + if (is_tstring && is_char(lex, '\\')) { + // it'll be reparsed as a string + vstr_add_byte(&lex->vstr, '\\'); + if (is_raw) { + vstr_add_byte(&lex->vstr, '\\'); + } else { + next_char(lex); + vstr_add_byte(&lex->vstr, CUR_CHAR(lex)); + } + } else if (is_char(lex, '\\')) { next_char(lex); unichar c = CUR_CHAR(lex); if (is_raw) { @@ -511,10 +629,36 @@ static void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) // Character out of range; this raises a generic SyntaxError. lex->tok_kind = MP_TOKEN_INVALID; } + #if MICROPY_PY_TSTRINGS + } else if (is_tstring && nested_formatting_in_tstring && is_char(lex, '}')) { + if (--nested_formatting_in_tstring > 0) { + nested_formatting_needs_fstring = true; + vstr_add_byte(&lex->vstr, CUR_CHAR(lex)); + } else { + // finished the current interpolation + ++tstring_num_interpolations; + if (nested_formatting_needs_fstring) { + vstr_add_byte(&lex->fstring_args, 'f'); + nested_formatting_needs_fstring = false; + } + vstr_add_byte(&lex->fstring_args, quote_char); + vstr_add_strn(&lex->fstring_args, lex->vstr.buf + end_of_format_index + 1, lex->vstr.len - end_of_format_index - 1); + lex->vstr.len = end_of_format_index; + vstr_add_byte(&lex->fstring_args, quote_char); + vstr_add_byte(&lex->fstring_args, ','); + } + #endif } else { // Add the "character" as a byte so that we remain 8-bit clean. // This way, strings are parsed correctly whether or not they contain utf-8 chars. vstr_add_byte(&lex->vstr, CUR_CHAR(lex)); + #if MICROPY_PY_TSTRINGS + if (is_tstring && is_char_and(lex, '}', '}')) { + next_char(lex); + } else if (is_tstring && is_char(lex, '}')) { + lex->tok_kind = MP_TOKEN_MALFORMED_FSTRING; + } + #endif } } continue_parsing_string_literal: @@ -529,8 +673,23 @@ static void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN; } - // cut off the end quotes from the token text - vstr_cut_tail_bytes(&lex->vstr, n_closing); + #if MICROPY_PY_TSTRINGS + if (is_tstring) { + if (nested_formatting_in_tstring > 0) { + lex->tok_kind = MP_TOKEN_MALFORMED_FSTRING; + } + + if (1 + tstring_num_interpolations * 4 > 255) { + // too many arguments for function call, so wrap interpolations in a tuple + vstr_ins_byte(&lex->fstring_args, 0, '('); + vstr_add_byte(&lex->fstring_args, ')'); + } + } else + #endif + { + // cut off the end quotes from the token text + vstr_cut_tail_bytes(&lex->vstr, n_closing); + } } // This function returns whether it has crossed a newline or not. @@ -621,11 +780,16 @@ void mp_lexer_to_next(mp_lexer_t *lex) { // MP_TOKEN_END is used to indicate that this is the first string token lex->tok_kind = MP_TOKEN_END; + #if MICROPY_PY_TSTRINGS + bool had_tstring = false; + #endif + // Loop to accumulate string/bytes literals do { // parse type codes bool is_raw = false; bool is_fstring = false; + bool is_tstring = false; mp_token_kind_t kind = MP_TOKEN_STRING; int n_char = 0; if (is_char(lex, 'u')) { @@ -645,11 +809,17 @@ void mp_lexer_to_next(mp_lexer_t *lex) { n_char = 2; } #if MICROPY_PY_FSTRINGS - if (is_char_following(lex, 'f')) { + else if (is_char_following(lex, 'f')) { is_fstring = true; n_char = 2; } #endif + #if MICROPY_PY_TSTRINGS + else if (is_char_following(lex, 't')) { + is_tstring = true; + n_char = 2; + } + #endif } #if MICROPY_PY_FSTRINGS else if (is_char(lex, 'f')) { @@ -661,6 +831,22 @@ void mp_lexer_to_next(mp_lexer_t *lex) { } } #endif + #if MICROPY_PY_TSTRINGS + else if (is_char(lex, 't')) { + is_tstring = true; + n_char = 1; + if (is_char_following(lex, 'r')) { + is_raw = true; + n_char = 2; + } + } + #endif + + #if MICROPY_PY_TSTRINGS + if (is_tstring) { + had_tstring = true; + } + #endif // Set or check token kind if (lex->tok_kind == MP_TOKEN_END) { @@ -679,13 +865,28 @@ void mp_lexer_to_next(mp_lexer_t *lex) { } // Parse the literal - parse_string_literal(lex, is_raw, is_fstring); + parse_string_literal(lex, is_raw, is_fstring, is_tstring); // Skip whitespace so we can check if there's another string following skip_whitespace(lex, true); } while (is_string_or_bytes(lex)); + #if MICROPY_PY_TSTRINGS + if (had_tstring) { + vstr_add_byte(&lex->vstr, ','); + vstr_add_byte(&lex->vstr, ')'); + vstr_add_byte(&lex->vstr, ','); + vstr_ins_strn(&lex->fstring_args, 0, lex->vstr.buf, lex->vstr.len); + if (lex->tok_kind > MP_TOKEN_MALFORMED_FSTRING) { + // next token is __template__ for the function + lex->tok_kind = MP_TOKEN_NAME; + vstr_reset(&lex->vstr); + vstr_add_str(&lex->vstr, "__template__"); + } + } + #endif + #if MICROPY_PY_FSTRINGS if (lex->fstring_args.len) { // If there was an f-string then it's now complete. diff --git a/py/modbuiltins.c b/py/modbuiltins.c index 51cf3137bf..6f085f2df3 100644 --- a/py/modbuiltins.c +++ b/py/modbuiltins.c @@ -599,6 +599,9 @@ MP_DEFINE_CONST_FUN_OBJ_0(mp_builtin_locals_obj, mp_builtin_locals); // These are defined in terms of MicroPython API functions right away MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_id_obj, mp_obj_id); MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_len_obj, mp_obj_len); +#if MICROPY_PY_TSTRINGS +static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mp_builtin___template___obj, 1, MP_OBJ_FUN_ARGS_MAX, mp_obj_new_template); +#endif static const mp_rom_map_elem_t mp_module_builtins_globals_table[] = { { MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_builtins) }, @@ -607,6 +610,9 @@ static const mp_rom_map_elem_t mp_module_builtins_globals_table[] = { { MP_ROM_QSTR(MP_QSTR___build_class__), MP_ROM_PTR(&mp_builtin___build_class___obj) }, { MP_ROM_QSTR(MP_QSTR___import__), MP_ROM_PTR(&mp_builtin___import___obj) }, { MP_ROM_QSTR(MP_QSTR___repl_print__), MP_ROM_PTR(&mp_builtin___repl_print___obj) }, + #if MICROPY_PY_TSTRINGS + { MP_ROM_QSTR(MP_QSTR___template__), MP_ROM_PTR(&mp_builtin___template___obj) }, + #endif // built-in types { MP_ROM_QSTR(MP_QSTR_bool), MP_ROM_PTR(&mp_type_bool) }, diff --git a/py/modstring.c b/py/modstring.c new file mode 100644 index 0000000000..26e0c2d83c --- /dev/null +++ b/py/modstring.c @@ -0,0 +1,56 @@ +/* + * This file is part of the MicroPython project, http://micropython.org/ + * + * The MIT License (MIT) + * + * Copyright (c) 2026 Damien P. George + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "py/obj.h" + +#if MICROPY_PY_TSTRINGS + +static const mp_rom_map_elem_t mp_module_string_templatelib_globals_table[] = { + { MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_string_dot_templatelib) }, + { MP_ROM_QSTR(MP_QSTR_Template), MP_ROM_PTR(&mp_type_template) }, + { MP_ROM_QSTR(MP_QSTR_Interpolation), MP_ROM_PTR(&mp_type_interpolation) }, +}; +static MP_DEFINE_CONST_DICT(mp_module_string_templatelib_globals, mp_module_string_templatelib_globals_table); + +static const mp_obj_module_t mp_module_string_templatelib = { + .base = { &mp_type_module }, + .globals = (mp_obj_dict_t *)&mp_module_string_templatelib_globals, +}; + +static const mp_rom_map_elem_t mp_module_string_globals_table[] = { + { MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_string) }, + { MP_ROM_QSTR(MP_QSTR_templatelib), MP_ROM_PTR(&mp_module_string_templatelib) }, +}; +static MP_DEFINE_CONST_DICT(mp_module_string_globals, mp_module_string_globals_table); + +const mp_obj_module_t mp_module_string = { + .base = { &mp_type_module }, + .globals = (mp_obj_dict_t *)&mp_module_string_globals, +}; + +MP_REGISTER_EXTENSIBLE_MODULE(MP_QSTR_string, mp_module_string); + +#endif // MICROPY_PY_TSTRINGS diff --git a/py/mpconfig.h b/py/mpconfig.h index e5b7b523a7..d0150ec7b9 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -1137,7 +1137,7 @@ typedef time_t mp_timestamp_t; // have __init__ methods. Instead, the top-level package's __init__ should // initialise all sub-packages. #ifndef MICROPY_MODULE_BUILTIN_SUBPACKAGES -#define MICROPY_MODULE_BUILTIN_SUBPACKAGES (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EVERYTHING) +#define MICROPY_MODULE_BUILTIN_SUBPACKAGES (MICROPY_PY_TSTRINGS || MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EVERYTHING) #endif // Whether to support module-level __getattr__ (see PEP 562) @@ -1322,6 +1322,12 @@ typedef time_t mp_timestamp_t; #define MICROPY_PY_FSTRINGS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES) #endif +// Support for template strings, t-strings (see PEP 750, Python 3.14+) +// Requires MICROPY_PY_FSTRINGS to be enabled. +#ifndef MICROPY_PY_TSTRINGS +#define MICROPY_PY_TSTRINGS (MICROPY_PY_FSTRINGS && MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_FULL_FEATURES) +#endif + // Support for assignment expressions with := (see PEP 572, Python 3.8+) #ifndef MICROPY_PY_ASSIGN_EXPR #define MICROPY_PY_ASSIGN_EXPR (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES) diff --git a/py/obj.h b/py/obj.h index 8c953d720c..3c9122a69c 100644 --- a/py/obj.h +++ b/py/obj.h @@ -831,6 +831,8 @@ extern const mp_obj_type_t mp_type_NoneType; extern const mp_obj_type_t mp_type_bool; extern const mp_obj_type_t mp_type_int; extern const mp_obj_type_t mp_type_str; +extern const mp_obj_type_t mp_type_template; +extern const mp_obj_type_t mp_type_interpolation; extern const mp_obj_type_t mp_type_bytes; extern const mp_obj_type_t mp_type_bytearray; extern const mp_obj_type_t mp_type_memoryview; @@ -1011,6 +1013,9 @@ mp_obj_t mp_obj_new_bytes_from_vstr(vstr_t *vstr); mp_obj_t mp_obj_new_bytes(const byte *data, size_t len); mp_obj_t mp_obj_new_bytearray(size_t n, const void *items); mp_obj_t mp_obj_new_bytearray_by_ref(size_t n, void *items); +#if MICROPY_PY_TSTRINGS +mp_obj_t mp_obj_new_template(size_t n_args, const mp_obj_t *args); +#endif #if MICROPY_PY_BUILTINS_FLOAT mp_obj_t mp_obj_new_int_from_float(mp_float_t val); mp_obj_t mp_obj_new_complex(mp_float_t real, mp_float_t imag); diff --git a/py/objtemplate.c b/py/objtemplate.c new file mode 100644 index 0000000000..0fc51a78d9 --- /dev/null +++ b/py/objtemplate.c @@ -0,0 +1,395 @@ +/* + * This file is part of the MicroPython project, http://micropython.org/ + * + * The MIT License (MIT) + * + * Copyright (c) 2025 Koudai Aono + * Copyright (c) 2026 Damien P. George + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "py/runtime.h" + +#if MICROPY_PY_TSTRINGS + +typedef struct _mp_obj_template_t { + mp_obj_base_t base; + mp_obj_t strings; + mp_obj_t interpolations; +} mp_obj_template_t; + +typedef struct _mp_obj_interpolation_t { + mp_obj_base_t base; + mp_obj_t value; + mp_obj_t expression; + mp_obj_t conversion; + mp_obj_t format_spec; +} mp_obj_interpolation_t; + +static mp_obj_t mp_obj_new_interpolation(mp_obj_t value, mp_obj_t expr, mp_obj_t conv, mp_obj_t spec); + +static mp_obj_t mp_obj_template_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) { + mp_arg_check_num(n_args, n_kw, 0, MP_OBJ_FUN_ARGS_MAX, false); + + mp_obj_t strings_obj; + mp_obj_t interpolations_obj; + + if (n_args == 0) { + mp_obj_t empty = MP_OBJ_NEW_QSTR(MP_QSTR_); + strings_obj = mp_obj_new_tuple(1, &empty); + interpolations_obj = mp_obj_new_tuple(0, NULL); + } else { + size_t n_interpolations = 0; + size_t n_str_args = 0; + for (size_t i = 0; i < n_args; i++) { + if (mp_obj_is_exact_type(args[i], &mp_type_interpolation)) { + n_interpolations++; + } else if (mp_obj_is_str(args[i])) { + n_str_args++; + } else { + mp_raise_TypeError(MP_ERROR_TEXT("expected str or Interpolation")); + } + } + + if (n_interpolations == 0) { + if (n_str_args == 1) { + strings_obj = mp_obj_new_tuple(1, &args[0]); + } else { + size_t total_len = 0; + for (size_t i = 0; i < n_args; i++) { + size_t str_len; + (void)mp_obj_str_get_data(args[i], &str_len); + total_len += str_len; + } + vstr_t vstr; + vstr_init(&vstr, total_len); + for (size_t i = 0; i < n_args; i++) { + size_t str_len; + const char *str_data = mp_obj_str_get_data(args[i], &str_len); + vstr_add_strn(&vstr, str_data, str_len); + } + mp_obj_t str_items[1]; + str_items[0] = mp_obj_new_str_from_vstr(&vstr); + strings_obj = mp_obj_new_tuple(1, str_items); + } + interpolations_obj = mp_obj_new_tuple(0, NULL); + } else { + size_t n_strings = n_interpolations + 1; + mp_obj_tuple_t *strings_tuple = mp_obj_malloc_var(mp_obj_tuple_t, items, mp_obj_t, n_strings, &mp_type_tuple); + mp_obj_tuple_t *interpolations_tuple = mp_obj_malloc_var(mp_obj_tuple_t, items, mp_obj_t, n_interpolations, &mp_type_tuple); + strings_tuple->len = n_strings; + interpolations_tuple->len = n_interpolations; + + size_t string_idx = 0; + size_t interp_idx = 0; + mp_obj_t current_str = MP_OBJ_NULL; + bool current_vstr_active = false; + vstr_t current_vstr = {0}; + + for (size_t i = 0; i <= n_args; i++) { + if (i == n_args || mp_obj_is_exact_type(args[i], &mp_type_interpolation)) { + mp_obj_t out_str; + if (current_vstr_active) { + out_str = mp_obj_new_str_from_vstr(¤t_vstr); + current_vstr_active = false; + } else if (current_str != MP_OBJ_NULL) { + out_str = current_str; + } else { + out_str = MP_OBJ_NEW_QSTR(MP_QSTR_); + } + strings_tuple->items[string_idx++] = out_str; + current_str = MP_OBJ_NULL; + if (i < n_args) { + interpolations_tuple->items[interp_idx++] = args[i]; + } + } else { + size_t str_len; + const char *str_data = mp_obj_str_get_data(args[i], &str_len); + if (current_vstr_active) { + vstr_add_strn(¤t_vstr, str_data, str_len); + } else if (current_str == MP_OBJ_NULL) { + current_str = args[i]; + } else { + size_t prev_len; + const char *prev_data = mp_obj_str_get_data(current_str, &prev_len); + vstr_init(¤t_vstr, prev_len + str_len); + vstr_add_strn(¤t_vstr, prev_data, prev_len); + vstr_add_strn(¤t_vstr, str_data, str_len); + current_vstr_active = true; + current_str = MP_OBJ_NULL; + } + } + } + + strings_obj = MP_OBJ_FROM_PTR(strings_tuple); + interpolations_obj = MP_OBJ_FROM_PTR(interpolations_tuple); + } + } + + mp_obj_template_t *self = mp_obj_malloc(mp_obj_template_t, type); + self->strings = strings_obj; + self->interpolations = interpolations_obj; + return MP_OBJ_FROM_PTR(self); +} + +static void mp_obj_template_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) { + (void)kind; + mp_obj_template_t *self = MP_OBJ_TO_PTR(self_in); + mp_printf(print, "%q(%q=", MP_QSTR_Template, MP_QSTR_strings); + mp_obj_print_helper(print, self->strings, PRINT_REPR); + mp_printf(print, ", %q=", MP_QSTR_interpolations); + mp_obj_print_helper(print, self->interpolations, PRINT_REPR); + mp_print_str(print, ")"); +} + +static mp_obj_t mp_obj_template_binary_op(mp_binary_op_t op, mp_obj_t lhs_in, mp_obj_t rhs_in) { + mp_obj_template_t *lhs = MP_OBJ_TO_PTR(lhs_in); + + switch (op) { + case MP_BINARY_OP_ADD: { + if (!mp_obj_is_exact_type(rhs_in, &mp_type_template)) { + return MP_OBJ_NULL; // op not supported + } + + mp_obj_template_t *rhs = MP_OBJ_TO_PTR(rhs_in); + + mp_obj_tuple_t *lhs_strings = MP_OBJ_TO_PTR(lhs->strings); + mp_obj_tuple_t *lhs_interps = MP_OBJ_TO_PTR(lhs->interpolations); + mp_obj_tuple_t *rhs_strings = MP_OBJ_TO_PTR(rhs->strings); + mp_obj_tuple_t *rhs_interps = MP_OBJ_TO_PTR(rhs->interpolations); + + size_t new_strings_len = lhs_strings->len + rhs_strings->len - 1; + size_t new_interps_len = lhs_interps->len + rhs_interps->len; + + // Create tuples directly to avoid GC issues. + mp_obj_tuple_t *new_strings_tuple = mp_obj_malloc_var(mp_obj_tuple_t, items, mp_obj_t, new_strings_len, &mp_type_tuple); + mp_obj_tuple_t *new_interps_tuple = mp_obj_malloc_var(mp_obj_tuple_t, items, mp_obj_t, new_interps_len, &mp_type_tuple); + new_strings_tuple->len = new_strings_len; + new_interps_tuple->len = new_interps_len; + + // Copy all but the last string from lhs. + for (size_t i = 0; i < lhs_strings->len - 1; i++) { + new_strings_tuple->items[i] = lhs_strings->items[i]; + } + + // Merge last string from lhs with first string from rhs. + size_t lhs_last_len, rhs_first_len; + const char *lhs_last_str = mp_obj_str_get_data(lhs_strings->items[lhs_strings->len - 1], &lhs_last_len); + const char *rhs_first_str = mp_obj_str_get_data(rhs_strings->items[0], &rhs_first_len); + + vstr_t vstr; + vstr_init(&vstr, lhs_last_len + rhs_first_len); + vstr_add_strn(&vstr, lhs_last_str, lhs_last_len); + vstr_add_strn(&vstr, rhs_first_str, rhs_first_len); + new_strings_tuple->items[lhs_strings->len - 1] = mp_obj_new_str_from_vstr(&vstr); + + // Copy remaining strings from rhs. + for (size_t i = 1; i < rhs_strings->len; i++) { + new_strings_tuple->items[lhs_strings->len - 1 + i] = rhs_strings->items[i]; + } + + // Copy interpolations from both sides. + for (size_t i = 0; i < lhs_interps->len; i++) { + new_interps_tuple->items[i] = lhs_interps->items[i]; + } + + for (size_t i = 0; i < rhs_interps->len; i++) { + new_interps_tuple->items[lhs_interps->len + i] = rhs_interps->items[i]; + } + + mp_obj_template_t *result = mp_obj_malloc(mp_obj_template_t, &mp_type_template); + result->strings = MP_OBJ_FROM_PTR(new_strings_tuple); + result->interpolations = MP_OBJ_FROM_PTR(new_interps_tuple); + return MP_OBJ_FROM_PTR(result); + } + + default: + return MP_OBJ_NULL; // op not supported + } +} + +static void mp_obj_template_attr(mp_obj_t self_in, qstr attr, mp_obj_t *dest) { + mp_obj_template_t *self = MP_OBJ_TO_PTR(self_in); + + if (dest[0] == MP_OBJ_NULL) { + // Load attribute. + if (attr == MP_QSTR_strings) { + dest[0] = self->strings; + } else if (attr == MP_QSTR_interpolations) { + dest[0] = self->interpolations; + } else if (attr == MP_QSTR_values) { + mp_obj_tuple_t *interps = MP_OBJ_TO_PTR(self->interpolations); + mp_obj_tuple_t *values_tuple = MP_OBJ_TO_PTR(mp_obj_new_tuple(interps->len, NULL)); + for (size_t i = 0; i < interps->len; i++) { + mp_obj_interpolation_t *interp = MP_OBJ_TO_PTR(interps->items[i]); + values_tuple->items[i] = interp->value; + } + dest[0] = MP_OBJ_FROM_PTR(values_tuple); + } + } +} + +typedef struct _mp_obj_template_iter_t { + mp_obj_base_t base; + mp_fun_1_t iternext; + mp_obj_t template; + size_t index; +} mp_obj_template_iter_t; + +static mp_obj_t template_iter_iternext(mp_obj_t self_in) { + mp_obj_template_iter_t *self = MP_OBJ_TO_PTR(self_in); + mp_obj_template_t *tmpl = MP_OBJ_TO_PTR(self->template); + mp_obj_tuple_t *strings = MP_OBJ_TO_PTR(tmpl->strings); + mp_obj_tuple_t *interps = MP_OBJ_TO_PTR(tmpl->interpolations); + + while (self->index < strings->len + interps->len) { + if ((self->index & 1) == 0) { + // A string. + mp_obj_t str_obj = strings->items[self->index++ / 2]; + size_t str_len; + mp_obj_str_get_data(str_obj, &str_len); + if (str_len > 0) { + return str_obj; + } + } else { + // An interpolation. + return interps->items[self->index++ / 2]; + } + } + + return MP_OBJ_STOP_ITERATION; +} + +static mp_obj_t mp_obj_template_iter(mp_obj_t self_in, mp_obj_iter_buf_t *iter_buf) { + assert(sizeof(mp_obj_template_iter_t) <= sizeof(mp_obj_iter_buf_t)); + mp_obj_template_iter_t *iter = (mp_obj_template_iter_t *)iter_buf; + iter->base.type = &mp_type_polymorph_iter; + iter->iternext = template_iter_iternext; + iter->template = self_in; + iter->index = 0; + return MP_OBJ_FROM_PTR(iter); +} + +MP_DEFINE_CONST_OBJ_TYPE( + mp_type_template, + MP_QSTR_Template, + MP_TYPE_FLAG_NONE, + make_new, mp_obj_template_make_new, + print, mp_obj_template_print, + binary_op, mp_obj_template_binary_op, + attr, mp_obj_template_attr, + iter, mp_obj_template_iter + ); + +mp_obj_t mp_obj_new_template(size_t n_args, const mp_obj_t *args) { + mp_obj_template_t *o = mp_obj_malloc(mp_obj_template_t, &mp_type_template); + o->strings = args[0]; + if (n_args == 2) { + // Unpack interpolations from second argument (which is a tuple). + mp_obj_t *iargs; + mp_obj_get_array(args[1], &n_args, &iargs); + args = iargs; + } else { + // Unpack interpolations directly from arguments. + --n_args; + ++args; + } + size_t n_interpolations = n_args / 4; + mp_obj_tuple_t *interpolations = MP_OBJ_TO_PTR(mp_obj_new_tuple(n_interpolations, NULL)); + for (size_t i = 0; i < n_interpolations; ++i) { + interpolations->items[i] = mp_obj_new_interpolation(args[i * 4], args[i * 4 + 1], args[i * 4 + 2], args[i * 4 + 3]); + } + o->interpolations = MP_OBJ_FROM_PTR(interpolations); + return MP_OBJ_FROM_PTR(o); +} + +///////////////////////////////////////////////////////////////// + +static mp_obj_t mp_obj_interpolation_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *all_args) { + enum { ARG_value, ARG_expression, ARG_conversion, ARG_format_spec }; + static const mp_arg_t allowed_args[] = { + { MP_QSTR_value, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_obj = MP_OBJ_NULL} }, + { MP_QSTR_expression, MP_ARG_OBJ, {.u_rom_obj = MP_ROM_QSTR(MP_QSTR_)} }, + { MP_QSTR_conversion, MP_ARG_OBJ, {.u_rom_obj = MP_ROM_NONE} }, + { MP_QSTR_format_spec, MP_ARG_OBJ, {.u_rom_obj = MP_ROM_QSTR(MP_QSTR_)} }, + }; + + mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)]; + mp_arg_parse_all_kw_array(n_args, n_kw, all_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args); + + mp_obj_interpolation_t *self = mp_obj_malloc(mp_obj_interpolation_t, &mp_type_interpolation); + self->value = args[ARG_value].u_obj; + self->expression = args[ARG_expression].u_obj; + self->conversion = args[ARG_conversion].u_obj; + self->format_spec = args[ARG_format_spec].u_obj; + + return MP_OBJ_FROM_PTR(self); +} + +static void mp_obj_interpolation_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) { + (void)kind; + mp_obj_interpolation_t *self = MP_OBJ_TO_PTR(self_in); + mp_printf(print, "%q(", MP_QSTR_Interpolation); + mp_obj_print_helper(print, self->value, PRINT_REPR); + mp_print_str(print, ", "); + mp_obj_print_helper(print, self->expression, PRINT_REPR); + mp_print_str(print, ", "); + mp_obj_print_helper(print, self->conversion, PRINT_REPR); + mp_print_str(print, ", "); + mp_obj_print_helper(print, self->format_spec, PRINT_REPR); + mp_print_str(print, ")"); +} + +static void mp_obj_interpolation_attr(mp_obj_t self_in, qstr attr, mp_obj_t *dest) { + mp_obj_interpolation_t *self = MP_OBJ_TO_PTR(self_in); + + if (dest[0] == MP_OBJ_NULL) { + // load attribute + if (attr == MP_QSTR_value) { + dest[0] = self->value; + } else if (attr == MP_QSTR_expression) { + dest[0] = self->expression; + } else if (attr == MP_QSTR_conversion) { + dest[0] = self->conversion; + } else if (attr == MP_QSTR_format_spec) { + dest[0] = self->format_spec; + } + } +} + +static mp_obj_t mp_obj_new_interpolation(mp_obj_t value, mp_obj_t expression, mp_obj_t conversion, mp_obj_t format_spec) { + mp_obj_interpolation_t *o = mp_obj_malloc(mp_obj_interpolation_t, &mp_type_interpolation); + o->value = value; + o->expression = expression; + o->conversion = conversion; + o->format_spec = format_spec; + return MP_OBJ_FROM_PTR(o); +} + +MP_DEFINE_CONST_OBJ_TYPE( + mp_type_interpolation, + MP_QSTR_Interpolation, + MP_TYPE_FLAG_NONE, + make_new, mp_obj_interpolation_make_new, + print, mp_obj_interpolation_print, + attr, mp_obj_interpolation_attr + ); + +#endif // MICROPY_PY_TSTRINGS diff --git a/py/py.cmake b/py/py.cmake index ec2a5d832d..8b3c857ef4 100644 --- a/py/py.cmake +++ b/py/py.cmake @@ -49,6 +49,7 @@ set(MICROPY_SOURCE_PY ${MICROPY_PY_DIR}/modio.c ${MICROPY_PY_DIR}/modmath.c ${MICROPY_PY_DIR}/modmicropython.c + ${MICROPY_PY_DIR}/modstring.c ${MICROPY_PY_DIR}/modstruct.c ${MICROPY_PY_DIR}/modsys.c ${MICROPY_PY_DIR}/modthread.c @@ -106,6 +107,7 @@ set(MICROPY_SOURCE_PY ${MICROPY_PY_DIR}/objstr.c ${MICROPY_PY_DIR}/objstringio.c ${MICROPY_PY_DIR}/objstrunicode.c + ${MICROPY_PY_DIR}/objtemplate.c ${MICROPY_PY_DIR}/objtuple.c ${MICROPY_PY_DIR}/objtype.c ${MICROPY_PY_DIR}/objzip.c diff --git a/py/py.mk b/py/py.mk index e7716a1adc..a8b50b8d24 100644 --- a/py/py.mk +++ b/py/py.mk @@ -180,6 +180,7 @@ PY_CORE_O_BASENAME = $(addprefix py/,\ objstr.o \ objstrunicode.o \ objstringio.o \ + objtemplate.o \ objtuple.o \ objtype.o \ objzip.o \ @@ -198,6 +199,7 @@ PY_CORE_O_BASENAME = $(addprefix py/,\ modmath.o \ modcmath.o \ modmicropython.o \ + modstring.o \ modstruct.o \ modsys.o \ moderrno.o \ diff --git a/py/qstrdefs.h b/py/qstrdefs.h index 0b50d279f9..96d4ece614 100644 --- a/py/qstrdefs.h +++ b/py/qstrdefs.h @@ -76,3 +76,7 @@ Q(/rom/lib) #if MICROPY_ENABLE_PYSTACK Q(pystack exhausted) #endif + +#if MICROPY_PY_TSTRINGS +Q(string.templatelib) +#endif diff --git a/tests/ports/unix/extra_coverage.py.exp b/tests/ports/unix/extra_coverage.py.exp index d11e5ee6f4..3a46994c23 100644 --- a/tests/ports/unix/extra_coverage.py.exp +++ b/tests/ports/unix/extra_coverage.py.exp @@ -73,9 +73,9 @@ example_package ffi framebuf gc hashlib heapq io json machine marshal math os platform random re -select socket struct sys -termios time tls uctypes -vfs websocket +select socket string struct +sys termios time tls +uctypes vfs websocket me micropython machine marshal math