From 44b0d5cff846ba487c526ed95be1b3d1cd3d762a Mon Sep 17 00:00:00 2001 From: Chris Angelico Date: Sun, 8 Jun 2014 06:32:44 +1000 Subject: [PATCH] Use utf8_get/next_char in building up a string's repr --- py/objstr.c | 59 +++++++++++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/py/objstr.c b/py/objstr.c index c9fc452f77..8ace52b8ee 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -83,40 +83,33 @@ void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *e quote_char = '"'; } print(env, "%c", quote_char); - for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) { - if (*s == quote_char) { - print(env, "\\%c", quote_char); - } else if (*s == '\\') { - print(env, "\\\\"); - } else if (32 <= *s && *s <= 126) { - print(env, "%c", *s); - } else if (*s == '\n') { - print(env, "\\n"); - } else if (*s == '\r') { - print(env, "\\r"); - } else if (*s == '\t') { - print(env, "\\t"); - } else if (*s == '\x7f') { - print(env, "\\x7f"); - } else if (is_bytes) { - print(env, "\\x%02x", *s); + const char *s = (const char *)str_data, *top = (const char *)str_data + str_len; + while (s < top) { + unichar ch; + if (is_bytes) { + ch = *(unsigned char *)s++; // Don't sign-extend bytes } else { - // Non-ASCII character. Decode UTF-8. - machine_int_t ord = *s++ & 0x7F; - for (machine_int_t mask = 0x40; ord & mask; mask >>= 1) { - ord &= ~mask; - } - while (UTF8_IS_CONT(*s)) { - ord = (ord << 6) | (*s++ & 0x3F); - } - --s; // s will be incremented by the main loop - if (ord < 0x100) { - print(env, "\\x%02x", ord); - } else if (ord < 0x10000) { - print(env, "\\u%04x", ord); - } else { - print(env, "\\U%08x", ord); - } + ch = utf8_get_char(s); + s = utf8_next_char(s); + } + if (ch == quote_char) { + print(env, "\\%c", quote_char); + } else if (ch == '\\') { + print(env, "\\\\"); + } else if (32 <= ch && ch <= 126) { + print(env, "%c", ch); + } else if (ch == '\n') { + print(env, "\\n"); + } else if (ch == '\r') { + print(env, "\\r"); + } else if (ch == '\t') { + print(env, "\\t"); + } else if (ch < 0x100) { + print(env, "\\x%02x", ch); + } else if (ch < 0x10000) { + print(env, "\\u%04x", ch); + } else { + print(env, "\\U%08x", ch); } } print(env, "%c", quote_char);