From 44b0d5cff846ba487c526ed95be1b3d1cd3d762a Mon Sep 17 00:00:00 2001
From: Chris Angelico <rosuav@gmail.com>
Date: Sun, 8 Jun 2014 06:32:44 +1000
Subject: [PATCH] Use utf8_get/next_char in building up a string's repr

---
 py/objstr.c | 59 +++++++++++++++++++++++------------------------------
 1 file changed, 26 insertions(+), 33 deletions(-)

diff --git a/py/objstr.c b/py/objstr.c
index c9fc452f77..8ace52b8ee 100644
--- a/py/objstr.c
+++ b/py/objstr.c
@@ -83,40 +83,33 @@ void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *e
         quote_char = '"';
     }
     print(env, "%c", quote_char);
-    for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
-        if (*s == quote_char) {
-            print(env, "\\%c", quote_char);
-        } else if (*s == '\\') {
-            print(env, "\\\\");
-        } else if (32 <= *s && *s <= 126) {
-            print(env, "%c", *s);
-        } else if (*s == '\n') {
-            print(env, "\\n");
-        } else if (*s == '\r') {
-            print(env, "\\r");
-        } else if (*s == '\t') {
-            print(env, "\\t");
-        } else if (*s == '\x7f') {
-            print(env, "\\x7f");
-        } else if (is_bytes) {
-            print(env, "\\x%02x", *s);
+    const char *s = (const char *)str_data, *top = (const char *)str_data + str_len;
+    while (s < top) {
+        unichar ch;
+        if (is_bytes) {
+            ch = *(unsigned char *)s++; // Don't sign-extend bytes
         } else {
-            // Non-ASCII character. Decode UTF-8.
-            machine_int_t ord = *s++ & 0x7F;
-            for (machine_int_t mask = 0x40; ord & mask; mask >>= 1) {
-                ord &= ~mask;
-            }
-            while (UTF8_IS_CONT(*s)) {
-                ord = (ord << 6) | (*s++ & 0x3F);
-            }
-            --s; // s will be incremented by the main loop
-            if (ord < 0x100) {
-                print(env, "\\x%02x", ord);
-            } else if (ord < 0x10000) {
-                print(env, "\\u%04x", ord);
-            } else {
-                print(env, "\\U%08x", ord);
-            }
+            ch = utf8_get_char(s);
+            s = utf8_next_char(s);
+        }
+        if (ch == quote_char) {
+            print(env, "\\%c", quote_char);
+        } else if (ch == '\\') {
+            print(env, "\\\\");
+        } else if (32 <= ch && ch <= 126) {
+            print(env, "%c", ch);
+        } else if (ch == '\n') {
+            print(env, "\\n");
+        } else if (ch == '\r') {
+            print(env, "\\r");
+        } else if (ch == '\t') {
+            print(env, "\\t");
+        } else if (ch < 0x100) {
+            print(env, "\\x%02x", ch);
+        } else if (ch < 0x10000) {
+            print(env, "\\u%04x", ch);
+        } else {
+            print(env, "\\U%08x", ch);
         }
     }
     print(env, "%c", quote_char);