Add support for \u and \U escapes, but not \N (with explanatory comment)

2026-03-18 14:50:18 +01:00 · 2014-06-07 08:37:27 +10:00
parent 231031ac5f
commit e924659b85
1 changed files with 17 additions and 4 deletions
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -502,19 +502,32 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
                            case 'v': c = 0x0b; break;
                            case 'f': c = 0x0c; break;
                            case 'r': c = 0x0d; break;
+                            case 'u':
+                            case 'U':
+				if (is_bytes) {
+				    // b'\u1234' == b'\\u1234'
+				    vstr_add_char(&lex->vstr, '\\');
+				    break;
+				}
+				// Otherwise fall through.
                            case 'x':
                            {
                                uint num = 0;
-                                if (!get_hex(lex, 2, &num)) {
+                                if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
                                    // TODO error message
                                    assert(0);
                                }
                                c = num;
                                break;
                            }
-                            case 'N': break; // TODO \N{name} only in strings
-                            case 'u': break; // TODO \uxxxx only in strings
-                            case 'U': break; // TODO \Uxxxxxxxx only in strings
+                            case 'N':
+                                // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
+                                // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
+                                // 3MB of text; even gzip-compressed and with minimal structure, it'll take
+                                // roughly half a meg of storage. This form of Unicode escape may be added
+                                // later on, but it's definitely not a priority right now. -- CJA 20140607
+                                assert(!"Unicode name escapes not supported");
+			        break;
                            default:
                                if (c >= '0' && c <= '7') {
                                    // Octal sequence, 1-3 chars