Files
micropython/tests/basics/lexer.py
Damien George 617c7dba3b py/lexer: Use null char as lexer EOF sentinel.
The null byte cannot exist in source code (per CPython), so use it to
indicate the end of the input stream (instead of `(mp_uint_t)-1`).  This
allows the cache chars (chr0/1/2 and their saved versions) to be 8-bit
bytes, making it clear that they are not `unichar` values.  It also saves a
bit of memory in the `mp_lexer_t` data structure.  (And in a future commit
allows the saved cache chars to be eliminated entirely by storing them in
a vstr instead.)

In order to keep code size down, the frequently used `chr0` is still of
type `uint32_t`.  Having it 32-bit means that machine instructions to load
it are smaller (it adds about +80 bytes to Thumb code if `chr0` is changed
to `uint8_t`).

Also add tests for invalid bytes in the input stream to make sure there are
no regressions in this regard.

Signed-off-by: Damien George <damien@micropython.org>
2026-02-04 23:19:09 +11:00

102 lines
1.9 KiB
Python

# test the lexer
try:
eval
exec
except NameError:
print("SKIP")
raise SystemExit
# __debug__ is a special symbol
print(type(__debug__))
# short input
exec("")
exec("\n")
exec("\n\n")
exec("\r")
exec("\r\r")
exec("\t")
exec("\r\n")
exec("\nprint(1)")
exec("\rprint(2)")
exec("\r\nprint(3)")
exec("\n5")
exec("\r6")
exec("\r\n7")
print(eval("1"))
print(eval("12"))
print(eval("123"))
print(eval("1\n"))
print(eval("12\n"))
print(eval("123\n"))
print(eval("1\r"))
print(eval("12\r"))
print(eval("123\r"))
# line continuation
print(eval("'123' \\\r '456'"))
print(eval("'123' \\\n '456'"))
print(eval("'123' \\\r\n '456'"))
print(eval("'123'\\\r'456'"))
print(eval("'123'\\\n'456'"))
print(eval("'123'\\\r\n'456'"))
# backslash used to escape a line-break in a string
print('a\
b')
# lots of indentation
def a(x):
if x:
if x:
if x:
if x:
if x:
if x:
if x:
if x:
if x:
if x:
if x:
if x:
if x:
if x:
if x:
print(x)
a(1)
# badly formed hex escape sequences
try:
exec(r"'\x0'")
except SyntaxError:
print("SyntaxError")
try:
exec(r"b'\x0'")
except SyntaxError:
print("SyntaxError")
try:
exec(r"'\u000'")
except SyntaxError:
print("SyntaxError")
try:
exec(r"'\U0000000'")
except SyntaxError:
print("SyntaxError")
# Properly formed integer literals
print(eval("00"))
# badly formed integer literals
try:
eval("01")
except SyntaxError:
print("SyntaxError")
# Bytes 0-8 inclusive are not allowed in input stream.
# Earlier CPython (eg 3.10.12) raises ValueError, later CPython (eg 3.11.14) raises SyntaxError.
for invalid_byte_value in range(0, 10):
try:
print(eval(b"123" + bytes([invalid_byte_value])))
except (ValueError, SyntaxError):
print("byte {}: SyntaxError".format(invalid_byte_value))