File: //snap/google-cloud-cli/394/platform/gsutil/third_party/pyparsing/examples/unicode_denormalizer.py
# unicode_denormalizer.py
#
# Demonstration of the pyparsing's transform_string() method, to
# convert identifiers in Python source code to equivalent Unicode
# characters. Python's compiler automatically normalizes Unicode
# characters back to their ASCII equivalents, so that identifiers may
# be rewritten using other Unicode characters, and normalize back to
# the same identifier. For instance, Python treats "print" and "𝕡𝓻ᵢ𝓃𝘁"
# and "𝖕𝒓𝗂𝑛ᵗ" all as the same identifier.
#
# The converter must take care to *only* transform identifiers -
# Python keywords must always be represented in base ASCII form. To
# skip over keywords, they are added to the parser/transformer, but
# contain no transforming parse action.
#
# The converter also detects identifiers in placeholders within f-strings.
#
# Copyright 2022, by Paul McGuire
#
import keyword
import random
import unicodedata
import pyparsing as pp
ppu = pp.pyparsing_unicode
_· = "_·"
ident_chars = (
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+ "0123456789" + _·
)
# build map of each ASCII character to a string of
# all the characters in the Basic Multilingual Plane
# that NFKC normalizes back to that ASCII character
ident_char_map = {c: [] for c in ident_chars}
for ch in ppu.BMP.identbodychars:
normal = unicodedata.normalize("NFKC", ch)
if normal in ident_char_map:
ident_char_map[normal].append(ch)
# ligatures will also normalize back to ASCII
# (doubled elements have higher chance of being chosen by random.choice)
ligature_map = {
'IJ': ('IJ', 'IJ', 'IJ'),
'LJ': ('LJ', 'LJ', 'LJ'),
'NJ': ('NJ', 'NJ', 'NJ'),
'DZ': ('DZ', 'DZ', 'DZ'),
'II': ('Ⅱ', 'Ⅱ', 'II'),
'IV': ('Ⅳ', 'Ⅳ', 'IV'),
'VI': ('Ⅵ', 'Ⅵ', 'VI'),
'IX': ('Ⅸ', 'Ⅸ', 'IX'),
'XI': ('Ⅺ', 'Ⅺ', 'XI'),
'ffl': ('ffl', 'ffl', 'ffl', 'ffl', 'ffl'),
'ffi': ('ffi', 'ffi', 'ffi', 'ffi', 'ffi'),
'ff': ('ff', 'ff', 'ff'),
'fi': ('fi', 'fi', 'fi'),
'fl': ('fl', 'fl', 'fl'),
'ij': ('ij', 'ij', 'ij'),
'lj': ('lj', 'lj', 'lj'),
'nj': ('nj', 'nj', 'nj'),
'dz': ('dz', 'dz', 'dz'),
'ii': ('ⅱ', 'ⅱ', 'ii'),
'iv': ('ⅳ', 'ⅳ', 'iv'),
'vi': ('ⅵ', 'ⅵ', 'vi'),
'ix': ('ⅸ', 'ⅸ', 'ix'),
'xi': ('ⅺ', 'ⅺ', 'xi'),
}
ligature_transformer = pp.one_of(ligature_map).add_parse_action(
lambda t: random.choice(ligature_map[t[0]])
)
def make_mixed_font(t):
# extract leading character and remainder to process separately
t_first, t_rest = t[0][0], t[0][1:]
# a leading '_' must be written using the ASCII character '_'
ret = ['_' if t_first == '_'
else random.choice(ident_char_map.get(t_first, t_first))]
t_rest = ligature_transformer.transform_string(t_rest)
ret.extend(random.choice(ident_char_map.get(c, c)) for c in t_rest)
return ''.join(ret)
# define a pyparsing expression to match any identifier; add a parse
# action to convert to mixed Unicode characters
identifier = pp.pyparsing_common.identifier
identifier.add_parse_action(make_mixed_font)
# match quoted strings (which may be f-strings)
python_quoted_string = pp.Opt(pp.Char("fF")("f_string_prefix")) + (
pp.python_quoted_string
)("quoted_string_body")
def mix_fstring_expressions(t):
if not t.f_string_prefix:
return
# define an expression and transformer to handle embedded
# f-string field expressions
fstring_arg = pp.QuotedString("{", end_quote_char="}")
fstring_arg.add_parse_action(
lambda tt: "{" + transformer.transform_string(tt[0]) + "}"
)
return (
t.f_string_prefix
+ fstring_arg.transform_string(t.quoted_string_body)
)
# add parse action to transform identifiers in f-strings
python_quoted_string.add_parse_action(mix_fstring_expressions)
# match keywords separately from identifiers - keywords must be kept in their
# original ASCII
any_keyword = pp.one_of(
list(keyword.kwlist) + getattr(keyword, "softkwlist", []),
as_keyword=True
)
# quoted strings and keywords will be parsed, but left untransformed
transformer = python_quoted_string | any_keyword | identifier
def demo():
import textwrap
hello_source = textwrap.dedent("""
def hello():
try:
hello_ = "Hello"
world_ = "World"
print(f"{hello_}, {world_}!")
except TypeError as exc:
print("failed: {}".format(exc))
if __name__ == "__main__":
hello()
""")
# use transformer to generate code with denormalized identifiers
transformed = transformer.transform_string(hello_source)
print(transformed)
# does it really work? compile the transformed code and run it!
code = compile(transformed, "inline source", mode="exec")
exec(code)
if 1:
# pick some code from the stdlib
import unittest.util as lib_module
import inspect
source = inspect.getsource(lib_module)
transformed = transformer.transform_string(source)
print()
print(transformed)
if __name__ == '__main__':
demo()