HEX
Server: Apache/2.4.65 (Ubuntu)
System: Linux ielts-store-v2 6.8.0-1036-gcp #38~22.04.1-Ubuntu SMP Thu Aug 14 01:19:18 UTC 2025 x86_64
User: root (0)
PHP: 7.2.34-54+ubuntu20.04.1+deb.sury.org+1
Disabled: pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,
Upload Files
File: //snap/google-cloud-cli/394/platform/gsutil/third_party/pyparsing/examples/unicode_denormalizer.py
# unicode_denormalizer.py
#
# Demonstration of the pyparsing's transform_string() method, to
# convert identifiers in Python source code to equivalent Unicode
# characters. Python's compiler automatically normalizes Unicode
# characters back to their ASCII equivalents, so that identifiers may
# be rewritten using other Unicode characters, and normalize back to
# the same identifier. For instance, Python treats "print" and "𝕡𝓻ᵢ𝓃𝘁"
# and "𝖕𝒓𝗂𝑛ᵗ" all as the same identifier.
#
# The converter must take care to *only* transform identifiers -
# Python keywords must always be represented in base ASCII form. To
# skip over keywords, they are added to the parser/transformer, but
# contain no transforming parse action.
#
# The converter also detects identifiers in placeholders within f-strings.
#
# Copyright 2022, by Paul McGuire
#
import keyword
import random
import unicodedata

import pyparsing as pp
ppu = pp.pyparsing_unicode

_· = "_·"
ident_chars = (
    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
    + "0123456789" + _·
)

# build map of each ASCII character to a string of
# all the characters in the Basic Multilingual Plane
# that NFKC normalizes back to that ASCII character
ident_char_map = {c: [] for c in ident_chars}
for ch in ppu.BMP.identbodychars:
    normal = unicodedata.normalize("NFKC", ch)
    if normal in ident_char_map:
        ident_char_map[normal].append(ch)

# ligatures will also normalize back to ASCII
# (doubled elements have higher chance of being chosen by random.choice)
ligature_map = {
    'IJ': ('IJ', 'IJ', 'IJ'),
    'LJ': ('LJ', 'LJ', 'LJ'),
    'NJ': ('NJ', 'NJ', 'NJ'),
    'DZ': ('DZ', 'DZ', 'DZ'),
    'II': ('Ⅱ', 'Ⅱ', 'II'),
    'IV': ('Ⅳ', 'Ⅳ', 'IV'),
    'VI': ('Ⅵ', 'Ⅵ', 'VI'),
    'IX': ('Ⅸ', 'Ⅸ', 'IX'),
    'XI': ('Ⅺ', 'Ⅺ', 'XI'),
    'ffl': ('ffl', 'ffl', 'ffl', 'ffl', 'ffl'),
    'ffi': ('ffi', 'ffi', 'ffi', 'ffi', 'ffi'),
    'ff': ('ff', 'ff', 'ff'),
    'fi': ('fi', 'fi', 'fi'),
    'fl': ('fl', 'fl', 'fl'),
    'ij': ('ij', 'ij', 'ij'),
    'lj': ('lj', 'lj', 'lj'),
    'nj': ('nj', 'nj', 'nj'),
    'dz': ('dz', 'dz', 'dz'),
    'ii': ('ⅱ', 'ⅱ', 'ii'),
    'iv': ('ⅳ', 'ⅳ', 'iv'),
    'vi': ('ⅵ', 'ⅵ', 'vi'),
    'ix': ('ⅸ', 'ⅸ', 'ix'),
    'xi': ('ⅺ', 'ⅺ', 'xi'),
}

ligature_transformer = pp.one_of(ligature_map).add_parse_action(
    lambda t: random.choice(ligature_map[t[0]])
)


def make_mixed_font(t):
    # extract leading character and remainder to process separately
    t_first, t_rest = t[0][0], t[0][1:]

    # a leading '_' must be written using the ASCII character '_'
    ret = ['_' if t_first == '_'
           else random.choice(ident_char_map.get(t_first, t_first))]
    t_rest = ligature_transformer.transform_string(t_rest)
    ret.extend(random.choice(ident_char_map.get(c, c)) for c in t_rest)
    return ''.join(ret)


# define a pyparsing expression to match any identifier; add a parse
# action to convert to mixed Unicode characters
identifier = pp.pyparsing_common.identifier
identifier.add_parse_action(make_mixed_font)

# match quoted strings (which may be f-strings)
python_quoted_string = pp.Opt(pp.Char("fF")("f_string_prefix")) + (
        pp.python_quoted_string
)("quoted_string_body")


def mix_fstring_expressions(t):
    if not t.f_string_prefix:
        return

    # define an expression and transformer to handle embedded
    # f-string field expressions
    fstring_arg = pp.QuotedString("{", end_quote_char="}")
    fstring_arg.add_parse_action(
        lambda tt: "{" + transformer.transform_string(tt[0]) + "}"
    )

    return (
        t.f_string_prefix
        + fstring_arg.transform_string(t.quoted_string_body)
    )

# add parse action to transform identifiers in f-strings
python_quoted_string.add_parse_action(mix_fstring_expressions)

# match keywords separately from identifiers - keywords must be kept in their
# original ASCII
any_keyword = pp.one_of(
    list(keyword.kwlist) + getattr(keyword, "softkwlist", []),
    as_keyword=True
)

# quoted strings and keywords will be parsed, but left untransformed
transformer = python_quoted_string | any_keyword | identifier


def demo():
    import textwrap
    hello_source = textwrap.dedent("""
    def hello():
        try:
            hello_ = "Hello"
            world_ = "World"
            print(f"{hello_}, {world_}!")
        except TypeError as exc:
            print("failed: {}".format(exc))
    
    if __name__ == "__main__":
        hello()
    """)

    # use transformer to generate code with denormalized identifiers
    transformed = transformer.transform_string(hello_source)
    print(transformed)

    # does it really work? compile the transformed code and run it!
    code = compile(transformed, "inline source", mode="exec")
    exec(code)

    if 1:
        # pick some code from the stdlib
        import unittest.util as lib_module
        import inspect
        source = inspect.getsource(lib_module)
        transformed = transformer.transform_string(source)
        print()
        print(transformed)

if __name__ == '__main__':
    demo()