HEX
Server: Apache/2.4.65 (Ubuntu)
System: Linux ielts-store-v2 6.8.0-1036-gcp #38~22.04.1-Ubuntu SMP Thu Aug 14 01:19:18 UTC 2025 x86_64
User: root (0)
PHP: 7.2.34-54+ubuntu20.04.1+deb.sury.org+1
Disabled: pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,
Upload Files
File: //snap/google-cloud-cli/396/platform/gsutil/third_party/pyparsing/examples/html_stripper.py
#
# html_stripper.py
#
#  Sample code for stripping HTML markup tags and scripts from
#  HTML source files.
#
# Copyright (c) 2006, 2016, 2023, Paul McGuire
#
from urllib.request import urlopen
from pyparsing import (
    LineEnd,
    quoted_string,
    make_html_tags,
    common_html_entity,
    replace_html_entity,
    html_comment,
    any_open_tag,
    any_close_tag,
    replace_with,
)

# if <script> tags found, remove script content also
script_open, script_close = make_html_tags("script")
script_body = script_open + ... + script_close

# translate HTML entities
common_html_entity.set_parse_action(replace_html_entity)

stripper = (
        # parse quoted strings first, if they enclose HTML tags - keep these
        quoted_string
        # parse and translate HTML entities (&amp;, &lt;, &gt;, etc.)
        | common_html_entity
        # expressions to be stripped - suppress() will remove them when transforming
        | (
            html_comment | script_body | any_open_tag | any_close_tag
          ).suppress()
)

repeated_newlines = LineEnd()[2, ...]
repeated_newlines.set_parse_action(replace_with("\n\n"))


if __name__ == '__main__':
    # get some HTML
    target_url = "https://wiki.python.org/moin/PythonDecoratorLibrary"
    with urlopen(target_url) as targetPage:
        target_html = targetPage.read().decode("UTF-8")

    # first pass, strip out tags and translate entities
    # (use transform_string() instead of parse_string - will do
    # suppressions and parse actions)
    first_pass = stripper.transform_string(target_html)

    # first pass leaves many blank lines, collapse these down
    second_pass = repeated_newlines.transform_string(first_pass)

    print(second_pass)