Python HTML特殊符号的转义与反转义

需求:在做Web开发过程中,经常遇到特殊符号需要转义为浏览器认为是字符串的数据,减少前端的攻击。
注意:此代码来源Tornado源码
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import html.entities
import typing
from typing import Union, Optional, Dict

_TO_UNICODE_TYPES = (str, type(None))

def to_unicode(value: Union[None, str, bytes]) -> Optional[str]:  # noqa: F811
    """将字节转为字符串"""
    if isinstance(value, _TO_UNICODE_TYPES):
        return value
    if not isinstance(value, bytes):
        raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))
    return value.decode("utf-8")

_XHTML_ESCAPE_RE = re.compile("[&<>\"']")
_XHTML_ESCAPE_DICT = {
    "&": "&amp;",
    "<": "&lt;",
    ">": "&gt;",
    '"': "&quot;",
    "'": "&#39;",
}

def xhtml_escape(value: Union[str, bytes]) -> str:
    """将特殊符号:``<``, ``>``, ``"``, ``'``, and ``&``,进行转义"""
    return _XHTML_ESCAPE_RE.sub(
        lambda match: _XHTML_ESCAPE_DICT[match.group(0)], to_unicode(value)
    )

def _build_unicode_map() -> Dict[str, str]:
    """
        打印出html所有的特殊符号与转义后的简称
    :return:
    """
    unicode_map = {}
    for name, value in html.entities.name2codepoint.items():
        unicode_map[name] = chr(value)
    return unicode_map


_HTML_UNICODE_MAP = _build_unicode_map()

def _convert_entity(m: typing.Match) -> str:
    """
        re.sub回调函数
    """
    if m.group(1) == "#":
        try:
            if m.group(2)[:1].lower() == "x":
                return chr(int(m.group(2)[1:], 16))
            else:
                return chr(int(m.group(2)))
        except ValueError:
            return "&#%s;" % m.group(2)
    try:
        return _HTML_UNICODE_MAP[m.group(2)]
    except KeyError:
        return "&%s;" % m.group(2)


def xhtml_unescape(value: Union[str, bytes]) -> str:
    """将转义字符,返转义为特殊符号."""
    return re.sub(r"&(#?)(\w+?);", _convert_entity, to_unicode(value))

if __name__ == '__main__':
    src_text = '<script>alert(1)</script>'
    ret_escape = xhtml_escape(src_text)
    print(ret_escape)
    reback = xhtml_unescape(ret_escape)
    print(reback)

    """
    输出结果:
        &lt;script&gt;alert(1)&lt;/script&gt;
        <script>alert(1)</script>
    """