C# NHtmlFilter 帮你过滤Html危险脚本 防止XSS攻击

转:http://www.oschina.net/code/snippet_222150_9776

与原文代码略有改动

 /// <summary>
        /// Html 脚本过滤
        /// </summary>
        public class NHtmlFilter
        {
            protected static readonly RegexOptions REGEX_FLAGS_SI = RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled;

            private static string P_COMMENTS = "<!--(.*?)-->";
            private static Regex P_COMMENT = new Regex("^!--(.*)--$", REGEX_FLAGS_SI);
            private static string P_TAGS = "<(.*?)>";
            private static Regex P_END_TAG = new Regex("^/([a-z0-9]+)", REGEX_FLAGS_SI);
            private static Regex P_START_TAG = new Regex("^([a-z0-9]+)(.*?)(/?)$", REGEX_FLAGS_SI);
            private static Regex P_QUOTED_ATTRIBUTES = new Regex("([a-z0-9|(a-z0-9\\-a-z0-9)]+)=([\"'])(.*?)\\2", REGEX_FLAGS_SI);
            private static Regex P_UNQUOTED_ATTRIBUTES = new Regex("([a-z0-9]+)(=)([^\"\\s']+)", REGEX_FLAGS_SI);
            private static Regex P_PROTOCOL = new Regex("^([^:]+):", REGEX_FLAGS_SI);
            private static Regex P_ENTITY = new Regex("&#(\\d+);?");
            private static Regex P_ENTITY_UNICODE = new Regex("&#x([0-9a-f]+);?");
            private static Regex P_ENCODE = new Regex("%([0-9a-f]{2});?");
            private static Regex P_VALID_ENTITIES = new Regex("&([^&;]*)(?=(;|&|$))");
            private static Regex P_VALID_QUOTES = new Regex("(>|^)([^<]+?)(<|$)", RegexOptions.Singleline | RegexOptions.Compiled);
            private static string P_END_ARROW = "^>";
            private static string P_BODY_TO_END = "<([^>]*?)(?=<|$)";
            private static string P_XML_CONTENT = "(^|>)([^<]*?)(?=>)";
            private static string P_STRAY_LEFT_ARROW = "<([^>]*?)(?=<|$)";
            private static string P_STRAY_RIGHT_ARROW = "(^|>)([^<]*?)(?=>)";
            private static string P_AMP = "&";
            private static string P_QUOTE = "\"";
            private static string P_LEFT_ARROW = "<";
            private static string P_RIGHT_ARROW = ">";
            private static string P_BOTH_ARROWS = "<>";

            // @xxx could grow large... maybe use sesat's ReferenceMap
            private static Dictionary<String, string> P_REMOVE_PAIR_BLANKS = new Dictionary<String, string>();
            private static Dictionary<String, string> P_REMOVE_SELF_BLANKS = new Dictionary<String, string>();
            /** 
             * flag determining whether to try to make tags when presented with "unbalanced"
             * angle brackets (e.g. "<b text </b>" becomes "<b> text </b>").  If set to false,
             * unbalanced angle brackets will be html escaped.
             */
            protected static bool alwaysMakeTags = true;

            /**
             * flag determing whether comments are allowed in input String.
             */
            protected static bool stripComment = true;


            /// <summary>
            /// 不允许
            /// </summary>
            private String[] vDisallowed { get; set; }
            /// <summary>
            /// 允许
            /// </summary>
            protected Dictionary<String, List<String>> vAllowed { get; set; }

            /** counts of open tags for each (allowable) html element **/
            protected Dictionary<String, int> vTagCounts;

            /** html elements which must always be self-closing (e.g. "<img />") **/
            protected String[] vSelfClosingTags;

            /** html elements which must always have separate opening and closing tags (e.g. "<b></b>") **/
            protected String[] vNeedClosingTags;

            /** attributes which should be checked for valid protocols **/
            protected String[] vProtocolAtts;

            /** allowed protocols **/
            protected String[] vAllowedProtocols;

            /** tags which should be removed if they contain no content (e.g. "<b></b>" or "<b />") **/
            protected String[] vRemoveBlanks;

            /** entities allowed within html markup **/
            protected String[] vAllowedEntities;


            /// <summary>
            /// 是否为调试
            /// </summary>
            protected bool vDebug;

            public NHtmlFilter() : this(false){}

            public NHtmlFilter(bool debug)
            {
                //List<Item> vAllowed = new List<Item>();
                vAllowed = new Dictionary<String, List<String>>();
                #region 允许通过数组

                vAllowed.Add("a", new List<string>() { "target", "href", "title", "class", "style" });
                vAllowed.Add("addr", new List<string>() { "title", "class", "style" });
                vAllowed.Add("address", new List<string>() { "class", "style" });
                vAllowed.Add("area", new List<string>() { "shape", "coords", "href", "alt" });
                vAllowed.Add("article", new List<string>() { });
                vAllowed.Add("aside", new List<string>() { });
                vAllowed.Add("audio", new List<string>() { "autoplay", "controls", "loop", "preload", "src", "class", "style" });
                vAllowed.Add("b", new List<string>() { "class", "style" });
                vAllowed.Add("bdi", new List<string>() { "dir" });
                vAllowed.Add("bdo", new List<string>() { "dir" });
                vAllowed.Add("big", new List<string>() { });
                vAllowed.Add("blockquote", new List<string>() { "cite", "class", "style" });
                vAllowed.Add("br", new List<string>() { });
                vAllowed.Add("caption", new List<string>() { "class", "style" });
                vAllowed.Add("center", new List<string>() { });
                vAllowed.Add("cite", new List<string>() { });
                vAllowed.Add("code", new List<string>() { "class", "style" });
                vAllowed.Add("col", new List<string>() { "align", "valign", "span", "width", "class", "style" });
                vAllowed.Add("colgroup", new List<string>() { "align", "valign", "span", "width", "class", "style" });
                vAllowed.Add("dd", new List<string>() { "class", "style" });
                vAllowed.Add("del", new List<string>() { "datetime" });
                vAllowed.Add("details", new List<string>() { "open" });
                vAllowed.Add("div", new List<string>() { "class", "style" });
                vAllowed.Add("dl", new List<string>() { "class", "style" });
                vAllowed.Add("dt", new List<string>() { "class", "style" });
                vAllowed.Add("em", new List<string>() { "class", "style" });
                vAllowed.Add("font", new List<string>() { "color", "size", "face" });
                vAllowed.Add("footer", new List<string>() { });
                vAllowed.Add("h1", new List<string>() { "class", "style" });
                vAllowed.Add("h2", new List<string>() { "class", "style" });
                vAllowed.Add("h3", new List<string>() { "class", "style" });
                vAllowed.Add("h4", new List<string>() { "class", "style" });
                vAllowed.Add("h5", new List<string>() { "class", "style" });
                vAllowed.Add("h6", new List<string>() { "class", "style" });
                vAllowed.Add("header", new List<string>() { });
                vAllowed.Add("hr", new List<string>() { });
                vAllowed.Add("i", new List<string>() { "class", "style" });
                vAllowed.Add("img", new List<string>() { "src", "alt", "title", "style", "width", "height", "id", "_src", "loadingclass", "class", "data-latex", "data-id", "data-type", "data-s" });
                vAllowed.Add("ins", new List<string>() { "datetime" });
                vAllowed.Add("li", new List<string>() { "class", "style" });
                vAllowed.Add("mark", new List<string>() { });
                vAllowed.Add("nav", new List<string>() { });
                vAllowed.Add("ol", new List<string>() { "class", "style" });
                vAllowed.Add("p", new List<string>() { "class", "style" });
                vAllowed.Add("pre", new List<string>() { "class", "style" });
                vAllowed.Add("s", new List<string>() { });
                vAllowed.Add("section", new List<string>() { });
                vAllowed.Add("small", new List<string>() { });
                vAllowed.Add("span", new List<string>() { "class", "style" });
                vAllowed.Add("sub", new List<string>() { "class", "style" });
                vAllowed.Add("sup", new List<string>() { "class", "style" });
                vAllowed.Add("strong", new List<string>() { "class", "style" });
                vAllowed.Add("table", new List<string>() { "width", "border", "align", "valign", "class", "style" });
                vAllowed.Add("tbody", new List<string>() { "align", "valign", "class", "style" });
                vAllowed.Add("td", new List<string>() { "width", "rowspan", "colspan", "align", "valign", "class", "style" });
                vAllowed.Add("tfoot", new List<string>() { "align", "valign", "class", "style" });
                vAllowed.Add("th", new List<string>() { "width", "rowspan", "colspan", "align", "valign", "class", "style" });
                vAllowed.Add("thead", new List<string>() { "align", "valign", "class", "style" });
                vAllowed.Add("tr", new List<string>() { "rowspan", "align", "valign", "class", "style" });
                vAllowed.Add("tt", new List<string>() { });
                vAllowed.Add("u", new List<string>() { });
                vAllowed.Add("ul", new List<string>() { "class", "style" });
                vAllowed.Add("video", new List<string>() { "autoplay", "controls", "loop", "preload", "src", "height", "width", "class", "style" });
                #endregion


                vDebug = debug;
                vTagCounts = new Dictionary<String, int>();

                vSelfClosingTags = new String[] { "img" };
                vNeedClosingTags = new String[] { "a", "b", "strong", "i", "em" };
                vDisallowed = new String[] { "script" };
                vAllowedProtocols = new String[] { "http", "mailto" }; // no ftp.
                vProtocolAtts = new String[] { "src", "href" };
                vRemoveBlanks = new String[] { "a", "b", "strong", "i", "em" };
                vAllowedEntities = new String[] { "amp", "gt", "lt", "quot" };
                stripComment = true;
                alwaysMakeTags = true;
            }


            protected void reset()
            {
                vTagCounts = new Dictionary<String, int>();
            }

            protected void debug(String msg)
            {
                if (vDebug)
                    System.Diagnostics.Debug.WriteLine(msg);
            }

            //---------------------------------------------------------------
            // my versions of some PHP library functions

            public static String chr(int dec)
            {
                return "" + ((char)dec);
            }

            /// <summary>
            /// 转换成实体字符
            /// </summary>
            /// <param name="str"></param>
            /// <returns></returns>
            public static String htmlSpecialChars(String str)
            {
                str = str.Replace(P_QUOTE, "&quot;");
                str = str.Replace(P_LEFT_ARROW, "&lt;");
                str = str.Replace(P_RIGHT_ARROW, "&gt;");
                str = str.Replace("\n", "<br>");
                return str;
            }

            //---------------------------------------------------------------

            /**
             * given a user submitted input String, filter out any invalid or restricted
             * html.
             * 
             * @param input text (i.e. submitted by a user) than may contain html
             * @return "clean" version of input, with only valid, whitelisted html elements allowed
             */
            public String filter(String input)
            {
                reset();
                String s = input;

                debug("************************************************");
                debug("              INPUT: " + input);

                s = escapeComments(s);
                debug("     escapeComments: " + s);

                s = balanceHTML(s);
                debug("        balanceHTML: " + s);

                s = checkTags(s);
                debug("          checkTags: " + s);

                s = processRemoveBlanks(s);
                debug("processRemoveBlanks: " + s);

                s = validateEntities(s);
                debug("    validateEntites: " + s);

                debug("************************************************\n\n");
                return s;
            }

            protected String escapeComments(String s)
            {
                return Regex.Replace(s, P_COMMENTS, new MatchEvaluator(ConverMatchComments), RegexOptions.Singleline);
            }

            protected String regexReplace(String regex_pattern, String replacement, String s)
            {
                return Regex.Replace(s, regex_pattern, replacement);
            }

            protected String balanceHTML(String s)
            {
                if (alwaysMakeTags)
                {
                    //
                    // try and form html
                    //
                    s = regexReplace(P_END_ARROW, "", s);
                    s = regexReplace(P_BODY_TO_END, "<$1>", s);
                    s = regexReplace(P_XML_CONTENT, "$1<$2", s);

                }
                else
                {
                    //
                    // escape stray brackets
                    //
                    s = regexReplace(P_STRAY_LEFT_ARROW, "&lt;$1", s);
                    s = regexReplace(P_STRAY_RIGHT_ARROW, "$1$2&gt;<", s);

                    //
                    // the last regexp causes '<>' entities to appear
                    // (we need to do a lookahead assertion so that the last bracket can
                    // be used in the next pass of the regexp)
                    //
                    s = s.Replace(P_BOTH_ARROWS, "");
                }
                return s;
            }

            protected String checkTags(String s)
            {
                //替换不允许标签
                foreach (var item in vDisallowed)
                {
                    s = Regex.Replace(s, string.Format(@"<{0}\b(.)*?>(.)+?</{0}>", item), "");
                }
                s = Regex.Replace(s, P_TAGS, new MatchEvaluator(ConverMatchTags), RegexOptions.Singleline);

                // these get tallied in processTag
                // (remember to reset before subsequent calls to filter method)
                foreach (String key in vTagCounts.Keys)
                {
                    for (int ii = 0; ii < vTagCounts[key]; ii++)
                    {
                        s += "</" + key + ">";
                    }
                }

                return s;
            }

            protected String processRemoveBlanks(String s)
            {
                foreach (String tag in vRemoveBlanks)
                {
                    s = regexReplace("<" + tag + "(\\s[^>]*)?></" + tag + ">", "", s);
                    s = regexReplace("<" + tag + "(\\s[^>]*)?/>", "", s);
                }
                return s;
            }

            private String processTag(String s)
            {
                // ending tags
                Match m = P_END_TAG.Match(s);
                if (m.Success)
                {
                    string name = m.Groups[1].Value.ToLower();
                    if (allowed(name))
                    {
                        if (!inArray(name, vSelfClosingTags))
                        {
                            if (vTagCounts.ContainsKey(name))
                            {
                                vTagCounts[name] = vTagCounts[name] - 1;
                                return "</" + name + ">";
                            }
                        }
                    }
                }


                // starting tags
                m = P_START_TAG.Match(s);
                if (m.Success)
                {
                    String name = m.Groups[1].Value.ToLower();
                    String body = m.Groups[2].Value;
                    String ending = m.Groups[3].Value;

                    //debug( "in a starting tag, name='" + name + "'; body='" + body + "'; ending='" + ending + "'" );
                    if (allowed(name))
                    {
                        String params1 = "";

                        MatchCollection m2 = P_QUOTED_ATTRIBUTES.Matches(body);
                        MatchCollection m3 = P_UNQUOTED_ATTRIBUTES.Matches(body);
                        List<String> paramNames = new List<String>();
                        List<String> paramValues = new List<String>();
                        foreach (Match match in m2)
                        {
                            paramNames.Add(match.Groups[1].Value); //([a-z0-9]+)
                            paramValues.Add(match.Groups[3].Value); //(.*?)
                        }
                        foreach (Match match in m3)
                        {
                            paramNames.Add(match.Groups[1].Value); //([a-z0-9]+)
                            paramValues.Add(match.Groups[3].Value); //([^\"\\s']+)
                        }

                        String paramName, paramValue;
                        for (int ii = 0; ii < paramNames.Count; ii++)
                        {
                            paramName = paramNames[ii].ToLower();
                            paramValue = paramValues[ii];

                            if (allowedAttribute(name, paramName))
                            {
                                if (inArray(paramName, vProtocolAtts))
                                {
                                    paramValue = processParamProtocol(paramValue);
                                }
                                params1 += " " + paramName + "=\"" + paramValue + "\"";
                            }
                        }

                        if (inArray(name, vSelfClosingTags))
                        {
                            ending = " /";
                        }

                        if (inArray(name, vNeedClosingTags))
                        {
                            ending = "";
                        }

                        if (ending == null || ending.Length < 1)
                        {
                            if (vTagCounts.ContainsKey(name))
                            {
                                vTagCounts[name] = vTagCounts[name] + 1;
                            }
                            else
                            {
                                vTagCounts.Add(name, 1);
                            }
                        }
                        else
                        {
                            ending = " /";
                        }
                        return "<" + name + params1 + ending + ">";
                    }
                    else
                    {
                        return "";
                    }
                }

                // comments
                m = P_COMMENT.Match(s);
                if (!stripComment && m.Success)
                {
                    return "<" + m.Value + ">";
                }

                return "";
            }

            private String processParamProtocol(String s)
            {
                s = decodeEntities(s);
                Match m = P_PROTOCOL.Match(s);
                if (m.Success)
                {
                    String protocol = m.Groups[1].Value;
                    if (!inArray(protocol, vAllowedProtocols))
                    {
                        // bad protocol, turn into local anchor link instead
                        s = "#" + s.Substring(protocol.Length + 1, s.Length - protocol.Length - 1);
                        if (s.StartsWith("#//"))
                        {
                            s = "#" + s.Substring(3, s.Length - 3);
                        }
                    }
                }
                return s;
            }

            private String decodeEntities(String s)
            {

                s = P_ENTITY.Replace(s, new MatchEvaluator(ConverMatchEntity));

                s = P_ENTITY_UNICODE.Replace(s, new MatchEvaluator(ConverMatchEntityUnicode));

                s = P_ENCODE.Replace(s, new MatchEvaluator(ConverMatchEntityUnicode));

                s = validateEntities(s);
                return s;
            }

            private String validateEntities(String s)
            {
                s = P_VALID_ENTITIES.Replace(s, new MatchEvaluator(ConverMatchValidEntities));
                s = P_VALID_QUOTES.Replace(s, new MatchEvaluator(ConverMatchValidQuotes));
                return s;
            }

            private static bool inArray(String s, String[] array)
            {
                foreach (String item in array)
                {
                    if (item != null && item.Equals(s))
                    {
                        return true;
                    }
                }
                return false;
            }

            private bool allowed(String name)
            {
                return (vAllowed.Count == 0 || vAllowed.ContainsKey(name)) && !inArray(name, vDisallowed);
            }

            private bool allowedAttribute(String name, String paramName)
            {
                return allowed(name) && (vAllowed.Count == 0 || vAllowed[name].Contains(paramName));
            }

            private String checkEntity(String preamble, String term)
            {

                return ";".Equals(term) && isValidEntity(preamble)
                        ? '&' + preamble
                        : "&amp;" + preamble;
            }
            private bool isValidEntity(String entity)
            {
                return inArray(entity, vAllowedEntities);
            }
            private static string ConverMatchComments(Match match)
            {
                string matchValue = "<!--" + htmlSpecialChars(match.Groups[1].Value) + "-->";
                return matchValue;
            }

            private string ConverMatchTags(Match match)
            {
                string matchValue = processTag(match.Groups[1].Value);
                return matchValue;
            }

            private string ConverMatchEntity(Match match)
            {
                string v = match.Groups[1].Value;
                int decimal1 = int.Parse(v);
                return chr(decimal1);
            }

            private string ConverMatchEntityUnicode(Match match)
            {
                string v = match.Groups[1].Value;
                int decimal1 = Convert.ToInt32("0x" + v, 16);
                return chr(decimal1);
            }

            private string ConverMatchValidEntities(Match match)
            {
                String one = match.Groups[1].Value; //([^&;]*)
                String two = match.Groups[2].Value; //(?=(;|&|$))
                return checkEntity(one, two);
            }
            private string ConverMatchValidQuotes(Match match)
            {
                String one = match.Groups[1].Value; //(>|^)
                String two = match.Groups[2].Value; //([^<]+?)
                String three = match.Groups[3].Value;//(<|$)
                return one + regexReplace(P_QUOTE, "&quot;", two) + three;
            }

            public bool isAlwaysMakeTags()
            {
                return alwaysMakeTags;
            }

            public bool isStripComments()
            {
                return stripComment;
            }

            class Item
            {
                public string name { get; set; }
                public List<string> parameter { get; set; }
            }

        }

调用:

var x = new Common.NHtmlFilter(false);
string content="文本<a >xxxxxxx</a><script>alert(1)</script>";
string str = x.filter(content);