public static class HtmlHelper
{
/// <summary>
/// 按文本内容长度截取HTML字符串(支持截取带HTML代码样式的字符串)
/// </summary>
/// <param name="html">将要截取的字符串参数</param>
/// <param name="len">截取的字节长度</param>
/// <param name="endString">字符串末尾补上的字符串</param>
/// <returns>返回截取后的字符串</returns>
public static string HTMLSubstring(string html, int len, string endString)
{
if (string.IsNullOrEmpty(html) || html.Length <= len) return html;
MatchCollection mcentiry, mchtmlTag;
ArrayList inputHTMLTag = new ArrayList();
string r = "", tmpValue;
int rWordCount = 0, wordNum = 0, i = 0;
Regex rxSingle = new Regex("^<(br|hr|img|input|param|meta|link)", RegexOptions.Compiled | RegexOptions.IgnoreCase)//是否单标签正则
, rxEndTag = new Regex("</[^>]+>", RegexOptions.Compiled)//是否结束标签正则
, rxTagName = new Regex("<([a-z]+)[^>]*>", RegexOptions.Compiled | RegexOptions.IgnoreCase)//获取标签名正则
, rxHtmlTag = new Regex("<[^>]+>", RegexOptions.Compiled)//html标签正则
, rxEntity = new Regex("&[a-z]{1,9};", RegexOptions.Compiled | RegexOptions.IgnoreCase)//实体正则
, rxEntityReverse = new Regex("§", RegexOptions.Compiled)//反向替换实体正则
;
html = html.Replace("§", "§");//替换字符§为他的实体“§”,以便进行下一步替换
mcentiry = rxEntity.Matches(html);//收集实体对象到匹配数组中
html = rxEntity.Replace(html, "§");//替换实体为特殊字符§,这样好控制一个实体占用一个字符
mchtmlTag = rxHtmlTag.Matches(html);//收集html标签到匹配数组中
html = rxHtmlTag.Replace(html, "__HTMLTag__");//替换为特殊标签
string[] arrWord = html.Split(new string[] { "__HTMLTag__" }, StringSplitOptions.None);//通过特殊标签进行拆分
wordNum = arrWord.Length;
//获取指定内容长度及HTML标签
for (; i < wordNum; i++)
{
if (rWordCount + arrWord[i].Length >= len) r += arrWord[i].Substring(0, len - rWordCount) + endString;
else r += arrWord[i];
rWordCount += arrWord[i].Length;//计算已经获取到的字符长度
if (rWordCount >= len) break;
//搜集已经添加的非单标签,以便封闭HTML标签对
if (i < wordNum - 1)
{
tmpValue = mchtmlTag[i].Value;
if (!rxSingle.IsMatch(tmpValue))
{ //不是单标签
if (rxEndTag.IsMatch(tmpValue) && inputHTMLTag.Count > 0) inputHTMLTag.RemoveAt(inputHTMLTag.Count - 1);
else inputHTMLTag.Add(tmpValue);
}
r += tmpValue;
}
}
//替换回实体
for (i = 0; i < mcentiry.Count; i++) r = rxEntityReverse.Replace(r, mcentiry[i].Value, 1);
//封闭标签
for (i = inputHTMLTag.Count - 1; i >= 0; i--) r += "</" + rxTagName.Match(inputHTMLTag[i].ToString()).Groups[1].Value + ">";
return r;
}
/// <summary>
/// 过滤html格式
/// </summary>
/// <param name="Htmlstring"></param>
/// <returns></returns>
public static string NoHTML(this string Htmlstring)
{
Htmlstring = Regex.Replace(Htmlstring, @"<script[\s\S]*?</script>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<noscript[\s\S]*?</noscript>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<style[\s\S]*?</style>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<.*?>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", " ", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", " ", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"-->", " ", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", " ", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", " ", RegexOptions.IgnoreCase);
return Htmlstring;
}
/// <summary>
/// 去除html标签后并截取字符串
/// </summary>
/// <param name="html">源html</param>
/// <param name="length">截取长度</param>
/// <returns></returns>
public static string RemoveHtmlTag(this string html, int length = 0)
{
var doc = new HtmlDocument();
doc.LoadHtml(html);
var strText = doc.DocumentNode.InnerText;
if (length > 0 && strText.Length > length)
{
return strText.Substring(0, length);
}
return strText;
}
/// <summary>
/// 补全HTMl标签
/// </summary>
/// <param name="html">源html</param>
/// <param name="length">截取长度</param>
/// <returns></returns>
public static string GetHtmlTag(this string html)
{
var doc = new HtmlDocument();
doc.LoadHtml(html);
var strText = doc.DocumentNode.InnerHtml;
return strText;
}
/// <summary>
/// 转换为HtmlDecode
/// </summary>
/// <param name="value"></param>
/// <returns></returns>
public static string HtmlDecode(this string value)
{
return System.Net.WebUtility.HtmlDecode(value);
}
/// <summary>
/// 转换为HtmlEncode
/// </summary>
/// <param name="value"></param>
/// <returns></returns>
public static string HtmlEncode(this string value)
{
return System.Net.WebUtility.HtmlEncode(value);
}
}