228 lines
11 KiB
C#
228 lines
11 KiB
C#
using System;
|
||
using System.Collections;
|
||
using System.Collections.Generic;
|
||
using System.Net;
|
||
using System.Text;
|
||
using System.Text.RegularExpressions;
|
||
|
||
namespace NetPanel.Help
|
||
{
|
||
public class HtmlTag
|
||
{
|
||
private String m_Name;
|
||
private String m_BeginTag;
|
||
private String m_InnerHTML;
|
||
private Hashtable m_Attributes = new Hashtable();
|
||
|
||
static Regex attrReg = new Regex(@"([a-zA-Z1-9_-]+)\s*=\s*(\x27|\x22)([^\x27\x22]*)(\x27|\x22)", RegexOptions.IgnoreCase);
|
||
|
||
private HtmlTag(string name, string beginTag, string innerHTML)
|
||
{
|
||
m_Name = name;
|
||
m_BeginTag = beginTag;
|
||
m_InnerHTML = innerHTML;
|
||
|
||
MatchCollection matchs = attrReg.Matches(beginTag);
|
||
foreach (Match match in matchs)
|
||
{
|
||
m_Attributes[match.Groups[1].Value.ToUpper()] = match.Groups[3].Value;
|
||
}
|
||
}
|
||
public string GetBeginTag()
|
||
{
|
||
|
||
return m_BeginTag;
|
||
}
|
||
public List<HtmlTag> FindTag(String name)
|
||
{
|
||
return FindTag(m_InnerHTML, name, String.Format(@"<{0}(\s[^<>]*|)>", name));
|
||
}
|
||
public List<HtmlTag> FindImgTag()
|
||
{
|
||
return FindTag(m_InnerHTML, "img", @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>");
|
||
}
|
||
public List<HtmlTag> FindTag(String name, String format)
|
||
{
|
||
return FindTag(m_InnerHTML, name, format);
|
||
}
|
||
|
||
public List<HtmlTag> FindTagByAttr(String tagName, String attrName, String attrValue)
|
||
{
|
||
return FindTagByAttr(m_InnerHTML, tagName, attrName, attrValue);
|
||
}
|
||
|
||
public String TagName
|
||
{
|
||
get { return m_Name; }
|
||
}
|
||
|
||
public String InnerHTML
|
||
{
|
||
get { return m_InnerHTML; }
|
||
}
|
||
public String InnerText
|
||
{
|
||
get { return checkStr(m_InnerHTML); }
|
||
}
|
||
public String GetAttribute(string name)
|
||
{
|
||
return m_Attributes[name.ToUpper()] as String;
|
||
}
|
||
public String FindDate
|
||
{
|
||
get
|
||
{
|
||
Match m = Regex.Match(InnerText, @"(?<date>((1[6-9]|[2-3]\d)\d{2})-(\d{1,2})-(\d{1,2}))");
|
||
if (m.Groups.Count > 0)
|
||
{
|
||
return m.Groups["date"].Value;
|
||
}
|
||
else
|
||
{
|
||
return null;
|
||
}
|
||
}
|
||
}
|
||
public static string checkStr2(string html)
|
||
{
|
||
html = html.Replace("<br>", "$br$");
|
||
html = html.Replace("<br/>", "$br/$");
|
||
System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"<script[\s\S]+</script *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@" href *= *[\s\S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@" no[\s\S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex6 = new System.Text.RegularExpressions.Regex(@"\<img[^\>]+\>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex7 = new System.Text.RegularExpressions.Regex(@"</p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex8 = new System.Text.RegularExpressions.Regex(@"<p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex9 = new System.Text.RegularExpressions.Regex(@"<[^>]*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
html = regex1.Replace(html, ""); //过滤<script></script>标记
|
||
html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
|
||
html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
|
||
html = regex4.Replace(html, ""); //过滤iframe
|
||
html = regex5.Replace(html, ""); //过滤frameset
|
||
html = regex6.Replace(html, ""); //过滤frameset
|
||
// html = regex7.Replace(html, ""); //过滤frameset
|
||
// html = regex8.Replace(html, ""); //过滤frameset
|
||
html = regex9.Replace(html, "");
|
||
html = html.Replace(" ", "");
|
||
html = html.Replace("</strong>", "");
|
||
html = html.Replace("<strong>", "");
|
||
html = html.Replace("$br$", "<br>");
|
||
html = html.Replace("$br/$", "<br/>");
|
||
return html;
|
||
}
|
||
public static string checkStr(string html)
|
||
{
|
||
if (string.IsNullOrEmpty(html))
|
||
{
|
||
return html;
|
||
}
|
||
System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"<script[\s\S]+</script *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@" href *= *[\s\S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@" no[\s\S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex6 = new System.Text.RegularExpressions.Regex(@"\<img[^\>]+\>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex7 = new System.Text.RegularExpressions.Regex(@"</p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex8 = new System.Text.RegularExpressions.Regex(@"<p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
System.Text.RegularExpressions.Regex regex9 = new System.Text.RegularExpressions.Regex(@"<[^>]*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||
html = regex1.Replace(html, ""); //过滤<script></script>标记
|
||
html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
|
||
html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
|
||
html = regex4.Replace(html, ""); //过滤iframe
|
||
html = regex5.Replace(html, ""); //过滤frameset
|
||
html = regex6.Replace(html, ""); //过滤frameset
|
||
html = regex7.Replace(html, ""); //过滤frameset
|
||
html = regex8.Replace(html, ""); //过滤frameset
|
||
html = regex9.Replace(html, "");
|
||
html = html.Replace(" ", "");
|
||
html = html.Replace("</strong>", "");
|
||
html = html.Replace("<strong>", "");
|
||
html = html.Replace(" ", " ");
|
||
return html;
|
||
}
|
||
public static string HtmlToText(string str)
|
||
{
|
||
|
||
string m_outstr = str;
|
||
m_outstr = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
|
||
m_outstr = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
|
||
m_outstr = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
|
||
|
||
//m_outstr = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
|
||
Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);
|
||
m_outstr = objReg.Replace(m_outstr, "");
|
||
Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
|
||
m_outstr = objReg2.Replace(m_outstr, " ");
|
||
return m_outstr;
|
||
|
||
|
||
}
|
||
|
||
|
||
/// <summary>
|
||
/// 在文本html的文本查找标志名为tagName,并且属性attrName的值为attrValue的所有标志
|
||
/// 例如:FindTagByAttr(html, "div", "class", "demo")
|
||
/// 返回所有class为demo的div标志
|
||
/// </summary>
|
||
public static List<HtmlTag> FindTagByAttr(String html, String tagName, String attrName, String attrValue)
|
||
{
|
||
String format = String.Format(@"<{0}\s[^<>]*{1}\s*=\s*(\x27|\x22){2}(\x27|\x22)[^<>]*>", tagName, attrName, attrValue);
|
||
return FindTag(html, tagName, format);
|
||
}
|
||
|
||
public static List<HtmlTag> FindTag(String html, String name, String format)
|
||
{
|
||
Regex reg = new Regex(format, RegexOptions.IgnoreCase);
|
||
Regex tagReg = new Regex(String.Format(@"<(\/|)({0})(\s[^<>]*|)>", name), RegexOptions.IgnoreCase);
|
||
|
||
List<HtmlTag> tags = new List<HtmlTag>();
|
||
int start = 0;
|
||
|
||
while (true)
|
||
{
|
||
Match match = reg.Match(html, start);
|
||
if (match.Success)
|
||
{
|
||
start = match.Index + match.Length;
|
||
Match tagMatch = null;
|
||
int beginTagCount = 1;
|
||
|
||
while (true)
|
||
{
|
||
tagMatch = tagReg.Match(html, start);
|
||
if (!tagMatch.Success)
|
||
{
|
||
tagMatch = null;
|
||
break;
|
||
}
|
||
start = tagMatch.Index + tagMatch.Length;
|
||
if (tagMatch.Groups[1].Value == "/") beginTagCount--;
|
||
else beginTagCount++;
|
||
if (beginTagCount == 0) break;
|
||
}
|
||
|
||
if (tagMatch != null)
|
||
{
|
||
HtmlTag tag = new HtmlTag(name, match.Value, html.Substring(match.Index + match.Length, tagMatch.Index - match.Index - match.Length));
|
||
tags.Add(tag);
|
||
}
|
||
else
|
||
{
|
||
break;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
break;
|
||
}
|
||
}
|
||
|
||
return tags;
|
||
}
|
||
|
||
|
||
}
|
||
}
|