This commit is contained in:
c
2023-07-23 00:36:17 +08:00
commit 5e5ffd309b
200 changed files with 152313 additions and 0 deletions

227
NetPanel.Help/HtmlTag.cs Normal file
View File

@@ -0,0 +1,227 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
namespace NetPanel.Help
{
public class HtmlTag
{
private String m_Name;
private String m_BeginTag;
private String m_InnerHTML;
private Hashtable m_Attributes = new Hashtable();
static Regex attrReg = new Regex(@"([a-zA-Z1-9_-]+)\s*=\s*(\x27|\x22)([^\x27\x22]*)(\x27|\x22)", RegexOptions.IgnoreCase);
private HtmlTag(string name, string beginTag, string innerHTML)
{
m_Name = name;
m_BeginTag = beginTag;
m_InnerHTML = innerHTML;
MatchCollection matchs = attrReg.Matches(beginTag);
foreach (Match match in matchs)
{
m_Attributes[match.Groups[1].Value.ToUpper()] = match.Groups[3].Value;
}
}
public string GetBeginTag()
{
return m_BeginTag;
}
public List<HtmlTag> FindTag(String name)
{
return FindTag(m_InnerHTML, name, String.Format(@"<{0}(\s[^<>]*|)>", name));
}
public List<HtmlTag> FindImgTag()
{
return FindTag(m_InnerHTML, "img", @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>");
}
public List<HtmlTag> FindTag(String name, String format)
{
return FindTag(m_InnerHTML, name, format);
}
public List<HtmlTag> FindTagByAttr(String tagName, String attrName, String attrValue)
{
return FindTagByAttr(m_InnerHTML, tagName, attrName, attrValue);
}
public String TagName
{
get { return m_Name; }
}
public String InnerHTML
{
get { return m_InnerHTML; }
}
public String InnerText
{
get { return checkStr(m_InnerHTML); }
}
public String GetAttribute(string name)
{
return m_Attributes[name.ToUpper()] as String;
}
public String FindDate
{
get
{
Match m = Regex.Match(InnerText, @"(?<date>((1[6-9]|[2-3]\d)\d{2})-(\d{1,2})-(\d{1,2}))");
if (m.Groups.Count > 0)
{
return m.Groups["date"].Value;
}
else
{
return null;
}
}
}
public static string checkStr2(string html)
{
html = html.Replace("<br>", "$br$");
html = html.Replace("<br/>", "$br/$");
System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"<script[\s\S]+</script *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@" href *= *[\s\S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@" no[\s\S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex6 = new System.Text.RegularExpressions.Regex(@"\<img[^\>]+\>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex7 = new System.Text.RegularExpressions.Regex(@"</p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex8 = new System.Text.RegularExpressions.Regex(@"<p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex9 = new System.Text.RegularExpressions.Regex(@"<[^>]*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
html = regex1.Replace(html, ""); //过滤<script></script>标记
html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
html = regex4.Replace(html, ""); //过滤iframe
html = regex5.Replace(html, ""); //过滤frameset
html = regex6.Replace(html, ""); //过滤frameset
// html = regex7.Replace(html, ""); //过滤frameset
// html = regex8.Replace(html, ""); //过滤frameset
html = regex9.Replace(html, "");
html = html.Replace(" ", "");
html = html.Replace("</strong>", "");
html = html.Replace("<strong>", "");
html = html.Replace("$br$", "<br>");
html = html.Replace("$br/$", "<br/>");
return html;
}
public static string checkStr(string html)
{
if (string.IsNullOrEmpty(html))
{
return html;
}
System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"<script[\s\S]+</script *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@" href *= *[\s\S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@" no[\s\S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex6 = new System.Text.RegularExpressions.Regex(@"\<img[^\>]+\>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex7 = new System.Text.RegularExpressions.Regex(@"</p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex8 = new System.Text.RegularExpressions.Regex(@"<p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex regex9 = new System.Text.RegularExpressions.Regex(@"<[^>]*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
html = regex1.Replace(html, ""); //过滤<script></script>标记
html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
html = regex4.Replace(html, ""); //过滤iframe
html = regex5.Replace(html, ""); //过滤frameset
html = regex6.Replace(html, ""); //过滤frameset
html = regex7.Replace(html, ""); //过滤frameset
html = regex8.Replace(html, ""); //过滤frameset
html = regex9.Replace(html, "");
html = html.Replace(" ", "");
html = html.Replace("</strong>", "");
html = html.Replace("<strong>", "");
html = html.Replace("&nbsp;", " ");
return html;
}
public static string HtmlToText(string str)
{
string m_outstr = str;
m_outstr = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
m_outstr = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
m_outstr = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
//m_outstr = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)|&nbsp;", RegexOptions.Multiline | RegexOptions.IgnoreCase);
m_outstr = objReg.Replace(m_outstr, "");
Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
m_outstr = objReg2.Replace(m_outstr, " ");
return m_outstr;
}
/// <summary>
/// 在文本html的文本查找标志名为tagName,并且属性attrName的值为attrValue的所有标志
/// 例如FindTagByAttr(html, "div", "class", "demo")
/// 返回所有class为demo的div标志
/// </summary>
public static List<HtmlTag> FindTagByAttr(String html, String tagName, String attrName, String attrValue)
{
String format = String.Format(@"<{0}\s[^<>]*{1}\s*=\s*(\x27|\x22){2}(\x27|\x22)[^<>]*>", tagName, attrName, attrValue);
return FindTag(html, tagName, format);
}
public static List<HtmlTag> FindTag(String html, String name, String format)
{
Regex reg = new Regex(format, RegexOptions.IgnoreCase);
Regex tagReg = new Regex(String.Format(@"<(\/|)({0})(\s[^<>]*|)>", name), RegexOptions.IgnoreCase);
List<HtmlTag> tags = new List<HtmlTag>();
int start = 0;
while (true)
{
Match match = reg.Match(html, start);
if (match.Success)
{
start = match.Index + match.Length;
Match tagMatch = null;
int beginTagCount = 1;
while (true)
{
tagMatch = tagReg.Match(html, start);
if (!tagMatch.Success)
{
tagMatch = null;
break;
}
start = tagMatch.Index + tagMatch.Length;
if (tagMatch.Groups[1].Value == "/") beginTagCount--;
else beginTagCount++;
if (beginTagCount == 0) break;
}
if (tagMatch != null)
{
HtmlTag tag = new HtmlTag(name, match.Value, html.Substring(match.Index + match.Length, tagMatch.Index - match.Index - match.Length));
tags.Add(tag);
}
else
{
break;
}
}
else
{
break;
}
}
return tags;
}
}
}