NetPanel/NetPanel.Help/HtmlTag.cs

using System;
using System.Collections;
using System.Collections.Generic;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;

namespace NetPanel.Help
{
    public class HtmlTag
    {
        private String m_Name;
        private String m_BeginTag;
        private String m_InnerHTML;
        private Hashtable m_Attributes = new Hashtable();

        static Regex attrReg = new Regex(@"([a-zA-Z1-9_-]+)\s*=\s*(\x27|\x22)([^\x27\x22]*)(\x27|\x22)", RegexOptions.IgnoreCase);

        private HtmlTag(string name, string beginTag, string innerHTML)
        {
            m_Name = name;
            m_BeginTag = beginTag;
            m_InnerHTML = innerHTML;

            MatchCollection matchs = attrReg.Matches(beginTag);
            foreach (Match match in matchs)
            {
                m_Attributes[match.Groups[1].Value.ToUpper()] = match.Groups[3].Value;
            }
        }
        public string GetBeginTag()
        {

            return m_BeginTag;
        }
        public List<HtmlTag> FindTag(String name)
        {
            return FindTag(m_InnerHTML, name, String.Format(@"<{0}(\s[^<>]*|)>", name));
        }
        public List<HtmlTag> FindImgTag()
        {
            return FindTag(m_InnerHTML, "img", @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>");
        }
        public List<HtmlTag> FindTag(String name, String format)
        {
            return FindTag(m_InnerHTML, name, format);
        }

        public List<HtmlTag> FindTagByAttr(String tagName, String attrName, String attrValue)
        {
            return FindTagByAttr(m_InnerHTML, tagName, attrName, attrValue);
        }

        public String TagName
        {
            get { return m_Name; }
        }

        public String InnerHTML
        {
            get { return m_InnerHTML; }
        }
        public String InnerText
        {
            get { return checkStr(m_InnerHTML); }
        }
        public String GetAttribute(string name)
        {
            return m_Attributes[name.ToUpper()] as String;
        }
        public String FindDate
        {
            get
            {
                Match m = Regex.Match(InnerText, @"(?<date>((1[6-9]|[2-3]\d)\d{2})-(\d{1,2})-(\d{1,2}))");
                if (m.Groups.Count > 0)
                {
                    return m.Groups["date"].Value;
                }
                else
                {
                    return null;
                }
            }
        }
        public static string checkStr2(string html)
        {
            html = html.Replace("<br>", "$br$");
            html = html.Replace("<br/>", "$br/$");
            System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"<script[\s\S]+</script *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@" href *= *[\s\S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@" no[\s\S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex6 = new System.Text.RegularExpressions.Regex(@"\<img[^\>]+\>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex7 = new System.Text.RegularExpressions.Regex(@"</p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex8 = new System.Text.RegularExpressions.Regex(@"<p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex9 = new System.Text.RegularExpressions.Regex(@"<[^>]*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            html = regex1.Replace(html, ""); //过滤<script></script>标记
            html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
            html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
            html = regex4.Replace(html, ""); //过滤iframe
            html = regex5.Replace(html, ""); //过滤frameset
            html = regex6.Replace(html, ""); //过滤frameset
            // html = regex7.Replace(html, ""); //过滤frameset
            // html = regex8.Replace(html, ""); //过滤frameset
            html = regex9.Replace(html, "");
            html = html.Replace(" ", "");
            html = html.Replace("</strong>", "");
            html = html.Replace("<strong>", "");
            html = html.Replace("$br$", "<br>");
            html = html.Replace("$br/$", "<br/>");
            return html;
        }
        public static string checkStr(string html)
        {
            if (string.IsNullOrEmpty(html))
            {
                return html;
            }
            System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"<script[\s\S]+</script *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@" href *= *[\s\S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@" no[\s\S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex6 = new System.Text.RegularExpressions.Regex(@"\<img[^\>]+\>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex7 = new System.Text.RegularExpressions.Regex(@"</p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex8 = new System.Text.RegularExpressions.Regex(@"<p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex9 = new System.Text.RegularExpressions.Regex(@"<[^>]*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            html = regex1.Replace(html, ""); //过滤<script></script>标记
            html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
            html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
            html = regex4.Replace(html, ""); //过滤iframe
            html = regex5.Replace(html, ""); //过滤frameset
            html = regex6.Replace(html, ""); //过滤frameset
            html = regex7.Replace(html, ""); //过滤frameset
            html = regex8.Replace(html, ""); //过滤frameset
            html = regex9.Replace(html, "");
            html = html.Replace(" ", "");
            html = html.Replace("</strong>", "");
            html = html.Replace("<strong>", "");
            html = html.Replace("&nbsp;", " ");
            return html;
        }
        public static string HtmlToText(string str)
        {

            string m_outstr = str;
            m_outstr = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
            m_outstr = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
            m_outstr = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");

            //m_outstr = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
            Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)|&nbsp;", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            m_outstr = objReg.Replace(m_outstr, "");
            Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
            m_outstr = objReg2.Replace(m_outstr, " ");
            return m_outstr;


        }


        /// <summary>
        /// 在文本html的文本查找标志名为tagName,并且属性attrName的值为attrValue的所有标志
        /// 例如：FindTagByAttr(html, "div", "class", "demo")
        /// 返回所有class为demo的div标志
        /// </summary>
        public static List<HtmlTag> FindTagByAttr(String html, String tagName, String attrName, String attrValue)
        {
            String format = String.Format(@"<{0}\s[^<>]*{1}\s*=\s*(\x27|\x22){2}(\x27|\x22)[^<>]*>", tagName, attrName, attrValue);
            return FindTag(html, tagName, format);
        }

        public static List<HtmlTag> FindTag(String html, String name, String format)
        {
            Regex reg = new Regex(format, RegexOptions.IgnoreCase);
            Regex tagReg = new Regex(String.Format(@"<(\/|)({0})(\s[^<>]*|)>", name), RegexOptions.IgnoreCase);

            List<HtmlTag> tags = new List<HtmlTag>();
            int start = 0;

            while (true)
            {
                Match match = reg.Match(html, start);
                if (match.Success)
                {
                    start = match.Index + match.Length;
                    Match tagMatch = null;
                    int beginTagCount = 1;

                    while (true)
                    {
                        tagMatch = tagReg.Match(html, start);
                        if (!tagMatch.Success)
                        {
                            tagMatch = null;
                            break;
                        }
                        start = tagMatch.Index + tagMatch.Length;
                        if (tagMatch.Groups[1].Value == "/") beginTagCount--;
                        else beginTagCount++;
                        if (beginTagCount == 0) break;
                    }

                    if (tagMatch != null)
                    {
                        HtmlTag tag = new HtmlTag(name, match.Value, html.Substring(match.Index + match.Length, tagMatch.Index - match.Index - match.Length));
                        tags.Add(tag);
                    }
                    else
                    {
                        break;
                    }
                }
                else
                {
                    break;
                }
            }

            return tags;
        }


    }
}