1. 程式人生 > >C# html解析器 ,解析HTML的各種工具比較。

C# html解析器 ,解析HTML的各種工具比較。

引用:

http://www.cnblogs.com/gaoweipeng/archive/2009/09/02/1558279.html

介紹了兩種不錯的HTML解析器的方法。

第一種方法:HTML正則表示式的方法。

     參見:http://www.cnblogs.com/gaoweipeng/archive/2009/09/02/1558279.html/

     或者直接解析標籤,以前我曾經使用DELPHI就是直接解析,C#也有。

參考網頁:

http://www.codeproject.com/Articles/57176/Parsing-HTML-Tags-in-C

第二種方法:採用.NET自帶的WebBrowser結合HtmlDocument進行解析。

    方法問題在於依賴於WEBBrowser.

原始碼:

using System;
using System.Collections;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.Web;
using System.IO;

namespace IntrospectionHtml
{
    public partial class Form1 : Form
    {

        public class TreeNodeHtmlElm : TreeNode
        {
            //屬性節點
           public enum TypeNode
            {
                Html,
                Form,
                Link,
                Image,
                Css
            };

            public HtmlElement mHtmlElement;
            private TypeNode type;

            public TypeNode Type
            {
                get { return type; }
                set { type = value; } 
            }


            public TreeNodeHtmlElm(string elm,TypeNode t)
                : base()
            {
                type = t;
                Text = elm;
                mHtmlElement = null;
            }

            public TreeNodeHtmlElm(HtmlElement elm,string txt,TypeNode t)
                : base()
            {
                type = t;
                Text = txt;
                mHtmlElement = elm;
            }

            public TreeNodeHtmlElm(HtmlElement elm,TypeNode t) : base()
            {
                type = t;
                mHtmlElement = elm;
                try
                {
                    if (elm.OuterText == null || elm.OuterText == "")
                    {
                        Text = elm.OuterHtml;
                    }
                    else
                    {
                        if (elm.OuterText.Length > 100)
                        {
                            Text = elm.OuterText.Substring(0, 100);
                        }
                        else
                        {
                            Text = elm.OuterText;
                        }
                    }
                }
                catch (Exception e)
                {
                    Text = "";
                }
            }
        };

        TreeNodeHtmlElm nodeCss = null;

        public Form1()
        {
            InitializeComponent();

            webBrowser2.Navigate("about:blank");
            splitContainer1.SplitterDistance = 50;
        }

        private void FillTree(HtmlElement hElmFather, TreeNodeHtmlElm t,TreeNodeHtmlElm.TypeNode type)
        {
            foreach (HtmlElement hElm in hElmFather.Children)
            {
                TreeNodeHtmlElm node = new TreeNodeHtmlElm(hElm,type);                
                t.Nodes.Add((TreeNode)node);
                if (hElm.Children.Count > 0)
                {
                    FillTree(hElm, node,type);
                }
            }
        }

        
        private void FillTreeForm(HtmlDocument doc, TreeNodeHtmlElm t)
        {
            System.Collections.IEnumerator en = doc.Forms.GetEnumerator();
            while (en.MoveNext())
            {
                FillTree((HtmlElement)en.Current,t,TreeNodeHtmlElm.TypeNode.Form);         
            }  
        }

        private void FillTreeLink(HtmlDocument doc, TreeNodeHtmlElm t)
        {
            List<String> lstTemp = new List<String>(); 

            foreach (HtmlElement e in doc.Links)
            {
                string textToAdd = e.GetAttribute("href");

                //
                // On elimine les doublons
                //
                if (lstTemp.IndexOf(textToAdd) == -1)
                {
                    TreeNodeHtmlElm node = new TreeNodeHtmlElm(e, e.GetAttribute("href"), TreeNodeHtmlElm.TypeNode.Link);
                    t.Nodes.Add((TreeNode)node);
                    lstTemp.Add(textToAdd);

                }                
            }
        }

        private void FillTreeImage(HtmlDocument doc, TreeNodeHtmlElm t)
        {            
            List<String> lstTemp = new List<String>(); 

            foreach (HtmlElement e in doc.Images)
            {
                string textToAdd = e.GetAttribute("src");

                //
                // On elimine les doublons
                //
                if (lstTemp.IndexOf(textToAdd) == -1 )
                {
                    TreeNodeHtmlElm node = new TreeNodeHtmlElm(e, textToAdd,TreeNodeHtmlElm.TypeNode.Image);
                    t.Nodes.Add((TreeNode)node);
                    lstTemp.Add(textToAdd);
                }
            }            
        }

        private void FillTreeCss(HtmlDocument doc, TreeNodeHtmlElm t)
        {         
            foreach (HtmlElement e in doc.All)
            {
                if(e.TagName.ToLower() == "link")
                {
                    if (e.GetAttribute("rel").ToLower() == "stylesheet")
                    {
                      TreeNodeHtmlElm node = new TreeNodeHtmlElm(e, e.GetAttribute("href"),TreeNodeHtmlElm.TypeNode.Css);
                      t.Nodes.Add((TreeNode)node);
                    }
                }   
            }
        }

        private void ShowInHtmlPreview()
        {
            TreeNodeHtmlElm tn = (TreeNodeHtmlElm)(treeView1.SelectedNode);
            if (tn != null)
            {
                try
                {
                    webBrowser2.Document.Body.InnerHtml = "<html><body>" + tn.mHtmlElement.InnerHtml + "</body></html>";                    
                }
                catch (Exception exp)
                {
                }
            }
        }

        private void SaveTreeNodeHtml(string filename)
        {
            // Mettre ceci dans un objet.
            TreeNodeHtmlElm tn = (TreeNodeHtmlElm)(treeView1.SelectedNode);
            StreamWriter sw = new StreamWriter(saveFileDialog1.FileName);
            sw.WriteLine("<html>\n\r<body>\n\r");

            // Ajouter le CSS dans le code html et recopier aussi le fichier.
            // nodeCss 
            foreach (TreeNode e in nodeCss.Nodes)
            {
                sw.WriteLine("<link rel=\"stylesheet\" href=\"" + ((TreeNodeHtmlElm)e).Text +"\" type=\"text/css\" media=\"screen\" />");
            }
            sw.WriteLine(tn.mHtmlElement.InnerHtml);
            sw.WriteLine("</body></html>");
            sw.Close();
            sw.Dispose();
        }        
        

        private void button1_Click(object sender, EventArgs e)
        {
            webBrowser1.Navigate(textBox1.Text);
            webBrowser2.Navigate(textBox1.Text);
            splitContainer3.Panel2Collapsed = true;
        }

        private void treeView1_AfterSelect(object sender, TreeViewEventArgs e)
        {
            splitContainer3.Panel2Collapsed = false;            
            TreeNodeHtmlElm tn = (TreeNodeHtmlElm)(treeView1.SelectedNode);
            propertyGrid1.SelectedObject = tn.mHtmlElement;
            ShowInHtmlPreview();
        }

        private void buttonColapsePropertyGrid_Click(object sender, EventArgs e)
        {
            splitContainer3.Panel2Collapsed = !splitContainer3.Panel2Collapsed;
        }

        private void saveToolStripMenuItem_Click(object sender, EventArgs e)
        {
            if (treeView1.SelectedNode == null)
            {
                return;
            }

            if (saveFileDialog1.ShowDialog() == DialogResult.OK)
            {
                SaveTreeNodeHtml(saveFileDialog1.FileName);
            }
        }

        private void showInHtmlToolStripMenuItem_Click(object sender, EventArgs e)
        {
            ShowInHtmlPreview();
        }

        private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            treeView1.Nodes.Clear();
            TreeNodeHtmlElm node = new TreeNodeHtmlElm("body",TreeNodeHtmlElm.TypeNode.Html);
            treeView1.Nodes.Add((TreeNode)node);

            FillTree(webBrowser1.Document.Body, node,TreeNodeHtmlElm.TypeNode.Html);
            TreeNodeHtmlElm nodeForm = new TreeNodeHtmlElm("forms",TreeNodeHtmlElm.TypeNode.Form);
            treeView1.Nodes.Add((TreeNode)nodeForm);
            FillTreeForm(webBrowser1.Document, nodeForm);
            TreeNodeHtmlElm nodeLink = new TreeNodeHtmlElm("Links",TreeNodeHtmlElm.TypeNode.Link);
            treeView1.Nodes.Add((TreeNode)nodeLink);
            FillTreeLink(webBrowser1.Document, nodeLink);
            TreeNodeHtmlElm nodeImg = new TreeNodeHtmlElm("Images",TreeNodeHtmlElm.TypeNode.Image);
            treeView1.Nodes.Add((TreeNode)nodeImg);
            FillTreeImage(webBrowser1.Document, nodeImg);
            nodeCss = new TreeNodeHtmlElm("CSS",TreeNodeHtmlElm.TypeNode.Css);

            treeView1.Nodes.Add((TreeNode)nodeCss);

            FillTreeCss(webBrowser1.Document, nodeCss);           
        }

        private void textBox1_Validated(object sender, EventArgs e)
        {
            button1_Click(sender, e);
        }

        private void contextMenuStrip1_Opening(object sender, CancelEventArgs e)
        {
            if (treeView1.SelectedNode == null)
            {
                contextMenuStrip1.Enabled = false;
                return;
            }
            contextMenuStrip1.Enabled = true;

            switch (((TreeNodeHtmlElm)treeView1.SelectedNode).Type)
            {
                case TreeNodeHtmlElm.TypeNode.Html:
                    break;
                case TreeNodeHtmlElm.TypeNode.Form:
                    break;
                case TreeNodeHtmlElm.TypeNode.Css:
                    break;
                case TreeNodeHtmlElm.TypeNode.Image:
                    break;
                case TreeNodeHtmlElm.TypeNode.Link:
                    break;
            }
        }


    }
}

第三種方法:第三方開源元件。

    Winista.Htmlparser.Net,原始碼參考引用的地址。

http://www.cnblogs.com/gaoweipeng/archive/2009/09/02/1558279.html

    HtmlAgilityPack。

怎麼下載呢,百度和谷歌了。。

我們比較一下。

    Winista.HTMLParser比較龐大,還要引用CSharp.zip包,功能強大,提供了HTTP協議支援。

    HTMLAgility.Pack比較輕量,而且時間速度比Winista解析快5倍左右,這個可能是輕量的好處。

我試用同樣一個頁面134的HTML頁面。

    Winista.HTMLParser:大約39000毫秒。

    HTMLAgilityPack:大約7800毫秒.

但是,HTMLAgilityPack對於中文支援差一些,BUG還不少,希望逐步改進。

單純做網路爬蟲,網址解析網頁。HTMLAgilityPack比Winista.HTMLParser好很多,而且足夠用了,壓縮,沒有必要吧。

強烈推薦HtmlAgilityPack.