1. 程式人生 > >.net使用abot爬蟲簡單例子

.net使用abot爬蟲簡單例子

ldoc edt pan cat style 指定 gles ted 應該

abot是.net爬蟲框架中的一種,Abot是一個開源的.net爬蟲,速度快,易於使用和擴展。項目的地址是https://code.google.com/p/abot/。

爬取的html解析,我們使用AngleSharp,項目的地址:https://github.com/AngleSharp/AngleSharp

首先我們需要配置abot

 private static readonly Uri FeedUrl = new Uri("https://www.jd.com/allSort.aspx");//定義一個爬取的url,這裏以京東商品的分類為例子

 public static IWebCrawler GetManuallyConfiguredWebCrawler()
        {
            //這裏進行配置,具體的含義自己看源代碼了解
            CrawlConfiguration config 
= new CrawlConfiguration(); config.MaxConcurrentThreads = System.Environment.ProcessorCount; config.MaxPagesToCrawl = 1000; config.IsExternalPageCrawlingEnabled = false; config.IsUriRecrawlingEnabled = false; config.IsExternalPageLinksCrawlingEnabled
= false; config.IsRespectRobotsDotTextEnabled = false; config.DownloadableContentTypes = "text/html, text/plain"; config.MinCrawlDelayPerDomainMilliSeconds = 1000; config.CrawlTimeoutSeconds = 0; config.MaxPagesToCrawlPerDomain = 0; var
crawler = new PoliteWebCrawler(config, null, null, null, null, null, null, null, null);
//爬取頁面前的判斷 crawler.ShouldCrawlPage(ShouldCrawlPage); crawler.ShouldDownloadPageContent(ShouldDownloadPageContent); crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks);

//下面是爬取的四個事件 crawler.PageCrawlStartingAsync
+= crawler_ProcessPageCrawlStarting;//單個頁面爬取開始 crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompletedAsync;//單個頁面爬取結束 // crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;// 頁面鏈接不允許爬取事件 //crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;//頁面不允許爬取事件 return crawler; }

爬蟲中主要是4個事件, 頁面爬取開始、頁面爬取失敗、頁面不允許爬取事件、頁面中的鏈接不允許爬取事件.

以下是示例

 //單個頁面爬取開始 
        public static void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;

        }
        //單個頁面爬取結束 
        public static void crawler_ProcessPageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e)
        {
            if (e.CrawledPage.Uri == FeedUrl)
            {
                StringBuilder sb=new StringBuilder();
               //這裏使用AngleSharp解析html
                var all=e.CrawledPage.AngleSharpHtmlDocument.QuerySelector(".category-items").Children;
                foreach (var col in all)
                {
                    var categorys=col.QuerySelectorAll(".category-item");
                    foreach (var category in categorys)
                    {
                        var first=category.QuerySelector(".item-title span").Text();
                        sb.Append("\r\n" + first + "\r\n");
                        var seconds = category.QuerySelector(".items").Children;
                        foreach (var second in seconds)
                        {
                            var secondtext=second.QuerySelector("dt a").Text();
                            sb.Append(secondtext + "\t");
                            var thireds = second.QuerySelector("dd").Children;
                            foreach (var thired in thireds)
                            {
                                var thiredtext = thired.Text();
                                sb.Append(thiredtext + ",");
                            }
                            sb.Remove(sb.Length - 1, 1);
                        }
                    }
                }
//爬取的數據保存到C:\Program Files (x86)\IIS Express下面。註意這裏保存可能需要以管理員的身份運行VS System.IO.File.AppendAllText(
"fake.txt", sb.ToString()); } } #region /// <summary> /// 同步方法註冊一個委托,以確定是否應該抓取一個頁面 /// </summary> /// <param name="pageToCrawl"></param> /// <param name="crawlContext"></param> /// <returns></returns> public static CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl.IsRetry || pageToCrawl.IsRoot || FeedUrl == pageToCrawl.Uri )//判斷是否為根Url,爬取的Url是否為我們指定的 { return new CrawlDecision() { Allow = true }; } else { return new CrawlDecision { Allow = false, Reason = "Not match uri" };//如果為false,就不爬取頁面 } } /// <summary> /// 同步方法註冊一個委托,以確定頁面的內容是否應該被加載 /// </summary> /// <param name="pageToCrawl"></param> /// <param name="crawlContext"></param> /// <returns></returns> private static CrawlDecision ShouldDownloadPageContent(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl.IsRoot || pageToCrawl.IsRetry || FeedUrl == pageToCrawl.Uri) { return new CrawlDecision { Allow = true }; } return new CrawlDecision { Allow = false, Reason = "Not match uri" }; } /// <summary> /// 同步方法註冊一個委托,以確定是否應該抓取一個頁面的鏈接 /// </summary> /// <param name="crawledPage"></param> /// <param name="crawlContext"></param> /// <returns></returns> private static CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext) { if (!crawledPage.IsInternal) return new CrawlDecision { Allow = false, Reason = "We dont crawl links of external pages" }; if (crawledPage.IsRoot || crawledPage.IsRetry || crawledPage.Uri == FeedUrl) { return new CrawlDecision { Allow = true }; } else { return new CrawlDecision { Allow = false, Reason = "We only crawl links of pagination pages" }; } } #endregion

接下來就是測試

        public ActionResult Index()
        {
            var crawler = GetManuallyConfiguredWebCrawler();
            var reuslt = crawler.Crawl(FeedUrl);
            Response.Write(reuslt.ErrorException);
            return View();
        }

.net使用abot爬蟲簡單例子