1. 程式人生 > >Java網路爬蟲crawler4j學習筆記 SAX解析工具類

Java網路爬蟲crawler4j學習筆記 SAX解析工具類

ExtractedUrlAnchorPair 類

package edu.uci.ics.crawler4j.parser;

// 將html文字中的超連結標籤,拆分為href(超連結),anchor(錨文字),tag(HTML標籤)各部分
public class ExtractedUrlAnchorPair {

  private String href;
  private String anchor;
  private String tag;

  public String getHref() {
    return href;
  }

  public void setHref
(String href) { this.href = href; } public String getAnchor() { return anchor; } public void setAnchor(String anchor) { this.anchor = anchor; } public String getTag() { return tag; } public void setTag(String tag) { this.tag = tag; } }

HtmlContentHandler類

HtmlContentHandler類使用SAX來解析網頁(SAX解析示例)。可以額外增加自己的SAX處理,來抽取你想要的內容

package edu.uci.ics.crawler4j.parser;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

// 使用sax解析器
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public
class HtmlContentHandler extends DefaultHandler { private final int MAX_ANCHOR_LENGTH = 100; private enum Element { A, AREA, LINK, IFRAME, FRAME, EMBED, IMG, BASE, META, BODY } // 建立string型別的標籤名到其對應的Element之間的對映關係 private static class HtmlFactory { private static Map<String, Element> name2Element; static { name2Element = new HashMap<>(); for (Element element : Element.values()) { //建立字串到enum變數的對映 name2Element.put(element.toString().toLowerCase(), element); } } public static Element getElement(String name) { return name2Element.get(name); } } private String base; // 當前網頁的base標籤的href的值,是當前頁面的basePath private String metaRefresh; // meta標籤,http-equiv=refresh private String metaLocation; // meta標籤,http-equiv="location" private Map<String, String> metaTags = new HashMap<>(); // 網頁的meta標籤鍵值對(可能有多個) private boolean isWithinBodyElement; // 是不是在body標籤之內 private StringBuilder bodyText; // body標籤內包含的文字內容 private List<ExtractedUrlAnchorPair> outgoingUrls; // 外鏈集合 private ExtractedUrlAnchorPair curUrl = null; // 當前處理的url所生成的ExtractedUrlAnchorPair物件 private boolean anchorFlag = false; // 是否有anchor private StringBuilder anchorText = new StringBuilder(); // 錨文字 public HtmlContentHandler() { isWithinBodyElement = false; bodyText = new StringBuilder(); outgoingUrls = new ArrayList<>(); } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { // 根據字串,獲取對應的element Element element = HtmlFactory.getElement(localName); // 如果是a,area,link標籤,得到標籤中的href超連結地址,並加入到outgoingUrls集合中。並設定anchorFlag為true,表示有錨文字。 // 如<a href="example.com">A sample anchor</a>中,anchor='A sample anchor',href=‘example.com’,tag='a' if (element == Element.A || element == Element.AREA || element == Element.LINK) { String href = attributes.getValue("href"); if (href != null) { anchorFlag = true; addToOutgoingUrls(href, localName); } } else if (element == Element.IMG) { // 如果為img標籤,則獲取其中的src,加入到外鏈集合 String imgSrc = attributes.getValue("src"); if (imgSrc != null) { addToOutgoingUrls(imgSrc, localName); } } else if (element == Element.IFRAME || element == Element.FRAME || element == Element.EMBED) { String src = attributes.getValue("src"); // 如果為iframe,frame,embed標籤,獲取src,加入到外鏈集合 if (src != null) { addToOutgoingUrls(src, localName); } } else if (element == Element.BASE) { // 只考慮第一個base標籤,其中的href屬性表示base路徑(詳見HTML base標籤) if (base != null) { // We only consider the first occurrence of the Base element. String href = attributes.getValue("href"); if (href != null) { base = href; } } } else if (element == Element.META) { String equiv = attributes.getValue("http-equiv"); if (equiv == null) { // This condition covers several cases of XHTML meta equiv = attributes.getValue("name"); // 使用name屬性(詳見HTML meta標籤) } String content = attributes.getValue("content"); if (equiv != null && content != null) { equiv = equiv.toLowerCase(); metaTags.put(equiv, content); // http-equiv="refresh" content="0;URL=http://foo.bar/..." if (equiv.equals("refresh") && (metaRefresh == null)) { // refresh跳轉 int pos = content.toLowerCase().indexOf("url="); if (pos != -1) { metaRefresh = content.substring(pos + 4); // 跳轉的目的地址 } addToOutgoingUrls(metaRefresh, localName); } // http-equiv="location" content="http://foo.bar/..." if (equiv.equals("location") && (metaLocation == null)) { // location重定向 metaLocation = content; // 重定向目的地址 //addToOutgoingUrls(metaRefresh, localName); // 原版本這裡出錯,github上已改正 addToOutgoingUrls(metaLocation, localName); } } } else if (element == Element.BODY) { // body標籤 isWithinBodyElement = true; } } // 設定外鏈的tag,href,並加入到外鏈集合中 private void addToOutgoingUrls(String href, String tag) { curUrl = new ExtractedUrlAnchorPair(); curUrl.setHref(href); curUrl.setTag(tag); outgoingUrls.add(curUrl); } @Override public void endElement(String uri, String localName, String qName) throws SAXException { Element element = HtmlFactory.getElement(localName); // 包含anchor的標籤結束了 if (element == Element.A || element == Element.AREA || element == Element.LINK) { anchorFlag = false; // 設定當前curUrl的錨文字內容 if (curUrl != null) { String anchor = anchorText.toString().replaceAll("\n", " ").replaceAll("\t", " ").trim(); if (!anchor.isEmpty()) { //超過最大長度限制時,進行擷取 if (anchor.length() > MAX_ANCHOR_LENGTH) { anchor = anchor.substring(0, MAX_ANCHOR_LENGTH) + "..."; } curUrl.setTag(localName); curUrl.setAnchor(anchor); } // 清空,以供下次使用 anchorText.delete(0, anchorText.length()); } curUrl = null; } else if (element == Element.BODY) { // body標籤結束 isWithinBodyElement = false; } } @Override public void characters(char ch[], int start, int length) throws SAXException { if (isWithinBodyElement) { if (bodyText.length() > 0) { bodyText.append(' '); } // 記錄下body中的內容 bodyText.append(ch, start, length); //如果有錨文字,記錄下來 if (anchorFlag) { anchorText.append(new String(ch, start, length)); } } } public String getBodyText() { return bodyText.toString(); } public List<ExtractedUrlAnchorPair> getOutgoingUrls() { return outgoingUrls; } public String getBaseUrl() { return base; } public Map<String, String> getMetaTags() { return metaTags; } }