1. 程式人生 > >Java網路爬蟲crawler4j學習筆記 Parser 類

Java網路爬蟲crawler4j學習筆記 Parser 類

簡介

Parser類負責將從伺服器得到的byte[]資料(儲存在Page物件裡)進行解析,按照binary,text,html的型別,分別呼叫相應的parseData類>。這裡有個容易混淆的點:類BinaryParseData,TextParseData,HtmlParseDat命名有點不好,它們表示的意思是pase之後得到的關於網頁的規範化的Data,而不是動名詞結構(parse data)。

原始碼

package edu.uci.ics.crawler4j.parser;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import
java.io.InputStream; import java.io.OutputStream; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.util.HashSet; import java.util.Set; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import
javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import edu.uci.ics.crawler4j.url.WebURL; // Tika是一個內容分析工具,自帶全面的parser工具類, // 能解析基本所有常見格式的檔案,得到檔案的metadata,content等內容,返回格式化資訊 import org.apache.tika.metadata.Metadata; import
org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; //二進位制檔案解析類 public class BinaryParseData implements ParseData { private static final Logger logger = LoggerFactory.getLogger(BinaryParseData.class); private static final String DEFAULT_ENCODING = "UTF-8"; // 預設編碼utf-8 private static final String DEFAULT_OUTPUT_FORMAT = "html"; //預設輸出格式html // Creates an auto-detecting parser instance using the default Tika configuration. private static final Parser AUTO_DETECT_PARSER = new AutoDetectParser(); private static final SAXTransformerFactory SAX_TRANSFORMER_FACTORY = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); // Parse context. Used to pass context information to Tika parsers. private final ParseContext context = new ParseContext(); // 頁面的所有外鏈 private Set<WebURL> outgoingUrls = new HashSet<>(); // 從二進位制資料中得到的html內容 private String html = null; public BinaryParseData() { context.set(Parser.class, AUTO_DETECT_PARSER); } public void setBinaryContent(byte[] data) { InputStream inputStream = new ByteArrayInputStream(data); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); try { TransformerHandler handler = getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING); AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context); // Hacking the following line to remove Tika's inserted DocType String htmlContent = new String(outputStream.toByteArray(), DEFAULT_ENCODING).replace("http://www.w3.org/1999/xhtml", ""); setHtml(htmlContent); } catch (Exception e) { logger.error("Error parsing file", e); } } /** * Returns a transformer handler that serializes incoming SAX events to * XHTML or HTML (depending the given method) using the given output encoding. * * @param encoding output encoding, or <code>null</code> for the platform default */ private static TransformerHandler getTransformerHandler(OutputStream out, String method, String encoding) throws TransformerConfigurationException { TransformerHandler transformerHandler = SAX_TRANSFORMER_FACTORY.newTransformerHandler(); Transformer transformer = transformerHandler.getTransformer(); transformer.setOutputProperty(OutputKeys.METHOD, method); // html // the Transformer may add additional whitespace when outputting the result tree; transformer.setOutputProperty(OutputKeys.INDENT, "yes"); if (encoding != null) { transformer.setOutputProperty(OutputKeys.ENCODING, encoding); } transformerHandler.setResult(new StreamResult(new PrintStream(out))); return transformerHandler; } /** @return Parsed binary content or null */ public String getHtml() { return html; } public void setHtml(String html) { this.html = html; } @Override public Set<WebURL> getOutgoingUrls() { return outgoingUrls; } @Override public void setOutgoingUrls(Set<WebURL> outgoingUrls) { this.outgoingUrls = outgoingUrls; } @Override public String toString() { return (html == null || html.isEmpty()) ? "No data parsed yet" : getHtml(); } }