1. 程式人生 > >【垂直搜尋引擎搭建14】HtmlParser中Filter方法(URL網路地址)

【垂直搜尋引擎搭建14】HtmlParser中Filter方法(URL網路地址)

1、TagNameFilter

import java.io.IOException;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class TagNameFilterDemo {

    public
static void getContent(String url) throws IOException,ParserException{ Parser parser = new Parser(url); NodeFilter filter = new TagNameFilter("div"); NodeList nodes = parser.extractAllNodesThatMatch(filter); String nodex = ""; if(nodes!=null){ for
(int i=0;i<nodes.size();i++){ Node node = nodes.elementAt(i); nodex = node.toString(); System.out.println(nodex); } } } public static void main(String[] args) throws ParserException, IOException { String url="http://yemacaijing.baijia.baidu.com/article/598342"
; getContent(url); } }

2、HasChildFilter

import java.io.IOException;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;


public class HasChildFilterDemo {

    public static void getContent(String url)throws ParserException,IOException{
        Parser parser = new Parser(url);
        NodeFilter innerfilter = new TagNameFilter("div");
        NodeFilter filter = new HasChildFilter(innerfilter);
        NodeList nodes = parser.extractAllNodesThatMatch(filter);

        String nodex = "";
        if(nodes!=null){
            for(int i=0;i<nodes.size();i++){
                Node node = nodes.elementAt(i);
                nodex = node.toString();

                System.out.println(nodex);
            }
        }

    }
    public static void main(String[] args) throws ParserException, IOException {
        String url="http://yemacaijing.baijia.baidu.com/article/598342";
        getContent(url);
    }

}

3、HasAttributeFilter


import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;


public class HasAttributeFilterDemo {

    public static void getContent(String url) throws ParserException{
        Parser parser = new Parser(url);
        NodeFilter filter = new HasAttributeFilter("href");
        NodeList nodes = parser.extractAllNodesThatMatch(filter);

        String nodex = "";
        if(nodes!=null){
            for(int i=0;i<nodes.size();i++){
                Node node = nodes.elementAt(i);
                nodex = node.toString();

                System.out.println(nodex);
            }
        }

    }
    public static void main(String[] args) throws ParserException {
        String url = "http://yemacaijing.baijia.baidu.com/article/598342";
        getContent(url);

    }

}