1. 程式人生 > >爬蟲基礎之Jsoup解析HTML

爬蟲基礎之Jsoup解析HTML

Jsoup的Maven座標

<dependency>

<groupId>org.jsoup</groupId>

<artifactId>jsoup</artifactId>

<version>1.7.2</version>

</dependency>

Jsoup解析HTML得到Document的幾種方式:

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.IOException;

/**
 * 解析HTML的DOM資料
 */
public class JsoupDom {

    public static void main(String[] args) throws IOException {
        String html = "<!DOCTYPE html>\n" +
                "<html lang=\"en\">\n" +
                "<head>\n" +
                "    <meta charset=\"UTF-8\">\n" +
                "    <title>Title</title>\n" +
                "</head>\n" +
                "<body>\n" +
                "\n" +
                "</body>\n" +
                "</html>";
        //方式一:獲取Document物件
        Document document = Jsoup.parse(html);
        System.out.println(document.title());
        //方式二:獲取Document物件
        Document document1 = Jsoup.connect("http://www.bingosoft.net").get();
        Elements elements = document1.select(".city h3");
        System.out.println(elements+",,,"+elements.text());
        //方式三:獲取Document物件
//        Document document2 = Jsoup.parse(new File("html_path"), "UTF-8");
        //方式四:獲取Document物件
        String bodyHtml = "<a href='#'>連線</a>";
        Document document3 = Jsoup.parseBodyFragment(bodyHtml);
        System.out.println(document3.text());
    }
}