1. 程式人生 > >HttpClient&Jsoup爬蟲的簡單應用

HttpClient&Jsoup爬蟲的簡單應用

target utf-8 gpo art t對象 設置 int sel 發送

  詳細的介紹已經有很多前輩總結,引用一下該篇文章:https://blog.csdn.net/zhuwukai/article/details/78644484

  下面是一個代碼的示例:

package com.http.client;

import java.io.IOException;

import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet; import org.apache.http.conn.params.ConnRouteParams; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; /** * * @author oo * @date 2018-04-04
*/ public class MyHttpClient { private static Logger logger = Logger.getLogger(MyHttpClient.class); /** * 需求:使用httpclient 爬取 網站數據 * * @param args */ public static void main(String[] args) { // 創建HttpClient 對象 HttpClient hclient = new DefaultHttpClient();
// 設置響應時間 傳輸源碼時間 代理服務器(設置代理服務器的目的是:防止爬數據被封ip) hclient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000) .setParameter(CoreConnectionPNames.SO_TIMEOUT, 20000) .setParameter(ConnRouteParams.DEFAULT_PROXY, new HttpHost("111.155.116.237", 8123)); HttpGet hGet = new HttpGet("http://www.itcast.cn/"); String content = ""; try { // 向網站發送請求,獲取網頁源碼 HttpResponse execute = hclient.execute(hGet); // EntityUtils工具類把網頁實體轉換成字符串 content = EntityUtils.toString(execute.getEntity(), "utf-8"); } catch (ClientProtocolException e) { e.printStackTrace(); logger.error("********ClientProtocolException" + e); } catch (IOException e) { e.printStackTrace(); logger.error("********IOException" + e); } System.out.println(content); } }

  使用Jsoup進行請求:

package com.http.client;

import java.io.IOException;

import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class MyJsoup {
    private static Logger logger = Logger.getLogger(MyJsoup.class);

    public static void main(String[] args) {
        try {
            // 使用jsoup 發送請求
            Document document = Jsoup.connect("http://www.itcast.cn").get();
//            System.out.println(document);
            Elements elements = document.getElementsByTag("a");
            String val = elements.text();
            System.out.println(val);
            
            for (Element element : elements) {
                System.out.println(element.text()+":"+element.attr("href"));
            }
        } catch (IOException e) {
            e.printStackTrace();
            logger.error("***********IOException: 連接失敗" + e);
        }
    }

}

  HttpClient 結合Jsoup:

 1 package com.http.client;
 2 
 3 import java.io.IOException;
 4 
 5 import org.apache.http.HttpResponse;
 6 import org.apache.http.client.ClientProtocolException;
 7 import org.apache.http.client.HttpClient;
 8 import org.apache.http.client.methods.HttpGet;
 9 import org.apache.http.impl.client.DefaultHttpClient;
10 import org.apache.http.util.EntityUtils;
11 import org.jsoup.Jsoup;
12 import org.jsoup.nodes.Document;
13 import org.jsoup.nodes.Element;
14 import org.jsoup.select.Elements;
15 
16 public class HttpCLientAndJsoup {
17 
18     public static void main(String[] args) throws ClientProtocolException, IOException {
19         // 創建HttpClient對象
20         HttpClient hClient = new DefaultHttpClient();
21         // 爬蟲URL大部分都是get請求,創建get請求對象
22         HttpGet hget = new HttpGet("http://www.itcast.cn/");
23         // 向網站發送請求,獲取網頁源碼
24         HttpResponse response = hClient.execute(hget);
25         // EntityUtils工具類把網頁實體轉換成字符串
26         String content = EntityUtils.toString(response.getEntity(), "utf-8");
27         // Jsoup負責解析網頁
28         Document doc = Jsoup.parse(content);
29         // 使用元素選擇器選擇網頁內容
30         Elements elements = doc.select("div.salary_con li");
31         // System.out.println(elements.text());
32         for (Element element : elements) {
33             String text = element.text();
34             System.out.println(text);
35         }
36 
37     }
38 
39 }

HttpClient&Jsoup爬蟲的簡單應用