1. 程式人生 > >java爬蟲-0020,httpclient獲取原始碼

java爬蟲-0020,httpclient獲取原始碼

1、匯入httpclient依賴

<dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient-cache</artifactId>
            <version>4.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpmime</artifactId>
            <version>4.3</version>
        </dependency>

2、封裝請求物件

/**
 * Created by rongyaowen on 2018/10/4.
 * 請求封裝,get請求,post請求。
 */
public class Request {
    private static CloseableHttpClient closeableHttpClient = HttpClientBuilder.create().build();

    /**
     * get 請求
     *
     * @param url
     * @param headerParams 請求頭
     * @return
     */
    public static Map<String, Object> get(String url, Map<String, Object> headerParams) {
        HttpGet httpGet = new HttpGet(url);
        Map<String, Object> logMap = new HashMap<>();
        logMap.put("請求連結", url);
        return response(httpGet, headerParams, logMap);
    }

    /**
     * post 請求
     *
     * @param url
     * @param headerParams  請求頭
     * @param requestParams 請求資料
     * @return
     */
    public static Map<String, Object> post(String url, Map<String, Object> headerParams, Map<String, Object> requestParams) {
        HttpPost httpPost = new HttpPost(url);
        StringEntity entity = null;
        try {
            String requestParamsStr = null;
            if (!requestParams.isEmpty() && !StringUtils.isEmpty(requestParamsStr = requestParams.get(P.REQUEST.REQUEST_PARAMS).toString())) {
                entity = new StringEntity(requestParamsStr);
            }
            String contentTypeStr = null;
            if (!requestParams.isEmpty() && !StringUtils.isEmpty(contentTypeStr = requestParams.get(P.REQUEST.CONTENT_TYPE).toString())) {
                // 表單格式資料
                entity.setContentType(contentTypeStr);
            }
            httpPost.setEntity(entity);
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        Map<String, Object> logMap = new HashMap<>();
        logMap.put("親求連結", url);
        logMap.put("請求引數", requestParams);

        return response(httpPost, headerParams, logMap);
    }

    /**
     * 請求
     *
     * @param httpRequestBase
     * @param headerParams    請求頭
     * @param logMap          日誌map
     * @return
     */
    private static Map<String, Object> response(HttpRequestBase httpRequestBase, Map<String, Object> headerParams, Map<String, Object> logMap) {
        Map<String, Object> resMap = new HashMap<>();
        RequestConfig config = RequestConfig.custom().setConnectionRequestTimeout(5000).setConnectTimeout(5000)
                .setSocketTimeout(5000).build();
        httpRequestBase.setConfig(config);
        // 拼裝請求頭
        if (!headerParams.isEmpty()) {
            for (Map.Entry<String, Object> entry : headerParams.entrySet()) {
                httpRequestBase.addHeader(entry.getKey(), entry.getValue().toString());
            }
        }

        try {
            HttpResponse httpResponse = closeableHttpClient.execute(httpRequestBase);
            // 狀態碼
            int statusCode = httpResponse.getStatusLine().getStatusCode();
            logMap.put("請求頭", headerParams);
            logMap.put("狀態碼", statusCode);
            logMap.put("請求方法", httpRequestBase.getMethod());
            LogUtil.debug(LogUtil.mapToStr(logMap));

            // 返回響應body資料
            HttpEntity entity = httpResponse.getEntity();
            String resBody = EntityUtils.toString(entity, "utf-8");
            // 響應頭
            Header[] headers = httpResponse.getAllHeaders();

            // 組裝響應
            resMap.put(P.REQUEST.RES_BODY, resBody);
            resMap.put(P.REQUEST.HEADERS, headers);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return resMap;
    }

    /**
     * 獲取請求流
     *
     * @param url
     * @param headerParams
     * @return
     */
    public static InputStream getAuthCode(String url, Map<String, Object> headerParams) {
        RequestConfig config = RequestConfig.custom().setConnectionRequestTimeout(5000).setConnectTimeout(5000)
                .setSocketTimeout(5000).build();
        HttpGet httpGet = new HttpGet(url);
        httpGet.setConfig(config);
        // 拼裝請求頭
        if (!headerParams.isEmpty()) {
            for (Map.Entry<String, Object> entry : headerParams.entrySet()) {
                httpGet.addHeader(entry.getKey(), entry.getValue().toString());
            }
        }
        HttpResponse httpResponse = null;
        try {
            httpResponse = closeableHttpClient.execute(httpGet);
            int statusCode = httpResponse.getStatusLine().getStatusCode();
            Map<String, Object> logMap = new HashMap<>();
            logMap.put("請求連結", url);
            logMap.put("請求頭", headerParams);
            logMap.put("請求方法", httpGet.getMethod());
            logMap.put("請求狀態", statusCode);
            LogUtil.debug(LogUtil.mapToStr(logMap));

            if (statusCode == HttpStatus.SC_OK) {
                HttpEntity entity = httpResponse.getEntity();
                return entity.getContent();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }
}

3、獲取豆瓣未登入主頁內容

首頁在谷歌的開發者工具中,拿到User-Agent的頭資訊(沒有這個資訊,會被伺服器判定為爬蟲)

4、模擬傳送請求獲取主頁內容

 /**
     * 第一個爬蟲程式,獲取原始碼,注意需要帶上User_Agetn
     */
    @Test
    public void crawlerClient_01() {
        String url = "https://www.douban.com";
        Map<String, Object> headerParams = new HashMap<>();
        headerParams.put(P.REQUEST.USER_AGENT, P.USER_AGENT);
        Map<String, Object> resMap = Request.get(url, headerParams);
        System.out.println(resMap.get(P.REQUEST.RES_BODY));
    }

5、效果展示