1. 程式人生 > >Java之請求傳送工具類(HttpClientUtils,爬蟲)-yellowcong

Java之請求傳送工具類(HttpClientUtils,爬蟲)-yellowcong

Java傳送請求,之前做過一段時間的爬蟲,所以寫了這個請求傳送的工具,這個工具偽裝成百度,然後去爬取推酷的資料,當時是由於推酷有ip訪問限制,你如果是爬蟲,就不讓訪問了,所以我偽裝成了百度,然後就可以隨便爬取推庫的資料了,當時爬了1GB多的文字資料,然後圖片資料大概有15GB左右,然而,我卻根本沒有用這些資料,只是爬下來了而已。。。。

pom.xml依賴

<dependency>
    <groupId>commons-httpclient</groupId>
    <artifactId>commons-httpclient</artifactId
>
<version>3.1</version> </dependency>

請求傳送工具

package com.yellowcong.utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Map;

import org.apache.commons.httpclient.Credentials;
import
org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.UsernamePasswordCredentials; import
org.apache.commons.httpclient.auth.AuthScope; import org.apache.commons.httpclient.cookie.CookiePolicy; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.PostMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpOptions; import org.apache.http.client.methods.HttpPost; import org.apache.http.entity.ContentType; import org.apache.http.entity.StringEntity; import org.apache.http.entity.mime.MultipartEntity; import org.apache.http.entity.mime.content.FileBody; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.lucene.analysis.ReusableAnalyzerBase; /** * 這個工具包是用來 做代理服務 爬去資料的,結果好多代理資料沒有 * 2015-10 爬取 推酷資料 來做服務 * 通過這個工具來來新增代理,來處理資料 * @author yellowcong * * * --------------------------------------------------- * 2016-8-9 更新,添加了setGet()中添加了,設定編碼,解決獲取的網頁亂碼問題 * */ public class HttpClientUtils { private static int timeout = 50000; /** * 通過url來獲取我們的GetMethod * @param url * @return */ public static GetMethod setGetMethod(String url) { // TODO Auto-generated method stub /* 2.生成 GetMethod 物件並設定引數 */ GetMethod getMethod = null; try{ //可能會在查詢的時候出現異常,我們簡單的丟去 getMethod = new GetMethod(url); // 設定 get 請求超時 5s getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, timeout); // 設定請求重試處理 getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler()); //Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1 //Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3 //Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12 //Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; WOW64; Trident/4.0; SLCC1) //Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727) //Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13 //設定USER_AGENT getMethod.getParams().setParameter(HttpMethodParams.USER_AGENT,"Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12"); }catch (Exception e){ throw new RuntimeException("-------------------------請求協議存在問題-----------------------"); } return getMethod; } /** * * @param host 要訪問的主機 * @param proxyIP 代理ip * @param proxyPort 代理埠 * @return * @throws Exception */ public static int testProxy(String host,String proxyIP,int proxyPort){ int code = 0; try { //獲取到HttpClient HttpClient httpClient = new HttpClient(); httpClient.getHostConfiguration().setHost(host); //設定超時 5000 毫秒的時間 httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(timeout); //設定代理 httpClient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); //設定代理 ip 和埠 httpClient.getHostConfiguration().setProxy(proxyIP, proxyPort); //設定代理的使用者和密碼 Credentials defaultcreds = new UsernamePasswordCredentials("", ""); httpClient.getState().setProxyCredentials(new AuthScope(proxyIP, proxyPort, null), defaultcreds); //獲取GetMethod GetMethod method = setGetMethod(host); if(method != null){ code = httpClient.executeMethod(method); //獲取請求的資料 } } catch (Exception e) { // TODO Auto-generated catch block //throw new RuntimeException("-------------"+proxyIP+":"+proxyPort+"\t 無效----------"); } return code; } /** * 獲取到我們的HttpClient * @param url * @return */ private static HttpClient getHttpClient(String url){ HttpClient httpClient = new HttpClient(); httpClient.getHostConfiguration().setHost(url); //設定超時 5000 毫秒的時間 httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(50000); return httpClient; } /** * 設定我們帶有代理的HttpClieantProxy * @param url * @param proxyIP * @param proxyPort * @return */ private static HttpClient setHttpClientProxy(String url,String proxyIP,int proxyPort){ HttpClient httpClient = getHttpClient(url); //設定代理 httpClient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); //設定代理 ip 和埠 httpClient.getHostConfiguration().setProxy(proxyIP, proxyPort); //設定代理的使用者和密碼 Credentials defaultcreds = new UsernamePasswordCredentials("", ""); httpClient.getState().setProxyCredentials(new AuthScope(proxyIP, proxyPort, null), defaultcreds); //反悔 return httpClient; } /** * 傳送Get請求 * @param url * @return */ public static String sendGet(String url){ return sendGet(url,false); } /** * * @param url * @param isProxy * @param encoding * @return */ public static String sendGet(String url,boolean isProxy,String encoding){ String content = null; HttpClient client = null; try { if(isProxy){ //當是代理的時候,獲取資料 //ProxyHttps porxy = ProxyUtils.getRandomPropertisProxy(); //106.38.194.199:80 // 好用 //client = setHttpClientProxy(url, porxy.getIp(),Integer.parseInt(porxy.getPort())); //System.out.println("-------------------使用代理"+porxy.getIp()+":"+porxy.getPort()); //209.66.193.186 ,s client = setHttpClientProxy(url,"121.14.138.56",81); }else{ client = getHttpClient(url); } GetMethod method = HttpClientUtils.setGetMethod(url); if(method != null){ content = dealHtml(client, method,encoding); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return content; } /** * 傳送get請求 待遇proxy * @param url * @param isProxy * @return */ public static String sendGet(String url,boolean isProxy){ return HttpClientUtils.sendGet(url, isProxy, "UTF-8"); } /** * 獲取網頁的資料 * @param url 網頁的地址 * @param encoding 網頁資料的編碼方式 * @return */ public static String sendGet(String url,String encoding){ return HttpClientUtils.sendGet(url, false,encoding); } /** * 處理網頁 * @param client * @param method * @param encoding * @return */ public static String dealHtml(HttpClient client,GetMethod method,String encoding){ String content = null; try { //執行資料 int code = client.executeMethod(method); if(code == 200){ //當數請求成功 Header header = method.getResponseHeader("Content-Type"); if(header != null){ String applicationType = header.getValue(); if(applicationType != null){ //當是網頁的情況 if(applicationType.indexOf("html") != -1 || applicationType.indexOf("json") != -1){ content = FileUtils.copyInput2String(method.getResponseBodyAsStream(),encoding); } } } }else if ((code == HttpStatus.SC_MOVED_TEMPORARILY) || (code == HttpStatus.SC_MOVED_PERMANENTLY) || (code == HttpStatus.SC_SEE_OTHER) || (code == HttpStatus.SC_TEMPORARY_REDIRECT)) { //System.err.println("------------------------請求失敗: " + method.getStatusLine()); return null; //當我們的ip被限制的情況 }else if(code == HttpStatus.SC_FORBIDDEN){ } } catch (HttpException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return content; } /** * 傳送json資料到伺服器 * @param url * @param json * @return */ public static String postJson(String url,String json){ //使用DefaultHttpClient 這個物件才可以獲取到Json String str = null; try { DefaultHttpClient client = new DefaultHttpClient(); //新增json HttpPost post =new HttpPost(url); StringEntity entity = new StringEntity(json,ContentType.create("application/json", "utf-8")); post.setEntity(entity); //返回的資料 HttpResponse response = client.execute(post); int code = response.getStatusLine().getStatusCode(); if(code >=200 && code <300){ InputStream in = response.getEntity().getContent(); str = FileUtils.copyInput2String(in); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return str; } /** * 下載檔案 * @param url 下載的路徑 * @return */ public static InputStream downLoad(String url){ InputStream in = null; try { DefaultHttpClient client = new DefaultHttpClient(); HttpGet get = new HttpGet(url); HttpResponse response = client.execute(get); int code = response.getStatusLine().getStatusCode(); if(code >=200 && code<300){ in = response.getEntity().getContent(); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return in; } /** * 下載檔案 * @param url 下載的路徑 * @return */ public static InputStream downLoadByPost(String url){ InputStream in = null; try { DefaultHttpClient client = new DefaultHttpClient(); HttpPost post = new HttpPost(url); HttpResponse response = client.execute(post); int code = response.getStatusLine().getStatusCode(); if(code >=200 && code<300){ in = response.getEntity().getContent(); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return in; } /** * 傳送檔案到摸個地方 * @param url 路徑 * @param field 檔案的欄位 * @param file 檔案物件 * @return */ public static String upload(String url,String field,File file){ String result = null; try { DefaultHttpClient client = new DefaultHttpClient(); HttpPost post = new HttpPost(url); //這個上傳的MultipartEntity 是httpmime中的 MultipartEntity entity = new MultipartEntity(); FileBody fileBody = new FileBody(file); entity.addPart(field, fileBody); post.setEntity(entity); HttpResponse response = client.execute(post); int code = response.getStatusLine().getStatusCode(); if(code >=200 && code<300){ InputStream in = response.getEntity().getContent(); //將InputStream 資料轉化為String result =FileUtils.copyInput2String(in); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return result; } /** * 通過post來提交資料,沒有帶引數 * @param url 請求的地址 * @return */ public static String post(String url){ return HttpClientUtils.post(url,null); } /** * 通過post來提交資料,帶引數的方法 * @param url 請求地址 * @param params 引數 * @return */ public static String post(String url,Map<String,String> params){ String str = null; try { HttpClient client = new HttpClient(); PostMethod method = new PostMethod(url); //設定請求頭的樣式 method.setRequestHeader("Content-Type","application/x-www-form-urlencoded;charset=utf-8"); if(params != null && params.size() >0){ for(Map.Entry<String,String> entry:params.entrySet()){ method.setParameter(entry.getKey(),entry.getValue()); } } int code = client.executeMethod(method); if(code >=200 && code <300){ InputStream in = method.getResponseBodyAsStream(); str = FileUtils.copyInput2String(in); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return str; } }