1. 程式人生 > >使用HTTPURLConnection模擬登陸,爬取網頁內容

使用HTTPURLConnection模擬登陸,爬取網頁內容

如果你需要爬取某些網頁的內容,但這些網站需要登入,那就需要一些額外的步驟來由程式來完成這些登入並爬取我們需要的網頁內容了,任意登入頁面都是向伺服器傳送請求,如果我們能夠模擬向伺服器傳送請求,那麼自然登入也就不在話下,通過Fiddler抓取我們需要的一些資訊,很輕鬆的就能模擬出向伺服器傳送的請求,下面我們可以使用HTTPURLConnection進行模擬登陸並爬取我們需要的網頁內容。


import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.List;
import java.util.Map.Entry;

public class INotesPost {

	public static void main(String[] args) throws Exception {
		String surl = "***?login";
		URL url = new URL(surl);
		HttpURLConnection connection = (HttpURLConnection) url.openConnection();

		connection.setDoOutput(true);
		connection.setDoInput(true);
		connection.setRequestMethod("POST");
		connection.setUseCaches(false);
		connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
		connection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322; .NET4.0C; .NET4.0E)");
		connection.setRequestProperty("Accept-Language","zh-CN");
		connection.setRequestProperty("Accept-Encoding","gzip, deflate");

		OutputStreamWriter out = new OutputStreamWriter(
				connection.getOutputStream(), "UTF-8");
		// 其中的memberName和password可通過fiddler來抓取
		out.write("username=***&password=***"); 
		out.flush();
		out.close();

		connection.connect();
		
		InputStream in = connection.getInputStream();

		StringBuilder retStr = new StringBuilder();
		BufferedReader br = new BufferedReader(new InputStreamReader(in));
		String temp = br.readLine();
		while (temp != null) {
			retStr.append(temp);
			temp = br.readLine();
		}
		br.close();
		in.close();

		System.out.println(retStr);
		for(Entry<String, List<String>> header: connection.getHeaderFields().entrySet()){
   			System.out.println(header.getKey() +" " + header.getValue());  
  		}
		
		
	}
}

在模擬登陸的時候,我們其實可以通過Fiddler來抓取網頁提交引數,直接將Cookie寫到我們的Connection的RequestProperty中去。

Fiddler抓取登入引數

將抓取到的引數直接填充到Connection的RequestProperty屬性中去,輕鬆抓取網頁內容。如果我們抓取的頁面內容是中文的,注意charset的編碼方式,並在讀取頁面返回的字元流時進行對應的編碼:

BufferedReader bufferedReader = new BufferedReader(

		new InputStreamReader(urlStream,"utf-8"));

下面是一段相對完整的程式碼

		String s = "****";

		url = new URL(s);
		HttpURLConnection resumeConnection = (HttpURLConnection) url.openConnection();

				
		resumeConnection.setRequestProperty("Accept-Charset","utf-8");
		resumeConnection.setRequestProperty("Content-Type","text/html;utf-8");
		resumeConnection.setRequestProperty("Cookie","AttachmentAuth=77u/PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48U1A+MCMuZnxpaWNwfDAwMDgyMzkwNSwwIy5mfGlpY3B8MDAwODIzOTA1LDEzMDc2NjgxMDQzODc3NDA0OCxUcnVlLEV0eHBYWVlYVHNYQ0hYR3hjRmZjdWowOXV6ekRXc01Hd0FLUzVkaFNmcEErcWo4S3pGTUYvYVRYZFJnWitSRW1pVmR4N0xKVzdoOUhzMitUamY5Z0E2VHY4a2hxeHNTQXlVRmhmQ1pwelBUOFBWQmc0NXI2cHo4eGZxZkEyNzAyOUo0eFBrcU9MM0dWNm1IVGdVNEZFT3E1OVIzSHA3dmZrS0tHR1YxNVJpTllKcXF1dUVCMmhlU1lGT0VLUjlBMitEQ00rMVlwdXBVTEJ0UGdWYk5lODBobEtydUttc1MyWWkrSmpXMFozTVVyRHJzN1VkU1VxNmdrYmo0dTB4OWNrTXRFZXJ1cUlZbDROb3N2UWhpSmNRTlVGcm9kNkVXaWhBL0tjUVpaZlY1UFJBREtjalZIYmx3dnRXMkIwZ1VPMVM3REJFa0VzOS9GQUViVzM2bnhJQT09LGh0dHA6Ly9vYS5zZGMuaWNiYzo4Mi9zdG9yYWdlL2F0dGFjaG1lbnQyLzIwMTUtMDUvZTY1Yjc3ZjUtNGZkMC00NDI2LWE1OWYtMjQxNTAxYWE0MjI1L+mZhOS7tjIu6L2v5Lu25byA5Y+R5Lit5b+D6K665paH5L2T5L6L6KaB5rGCLmRvYzwvU1A+");
		resumeConnection.setRequestProperty("Cookie","PortalAuth=77u/PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48U1A+MCMuZnxpaWNwfDAwMDgyMzkwNSwwIy5mfGlpY3B8MDAwODIzOTA1LDEzMDc2NzA1NzM3NTI3MDY4NCxUcnVlLFFldU1Fa2xDelI0bEZaTTJkbVVtZGxPVmhsUVdwQWMzQlk2TCtWdlVOb1ZsRjVHZ1BMRVhMTTAwcHBKWW5WTGZLYzFPTTh2aGRydmRIVWVLR3JOb255dWpTS2lMeEhyQUlBbmtYZTVBTWlFVGpFMlF4bzRjWVRKeEhjNU5ScEhMSWJOWHdWckFTWHhuNUd5bURST0xTK2d3cUFWbThFUllPM3J1enR4aGgwT1VrTDJGMGkrUDdWcHViRm84blFrTXp4MFNyMXdtQzE3UEJkcGpGVU1nOW8xRkJoeHhzWElDdHhLVEpVSHRGMmpDNmNKS285bGJtTXZJZnlwR0k1VGpLd29TTUpaenhyb1BkQ3VOVW13Wk01T0ZEUExSK1lqajVCRitJSFc1enV0UlpXM08wWHhNaldIWk1nWHhncjF0dUc1b3E3RlRwOGhCMFVCWjAydDlGQT09LGh0dHA6Ly9vYS5zZGMuaWNiYy88L1NQPg==");
		resumeConnection.connect();

		InputStream urlStream = resumeConnection.getInputStream();

		BufferedReader bufferedReader = new BufferedReader(

		new InputStreamReader(urlStream,"utf-8"));

		String ss = null;
		StringBuilder total = new StringBuilder();
		while ((ss = bufferedReader.readLine()) != null) {
			total.append(ss);
		}
		bufferedReader.close();		
		resumeConnection.disconnect();
		
//		System.out.println(total.toString());