1. 程式人生 > >Java 獲得網頁原始碼和模擬瀏覽器請求(個人總結)

Java 獲得網頁原始碼和模擬瀏覽器請求(個人總結)

Java獲取原始碼自己知道的幾種方式,在這裡總結一下。

1:GetSourceCode.java

package kalision;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;


public class GetSourceCode {

  public static void main(String[] args) throws IOException {
    HttpURLConnection huc;

    URL myurl = new URL("http://www.baidu.com");//獲取原始碼的頁面。

    huc = (HttpURLConnection) myurl.openConnection();
    BufferedReader in;
    in = new BufferedReader(new InputStreamReader(huc.getInputStream()));
    String line;
    while ((line = in.readLine()) != null) {
      System.out.println(line);
    }

  }
}

或者

2.test1.java

package kalision;


import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;

public class test1 {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		try {
			 URL url = new URL("http://train.qunar.com/stationToStation.htm?fromStation=%E6%B5%8E%E5%8D%97&toStation=%E7%83%9F%E5%8F%B0&date=2012-01-08");

			 URLConnection conn = url.openConnection();
			 conn.setDoOutput(true);
			 InputStream in = null;
			 in = url.openStream();
			 String content = pipe(in,"utf-8");
			 
			 System.out.println(content);
			 
			 
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	static String pipe(InputStream in,String charset) throws IOException {
        StringBuffer s = new StringBuffer();
        if(charset==null||"".equals(charset)){
        	charset="utf-8";
        }
        String rLine = null;
        BufferedReader bReader = new BufferedReader(new InputStreamReader(in,charset));
        PrintWriter pw = null;
        
		FileOutputStream fo = new FileOutputStream("../index.html");
		OutputStreamWriter writer = new OutputStreamWriter(fo, "utf-8");
		pw = new PrintWriter(writer);
        while ( (rLine = bReader.readLine()) != null) {
            String tmp_rLine = rLine;
            int str_len = tmp_rLine.length();
            if (str_len > 0) {
              s.append(tmp_rLine);
              pw.println(tmp_rLine);
              pw.flush();
            }
            tmp_rLine = null;
       }
        in.close();
        pw.close();
        return s.toString();
	}
}
注意:

如果得到的原始檔儲存執行,出現亂碼。是因為編碼問題。可以嘗試修改

原始檔頭部的編碼為GBK等即可。


以上兩種方式都可以得到頁面的原始碼。

對於有請求引數的頁面如:

test1類中的url,它是一個請求連線,帶有引數,以get方式提交的url

返回的原始碼可能沒有我們想要的資料。

據個人瞭解這種頁面大多數情況資料是放到了另一個頁面。

在返回的原始檔中以js動態去彼頁面獲取動態資料載入到此頁面中。

可以用firebug等一些工具來抓到此頁面。

來分析解析這些需要的動態資料。

當然個人感覺這樣的工作,非推薦的。也是不易實現的。

對於上面講到的get方式提交 ,可以直接在url後邊新增引數。下面是以post方式提交資料並請求

1.Test.java

import java.util.Properties;


public class Test {

	 public static void testRequestPostStringByteArray() throws Exception {   
	        Properties requestProperties = new Properties();   
	  
	        // 模擬瀏覽器資訊   
	        requestProperties   
	                .put(   
	                        "User-Agent",   
	                        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TencentTraveler ; .NET CLR 1.1.4322)");   
	  
	        byte[] b = HtmlPost.requestPost("http://train.qunar.com/stationToStation.htm?fromStation=%E5%8C%97%E4%BA%AC&toStation=%E4%B8%8A%E6%B5%B7&date=2012-01-01",   
	                "XML".getBytes());   
	        System.err.println(new String(b, "utf-8"));   
	    }   
	  
	    /**  
	     * Test method for  
	     * {@link org.zlex.commons.net.NetUtils#requestPostForm(java.lang.String, java.util.Properties)}  
	     * .  
	     */  
	    public static void testRequestPostForm() throws Exception {   
	        Properties formProperties = new Properties();   
	  
	        formProperties.put("ictN", "5924");
	        formProperties.put("fdl", "");
	        formProperties.put("lx", "00");
	        formProperties.put("nyear3", "2011");
	        formProperties.put("nyear3_new_value", "true");
	        formProperties.put("nmonth3", "12");
	        formProperties.put("nmonth3_new_value", "true");
	        formProperties.put("nday3", "27");
	        formProperties.put("nday3_new_value", "false");
	        formProperties.put("startStation_ticketLeft", "6d4e53e80482a0b7");
	        formProperties.put("startStation_ticketLeft_new_value", "true");
	        formProperties.put("arriveStation_ticketLeft", "53174e1300e781a2");
	        formProperties.put("arriveStation_ticketLeft_new_value", "true");
	        formProperties.put("trainCode", "");
	        formProperties.put("trainCode_new_value", "true");
	        formProperties.put("rFlag", "1");
	        formProperties.put("name_ckball", "value_ckball");
	        formProperties.put("tFlagDC", "DC");
	        formProperties.put("tFlagZ", "Z");
	        formProperties.put("tFlagT", "T");
	        formProperties.put("tFlagK", "K");
	        formProperties.put("tFlagPK", "PK");
	        formProperties.put("tFlagPKE", "PKE");
	        formProperties.put("tFlagLK", "LK");
	        formProperties.put("randCode", "BYHJ");
	        
	        byte[] b = HtmlPost.requestPostForm(   
	                "http://dynamic.12306.cn/TrainQuery/iframeLeftTicketByStation.jsp",   
	                formProperties);   
//	        byte[] b = HtmlPost.requestPostForm(   
//	                "http://train.qunar.com/stationToStation.htm?fromStation=%E5%8C%97%E4%BA%AC&toStation=%E5%B9%BF%E5%B7%9E&date=2011-12-31",   
//	                formProperties);   
//	        
	        
	        System.err.println(new String(b, "utf-8"));   
	    }   
	    public static void main(String args[]){
	    	
	    	try {
				testRequestPostForm();
	    	//	testRequestPostStringByteArray();
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
	    }

}