1. 程式人生 > >利用java-maven程式爬取西刺網頁的ip代理

利用java-maven程式爬取西刺網頁的ip代理

主要程式碼:
package com.itquwei.spider;

import java.io.IOException;
import java.nio.charset.Charset;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.itquwei.spider.dao.IpInfoDao;
import com.itquwei.spider.pojo.IpInfo;

public class XCSpider {

	private static IpInfoDao dao = new IpInfoDao();

	public static void main(String[] args) throws Exception {

		for (int page = 1; page < 664; page++) {
			paging(page);
			System.out.println("第一"+page+"頁");
			Thread.sleep(5000);
		}

	}

	// 分頁查詢
	public static void paging(int page) throws IOException,
			ClientProtocolException {
		// 建立一個客戶端
		String url = "http://www.xicidaili.com/nt/" + page;
		String html = getIndex(url);

		IpInfo ipInfo = getIpInfo(html);
		if (ipInfo != null) {
			dao.saveIpInfo(ipInfo);
		}

	}

	// 獲取ip詳細資訊
	public static IpInfo getIpInfo(String html) {
		Document doc = Jsoup.parse(html);
		Elements trs = doc.select("#ip_list  tr[class]");

		for (Element element : trs) {
			IpInfo info = new IpInfo();
			Elements tds = element.select("tr td");

			// 獲取ip地址
			String ip = tds.get(1).text();
			info.setIp(ip);
			// 獲取埠號
			String port = tds.get(2).text();
			info.setPort(port);
			// 獲取伺服器地址
			String address = tds.get(3).select("a").text();
			info.setAddress(address);
			// 獲取狀態
			String status = tds.get(4).text();
			info.setStatus(status);
			// 獲取型別
			String type = tds.get(5).text();
			info.setType(type);
			// 獲取存活時間
			String liveTime = tds.get(8).text();
			info.setLiveTime(liveTime);
			// 獲取驗證時間
			String testTime = tds.get(9).text();
			info.setTestTime(testTime);
			// System.out.println(info);
			return info;
		}

		return null;
	}

	// 獲取西刺網頁
	public static String getIndex(String url) throws IOException,
			ClientProtocolException {
		//建立客戶端
		CloseableHttpClient httpClient = HttpClients.createDefault();
		HttpGet httpGet = new HttpGet(url);
		// setConnectTimeout(10000)連線超時時間(單位豪秒)
		// setSocketTimeout(10000)讀取超時時間(單位豪秒)
		RequestConfig config = RequestConfig.custom().setConnectTimeout(20000)
				.setSocketTimeout(30000).build();
		httpGet.setConfig(config);
		httpGet.setHeader(
				"User-Agent",
				"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36");
		// 執行
		CloseableHttpResponse res = httpClient.execute(httpGet);
		HttpEntity entity = res.getEntity();
		String html = "";
		if (entity != null) {
			html = EntityUtils.toString(entity, Charset.forName("utf-8"));
		}

		// System.out.println(html);
		return html;
	}

}

pojo程式碼:

package com.itquwei.spider.pojo;

public class IpInfo {

	private String ip;
	private String port;
	private String address;
	private String status;
	private String type;
	private String liveTime;
	private String testTime;

	
	@Override
	public String toString() {
		return "IpInfo [ip=" + ip + ", port=" + port + ", address=" + address
				+ ", status=" + status + ", type=" + type + ", liveTime="
				+ liveTime + ", testTime=" + testTime + "]";
	}

	public String getStatus() {
		return status;
	}

	public void setStatus(String status) {
		this.status = status;
	}

	public String getLiveTime() {
		return liveTime;
	}

	public void setLiveTime(String liveTime) {
		this.liveTime = liveTime;
	}

	public String getTestTime() {
		return testTime;
	}

	public void setTestTime(String testTime) {
		this.testTime = testTime;
	}

	public String getIp() {
		return ip;
	}

	public void setIp(String ip) {
		this.ip = ip;
	}

	public String getPort() {
		return port;
	}

	public void setPort(String port) {
		this.port = port;
	}

	public String getAddress() {
		return address;
	}

	public void setAddress(String address) {
		this.address = address;
	}

	public String getType() {
		return type;
	}

	public void setType(String type) {
		this.type = type;
	}

}

dao程式碼:連線資料庫用的

package com.itquwei.spider.dao;

import org.springframework.jdbc.core.JdbcTemplate;

import com.itquwei.spider.pojo.IpInfo;
import com.mchange.v2.c3p0.ComboPooledDataSource;

public class IpInfoDao extends JdbcTemplate {

	public IpInfoDao() {
		ComboPooledDataSource dataSource = new ComboPooledDataSource();
		dataSource
				.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEnconding=utf-8");
		dataSource.setUser("root");
		dataSource.setPassword("root");
		setDataSource(dataSource);
	}

	public void saveIpInfo(IpInfo info) {
		String sql = "insert into xc_ipInfo (ip,port,address,status,type,liveTime,testTime) values(?,?,?,?,?,?,?);";
		update(sql, info.getIp(), info.getPort(), info.getAddress(),
				info.getStatus(), info.getType(), info.getLiveTime(),
				info.getTestTime());
	}

}


結果: