1. 程式人生 > >java爬蟲 京東商品頁 簡單案例

java爬蟲 京東商品頁 簡單案例

要爬的資料



資料庫表結構

資料庫建表語句
SET FOREIGN_KEY_CHECKS=0;

-- ----------------------------
-- Table structure for `spider`
-- ----------------------------
DROP TABLE IF EXISTS `spider`;
CREATE TABLE `spider` (
  `id` int(10) NOT NULL AUTO_INCREMENT,
  `goods_id` varchar(20) DEFAULT NULL,
  `data_url` varchar(300) DEFAULT NULL,
  `pic_url` varchar(300) DEFAULT NULL,
  `title` varchar(300) DEFAULT NULL,
  `price` varchar(10) DEFAULT NULL,
  `param` text,
  `current_time` datetime DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8;

專案的包結構


pom.xml 檔案中的jar包依賴
<dependencies>
	        <dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.12</version>
			<scope>test</scope>
		</dependency>
		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpclient</artifactId>
			<version>4.4</version>
		</dependency>
		<dependency>
			<groupId>net.sourceforge.htmlcleaner</groupId>
			<artifactId>htmlcleaner</artifactId>
			<version>2.16</version>
		</dependency>
		<dependency>
			<groupId>org.json</groupId>
			<artifactId>json</artifactId>
			<version>20160212</version>
		</dependency>
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.38</version>
		</dependency>
		<dependency>
			<groupId>commons-dbutils</groupId>
			<artifactId>commons-dbutils</artifactId>
			<version>1.6</version>
		</dependency>
</dependencies>

編寫實體類
import java.util.HashMap;
import java.util.Map;

/**
 * 頁面實體類
 * 儲存頁面資訊
 */
public class Page {

	private String goodId;// 商品ID
	private String goodName;//商品名稱
	private String dataUrl;//商品URL地址
	private String picUrl;//商品圖片URL地址
	private String price;//價格
	private Map<String, String> param = new HashMap<String, String>();//商品引數規格
	private String content;//頁面原始原始碼內容
	
	public String getGoodId() {
		return goodId;
	}
	public void setGoodId(String goodId) {
		this.goodId = goodId;
	}
	public String getGoodName() {
		return goodName;
	}
	public void setGoodName(String goodName) {
		this.goodName = goodName;
	}
	public String getDataUrl() {
		return dataUrl;
	}
	public void setDataUrl(String dataUrl) {
		this.dataUrl = dataUrl;
	}
	public Map<String, String> getParam() {
		return param;
	}
	public void setParam(String key,String value) {
		this.param.put(key, value);
	}
	public String getContent() {
		return content;
	}
	public void setContent(String content) {
		this.content = content;
	}
	public String getPicUrl() {
		return picUrl;
	}
	public void setPicUrl(String picUrl) {
		this.picUrl = picUrl;
	}
	public String getPrice() {
		return price;
	}
	public void setPrice(String price) {
		this.price = price;
	}
}

spider類
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.download.Downloadable;
import cn.crxy.maven.Spider.process.Processable;
import cn.crxy.maven.Spider.store.Storeable;

public class Spider {

	private Downloadable downloadable;
	private Processable processable;
	private Storeable storeable;
	
	//下載頁面原始碼
	public Page download(String url){
		return downloadable.download(url);
	}
	
	//解析頁面原始碼
	public void process(Page page){
		processable.process(page);
	}
	
	 //將解析後的資料儲存到資料庫 
	public void store(Page page){
		storeable.store(page);
	}

	public Downloadable getDownloadable() {
		return downloadable;
	}

	public void setDownloadable(Downloadable downloadable) {
		this.downloadable = downloadable;
	}

	public Processable getProcessable() {
		return processable;
	}

	public void setProcessable(Processable processable) {
		this.processable = processable;
	}

	public Storeable getStoreable() {
		return storeable;
	}

	public void setStoreable(Storeable storeable) {
		this.storeable = storeable;
	}
}

Downloadable介面類
import cn.crxy.maven.Spider.domain.Page;

public interface Downloadable {
	Page download(String url);
}

DownloadImpl實現類
import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.utils.PageUtil;

public class DownloadImpl implements Downloadable {
	
	public Page download(String url) {
		Page page = new Page();
		String content=PageUtil.getContent(url);//根據url得到內容
		page.setContent(content);
		page.setDataUrl(url);
		return page;
	}
}

PageUtil頁面工具類
import java.io.IOException;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

/**
 * 根據URL獲取url對應的內容
 */
public class PageUtil {

	public static String getContent(String url){
		HttpClientBuilder custom = HttpClients.custom();//建立httpclient
		//通過構建器構建一個httpclient物件,可以認為是獲取到一個瀏覽器物件
		CloseableHttpClient build = custom.build();
		//把url封裝到get請求中
		HttpGet httpGet = new HttpGet(url);
		String content = null;
		try {
			//使用client執行get請求,獲取請求的結果,請求的結果被封裝到response中
			CloseableHttpResponse response = build.execute(httpGet);
			//表示獲取返回的內容實體物件
			HttpEntity entity = response.getEntity();
			//解析實體中頁面的內容,返回字串形式
			content = EntityUtils.toString(entity);
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return content;
	} 
}

Processable.java
import cn.crxy.maven.Spider.domain.Page;

public interface Processable {
	void process(Page page);
}

ProcessImpl.java
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import org.json.JSONArray;
import org.json.JSONObject;

import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.utils.HtmlUtil;
import cn.crxy.maven.Spider.utils.PageUtil;

public class ProcessImpl implements Processable {

	public void process(Page page) {

		HtmlCleaner htmlCleaner = new HtmlCleaner();
		TagNode rootNode = htmlCleaner.clean(page.getContent());
		try {
			String goodName = HtmlUtil.getText(rootNode, "//*[@id='name']/h1");// 得到商品名稱
			page.setGoodName(goodName);

			String picUrl = HtmlUtil.getAttributeByName(rootNode, "//*[@id='spec-n1']/img","src");// 獲取商品圖片url
			page.setPicUrl("http:"+picUrl);

			// 獲取商品號
			String url = page.getDataUrl();
			Pattern compile = Pattern.compile("http://item.jd.com/([0-9]+).html");
			Matcher matcher = compile.matcher(url);
			String goodid = null;
			if (matcher.find()) {
				goodid = matcher.group(1);
				page.setGoodId(goodid);
			}

			// 獲取商品價格
			// 得到價格的json格式[{"id":"J_1593512","p":"17988.00","m":"17989.00"}]
			String pricejson = PageUtil
					.getContent("http://p.3.cn/prices/get?skuid=J_" + goodid);
			JSONArray jsonArray = new JSONArray(pricejson);
			JSONObject jsonObject = jsonArray.getJSONObject(0);
			String price = jsonObject.getString("p");
			page.setPrice(price);

			// 獲取規格引數
			// *[@id="product-detail-2"]
			// *[@id="product-detail-2"]/table/tbody/tr[1]/th
			Object[] evaluateXPath = rootNode
					.evaluateXPath("//*[@id='product-detail-2']/table/tbody/tr");
			JSONArray jsonArray2 = new JSONArray();
			if(evaluateXPath != null && evaluateXPath.length > 0){
				for(Object object : evaluateXPath){
					TagNode tagnode = (TagNode) object;
					if(!"".equals(tagnode.getText().toString().trim())){//有資料
						
						Object[] evaluateXPath2 = tagnode.evaluateXPath("/th");
						JSONObject jsonObject2 = new JSONObject();
						if(evaluateXPath2.length>0){
							TagNode tagNode2 = (TagNode) evaluateXPath2[0];
							jsonObject2.put("name", tagNode2.getText().toString());
							jsonObject2.put("value", "");
						}else {
							
							Object[] evaluateXPath3 = tagnode.evaluateXPath("/td");
							TagNode tagNode1 = (TagNode) evaluateXPath3[0];
							TagNode tagNode2 = (TagNode) evaluateXPath3[1];
							jsonObject2.put("name", tagNode1.getText().toString());
							jsonObject2.put("value", tagNode2.getText().toString());
						}
						jsonArray2.put(jsonObject2);
					}
				}
			}
			page.setParam("spec",jsonArray2.toString());
		} catch (XPatherException e) {
			e.printStackTrace();
		}
	}
}

ProcessImpl.java程式碼中的幾個注意點:
獲取商品名稱、圖片URL的xpath路徑



在京東商品頁面獲取商品價格的方式


  
得到如下的連線地址:
http://p.3.cn/prices/get?type=1&area=1_72_4137&pdtk=&pduid=1112434089&pdpin=&pdbp=0&skuid=J_1593512&callback=cnp
對連線進行處理後得到如下結果
  
商品引數規格的Xpath


Storeable.java
package cn.crxy.maven.Spider.store;

import cn.crxy.maven.Spider.domain.Page;

public interface Storeable {
	void store(Page page);
}

StoreImple.java
package cn.crxy.maven.Spider.store;

import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;

import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.utils.MyDBUtils;

public class StoreImpl implements Storeable {

	public void store(Page page) {
		String dataUrl = page.getDataUrl();
		String goodid = page.getGoodId();
		String goodname = page.getGoodName();
		String picUrl = page.getPicUrl();
		String price  = page.getPrice();
		
		Map<String, String> values = page.getParam();
		String param = values.get("spec");
		
		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
		String currtime = sdf.format(new Date());
		MyDBUtils.update(MyDBUtils.INSERT_LOG, goodid,dataUrl,picUrl,goodname,price,param,currtime);
	}

}

MyDBUtils.java
package cn.crxy.maven.Spider.utils;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.dbutils.BasicRowProcessor;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.ArrayListHandler;

public class MyDBUtils {
	private static String className = "com.mysql.jdbc.Driver";
	private static String url = "jdbc:mysql://localhost:3306/spider?"
			+ "useUnicode=true&characterEncoding=utf-8";
	private static String user = "root";
	private static String password = "1234";
	private static QueryRunner queryRunner = new QueryRunner();

	public static final String INSERT_LOG = "INSERT INTO SPIDER(good_id,"
			+ "data_url,pic_url,good_name,price,param,`current_time`) "
			+ "VALUES(?,?,?,?,?,?,?)";

	// 拒絕new一個例項
	private MyDBUtils() {
	};

	static {// 呼叫該類時既註冊驅動
		try {
			Class.forName(className);
		} catch (Exception e) {
			e.printStackTrace();
			throw new RuntimeException();
		}
	}
	
	//查詢
	public static List<String> executeQuerySql(String sql) {
		List<String> result = new ArrayList<String>();
		try {
			List<Object[]> requstList = queryRunner.query(getConnection(), sql,
					new ArrayListHandler(new BasicRowProcessor() {
						@Override
						public <Object> List<Object> toBeanList(ResultSet rs,
								Class<Object> type) throws SQLException {
							return super.toBeanList(rs, type);
						}
					}));
			for (Object[] objects : requstList) {
				result.add(objects[0].toString());
			}
		} catch (SQLException e) {
			e.printStackTrace();
		}
		return result;
	}
	
	 //這個方法可以執行一些更新或者新增的sql語句或者刪除
	public static void update(String sql, Object... params) {
		try {
			Connection connection = getConnection();
			queryRunner.update(connection, sql, params);
			connection.close();
		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	// 獲取連線
	private static Connection getConnection() throws SQLException {
		return DriverManager.getConnection(url, user, password);
	}
}
HtmlUtils.java
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;

public class HtmlUtils {
	
	/**
	 * 根據xpath獲取對應標籤的內容
	 * @param tagNode
	 * @param xpath
	 * @return
	 */
	public static String getText(TagNode tagNode,String xpath){
		String content = null;
		Object[] evaluateXPath;
		try {
			evaluateXPath = tagNode.evaluateXPath(xpath);
			if(evaluateXPath!=null && evaluateXPath.length>0){
				TagNode node = (TagNode)evaluateXPath[0];
				content = node.getText().toString();
			}
		} catch (XPatherException e) {
			e.printStackTrace();
		}
		return content;
	}
	
	/**
	 * 獲取對應標籤中指定屬性的值
	 * @param tagNode
	 * @param xpath
	 * @param attr
	 * @return
	 */
	public static String getAttributeByName(TagNode tagNode,String xpath,String attr){
		String content = null;
		Object[] evaluateXPath;
		try {
			evaluateXPath = tagNode.evaluateXPath(xpath);
			if(evaluateXPath!=null && evaluateXPath.length>0){
				TagNode node = (TagNode)evaluateXPath[0];
				content = node.getAttributeByName(attr);
			}
		} catch (XPatherException e) {
			e.printStackTrace();
		}
		return content;
	}
	
	

}



在src/test/java資料夾下面的包中新建test類 TestSpider.java
package cn.crxy.maven.Spider;

import org.junit.Test;

import cn.crxy.maven.Spider.domain.Page;
import cn.crxy.maven.Spider.download.DownloadImpl;
import cn.crxy.maven.Spider.process.ProcessImpl;
import cn.crxy.maven.Spider.store.StoreImpl;

public class TestSpider {

	@Test
	public void test1() throws Exception {
		Spider spider = new Spider();
		
		//給介面注入實現類
		spider.setDownloadable(new DownloadImpl());
		spider.setProcessable(new ProcessImpl());
		spider.setStoreable(new StoreImpl());
		
		String url = "http://item.jd.com/1593512.html";
		Page page = spider.download(url);
		spider.process(page);
		spider.store(page);

	}
}

執行test測試方法,在資料庫中插入了資料