1. 程式人生 > >Spring Boot + JSoup 抓取京東商品資訊

Spring Boot + JSoup 抓取京東商品資訊

需求分析

  1. 匯入京東商品URL列表
  2. 生成京東商品資訊並輸出到excel表

思路

  1. 讀取excel獲取URL列表
  2. 訪問url並獲得HTML原始碼
  3. 提取對應的商品資訊欄位
  4. 輸出到excel

搭建框架

  1. 建立Spring Boot工程

    這裡寫圖片描述

    這裡寫圖片描述

  2. 選擇依賴

    這裡寫圖片描述

  3. 配置pom.xml

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation
    ="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion> <groupId>com.landawang</groupId> <artifactId>operation-web</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging
    >
    <name>operation-web</name> <description>Demo project for Spring Boot</description> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.0.3.RELEASE</version
    >
    <relativePath/> <!-- lookup parent from repository --> </parent> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <java.version>10</java.version> </properties> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-devtools</artifactId> <scope>runtime</scope> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <!-- thymeleaf --> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-thymeleaf</artifactId> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.10-FINAL</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.1</version> </dependency> <dependency> <!-- jsoup HTML parser library @ https://jsoup.org/ --> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> </plugin> </plugins> </build> </project>
  4. 重新命名application.properties為application.yml,修改內容

    server:
      port: 8080 # 配置埠
    
    spring:
      thymeleaf:
        # 開發時關閉快取,不然無法看到實時頁面
        cache: false
        prefix: classpath:/templates/
        suffix: .html
  5. 建立resources/templates/index.html

    <!DOCTYPE html>
    <html lang="en" xmlns:th="http://www.w3.org/1999/xhtml">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
        <title>生成商品資訊</title>
    </head>
    <body>
    
    <h3>請選擇excel檔案</h3>
    
    <form action="/excel/upload" enctype="multipart/form-data" method="post">
        <span style="white-space:pre"></span>
        <input type="file" name="file"/>
        <span style="white-space:pre"></span>
        <input type="submit" value="匯入"/>
        <span style="white-space:pre"></span>
    </form>
    </body>
    </html>
  6. 建立PageController並訪問頁面

    /**
     * Created with IntelliJ IDEA.
     * Project: operation-web
     * Package: com.landawang.operationweb.controller
     * User: Wangxin
     * Date: 2018/7/12
     * Time: 16:46
     * Copyright © 2018年 黃旺鑫. All rights reserved.
     */
    
    
    package com.landawang.operationweb.controller;
    
    import org.springframework.stereotype.Controller;
    import org.springframework.web.bind.annotation.RequestMapping;
    
    @Controller
    public class PageController {
    
        @RequestMapping("/")
        public String index() {
            return "index";
        }
    }

提取商品資訊

  1. ExcelController.java,編寫實現邏輯

    /**
     * Created with IntelliJ IDEA.
     * Project: operation-web
     * Package: com.landawang.operationweb.controller
     * User: Wangxin
     * Date: 2018/7/12
     * Time: 17:26
     * Copyright © 2018年 黃旺鑫. All rights reserved.
     */
    
    
    package com.landawang.operationweb.controller;
    
    import org.apache.http.Consts;
    import org.apache.http.HttpEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    import org.apache.poi.xssf.usermodel.*;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    import org.openxmlformats.schemas.spreadsheetml.x2006.main.STCellType;
    import org.springframework.web.bind.annotation.PostMapping;
    import org.springframework.web.bind.annotation.RequestMapping;
    import org.springframework.web.bind.annotation.RequestParam;
    import org.springframework.web.bind.annotation.RestController;
    import org.springframework.web.multipart.MultipartFile;
    
    import javax.servlet.http.HttpServletResponse;
    import java.util.Date;
    import java.util.HashMap;
    import java.util.Map;
    
    
    @RestController
    @RequestMapping("/excel")
    public class ExcelController {
    
        @PostMapping("/upload")
        public void uploadExcel(HttpServletResponse response, @RequestParam("file") MultipartFile file) {
    
            /**
             * 1. 獲取上傳的url列表
             * 2. 遍歷獲取url對應頁面的HTML原始碼
             * 3. 提取對應的商品資訊欄位
             * 4. 輸出的excel
             * */
    
            try {
                // 讀取Excel檔案
                XSSFWorkbook workbook = new XSSFWorkbook(file.getInputStream());
                // 讀取Excel工作表
                XSSFSheet sheet = workbook.getSheetAt(0);
    
    
                // 建立輸出Excel檔案
                XSSFWorkbook newWorkbook = new XSSFWorkbook();
                // 建立Sheet
                XSSFSheet newSheet = newWorkbook.createSheet();
                // 建立標題行
                XSSFRow titleRow = newSheet.createRow(0);
                // 設定標題行
                XSSFCell cell1 = titleRow.createCell(0, STCellType.INT_STR);
                cell1.setCellValue("商品編碼");
                XSSFCell cell2 = titleRow.createCell(1, STCellType.INT_STR);
                cell2.setCellValue("商品名稱");
                XSSFCell cell3 = titleRow.createCell(2, STCellType.INT_STR);
                cell3.setCellValue("商品分類");
                // 設定寬度
                newSheet.setColumnWidth(0, 2560);
                newSheet.setColumnWidth(1, 25600);
                newSheet.setColumnWidth(2, 5120);
    
                // 遍歷獲取HTML原始碼,提取資訊
                for (int i = 0; i < sheet.getLastRowNum(); i++) {
                    // 獲取行
                    XSSFRow row = sheet.getRow(i);
                    // 獲取列
                    XSSFCell cell = row.getCell(0);
                    // 獲取url
                    String url = cell.getStringCellValue();
    
                    // 輸出的Excel建立行
                    XSSFRow newRow = newSheet.createRow(i + 1);
    
                    // 判斷url不為空並且包含http
                    if (!url.isEmpty() && url.contains("http")) {
                        // 獲取商品資訊集合
                        Map<String, String> data = getProductInfo(url);
    
                        // 輸出商品資訊到Excel表
                        if (data != null) {
                            XSSFCell cellOne = newRow.createCell(0, STCellType.INT_STR);
                            cellOne.setCellValue(data.get("sku"));
                            XSSFCell cellTwo = newRow.createCell(1, STCellType.INT_STR);
                            cellTwo.setCellValue(data.get("name"));
                            XSSFCell cellThree = newRow.createCell(2, STCellType.INT_STR);
                            cellThree.setCellValue(data.get("cat"));
                        }
                    }
    
                    // 列印除錯
                    System.out.println("\n內容是:" + url);
                }
    
                // 下載excel
                response.setContentType("application/octet-stream");
                // 以時間戳命名
                String fileName = String.valueOf(new Date().getTime()) + ".xlsx";
                response.setHeader("Content-disposition", "attachment;filename=" + fileName);
                response.flushBuffer();
    
                // 輸出excel
                newWorkbook.write(response.getOutputStream());
    
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
    
        /**
         * 提取商品資訊
         * */
        private Map<String, String> getProductInfo(String url) throws Exception {
    
    
            CloseableHttpClient httpclient = HttpClients.createDefault();
            HttpGet httpGet = new HttpGet(url);
    
            // 模擬瀏覽器瀏覽
            httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0");
            CloseableHttpResponse response1 = httpclient.execute(httpGet);
            // The underlying HTTP connection is still held by the response object
            // to allow the response content to be streamed directly from the network socket.
            // In order to ensure correct deallocation of system resources
            // the user MUST call CloseableHttpResponse#close() from a finally clause.
            // Please note that if response content is not fully consumed the underlying
            // connection cannot be safely re-used and will be shut down and discarded
            // by the connection manager.
    
            // 結果集合
            Map<String, String> reslut = null;
    
            //獲取響應狀態碼
            int StatusCode = response1.getStatusLine().getStatusCode();
    
            try {
    
                HttpEntity entity1 = response1.getEntity();
    
                //如果狀態響應碼為200,則獲取html實體內容或者json檔案
                if(StatusCode == 200){
                    String html = EntityUtils.toString(entity1, Consts.UTF_8);
    
                    // 提取HTML得到商品資訊結果
                    reslut = getData(html);
    
                    // 消耗掉實體
                    EntityUtils.consume(response1.getEntity());
                }else {
                    //否則,消耗掉實體
                    EntityUtils.consume(response1.getEntity());
                }
            } finally {
                response1.close();
            }
    
            return reslut;
        }
    
        private static Map<String, String> getData (String html) throws Exception{
            //獲取的資料,存放在集合中
            Map<String, String> data = new HashMap<String,String>();
    
            //採用Jsoup解析
            Document doc = Jsoup.parse(html);
    
            //獲取html標籤中的內容
            // 標題
            String name = doc.select("div[class=sku-name]").text();
            if (name != null) {
                data.put("name", name);
            }
    
            // sku
            String sku = "";
            Elements elements = doc.select("a[data-sku]");
            for (Element ele: elements) {
                if (ele.hasAttr("data-sku")) {
                    sku = ele.attr("data-sku");
                    break;
                }
            }
    
            if (sku != null) {
                data.put("sku", sku);
            }
    
            String cat = doc.select("a[clstag=shangpin|keycount|product|mbNav-1]").text();
    
            if (cat != null) {
                data.put("cat", cat);
            }
    
            System.out.print(sku + "---------" + cat + "---------" + name);
    
            //返回資料
            return data;
        }
    
    }

測試

  1. 準備excel表
    這裡寫圖片描述

  2. 獲取時間戳命名的excel表
    這裡寫圖片描述