1. 程式人生 > >爬蟲記錄(6)——爬蟲實戰:爬取知乎網站內容,儲存到資料庫,並匯出到Excel

爬蟲記錄(6)——爬蟲實戰:爬取知乎網站內容,儲存到資料庫,並匯出到Excel

前面幾篇文字我們介紹了相關的爬蟲的方法爬取網站內容和網站的圖片,且儲存到資料庫中。

今天呢,我們來次實戰練習,爬取知乎網站跟話題網站top的幾個問題和答案,然後儲存到資料庫中,最後把資料庫中的所有內容再匯出到Excel中。我們還是繼續之前的程式碼,同樣的程式碼就不貼出來了,如果有不瞭解的同學,可以檢視之前的文章,或者文章末尾有又git網站可以自己下載檢視所有程式碼。

1、ExcelUtils Excel匯出工具類

package com.dyw.crawler.util;

import org.apache.poi.hssf.usermodel.*;
import org.apache
.poi.hssf.util.HSSFColor; import java.io.IOException; import java.io.OutputStream; import java.lang.reflect.Field; import java.lang.reflect.Method; import java.text.SimpleDateFormat; import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.regex.Matcher; import java.util
.regex.Pattern; /** * excel 工具類 * Created by dyw on 2017/9/14. */ public class ExcelUtils<T> { public void exportExcel(String title, Collection<T> dataset, OutputStream out) { exportExcel(title, null, dataset, out, "yyyy-MM-dd"); } public void exportExcel(String title, String[] headers, Collection<T> dataset, OutputStream out
) { exportExcel(title, headers, dataset, out, "yyyy-MM-dd"); } /** * 這是一個通用的方法,利用了JAVA的反射機制,可以將放置在JAVA集合中並且符號一定條件的資料以EXCEL 的形式輸出到指定IO裝置上 * * @param title 表格標題名 * @param headers 表格屬性列名陣列 * @param dataset 需要顯示的資料集合,集合中一定要放置符合javabean風格的類的物件。此方法支援的 * javabean屬性的資料型別有基本資料型別及String,Date,byte[](圖片資料) * @param out 與輸出裝置關聯的流物件,可以將EXCEL文件匯出到本地檔案或者網路中 * @param pattern 如果有時間資料,設定輸出格式。預設為"yyy-MM-dd" */ public void exportExcel(String title, String[] headers, Collection<T> dataset, OutputStream out, String pattern) { // 宣告一個工作薄 HSSFWorkbook workbook = new HSSFWorkbook(); // 生成一個表格 HSSFSheet sheet = workbook.createSheet(title); // 設定表格預設列寬度為15個位元組 sheet.setDefaultColumnWidth((short) 15); // 生成一個樣式 HSSFCellStyle style = workbook.createCellStyle(); // 設定這些樣式 style.setFillForegroundColor(HSSFColor.SKY_BLUE.index); style.setFillPattern(HSSFCellStyle.SOLID_FOREGROUND); style.setBorderBottom(HSSFCellStyle.BORDER_THIN); style.setBorderLeft(HSSFCellStyle.BORDER_THIN); style.setBorderRight(HSSFCellStyle.BORDER_THIN); style.setBorderTop(HSSFCellStyle.BORDER_THIN); style.setAlignment(HSSFCellStyle.ALIGN_CENTER); // 生成一個字型 HSSFFont font = workbook.createFont(); font.setColor(HSSFColor.VIOLET.index); font.setFontHeightInPoints((short) 12); font.setBoldweight(HSSFFont.BOLDWEIGHT_BOLD); // 把字型應用到當前的樣式 style.setFont(font); // 生成並設定另一個樣式 HSSFCellStyle style2 = workbook.createCellStyle(); style2.setFillForegroundColor(HSSFColor.LIGHT_YELLOW.index); style2.setFillPattern(HSSFCellStyle.SOLID_FOREGROUND); style2.setBorderBottom(HSSFCellStyle.BORDER_THIN); style2.setBorderLeft(HSSFCellStyle.BORDER_THIN); style2.setBorderRight(HSSFCellStyle.BORDER_THIN); style2.setBorderTop(HSSFCellStyle.BORDER_THIN); style2.setAlignment(HSSFCellStyle.ALIGN_CENTER); style2.setVerticalAlignment(HSSFCellStyle.VERTICAL_CENTER); // 生成另一個字型 HSSFFont font2 = workbook.createFont(); font2.setBoldweight(HSSFFont.BOLDWEIGHT_NORMAL); // 把字型應用到當前的樣式 style2.setFont(font2); // 宣告一個畫圖的頂級管理器 HSSFPatriarch patriarch = sheet.createDrawingPatriarch(); // 定義註釋的大小和位置,詳見文件 HSSFComment comment = patriarch.createComment(new HSSFClientAnchor(0, 0, 0, 0, (short) 4, 2, (short) 6, 5)); // 設定註釋內容 comment.setString(new HSSFRichTextString("可以在POI中添加註釋!")); // 設定註釋作者,當滑鼠移動到單元格上是可以在狀態列中看到該內容. comment.setAuthor("leno"); // 產生表格標題行 HSSFRow row = sheet.createRow(0); for (short i = 0; i < headers.length; i++) { HSSFCell cell = row.createCell(i); cell.setCellStyle(style); HSSFRichTextString text = new HSSFRichTextString(headers[i]); cell.setCellValue(text); } // 遍歷集合資料,產生資料行 Iterator<T> it = dataset.iterator(); int index = 0; while (it.hasNext()) { index++; row = sheet.createRow(index); T t = (T) it.next(); // 利用反射,根據javabean屬性的先後順序,動態呼叫getXxx()方法得到屬性值 Field[] fields = t.getClass().getDeclaredFields(); for (short i = 0; i < fields.length; i++) { HSSFCell cell = row.createCell(i); cell.setCellStyle(style2); Field field = fields[i]; String fieldName = field.getName(); String getMethodName = "get" + fieldName.substring(0, 1).toUpperCase() + fieldName.substring(1); try { Class tCls = t.getClass(); Method getMethod = tCls.getMethod(getMethodName, new Class[]{}); Object value = getMethod.invoke(t, new Object[]{}); // 判斷值的型別後進行強制型別轉換 String textValue = null; // if (value instanceof Integer) { // int intValue = (Integer) value; // cell.setCellValue(intValue); // } else if (value instanceof Float) { // float fValue = (Float) value; // textValue = new HSSFRichTextString( // String.valueOf(fValue)); // cell.setCellValue(textValue); // } else if (value instanceof Double) { // double dValue = (Double) value; // textValue = new HSSFRichTextString( // String.valueOf(dValue)); // cell.setCellValue(textValue); // } else if (value instanceof Long) { // long longValue = (Long) value; // cell.setCellValue(longValue); // } if (value instanceof Boolean) { boolean bValue = (Boolean) value; textValue = "男"; if (!bValue) { textValue = "女"; } } else if (value instanceof Date) { Date date = (Date) value; SimpleDateFormat sdf = new SimpleDateFormat(pattern); textValue = sdf.format(date); } else if (value instanceof byte[]) { // 有圖片時,設定行高為60px; row.setHeightInPoints(60); // 設定圖片所在列寬度為80px,注意這裡單位的一個換算 sheet.setColumnWidth(i, (short) (35.7 * 80)); // sheet.autoSizeColumn(i); byte[] bsValue = (byte[]) value; HSSFClientAnchor anchor = new HSSFClientAnchor(0, 0, 1023, 255, (short) 6, index, (short) 6, index); anchor.setAnchorType(2); patriarch.createPicture(anchor, workbook.addPicture( bsValue, HSSFWorkbook.PICTURE_TYPE_JPEG)); } else { // 其它資料型別都當作字串簡單處理 if (null == value) { textValue = ""; } else { textValue = value.toString(); } } // 如果不是圖片資料,就利用正則表示式判斷textValue是否全部由數字組成 if (textValue != null) { Pattern p = Pattern.compile("^//d+(//.//d+)?$"); Matcher matcher = p.matcher(textValue); if (matcher.matches()) { // 是數字當作double處理 cell.setCellValue(Double.parseDouble(textValue)); } else { HSSFRichTextString richString = new HSSFRichTextString( textValue); HSSFFont font3 = workbook.createFont(); font3.setColor(HSSFColor.BLUE.index); richString.applyFont(font3); cell.setCellValue(richString); } } } catch (Exception e) { e.printStackTrace(); } } } try { workbook.write(out); } catch (IOException e) { e.printStackTrace(); } } }

2、RegularCollection 正則表示式集合類

因為提取網站內容,不同的內容會涉及到不同的正則表示式,所有我們這裡把所有的則表示式提取出來,放到一個單獨的類中。

package com.dyw.crawler.file;

/**
 * 正則表示式集合類
 * Created by dyw on 2017/9/14.
 */
public class RegularCollection {

    //獲取img標籤正則
    public static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
    //獲取href正則
    public static final String AURL_REG = "href=\"(.*?)\"";
    //獲取http開頭,png|jpg|bmp|gif結尾的 正則
    public static final String IMGSRC_REG = "[a-zA-z]+://[^\\s]*(?:png|jpg|bmp|gif)";
    //獲取沒有以http開頭,png|jpg|bmp|gif結尾的 正則
    public static final String IMGSRC_REG1 = "/[^\\s]*(?:png|jpg|bmp|gif)";

    /* **************************知乎網址************************** */
    //知乎獲取 question link
    public static final String ZHIHU_QUESTION_link = "_link.*target";
    //匹配知乎 question uri
    public static final String ZHIHU_QUESTION_URI = "/.*[0-9]{8}";
    //匹配標題   匹配結果:Header-title">有哪些值得一提的生活竅門?<    還得:substring(14, title.length() - 1);
    public static final String ZHIHU_TITLE = "Header-title.*?<";
    //匹配問題當前回答數    匹配結果:List-headerText"><span>344 個回答</    還得:substring(23, title.length() - 1);
    public static final String ZHIHU_ANSWER = "List-headerText.*?</";
    //匹配關注者和被瀏覽數    匹配結果:NumberBoard-value">4894</    還得:substring(19, title.length() - 1);
    public static final String ZHIHU_CONCERN = "NumberBoard-value.*?</";
    //匹配答案內容    匹配結果:   還得:substring(19, title.length() - 1);
    public static final String ZHIHU_ANSWER_CONTENT = "CopyrightRichText-richText\" itemprop.*?<div class=\"ContentItem-time\">";
    //匹配答案點贊數    匹配結果:   還得:substring(40, title.length() - 1);
    public static final String ZHIHU_LIKE_COUNT = "AnswerItem-extraInfo.*?</button>";
}

3、ExcelTitleConllection excel 標題集合類

package com.dyw.crawler.file;

/**
 * excel 標題集合類
 * Created by dyw on 2017/9/17.
 */
public class ExcelTitleConllection {

    public static final String[] ZHIHUTITLE = {"標題", "內容", "關注者數", "瀏覽數", "答案數",
            "答案一", "點贊數一","評論數一",
            "答案二", "點贊數二","評論數二",
            "答案三", "點贊數三","評論數三",
            "答案四", "點贊數四","評論數四",
            "答案五", "點贊數五","評論數五","爬取時間"};
}

4、URICollection 需要爬取的URI的集合

package com.dyw.crawler.file;

/**
 * 需要爬取的URI的集合
 * Created by dyw on 2017/9/14.
 */
public class URICollection {

    /* **************************知乎網址************************** */
    /**
     * 知乎網址url
     */
    public static final String ZHIHU = "https://www.zhihu.com";
    /**
     * 知乎根話題網址url
     */
    public static final String ZHIHUTOPIC = "https://www.zhihu.com/topic/19776749/hot";
}

5、main主方法

package com.dyw.crawler.project;

import com.dyw.crawler.file.ExcelTitleConllection;
import com.dyw.crawler.file.RegularCollection;
import com.dyw.crawler.file.URICollection;
import com.dyw.crawler.model.Zhihu;
import com.dyw.crawler.util.ConnectionPool;
import com.dyw.crawler.util.CrawlerUtils;
import com.dyw.crawler.util.ExcelUtils;
import com.dyw.crawler.util.RegularUtils;

import java.io.FileOutputStream;
import java.io.OutputStream;
import java.sql.Connection;
import java.sql.Date;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * 爬取知乎根話題 排行榜 前幾 問題 答案等相關資訊
 * Created by dyw on 2017/9/12.
 */
public class Project5 {

    public static void main(String[] args) throws Exception {

        ConnectionPool connectionPool = new ConnectionPool();
        connectionPool.createPool();
        // 網站登入url
//        String loginUrl = "https://www.zhihu.com/";
        //爬取網站
        String dataUrl = URICollection.ZHIHUTOPIC;
        // 設定登陸時要求的資訊,使用者名稱和密碼
//        NameValuePair[] loginInfo = {new NameValuePair("phone_num", ""),
//                new NameValuePair("password", "")};
        try {
//            String cookie = CrawlerUtils.post(loginUrl, loginInfo);
            Map<String, String> map = new HashMap<>();
            //放入自己知乎賬號登入時獲取的cookie即可
            map.put("Cookie", "z_c0=");
            String html = CrawlerUtils.get(dataUrl, map);
            List<String> list = RegularUtils.match(RegularCollection.ZHIHU_QUESTION_link, html);
            List<String> uriLists = RegularUtils.match(RegularCollection.ZHIHU_QUESTION_URI, list);

            uriLists.forEach(uri -> {
                String url = URICollection.ZHIHU + uri;
                try {
                    //詳細頁內容
                    String detailHtml = CrawlerUtils.get(url, map);
                    //匹配標題
                    List<String> match = RegularUtils.match(RegularCollection.ZHIHU_TITLE, detailHtml);
                    String title = match.get(0);
                    title = title.substring(14, title.length() - 1);
                    //答案數
                    List<String> match1 = RegularUtils.match(RegularCollection.ZHIHU_ANSWER, detailHtml);
                    String answerCount = match1.get(0);
                    answerCount = answerCount.substring(23, answerCount.length() - 2);
                    //關注者和被瀏覽數
                    List<String> match2 = RegularUtils.match(RegularCollection.ZHIHU_CONCERN, detailHtml);
                    String concern = match2.get(0);
                    concern = concern.substring(19, concern.length() - 2);
                    String browsed = match2.get(1);
                    browsed = browsed.substring(19, browsed.length() - 2);
                    //答案內容
                    List<String> match3 = RegularUtils.match(RegularCollection.ZHIHU_ANSWER_CONTENT, detailHtml);
                    String answer1 = match3.get(0);
                    answer1 = answer1.substring(44, answer1.length() - 40);
                    String answer2 = match3.get(1);
                    answer2 = answer2.substring(44, answer2.length() - 40);
                    //答案內容點贊數
                    List<String> match4 = RegularUtils.match(RegularCollection.ZHIHU_LIKE_COUNT, detailHtml);
                    String like1 = match4.get(0);
                    like1 = like1.substring(94, like1.length() - 9);
                    String like2 = match4.get(1);
                    like2 = like2.substring(94, like2.length() - 9);

                    Connection conn = connectionPool.getConnection();
                    Zhihu zhihu = new Zhihu(title, "", concern, browsed, answerCount, answer1, like1, "", answer2, like2, "", "", "", "", "", "", "", "", "", "");
                    executeInsert(conn, zhihu);
                    connectionPool.returnConnection(conn);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            });
        } catch (Exception e) {
            e.printStackTrace();
        }
        OutputStream out = new FileOutputStream("C:\\Users\\dyw\\Desktop\\crawler\\a.xls");
        Connection conn1 = connectionPool.getConnection();
        List<Zhihu> list = excueteQuery(conn1);
        connectionPool.returnConnection(conn1);
        ExcelUtils<Zhihu> ex = new ExcelUtils<>();
        ex.exportExcel("知乎根話題top5問題及答案", ExcelTitleConllection.ZHIHUTITLE, list, out);
        out.close();
    }

    /**
     * 執行sql儲存
     *
     * @param conn  sqlconn
     * @param zhihu 知乎實體
     */
    private static void executeInsert(Connection conn, Zhihu zhihu) throws Exception {
        String insertSql = "insert into zhihu (title,content,concern,browsed,answer_count, answer1,like1,comment1, answer2,like2,comment2, answer3,like3,comment3, answer4,like4,comment4, answer5,like5,comment5, crawler_date) " +
                "values (?,?,?,?,?, ?,?,?, ?,?,?, ?,?,?, ?,?,?, ?,?,?, ?)";
        PreparedStatement preparedStatement = conn.prepareStatement(insertSql);
        preparedStatement.setString(1, zhihu.getTitle());
        preparedStatement.setString(2, zhihu.getContent());
        preparedStatement.setString(3, zhihu.getConcern());
        preparedStatement.setString(4, zhihu.getBrowsed());
        preparedStatement.setString(5, zhihu.getAnswerCount());

        preparedStatement.setString(6, zhihu.getAnswer1());
        preparedStatement.setString(7, zhihu.getLike1());
        preparedStatement.setString(8, zhihu.getComment1());

        preparedStatement.setString(9, zhihu.getAnswer2());
        preparedStatement.setString(10, zhihu.getLike2());
        preparedStatement.setString(11, zhihu.getComment2());

        preparedStatement.setString(12, zhihu.getAnswer3());
        preparedStatement.setString(13, zhihu.getLike3());
        preparedStatement.setString(14, zhihu.getComment3());

        preparedStatement.setString(15, zhihu.getAnswer4());
        preparedStatement.setString(16, zhihu.getLike4());
        preparedStatement.setString(17, zhihu.getComment4());

        preparedStatement.setString(18, zhihu.getAnswer5());
        preparedStatement.setString(19, zhihu.getLike5());
        preparedStatement.setString(20, zhihu.getComment5());

        preparedStatement.setDate(21, new Date(System.currentTimeMillis()));
        preparedStatement.executeUpdate();
        preparedStatement.close();
    }


    private static List<Zhihu> excueteQuery(Connection conn) throws Exception {
        List<Zhihu> list = new ArrayList<>();
        String insertSql = "select * from zhihu ";
        PreparedStatement preparedStatement = conn.prepareStatement(insertSql);
        ResultSet resultSet = preparedStatement.executeQuery();
        while (resultSet.next()) {
            String title = resultSet.getString(2);
            String content = resultSet.getString(3);
            String concern = resultSet.getString(4);
            String browsed = resultSet.getString(5);
            String answerCount = resultSet.getString(6);
            String answer1 = resultSet.getString(7);
            String like1 = resultSet.getString(8);
            String comment1 = resultSet.getString(9);
            Zhihu zhihu = new Zhihu(title, content, concern, browsed, answerCount, answer1, like1, comment1, "", "", "", "", "", "", "", "", "", "", "", "");
            list.add(zhihu);
        }
        return list;
    }

}

6、Zhihu 知乎實體

package com.dyw.crawler.model;

import java.sql.Date;

/**
 * 知乎實體
 * Created by dyw on 2017/9/17.
 */
public class Zhihu {
    //標題
    private String title;
    //內容
    private String content;
    //關注者
    private String concern;
    //瀏覽數
    private String browsed;
    //爬取時答案數
    private String answerCount;
    //回答1
    private String answer1;
    //點贊數1
    private String like1;
    //評論數1
    private String comment1;

    private String answer2;
    private String like2;
    private String comment2;

    private String answer3;
    private String like3;
    private String comment3;

    private String answer4;
    private String like4;
    private String comment4;

    private String answer5;
    private String like5;
    private String comment5;

    private Date crawler_date;

    public String getAnswerCount() {
        return answerCount;
    }

    public void setAnswerCount(String answerCount) {
        this.answerCount = answerCount;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public String getConcern() {
        return concern;
    }

    public void setConcern(String concern) {
        this.concern = concern;
    }

    public String getBrowsed() {
        return browsed;
    }

    public void setBrowsed(String browsed) {
        this.browsed = browsed;
    }

    public String getAnswer1() {
        return answer1;
    }

    public void setAnswer1(String answer1) {
        this.answer1 = answer1;
    }

    public String getLike1() {
        return like1;
    }

    public void setLike1(String like1) {
        this.like1 = like1;
    }

    public String getComment1() {
        return comment1;
    }

    public void setComment1(String comment1) {
        this.comment1 = comment1;
    }

    public String getAnswer2() {
        return answer2;
    }

    public void setAnswer2(String answer2) {
        this.answer2 = answer2;
    }

    public String getLike2() {
        return like2;
    }

    public void setLike2(String like2) {
        this.like2 = like2;
    }

    public String getComment2() {
        return comment2;
    }

    public void setComment2(String comment2) {
        this.comment2 = comment2;
    }

    public String getAnswer3() {
        return answer3;
    }

    public void setAnswer3(String answer3) {
        this.answer3 = answer3;
    }

    public String getLike3() {
        return like3;
    }

    public void setLike3(String like3) {
        this.like3 = like3;
    }

    public String getComment3() {
        return comment3;
    }

    public void setComment3(String comment3) {
        this.comment3 = comment3;
    }

    public String getAnswer4() {
        return answer4;
    }

    public void setAnswer4(String answer4) {
        this.answer4 = answer4;
    }

    public String getLike4() {
        return like4;
    }

    public void setLike4(String like4) {
        this.like4 = like4;
    }

    public String getComment4() {
        return comment4;
    }

    public void setComment4(String comment4) {
        this.comment4 = comment4;
    }

    public String getAnswer5() {
        return answer5;
    }

    public void setAnswer5(String answer5) {
        this.answer5 = answer5;
    }

    public String getLike5() {
        return like5;
    }

    public void setLike5(String like5) {
        this.like5 = like5;
    }

    public String getComment5() {
        return comment5;
    }

    public void setComment5(String comment5) {
        this.comment5 = comment5;
    }

    public Date getCrawler_date() {
        return crawler_date;
    }

    public void setCrawler_date(Date crawler_date) {
        this.crawler_date = crawler_date;
    }

    public Zhihu(){

    }

    //建構函式
    public Zhihu(String title, String content, String concern, String browsed,String answerCount,String answer1, String like1, String comment1, String answer2, String like2, String comment2, String answer3, String like3, String comment3, String answer4, String like4, String comment4, String answer5, String like5, String comment5) {
        this.title = title;
        this.content = content;
        this.concern = concern;
        this.browsed = browsed;
        this.answerCount = answerCount;
        this.answer1 = answer1;
        this.like1 = like1;
        this.comment1 = comment1;
        this.answer2 = answer2;
        this.like2 = like2;
        this.comment2 = comment2;
        this.answer3 = answer3;
        this.like3 = like3;
        this.comment3 = comment3;
        this.answer4 = answer4;
        this.like4 = like4;
        this.comment4 = comment4;
        this.answer5 = answer5;
        this.like5 = like5;
        this.comment5 = comment5;
    }
}

7、結果

表結構:

這裡寫圖片描述

結果:

這裡寫圖片描述

這裡寫圖片描述

注:

在用httpclient進行模擬登入的時候,我遇到一個問題。獲取到的cookie並不是瀏覽器登陸的時候獲取到的完整的cookie,所有我就暫時先用瀏覽器中的cookie直接進行頁面爬取。因為時間有限,先沒有去理會這個問題,如果您知道如何修改,請告知一二,我在這謝謝了。

還有其中我沒有獲取答案的 評論數,這個大家可以自己寫相應的正則,應該不是什麼問題。

還有就是答案只獲取了2個,表中我是設計了5個,剩下的3個可以先不用考慮。

如果對本文中方法有不瞭解的可以看之前的爬蟲記錄系列文章,有具體程式碼。

如果有什麼程式碼修改的建議,請給我留言唄! ☺☺☺

歡迎加入 扣扣群 371322638 一起學習研究爬蟲技術