1. 程式人生 > >python 爬蟲例項(三)

python 爬蟲例項(三)

問題描述

爬取部落格園的首頁資料URL【https://home.cnblogs.com/blog/page/1/】,之後寫到自己的Excel裡面

 

環境:

OS:Window10

python:3.7

 

程式碼

import requests
import os
from bs4 import BeautifulSoup
import xlwt
import xlrd
from xlutils.copy import copy
import threading
import datetime

class BlogHome:

    def __init__(self):
        self.url = "https://home.cnblogs.com/blog/page/{}/"
        self.path = r"C:\pythonProject\Blog"

    def request(self, param):
        url= self.url.format(param)
        r = requests.get(self.url)
        return r.text

    def all_page(self, maxpage):

        # wbk = xlwt.Workbook()
        # sheet = wbk.add_sheet("Data")
        wbk = xlrd.open_workbook(r"C:\Users\peiqiang\Desktop\aaa.xls", formatting_info=True)
        wbCopy = copy(wbk)
        sheet = wbCopy.get_sheet(0)
        row = 4
        for page in range(1, maxpage):
            thread_lock.acquire()
            req = self.request(page)
            reRow = self.getdata(req, sheet, row)
            row = reRow
            thread_lock.release()

        wbCopy.save(r"C:\Users\peiqiang\Desktop\aaa.xls")
        print("書き込みました")

    def getdata(self, req, sheet, row):
        soup = BeautifulSoup(req, "xml")
        all_title = soup.find_all(class_="post_block")
        for title in all_title:
            col = 1
            # title取得
            title_blank = title.find(class_="entry_title").find_all("a")
            print("user:", title_blank[0].string.replace("[", "").replace("]", ""))
            sheet.write(row, col, title_blank[0].string.replace("[", "").replace("]", ""))
            col += 1
            print("title:", title_blank[1].string)
            sheet.write(row, col, title_blank[1].string)
            col += 1

            # 評論個數
            post_comment = title.find(class_="post_comment")
            print("評論個數:", post_comment.string)
            sheet.write(row, col, post_comment.string)
            col += 1
            # 読込個數
            post_view = title.find(class_="post_view")
            print("読込個數:", post_view.string)

            sheet.write(row, col, post_view.string)
            col += 1

            # 推奨個數
            # susume = title.find(class_="entry_footer")
            # print("推奨個數:", susume.string)
            # 発表日付
            postdate = title.find(class_="postdate")
            print("発表日付:", postdate.string)
            sheet.write(row, col, postdate.string)
            col += 1
            # 詳細取得
            entry_summary = title.find(class_="entry_summary")
            print("詳細取得:", entry_summary.string)
            sheet.write(row, col, entry_summary.string)
            col += 1
            row += 1
        return row

    def writeExcel(self, row, col, data):
        wbk = xlwt.Workbook()
        sheet = wbk.add_sheet("Data", cell_overwrite_ok=True)
        sheet.write(row, col, data)
        wbk.save(r"C:\Users\peiqiang\Desktop\aaa.xls")
        print("書き込みました")

    def mkdir(self):
        path = self.path.strip()
        isExist = os.path.exists(path)
        if not isExist:
            print('建立名字叫做', path, '的資料夾')
            os.makedirs(path)
            print('建立成功!')
            return True
        else:
            print(path, '資料夾已經存在了,不再建立')
            return False


    def getBlog(self):

        startTime = datetime.datetime.now()
        print("開始", startTime)
        self.all_page(10)
        endTime = datetime.datetime.now()
        print("実行時間:", (endTime - startTime).seconds)
        print("開始", startTime)
        print("終了", endTime)


thread_lock = threading.BoundedSemaphore(value=10)
blogHome = BlogHome()
blogHome.getBlog()

  執行上面的程式碼

Excel上面的資料