1. 程式人生 > >python爬取小說(三)資料儲存

python爬取小說(三)資料儲存

由於時間關係,我們先把每章的內容儲存到資料庫。
需要用到sqlite
接著上一篇,在原基礎上修改程式碼如下:

# -*- coding: utf-8 -*-
import urllib.request
import bs4
import re
import sqlite3
import time
print ('連線資料庫……')
cx = sqlite3.connect('PaChong.db')
# #在該資料庫下建立表
# 建立書籍基本資訊表
cx.execute('''CREATE TABLE book_info(
       id INTEGER PRIMARY KEY   AUTOINCREMENT,
       title             verchar(128)     not null,
       img             verchar(512)      null,
       auther         verchar(64)      null,
       type             verchar(128)      null,
       status             verchar(64)      null,
       num             int      null,
       updatatime             datetime      null,
       newchapter             verchar(512)      null,
       authsummery             verchar(1024)      null,
       summery            verchar(1024)      null,
       notipurl             verchar(512)      null);
       '''
) # 建立章節內容表 cx.execute('''CREATE TABLE book_chapter( id INTEGER PRIMARY KEY AUTOINCREMENT, book_id int null , chapter_no int null , chapter_name verchar(128) null, chapter_url verchar(512) null, chapter_content text null); '''
) print("Table created successfully") print("資料庫連線完成") def getHtml(url): user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" headers = {"User-Agent":user_agent} request = urllib.request.Request(url,headers=headers) response = urllib.request.urlopen(request) html = response.read() return
html # 爬取整個網頁 def parse(url): html_doc = getHtml(url) sp = bs4.BeautifulSoup(html_doc, 'html.parser', from_encoding="utf-8") return sp # 爬取書籍基本資訊 def get_book_baseinfo(url): # class = "info"資訊獲取 info = parse(url).find('div',class_ = 'info') book_info = {} if info: # print(info) book_info['title'] = '' book_info['img'] = '' # 標題 book_info['title'] = info.find('h2').string # book_info['title'] = title # 圖片連結 img = info.find('div',class_ = 'cover') for im in img.children: # 圖片地址想要訪問,顯然需要拼接 book_info['img'] = 'http://www.biqukan.com' + im.attrs['src'] # 基本資訊儲存 ifo = info.find('div',class_ = 'small') bkinfo = [] for b in ifo: for v in b.children: t = v.string if t: bkinfo.append(''.join(t)) # 將:後面的資訊連起來 spv = [] cv = '' for v in bkinfo: if v.find(':') >= 0: if cv: spv.append(cv) cv = v else: cv += v spv.append(cv) # 基本資訊轉成字典 for element in spv: its = [v.strip() for v in element.split(':')] if len(its) != 2: continue nm = its[0].lower() # 統一成小寫 if type(nm).__name__ == 'unicode': nm = nm.encode('utf-8') vu = its[1] book_info[nm] = vu # 發現這裡獲取到的字典鍵與後面將要獲取的鍵重複了,所以這裡改一下 book_info['auther'] = book_info.pop('作者') #簡介獲取(與基本資訊的獲取方式一致) intro = info.find('div',class_ = 'intro') bkurl = [] for b in intro: t = b.string if t: bkurl.append(''.join(t)) bkjj = [] cvx = '' for w in bkurl: if w.find(':') >= 0: if cvx: bkjj.append(cvx) cvx = w else: cvx += w bkjj.append(cvx) for ele in bkjj: itis = [n.strip() for n in ele.split(':')] if len(itis) != 2: continue summ = itis[0].lower() # 統一成小寫 if type(summ).__name__ == 'unicode': summ = summ.encode('utf-8') vux = itis[1] book_info[summ] = vux # 由於我們後面建立的資料表字段使用英文,為方便起見,這裡用字典名對映轉換 # 英文 # book_en = ["title", "img", "type", "status","num", "updatatime", "newchapter", "auther","summery", "authsummery","notipurl"] # 中文 # book_cn = ["書名", "圖片連結", "分類", "狀態", "字數", "更新時間", "最新章節", "作者", "簡介", "作者介紹","無彈窗推薦地址"] # 將字典的key與資料庫中的欄位對應,這裡用book_dict列表儲存 # book_dict = dict(zip(book_cn,book_en)) # 使用笨辦法將字典的key轉成英文狀態,這樣方便資料庫儲存 book_info['type'] = book_info.pop('分類') book_info['status'] = book_info.pop('狀態') book_info['num'] = book_info.pop('字數') book_info['updatatime'] = book_info.pop('更新時間') book_info['newchapter'] = book_info.pop('最新章節') book_info['authsummery'] = book_info.pop('作者') book_info['summery'] = book_info.pop('簡介') book_info['notipurl'] = book_info.pop('無彈窗推薦地址') return book_info # 獲取書籍目錄 def get_book_dir(url): books_dir = [] name = parse(url).find('div', class_='listmain') if name: dd_items = name.find('dl') dt_num = 0 for n in dd_items.children: ename = str(n.name).strip() if ename == 'dt': dt_num += 1 if ename != 'dd': continue Catalog_info = {} if dt_num == 2: durls = n.find_all('a')[0] Catalog_info['chapter_name'] = (durls.get_text()) Catalog_info['chapter_url'] = 'http://www.biqukan.com' + durls.get('href') books_dir.append(Catalog_info) # print(books_dir) return books_dir # 獲取章節內容 def get_charpter_text(curl): text = parse(curl).find('div', class_='showtxt') if text: cont = text.get_text() cont = [str(cont).strip().replace('\r \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0', '').replace('\u3000\u3000', '')] c = " ".join(cont) ctext = ' '.join(re.findall(r'^.*?html', c)) return ctext else: return '' #資料儲存 def SqlExec(conn,sql): try: cur = conn.cursor() cur.execute(sql) conn.commit() except Exception as e: print('exec sql error[%s]' % sql) print(Exception, e) cur = None return cur # 獲取書籍章節內容 併入庫 def get_book(burl): # 目錄 book = get_book_dir(burl) if not book: print('獲取資料目錄失敗:', burl) return book for d in book: curl = d['chapter_url'] try: ctext = get_charpter_text(curl) d['chapter_content'] = ctext sql = 'insert into book_chapter(' + 'book_id' + ',' + 'chapter_no' + ','+ ','.join(d.keys()) + ')' i = 1 sql += " values('" + str(i) + "'" + "," + "'" + str(i) + "'" + "," + "'" + "','".join(d.values()) + "');" # 呼叫資料庫函式 if SqlExec(cx, sql): print('正在插入...【{}】'.format(d['chapter_name'])) else: print(sql) except Exception as err: d['chapter_content'] = 'get failed' return book # 書籍基本資訊入庫 def insert_baseinfo(burl): baseinfo = get_book_baseinfo(burl) if not baseinfo: print("獲取基本資訊失敗") return baseinfo sql = 'insert into book_info(' + ','.join(baseinfo.keys()) + ')' sql += " values('" + "','".join(baseinfo.values()) + "');" # 呼叫資料庫函式 if SqlExec(cx, sql): print('正在插入...書籍【{}】'.format(baseinfo['title'])) else: print(sql) if __name__ == '__main__': # 一本書的所有章節爬完之後,才會爬取下一本書的內容 url = 'http://www.biqukan.com/1_1094/' insert_baseinfo(url) get_book(url)

結果展示:
這裡寫圖片描述

資料庫展示:
這裡寫圖片描述

這裡寫圖片描述

如果你想爬去多本書的資訊和內容,那就組裝一下url

if __name__ == '__main__':
    for i in range(1090,1100):
        url = 'http://www.biqukan.com/1_' + str(i) + '/'
        insert_baseinfo(url)
        get_book(url)

結果如下:
這裡寫圖片描述

下一篇會將資料在前端展示。