1. 程式人生 > >爬取小說網站整站小說內容 -《狗嗨默示錄》-

爬取小說網站整站小說內容 -《狗嗨默示錄》-

exception chap color row con print 動漫 pri value

# !/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib.request
import re
import MySQLdb
import socket


domain = http://www.quanshuwang.com
headers = {
    User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36
}

#獲取分類列表
def getTypeList(type):
    req 
= urllib.request.Request(http://www.quanshuwang.com/map/%s.html%type) req.headers = headers #替換頭信息 #req.add_header() #添加單個頭信息 res = urllib.request.urlopen(req) # 獲取源碼 html = res.read().decode(gbk) # 解碼 reg = r<a href="(/book/.+?)" target="_blank">(.+?)</a> reg = re.compile(reg) #
編譯 return re.findall(reg,html) def getNovelList(href): req = urllib.request.Request(domain + href) req.headers = headers res = urllib.request.urlopen(req) html = res.read().decode(gbk) reg = r<li><a href="(.+?)" title="(.+?)">(.+?)</a></li> reg
= re.compile(reg) return re.findall(reg,html) def getNovelContent(url): req = urllib.request.Request(domain + url) req.headers = headers res = urllib.request.urlopen(req) html = res.read().decode(gbk,ignore) reg = rstyle5\(\);</script>(.*?)<script type="text/javascript">style6\(\) reg = re.compile(reg,re.S) print(domain + url) return re.findall(reg,html)[0] class Sql(object): conn = MySQLdb.connect(host=localhost,port=x,user=‘x,password=‘x,db=novel,charset=utf8) def addnovels(self,sort,novelname): cur = self.conn.cursor() #遊標 cur.execute("insert into novel(sort,novelname) values(‘%s‘,‘%s‘)"%(sort,novelname)) lastrowid = cur.lastrowid cur.close() self.conn.commit() return lastrowid def addchapters(self,novelid,chaptername,content): cur = self.conn.cursor() cur.execute("insert into chapter(novelid,chaptername,content) values(%s,‘%s‘,‘%s‘)"%(novelid,chaptername,content)) cur.close() self.conn.commit() mysql = Sql() if __name__ == __main__: for type in range(1,10): if type == 1: sort = "玄幻魔法" elif type == 2: sort = "武俠修真" elif type == 3: sort = "歷史軍事" elif type == 4: sort = "女頻言情" elif type == 5: sort = "偵探推理" elif type == 6: sort = "網絡動漫" elif type == 7: sort = "科幻小說" elif type == 8: sort = "恐怖靈異" elif type == 9: sort = "美文同人" else: print("請求的小說類型有誤!!!") for href,novelname in getTypeList(type): lastrowid = mysql.addnovels(sort,novelname) for url,title,title in getNovelList(href): try: print("正在爬取------------%s 《%s》 %s"%(sort,novelname,title)) content = getNovelContent(href.replace(index.html,url)) mysql.addchapters(novelid=lastrowid,chaptername=title,content=content) socket.setdefaulttimeout(30) except Exception as e: print("連接中斷,發生錯誤:%s !!!!"%e)

爬取小說網站整站小說內容 -《狗嗨默示錄》-