1. 程式人生 > >使用lxml的css選擇器用法爬取奇書網並保存到mongoDB中

使用lxml的css選擇器用法爬取奇書網並保存到mongoDB中

referer 最新 shu auth style ret bre last tail

import requests
from lxml import etree
from fake_useragent import UserAgent
import pymongo
class QiShuSpider(object):
    def __init__(self):
        self.base_url="https://www.qisuu.la/soft/sort01/"
        self.headers={
            "User-Agent":UserAgent().random,
            "HOST":"www.qisuu.la
", "Referer":"https://www.qisuu.la", } def get_index_code(self): #聲明一個變量,記錄重連的次數 retry_link_count=0 while True: try: response=requests.get(self.base_url,headers=self.headers) # print(response.text)
except Exception as e: print("連接奇書網失敗,原因是:",e) print("正在嘗試第{}次重連....".format(retry_link_count)) retry_link_count+=1 if retry_link_count>=5: print("嘗試連接次數已經達到五次,停止連接") break else
: html_obj=etree.HTML(response.text) # print(response.text) #獲取option這個標簽列表 option_list=html_obj.cssselect("select>option") return option_list def get_every_page_code(self): option_list=self.get_index_code() for option in option_list: value=option.get("value") #拼接每一頁的完整地址 base_url="https://www.qisuu.la"+value print("正在爬取{}鏈接".format(base_url)) response=requests.get(base_url,headers=self.headers).text html_obj=etree.HTML(response) #獲取每一本小數所在的a標簽的一個列表 a_list=html_obj.cssselect(".listBox li>a") for a in a_list: novel_href=a.get("href") #拼接每一本小說的完整地址 novel_url="https://www.qisuu.la"+novel_href print("正在爬取鏈接為{}的小說".format(novel_url)) self.parse_every_novel(novel_url) def parse_every_novel(self,novel_url): reponse=requests.get(novel_url,headers=self.headers) reponse.encoding="utf-8" html_obj=etree.HTML(reponse.text) novel_name=html_obj.cssselect(".detail_right>h1")[0].text clik_num=html_obj.cssselect(".detail_right>ul>li:nth-child(1)")[0].text novel_size=html_obj.cssselect(".detail_right>ul>li:nth-child(2)")[0].text novel_type=html_obj.cssselect(".detail_right>ul>li:nth-child(3)")[0].text update_time = html_obj.cssselect(".detail_right>ul>li:nth-child(4)")[0].text novel_status = html_obj.cssselect(".detail_right>ul>li:nth-child(5)")[0].text novel_author = html_obj.cssselect(".detail_right>ul>li:nth-child(6)")[0].text novel_run_envir=html_obj.cssselect(".detail_right>ul>li:nth-child(7)")[0].text novel_lasted_chapter=html_obj.cssselect(".detail_right>ul>li:nth-child(8)>a")[0].text dict_novel={"小說名稱":novel_name,"點擊次數":clik_num,"小說大小":novel_size,"小說類型":novel_type,"更新時間":update_time,"小說狀態":novel_status,"小說作者":novel_author,"小說運行環境":novel_run_envir,"小說最新章節":novel_lasted_chapter} collection.insert_one(dict_novel) def start_spider(self): self.get_every_page_code() if __name__ == __main__: client = pymongo.MongoClient(host="localhost", port=27017) db = client.novel collection = db.novel spider=QiShuSpider() spider.start_spider()

使用lxml的css選擇器用法爬取奇書網並保存到mongoDB中