1. 程式人生 > >爬蟲,爬取句子迷《龍族》

爬蟲,爬取句子迷《龍族》

chrom ide win true res spa rom request file

踩了很多坑,主要是python2編碼的問題和正則不熟

直接上腳本

# -*- coding: gbk -*-
import re
import urllib2
import time



class spider:
    """
龍族  句子迷
    """


    def __init__(self):
        self.enable = True
        self.page = 1
    def load_page(self,page):
        url = http://www.juzimi.com/article/113093?page=+str(page)
        user_agent
=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 headers = {"User-Agent":user_agent} req = urllib2.Request(url,headers = headers) response = urllib2.urlopen(req) html = response.read() new_html = html.decode(
utf-8) # print new_html #正則過濾得到句子 pattern = re.compile(rclass="xlistju">(.*?)</a>,re.S) item_list=pattern.findall(new_html) # print item_list return item_list def deal_one_page(self,item_list,page): print 第%d頁 %(page)
for item in item_list: item = item.replace("<br />", "") item = item.replace("<br/>","") self.write_to_file(item) print %d %(page) def write_to_file(self,txt): f=open(F:\py\longzu.txt,a) f.write(txt.encode(utf-8)) f.write(\n\n) f.close() def do_work(self): while self.enable: for i in range(1,34): item_list = self.load_page(self.page) self.deal_one_page(item_list,self.page) time.sleep(5) self.page += 1 if i==33 : print 結束 self.enable = False break if __name__ == "__main__": longspider = spider() longspider.do_work()

爬蟲,爬取句子迷《龍族》