1. 程式人生 > >使用Python寫的一個爬蟲【任務佇列版本】

使用Python寫的一個爬蟲【任務佇列版本】

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import urllib
from pyquery import PyQuery as pq
import codecs
import Queue



class Fetcher:
    def __init__(self):
        self.q = Queue.Queue()
        self.q.put(("http://www.7dsw.com/toplastupdate/1.html",0))

    def work(self,):
        while not self.q.empty():
            url,tp = self.q.get()
            page = self.getPage(url)
            if
tp == 0: self.getCapUrl(page) else: self.getContent(page) def getPage(self,url): print 'fetch page...' resp = urllib.urlopen(url) page = resp.read() page = page.decode('gbk') return page def getCapUrl
(self,page):
doc = pq(page) wanted = doc('#newscontent ul a') i = 1 dir(wanted[i]) while i<len(wanted): u = wanted.eq(i).attr("href") print u # the ap link we get self.q.put((u,1)) i+= 2 def saveFile
(self,filename,data):
fp = codecs.open(filename,'a','utf-8') fp.write(data) fp.write("\r\n------------------------\r\n"); fp.close() def getContent(self,page): doc = pq(page) wanted = doc('#BookText') self.saveFile("aa.txt",wanted.text()) #print wanted.text() f = Fetcher() f.work()