1. 程式人生 > >python3爬蟲簡單記錄-使用CSS選擇器

python3爬蟲簡單記錄-使用CSS選擇器

學習過了python寫爬蟲,怕以後完全不記得了,故簡單把寫的東西貼一下。
如果以後繼續深入,再將這些程式碼功能補充完全。
CSS選擇器,需要安裝cssselect模組,用pip命令就可以,還需要安裝lxml模組
還可以使用selenium簡單模擬瀏覽器,或者使用PyQt4/PySide模擬瀏覽器動作
還有Scrapy爬蟲框架。
另外,可能用到影象處理,PIL模組或者Pillow模組解決簡單驗證碼的問題;cookielib模組解決cookies問題

使用的windows+python3.5.2

檔案downloading.py

#!/usr/bin/env python
# -*- coding:utf-8 -*-
'''downloading.py 下載靜態網頁的功能組合成的模組 有一個下載器類class Downloader 一個下載限速類class Throttle 預設值: DEFAULT_AGENT = 'wswp' --使用者代理 DEFAULT_DELAY = 5 --延時5s DEFAULT_RETRIES = 1 --重試下載次數 DEFAULT_TIMEOUT = 60 --超時時限 ---''' from urllib import request, error, parse from datetime import datetime import
random import time import socket DEFAULT_AGENT = 'wswp' DEFAULT_DELAY = 5 DEFAULT_RETRIES = 1 DEFAULT_TIMEOUT = 60 class Downloader(object): """下載頁面的類 屬性:self.throttle = Throttle(delay) self.user_agent = user_agent --使用者代理 self.proxies = proxies --下載協議,預設為None self.num_retries = num_retries --重試下載次數 self.opener = opener --request啟動器,預設為None self.cache = cache --下載快取,預設關閉(None),需要自己定義cache類 類方法:download()和特殊類方法__call__() 如果不需要快取和限速功能,直接呼叫download(),不用類例項就不會通過__call__()方法 """
def __init__(self, delay=DEFAULT_DELAY, timeout=DEFAULT_TIMEOUT, user_agent=DEFAULT_AGENT, num_retries=DEFAULT_RETRIES, proxies=None, opener=None, cache=None): socket.setdefaulttimeout(timeout) self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = num_retries self.opener = opener self.cache = cache def __call__(self, url): """ 類的特殊方法,在物件作為函式被呼叫時會呼叫該方法 傳入一個url,使用預設的下載引數,返回下載的html的bytes物件 該方法,實現了下載前檢查快取和限速5s的功能 """ result = None if self.cache: # 檢查快取是否定義 try: result = self.cache[url] except KeyError: # url is not available in cache pass else: # 如果已經快取該url,檢查之前的下載是否遇到服務端錯誤 if self.num_retries > 0 and 500 <= result['code'] < 600: # server error so ignore result from cache and re-download result = None if result is None: # 如果實際沒有快取,則下載該url,然後新增快取 # result was not loaded from cache so still need to download self.throttle.wait(url) # 下載延時,預設5s proxy = random.choice(self.proxies) if self.proxies else None headers = {'User-agent': self.user_agent} result = self.download( url, headers, proxy=proxy, num_retries=self.num_retries) if self.cache: # save result to cache self.cache[url] = result return result['html'] def download(self, url, headers, num_retries, proxy, data=None): """ 引數:url, headers, proxy, num_retries, data=None 下載該url,返回HTTP狀態碼和html組成的字典{'html': html, 'code': code} """ print('Downloading:', url) requ = request.Request(url, data, headers or {}) # request物件啟動器,用於支援代理,GET/POST協議和下載頁面,同時能返回HTTP狀態碼 opener = self.opener or request.build_opener() if proxy: proxy_params = {parse.urlparse(url).scheme: proxy} opener.add_handler(request.ProxyHandler(proxy_params)) try: response = opener.open(requ) html = response.read() code = response.code except error.URLError as e: print('Download error:', str(e)) html = '' if hasattr(e, 'code'): code = e.code if num_retries > 0 and 500 <= code < 600: # retry 5XX HTTP errors return self._get(url, headers, proxy, num_retries - 1, data) else: code = None return {'html': html, 'code': code} class Throttle: """Throttle downloading by sleeping between requests to same domain 下載限速,在2次下載之間新增延時 """ def __init__(self, delay): # amount of delay between downloads for each domain self.delay = delay # timestamp of when a domain was last accessed self.domains = {} def wait(self, url): """Delay if have accessed this domain recently """ domain = parse.urlsplit(url).netloc last_accessed = self.domains.get(domain) if self.delay > 0 and last_accessed is not None: sleep_secs = self.delay - (datetime.now() - last_accessed).seconds if sleep_secs > 0: time.sleep(sleep_secs) self.domains[domain] = datetime.now()

ajax_crawler.py


import lxml.html
import urllib
from downloading import Downloader


def main():

    D = Downloader()
    #這個地方有編碼問題
    parameters = {'fname': '\xc1\xa2\xd6\xbe\xbd\xf0\xb6\xee', 'lname': '呵呵'}
    data = urllib.parse.urlencode(parameters)
    print(type(data))

    # Downloader.download(self, url, headers, num_retries, proxy, data=None)
    htmldict = D.download(
        'http://www.w3school.com.cn/ajax/demo_post2.asp',
        {'User-agent': ' seagent'}, 1, None, data=data.encode('gb2312'))
    html = htmldict['html']
    print(html)
    # import chardet
    # print(chardet.detect(html))
    html = html.decode('gb2312')
    print(html)
    tree = lxml.html.fromstring(html)
    td = tree.cssselect('p')[0]

    print(td.text_content())


def main1():
    D = Downloader()
    #分析網頁ajax地址
    url = 'http://www.w3school.com.cn/tiy/loadtext.asp?f=ajax_async_false'
    html = D(url)
    print(html.decode('gb2312'))



if __name__ == '__main__':
    main1()