1. 程式人生 > >python爬蟲-通過bs4和xpath分析html程式碼

python爬蟲-通過bs4和xpath分析html程式碼

我感覺作者用xpath分析程式碼的時候不是很好,下面是我重新改善的

一、用lxml模組分析程式碼

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import requests
import time,os
from lxml import etree

def get_Page(url,headers):
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        return response.text
    return None

def parse_Page(html,headers):
    html_lxml = etree.HTML(html)
    
    #在xpath中可以用 "|" 表示選取若干路徑
    datas = html_lxml.xpath('.//div[@class="captcha_images_left"]|.//div[@class="captcha_images_right"]')
    item= {}
    # 建立儲存驗證碼資料夾
    file = 'qcode'
    if os.path.exists(file):
        os.chdir(file)
    else:
        os.mkdir(file)
        os.chdir(file)
        
    for data in datas:
        name = data.xpath('.//h3')   #驗證碼名稱,返回列表
        src = data.xpath('.//div/img/@src') #驗證碼連結,返回列表

        for i in range(len(name)):
            filename = name[i].text + '.jpg'    # 驗證碼圖片檔名
            img_url = 'https://captcha.com/' + src[i]
            item[filename] = img_url

        count = 0
        for imgname, imgurl in item.items():
            response = requests.get(imgurl, headers=headers)
            if response.status_code == 200:
                image = response.content     #獲取圖片內容
                with open(imgname,'wb') as f:
                    f.write(image)
                    count += 1
                    print('儲存第{}張驗證碼成功'.format(count))
                    time.sleep(1)


def main():
    url = 'https://captcha.com/captcha-examples.html?cst=corg'
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
    html = get_Page(url,headers)
    parse_Page(html,headers)


if __name__ == '__main__':
    main()

二、用bs4模組分析程式碼

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import requests
import time,os
from urllib.request import urlretrieve
from bs4 import BeautifulSoup

def get_Page(url,headers):
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        return response.text
    return None

def parse_Page(html):
    soup = BeautifulSoup(html.encode(), 'html.parser', from_encoding='utf-8')
    data_left = soup.select('#main .captcha_images_left')
    data_right = soup.select('#main .captcha_images_right')
    data = {}

    # 建立儲存驗證碼資料夾
    file = 'qcode'
    if os.path.exists(file):
        os.chdir(file)
    else:
        os.mkdir(file)
        os.chdir(file)

    for i in range(2):
        for row in zip(data_left,data_right):
            names = row[i].select('h3')    #獲取所有的 h3 標籤,返回列表
            images = row[i].select('img')  #獲取所有的 img 標籤,返回列表
            for tag_h,tag_img in zip(names,images):
                #tag_h 為所有的 h3 標籤 ;tag_img 為所有的 img 標籤
                data[str(tag_h.text).strip() + '.jpg'] = 'https://captcha.com/' + tag_img['src']

    for imgname,imgurl in data.items():
        response = requests.get(imgurl, headers=headers)
        if response.status_code == 200:
            urlretrieve(imgurl,imgname)
            time.sleep(1)

if __name__ == '__main__':
    url = 'https://captcha.com/captcha-examples.html?cst=corg'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
    html = get_Page(url, headers)
    parse_Page(html)

改善多執行緒爬蟲,以下是pyton3的程式碼

#!/usr/bin/env python
#-*- coding:utf-8 -*-
# author:Mr Yang

import requests
import time,os
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import threading, queue

def get_Page(url,headers):
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        return response.text
    return None

def parse_Page(html,urlQueue):
    soup = BeautifulSoup(html.encode(), 'html.parser', from_encoding='utf-8')
    data_left = soup.select('#main .captcha_images_left')
    data_right = soup.select('#main .captcha_images_right')

    # 建立儲存驗證碼資料夾
    file = 'qcode'
    if os.path.exists(file):
        os.chdir(file)
    else:
        os.mkdir(file)
        os.chdir(file)

    for i in range(2):
        for row in zip(data_left,data_right):
            names = row[i].select('h3')    #獲取所有的 h3 標籤,返回列表
            images = row[i].select('img')  #獲取所有的 img 標籤,返回列表
            for tag_h,tag_img in zip(names,images):
                #tag_h 為所有的 h3 標籤 ;tag_img 為所有的 img 標籤
                urlQueue.put({str(tag_h.text).strip() + '.jpg':'https://captcha.com/' + tag_img['src']})

def dowloadimg(urlQueue,headers):
    while True:
        try:
            data = urlQueue.get_nowait()  # 不阻塞的讀取佇列資料
            i = urlQueue.qsize()  # 佇列長度,取出一個長度就減少一個
        except Exception as e:
            break


        for imgname,imgurl in data.items():
            response = requests.get(imgurl, headers=headers)
            if response.status_code == 200:
                urlretrieve(imgurl,imgname)
                time.sleep(1)

if __name__ == '__main__':
    url = 'https://captcha.com/captcha-examples.html?cst=corg'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}

    urlQueue = queue.Queue()
    html = get_Page(url,headers)
    parse_Page(html,urlQueue)

    threadNum = 7
    for i in range(threadNum):
        t = threading.Thread(target=dowloadimg,args=(urlQueue,headers,))
        t.start()