1. 程式人生 > >學會用python網路爬蟲爬取鬥圖網的表情包,聊微信再也不怕鬥圖了

學會用python網路爬蟲爬取鬥圖網的表情包,聊微信再也不怕鬥圖了

最近總是有人跟我鬥圖,想了想17年中旬時在網上看過一篇關於爬取鬥圖網表情包的py程式碼,但是剛想爬的時候發現網頁結構發生了變化,而且鬥圖網還插入了很多廣告,變化其實挺大的,所以臨時寫了一個爬蟲,簡單的爬取了鬥圖網的表情包。從這連結上看,page表示的是第幾頁,我只爬取了500多頁(很奇怪白天明明看到一共有一千多頁的,為啥晚上就只有548頁?),純屬娛樂,表情包夠用就行。
這裡寫圖片描述
重點還是在於解析網頁,頁面上每一欄都是一組圖,這組圖有一個連結指向,所以我只要提取到這個連結,再開啟這個連結,然後在新的網頁上提取表情圖片,下載下來就行了。解析網頁使用了python的xpath,剩下的就是數學思維了,迴圈巢狀和判斷什麼的。
原始碼截圖如下(使用的是python3):

#coding=utf8
import sys
defaultencoding = 'utf-8'
if sys.getdefaultencoding() != defaultencoding:
    reload(sys)
    sys.setdefaultencoding(defaultencoding)
import requests
from lxml import etree
from urllib import request
from bs4 import BeautifulSoup
import time

headers = {}
headers["User-Agent"
] = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0" headers["Host"] = "www.doutula.com" headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" headers["Accept-Language"] = "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2" headers["Accept-Encoding"] = "gzip, deflate"
headers["Connection"] = "close" headers["Upgrade-Insecure-Requests"] = "1" def get_link(page): url = "http://www.doutula.com/article/list/?page=" + str(page) response = requests.get(url,headers=headers) html = response.text link_list = [] selector = etree.HTML(html) link1 = selector.xpath('//a[@class="list-group-item random_list tg-article"]/@href') for i in link1: link_list.append(i) link2 = selector.xpath('//a[@class="list-group-item random_list"]/@href') for i in link2: link_list.append(i) return link_list j = 1 def get_img(link_list): global j img_url = [] for url in link_list: response = requests.get(url,headers=headers) time.sleep(1) html = response.text selector = etree.HTML(html) soup = BeautifulSoup(html,"lxml") tb = soup.find_all("table") i = 1 while i < len(tb)+1: try: img_link = selector.xpath("//table[%d]/tbody[1]/tr[1]/td[1]/a/img/@src" % i) img_url.append(img_link) i += 1 except Exception as e: print(str(e)) for img in img_url: for image_link in img: print(image_link) if image_link[-3:]=="gif": request.urlretrieve(image_link, 'F:\\image\\%s.gif' % str(j)) else: if image_link[-3:]=="png": request.urlretrieve(image_link, 'F:\\image\\%s.png' % str(j)) else: request.urlretrieve(image_link, 'F:\\image\\%s.jpg' % str(j)) time.sleep(1) j += 1 for page in range(1,5): try: get_img(get_link(page)) except Exception as e: print(str(e))

這裡寫圖片描述
這裡寫圖片描述
後面覺得直接使用數字名字的效果不太好,還是需要給圖片命個名字,這樣好搜尋自己需要什麼樣子的表情。
所以修改了原始碼。先上效果圖:
這裡寫圖片描述
原始碼如下:

#coding=utf8
import sys
defaultencoding = 'utf-8'
if sys.getdefaultencoding() != defaultencoding:
    reload(sys)
    sys.setdefaultencoding(defaultencoding)
import requests
from lxml import etree
from urllib import request
from bs4 import BeautifulSoup
import time

headers = {}
headers["User-Agent"] = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"
headers["Host"] = "www.doutula.com"
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
headers["Accept-Language"] = "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2"
headers["Accept-Encoding"] = "gzip, deflate"
headers["Connection"] = "close"
headers["Upgrade-Insecure-Requests"] = "1"

def get_link(page):
    url = "http://www.doutula.com/article/list/?page=" + str(page)
    response = requests.get(url,headers=headers)
    html = response.text
    link_list = []
    selector = etree.HTML(html)
    link1 = selector.xpath('//a[@class="list-group-item random_list tg-article"]/@href')
    for i in link1:
        link_list.append(i)
    link2 = selector.xpath('//a[@class="list-group-item random_list"]/@href')
    for i in link2:
        link_list.append(i)
    return link_list
j = 1
def get_img(link_list):
    global j
    img_url = []
    img_name = []
    for url in link_list:
        response = requests.get(url,headers=headers)
        time.sleep(1)
        html = response.text
        selector = etree.HTML(html)
        soup = BeautifulSoup(html,"lxml")
        tb = soup.find_all("table")
        i = 1
        while i < len(tb)+1:
            try:
                img_link = selector.xpath("//table[%d]/tbody[1]/tr[1]/td[1]/a/img/@src" % i)
                image_name = selector.xpath("//table[%d]/tbody[1]/tr[1]/td[1]/a/img/@alt" % i)
                img_url.append(img_link)
                img_name.append(image_name)
                i += 1
            except Exception as e:
                print(str(e))
    for img in img_url:
        img_id = img_url.index(img)
        for image_link in img:
            image_id = img.index(image_link)
            print("下載第%d張表情包:"%j + image_link)
            if image_link[-3:]=="gif":
                request.urlretrieve(image_link, 'F:\\image\\%s.gif' % str(img_name[img_id][image_id]))
            else:
                if image_link[-3:]=="png":
                    request.urlretrieve(image_link, 'F:\\image\\%s.png' % str(img_name[img_id][image_id]))
                else:
                    request.urlretrieve(image_link, 'F:\\image\\%s.jpg' % str(img_name[img_id][image_id]))
            time.sleep(1)
            j += 1
for page in range(4,11):
    try:
        get_img(get_link(page))
    except Exception as e:
        print(str(e))