學會用python網路爬蟲爬取鬥圖網的表情包,聊微信再也不怕鬥圖了
阿新 • • 發佈:2019-01-06
最近總是有人跟我鬥圖,想了想17年中旬時在網上看過一篇關於爬取鬥圖網表情包的py程式碼,但是剛想爬的時候發現網頁結構發生了變化,而且鬥圖網還插入了很多廣告,變化其實挺大的,所以臨時寫了一個爬蟲,簡單的爬取了鬥圖網的表情包。從這連結上看,page表示的是第幾頁,我只爬取了500多頁(很奇怪白天明明看到一共有一千多頁的,為啥晚上就只有548頁?),純屬娛樂,表情包夠用就行。
重點還是在於解析網頁,頁面上每一欄都是一組圖,這組圖有一個連結指向,所以我只要提取到這個連結,再開啟這個連結,然後在新的網頁上提取表情圖片,下載下來就行了。解析網頁使用了python的xpath,剩下的就是數學思維了,迴圈巢狀和判斷什麼的。
原始碼截圖如下(使用的是python3):
#coding=utf8
import sys
defaultencoding = 'utf-8'
if sys.getdefaultencoding() != defaultencoding:
reload(sys)
sys.setdefaultencoding(defaultencoding)
import requests
from lxml import etree
from urllib import request
from bs4 import BeautifulSoup
import time
headers = {}
headers["User-Agent" ] = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"
headers["Host"] = "www.doutula.com"
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
headers["Accept-Language"] = "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2"
headers["Accept-Encoding"] = "gzip, deflate"
headers["Connection"] = "close"
headers["Upgrade-Insecure-Requests"] = "1"
def get_link(page):
url = "http://www.doutula.com/article/list/?page=" + str(page)
response = requests.get(url,headers=headers)
html = response.text
link_list = []
selector = etree.HTML(html)
link1 = selector.xpath('//a[@class="list-group-item random_list tg-article"]/@href')
for i in link1:
link_list.append(i)
link2 = selector.xpath('//a[@class="list-group-item random_list"]/@href')
for i in link2:
link_list.append(i)
return link_list
j = 1
def get_img(link_list):
global j
img_url = []
for url in link_list:
response = requests.get(url,headers=headers)
time.sleep(1)
html = response.text
selector = etree.HTML(html)
soup = BeautifulSoup(html,"lxml")
tb = soup.find_all("table")
i = 1
while i < len(tb)+1:
try:
img_link = selector.xpath("//table[%d]/tbody[1]/tr[1]/td[1]/a/img/@src" % i)
img_url.append(img_link)
i += 1
except Exception as e:
print(str(e))
for img in img_url:
for image_link in img:
print(image_link)
if image_link[-3:]=="gif":
request.urlretrieve(image_link, 'F:\\image\\%s.gif' % str(j))
else:
if image_link[-3:]=="png":
request.urlretrieve(image_link, 'F:\\image\\%s.png' % str(j))
else:
request.urlretrieve(image_link, 'F:\\image\\%s.jpg' % str(j))
time.sleep(1)
j += 1
for page in range(1,5):
try:
get_img(get_link(page))
except Exception as e:
print(str(e))
後面覺得直接使用數字名字的效果不太好,還是需要給圖片命個名字,這樣好搜尋自己需要什麼樣子的表情。
所以修改了原始碼。先上效果圖:
原始碼如下:
#coding=utf8
import sys
defaultencoding = 'utf-8'
if sys.getdefaultencoding() != defaultencoding:
reload(sys)
sys.setdefaultencoding(defaultencoding)
import requests
from lxml import etree
from urllib import request
from bs4 import BeautifulSoup
import time
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"
headers["Host"] = "www.doutula.com"
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
headers["Accept-Language"] = "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2"
headers["Accept-Encoding"] = "gzip, deflate"
headers["Connection"] = "close"
headers["Upgrade-Insecure-Requests"] = "1"
def get_link(page):
url = "http://www.doutula.com/article/list/?page=" + str(page)
response = requests.get(url,headers=headers)
html = response.text
link_list = []
selector = etree.HTML(html)
link1 = selector.xpath('//a[@class="list-group-item random_list tg-article"]/@href')
for i in link1:
link_list.append(i)
link2 = selector.xpath('//a[@class="list-group-item random_list"]/@href')
for i in link2:
link_list.append(i)
return link_list
j = 1
def get_img(link_list):
global j
img_url = []
img_name = []
for url in link_list:
response = requests.get(url,headers=headers)
time.sleep(1)
html = response.text
selector = etree.HTML(html)
soup = BeautifulSoup(html,"lxml")
tb = soup.find_all("table")
i = 1
while i < len(tb)+1:
try:
img_link = selector.xpath("//table[%d]/tbody[1]/tr[1]/td[1]/a/img/@src" % i)
image_name = selector.xpath("//table[%d]/tbody[1]/tr[1]/td[1]/a/img/@alt" % i)
img_url.append(img_link)
img_name.append(image_name)
i += 1
except Exception as e:
print(str(e))
for img in img_url:
img_id = img_url.index(img)
for image_link in img:
image_id = img.index(image_link)
print("下載第%d張表情包:"%j + image_link)
if image_link[-3:]=="gif":
request.urlretrieve(image_link, 'F:\\image\\%s.gif' % str(img_name[img_id][image_id]))
else:
if image_link[-3:]=="png":
request.urlretrieve(image_link, 'F:\\image\\%s.png' % str(img_name[img_id][image_id]))
else:
request.urlretrieve(image_link, 'F:\\image\\%s.jpg' % str(img_name[img_id][image_id]))
time.sleep(1)
j += 1
for page in range(4,11):
try:
get_img(get_link(page))
except Exception as e:
print(str(e))