1. 程式人生 > >python_爬取【搜狗圖片】

python_爬取【搜狗圖片】

1.利用python抓取網站上的圖片,對於學習python及對網頁資料分析處理很有幫助,也可以學習一些web方面的知識,我嘗試使用【搜狗圖片】搜尋到的圖片作為抓取物件,抓取【搜狗圖片】主頁各個標題欄的圖片,以及【其他】輸入圖片型別的圖片,使用tkinter完成了一個簡單的UI介面。

2.一般抓取網頁圖片,需要先訪問頁面,然後提取原始碼,依次解析各個圖片URL,然後直接下載即可,這些網上的教程很多,在此不再贅述。但是對於一些圖片較多的頁面,往往使用動態載入的方式呈現圖片,也就是我們抓取的頁面原始碼中,並沒有各個圖片的URL,這就需要 分析頁面結構,找到頁面圖片真正的URL資源地址,才能完成下載。

3.例如在【搜狗圖片】搜尋美女,然後點進圖片,查詢該圖片的URL地址:

【請求URL】:

【頁面原始碼】:找不到對應的圖片URL。

發現【標題類】的圖片都集中在如下URL:

而通過【搜尋】得到得圖片URL集中在:

這樣,就可以很清楚的得到各個圖片URL的地址,爬取圖片了。

4.原始碼:

#-*- encoding=UTF-8 -*-
import urllib.request,socket,re,sys,os
from urllib.request import urlopen
import time
from tkinter import *
import webbrowser
from bs4 import BeautifulSoup
import requests
import json
import urllib

##############################常量區##############################
sougou_url="http://pic.sogou.com/"
###URL
download_pics_path="C:/bz2018/"
download_pics_num=10
download_success = ""
sougou_pics_tag=["pic_url","thumbUrl","bthumbUrl","ori_pic_url"]
sougou_url_pics_start="http://pic.sogou.com/pics/channel/getAllRecomPicByTag.jsp?category="
sougou_url_pics_mid="&tag=%E5%85%A8%E9%83%A8&start=0&len="
sougou_url_pics_start_other="http://pic.sogou.com/pics?query="
sougou_url_pics_mid_other="&did=1&mode=1&start=0&len="
sougou_url_pics_stop_other="&reqType=ajax"
###title
title_key_start="a class=\"nav-tab\" href=\"/pics/"
title_key_stop="<"
###tkinter
window_name="搜狗圖片下載器"
window_size="500x500"
frm_bg="white"
real_columnspan=4
label_type_str="---------------------------------------------圖片型別---------------------------------------------"
##############################常量區##############################

real_url_arr = [] ###組成url集合

##############################函式區##############################
###獲取網頁上標題,返回標題陣列
def get_title(url):
    html = urlopen(url)
    sougou_html = BeautifulSoup(html.read())
    title_key = []
    for ihtml in sougou_html:
        data1 = str(ihtml).split(title_key_start)
        if len(data1) > 1:
            for jhtml in data1:
                data2 = jhtml.split(title_key_stop)[0]
                data3 = data2.split("\">")
                if len(data3) == 2:
                    title_key.append(data3[1])
    return title_key

###獲取網頁圖片並下載,返回下載失敗個數
def get_pics(url,path):
    # 檢測當前路徑的有效性
    if not os.path.isdir(path):
        os.mkdir(path)
    pics_str = requests.get(url)
    pics_dict = json.loads(pics_str.text)
    pics_dict_items = pics_dict['all_items']
    i_item=0
    fail_count=0
    for item in pics_dict_items:
        fail_flag=0
        for itag in sougou_pics_tag:
            try:
                pic_url=item[itag]
                pic_title=item['title']
                if pic_title == "":
                    pic_title = str(i_item)
                    i_item = i_item + 1
                if pic_url != "":
                    urllib.request.urlretrieve(pic_url, path + pic_title + '.jpg')
                    print(pic_title+": download complete!")
                    fail_flag=1
                    break
            except:
                print("download fail!")
                continue
        if fail_flag != 1:
            fail_count=fail_count+1
    return fail_count

def get_pics_other(url,path):
    pics_str = requests.get(url)
    pics_dict = json.loads(pics_str.text)
    pics_dict_items = pics_dict['items']
    i_item=0
    fail_count=0
    for item in pics_dict_items:
        fail_flag=0
        for itag in sougou_pics_tag:
            try:
                pic_url=item[itag]
                pic_title=item['title']
                pic_title=pic_title+str(i_item)
                i_item = i_item + 1
                if pic_url != "":
                    urllib.request.urlretrieve(pic_url, path + pic_title + '.jpg')
                    print(pic_title+": download complete!")
                    fail_flag=1
                    break
            except:
                print("download fail!")
                continue
        if fail_flag != 1:
            fail_count=fail_count+1
    return fail_count

def url_get_othertype():
    global real_url_arr
    if PhotoType.get() != "":
        real_url_arr.append(PhotoType.get())
        real_url_arr = list(set(real_url_arr))

def url_get_phototype(all_type):
    global real_url_arr
    real_url_arr=[]
    url_get_othertype()
    if "其他" in all_type:
        all_type.remove("其他")
    for i in range(len(all_type)):
        if CheckType[i].get() == 1:
            real_url_arr.append(typeBtn[all_type[i]]['text'])
    real_url_arr = list(set(real_url_arr))

def other_type():
    if OtherType.get() == 1 :
        type["state"] = "normal"
    else:
        type["state"] = "disabled"
        PhotoType.set("")

def get_full_url(all_type):
    global download_pics_num
    down_result["text"] = ""
    url_get_phototype(all_type)
    if download_num_str.get() != "":
        download_pics_num = int((download_num_str.get()))
    sum = len(real_url_arr) * download_pics_num
    down_result["text"] = "準備下載: " + str(sum) + "張照片"
    fail_num = 0
    for iurl in real_url_arr:
        if iurl in photo_type:
            tmp_url=sougou_url_pics_start+iurl+sougou_url_pics_mid+str(download_pics_num)
            fail_num = fail_num + get_pics(tmp_url, download_pics_path)
        else:
            tmp_url=sougou_url_pics_start_other + iurl + sougou_url_pics_mid_other + str(download_pics_num) + sougou_url_pics_stop_other
            time.sleep(1)
            fail_num = fail_num + get_pics_other(tmp_url, download_pics_path)
    down_result["text"] ="成功下載: " + str(sum-fail_num) + "張照片"

###tkinter label佔一行
def write_line(row,text="",column=0,columnspan=real_columnspan,bg=frm_bg):
    label = Label(frm, text=text, bg=bg)
    label.grid(row=row, column=column,columnspan=columnspan)
    return label

###呼叫網頁
def callback(url=sougou_url):
    webbrowser.open_new(url)

##############################函式區##############################

##############################UI部分##########################################
root =Tk() #給窗體
root.title(window_name) #設定窗體名字
root.geometry(window_size)
root.resizable(width=False, height=False) ###固定窗體大小

frm=Frame(root,bg=frm_bg) #新建框架
frm.pack(expand = YES,fill = BOTH) #放置框架

###控制行的引數
real_row=0
###空一行
write_line(real_row)
real_row=real_row+1
###進入官網
Button(frm,text="點選進入搜狗圖片官網",command=callback).grid(row=real_row,column=0,columnspan=real_columnspan,sticky=N)
real_row=real_row+1
###空一行
write_line(real_row)
real_row=real_row+1
###圖片型別
write_line(real_row,label_type_str)
real_row=real_row+1
###空一行
write_line(real_row)
real_row=real_row+1

###checkbutton
photo_type=get_title(sougou_url)
photo_type.append("其他")
typeBtn={}
CheckType=[]
real_column=0
for itype in photo_type:
    if itype == "其他":
        OtherType = IntVar()
        PhotoType = StringVar()
        type = Entry(frm, textvariable=PhotoType, width=9, state='disabled')  # 新增輸入框
        Checkbutton(frm, text="其他", variable=OtherType, onvalue=1, offvalue=2, command=other_type).grid(row=real_row, column=1)
        type.grid(row=real_row, column=2, columnspan=4, sticky=W, padx=40, ipadx=60)  # 放置輸入框位置
    else:
        CheckType.append(IntVar())
        typeBtn[itype]=Checkbutton(frm, text=itype, variable=CheckType[-1], command=lambda: url_get_phototype(photo_type))
        typeBtn[itype].grid(row=real_row, column=real_column)
    real_column=real_column+1
    if real_column == 4:
        real_column = 0
        real_row = real_row + 1
real_row=real_row+1

###空一行
write_line(real_row)
real_row=real_row+1

###下載個數
lab1 = Label(frm,text = "下載個數:")# 新增Label
lab1.grid(row = real_row,column=0)
download_num_str = StringVar()
download_num = Entry(frm,width=10,textvariable=download_num_str)# 新增Entry
download_num.grid(row = real_row,column=1,sticky=W)
real_row=real_row+1

###空一行
write_line(real_row)
real_row=real_row+1

###get
Button(frm,text="獲取照片",command=lambda: get_full_url(photo_type)).grid(row=real_row,column=0,columnspan=4,sticky=N)
real_row=real_row+1

###空一行
write_line(real_row)
real_row=real_row+1

###結果
down_result=write_line(real_row)
real_row=real_row+1

###空一行
write_line(real_row)
real_row=real_row+1

Button(frm,text="退出程式",command=root.quit).grid(row=real_row,column=0,columnspan=4,sticky=N)
real_row=real_row+1

mainloop()
##############################UI部分##########################################
執行結果:圖片