1. 程式人生 > >python爬蟲--下載煎蛋網妹子圖到本地

python爬蟲--下載煎蛋網妹子圖到本地

eve 元素 download down find .get fault log arc

 1 #coding:utf-8
 2 """
 3 下載煎蛋妹子到本地,通過selenium、正則表達式、phantomjs、Beautifulsoup實現
 4 """
 5 
 6 import re
 7 import os
 8 
 9 from selenium import webdriver
10 from selenium.webdriver.support.wait import WebDriverWait
11 from selenium.webdriver.support import expected_conditions as EC
12 from selenium.webdriver.common.by import
By 13 from selenium.common.exceptions import TimeoutException 14 from bs4 import BeautifulSoup 15 from urllib import urlretrieve 16 17 #解決谷歌瀏覽器正受到自動測試軟件的控制 18 # options = webdriver.ChromeOptions() 19 # options.add_argument(‘disable-infobars‘) 20 21 url = http://jandan.net/ooxx 22 # driver = webdriver.Chrome(chrome_options=options)
23 driver = webdriver.PhantomJS() 24 wait = WebDriverWait(driver, 30) 25 26 #下載的煎蛋妹子保存的文件夾 27 img_save_file = images 28 29 #獲取總頁數。打開煎蛋網-妹子圖默認頁面可以獲取到總頁數 30 def get_default_page_num(): 31 try: 32 driver.get(url) 33 page_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,
.current-comment-page))) 34 return page_element.text 35 except TimeoutException: 36 get_default_page_num() 37 38 #獲取圖片的url 39 def get_img_url(page_number): 40 img_url_list = [] 41 url = rhttp://jandan.net/ooxx/page-+ str(page_number) + r#comments 42 print url 43 # url = ‘http://www.baidu.com‘ 44 html = driver.get(url) 45 try: 46 driver.get(url) 47 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, #comments > ol img))) 48 except TimeoutException: 49 print "打開頁面失敗,重新加載該頁面" 50 get_img_url(page_number) 51 52 #獲取頁面html元素 53 html = driver.page_source 54 #通過BeautifulSoup解析 55 soup = BeautifulSoup(html, html.parser) 56 #找出所有為img的標簽 57 imgs = soup.find_all(img) 58 #gif圖片需要獲取ora_src屬性,才是完整的gif圖片。has_attr 判斷是否有某個屬性,attrs可以獲取屬性值 59 for img in imgs: 60 if img.has_attr(org_src): 61 img_url = img.attrs[org_src] 62 else: 63 img_url = img.attrs[src] 64 img_url_list.append(img_url) 65 return img_url_list 66 67 #下載圖片,通過urllib的urlretrieve實現 68 def download_img(img_url): 69 img_name = img_url.split(/)[-1] 70 img_save_path = img_save_file + / +img_name 71 urlretrieve(img_url, img_save_file + / + img_name) 72 73 #創建圖片存儲所在的文件夾 74 def add_img_save_file(img_save_file): 75 if os.path.exists(img_save_file): 76 pass 77 else: 78 os.makedirs(img_save_file) 79 80 def main(): 81 add_img_save_file(img_save_file) 82 #通過正則表達式提取當前的頁數 83 partner = re.compile(r(\d+)) 84 content = get_default_page_num() 85 total_pages = partner.search(content).group() 86 87 for i in range(1, int(total_pages) + 1): 88 print "正在下載第" + str(i) + 的圖片,url為:, 89 img_url_list = get_img_url(str(i)) 90 for img_url in img_url_list: 91 download_img(img_url) 92 93 if __name__ == __main__: 94 main()

python爬蟲--下載煎蛋網妹子圖到本地