python爬蟲--下載煎蛋網妹子圖到本地
阿新 • • 發佈:2017-12-28
eve 元素 download down find .get fault log arc
1 #coding:utf-8 2 """ 3 下載煎蛋妹子到本地,通過selenium、正則表達式、phantomjs、Beautifulsoup實現 4 """ 5 6 import re 7 import os 8 9 from selenium import webdriver 10 from selenium.webdriver.support.wait import WebDriverWait 11 from selenium.webdriver.support import expected_conditions as EC 12 from selenium.webdriver.common.by importBy 13 from selenium.common.exceptions import TimeoutException 14 from bs4 import BeautifulSoup 15 from urllib import urlretrieve 16 17 #解決谷歌瀏覽器正受到自動測試軟件的控制 18 # options = webdriver.ChromeOptions() 19 # options.add_argument(‘disable-infobars‘) 20 21 url = ‘http://jandan.net/ooxx‘ 22 # driver = webdriver.Chrome(chrome_options=options)23 driver = webdriver.PhantomJS() 24 wait = WebDriverWait(driver, 30) 25 26 #下載的煎蛋妹子保存的文件夾 27 img_save_file = ‘images‘ 28 29 #獲取總頁數。打開煎蛋網-妹子圖默認頁面可以獲取到總頁數 30 def get_default_page_num(): 31 try: 32 driver.get(url) 33 page_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ‘.current-comment-page‘))) 34 return page_element.text 35 except TimeoutException: 36 get_default_page_num() 37 38 #獲取圖片的url 39 def get_img_url(page_number): 40 img_url_list = [] 41 url = r‘http://jandan.net/ooxx/page-‘+ str(page_number) + r‘#comments‘ 42 print url 43 # url = ‘http://www.baidu.com‘ 44 html = driver.get(url) 45 try: 46 driver.get(url) 47 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ‘#comments > ol img‘))) 48 except TimeoutException: 49 print "打開頁面失敗,重新加載該頁面" 50 get_img_url(page_number) 51 52 #獲取頁面html元素 53 html = driver.page_source 54 #通過BeautifulSoup解析 55 soup = BeautifulSoup(html, ‘html.parser‘) 56 #找出所有為img的標簽 57 imgs = soup.find_all(‘img‘) 58 #gif圖片需要獲取ora_src屬性,才是完整的gif圖片。has_attr 判斷是否有某個屬性,attrs可以獲取屬性值 59 for img in imgs: 60 if img.has_attr(‘org_src‘): 61 img_url = img.attrs[‘org_src‘] 62 else: 63 img_url = img.attrs[‘src‘] 64 img_url_list.append(img_url) 65 return img_url_list 66 67 #下載圖片,通過urllib的urlretrieve實現 68 def download_img(img_url): 69 img_name = img_url.split(‘/‘)[-1] 70 img_save_path = img_save_file + ‘/‘ +img_name 71 urlretrieve(img_url, img_save_file + ‘/‘ + img_name) 72 73 #創建圖片存儲所在的文件夾 74 def add_img_save_file(img_save_file): 75 if os.path.exists(img_save_file): 76 pass 77 else: 78 os.makedirs(img_save_file) 79 80 def main(): 81 add_img_save_file(img_save_file) 82 #通過正則表達式提取當前的頁數 83 partner = re.compile(r‘(\d+)‘) 84 content = get_default_page_num() 85 total_pages = partner.search(content).group() 86 87 for i in range(1, int(total_pages) + 1): 88 print "正在下載第" + str(i) + ‘的圖片,url為:‘, 89 img_url_list = get_img_url(str(i)) 90 for img_url in img_url_list: 91 download_img(img_url) 92 93 if __name__ == ‘__main__‘: 94 main()
python爬蟲--下載煎蛋網妹子圖到本地