1. 程式人生 > >python+selenium的一個小蜘蛛

python+selenium的一個小蜘蛛

用selenium玩的是精準打擊.
這是爬取指定漢字的筆順拼音聲音的小爬蟲。速度慢,但是指向很靈活。只需要調整 yourtxt.txt 裡面的
檔案內容即可。

#coding:utf-8
from urllib.request import urlretrieve

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
from selenium.
webdriver.common.by import By from time import sleep import re address = 'https://hanyu.baidu.com/' shz = ' ' url = ' ' op = webdriver.FirefoxOptions() op.add_argument("--headless") #等效於 ———— op.set_headless() op.add_argument("--disable-gpu") #禁用GPU加速 driver = webdriver.Firefox(firefox_options =
op) driver.get(address) ele = WebDriverWait(driver,10,0.2).until(ec.title_contains('百度漢語')) def getvalue(shz): try: #ele = WebDriverWait(driver,10,0.2).until(ec.title_contains('百度漢語')) trg = driver.find_element_by_id('kw') trg.clear() trg.send_keys(shz) trg =
driver.find_element_by_id('su') trg.click() print(shz) except: print('程式出現錯誤,請除錯解決後執行') exit() quit() try: trg = driver.find_element_by_id('pc--body') trg = driver.find_element_by_xpath(r'//*[@id="data-container"]/div[1]/div[1]/a') trg.click() print('有夾層') except: print('無夾層') finally: ele = WebDriverWait(driver,10).until(ec.visibility_of(driver.find_element(by=By.ID,value='pc-word-body'))) trg = driver.find_element_by_xpath(r'//*[@id="word_bishun"]') #抓取字元動畫 url = trg.get_attribute('src') urlretrieve(url,'./image/'+ shz + '.gif') #儲存為以字元為名字的gif動圖 print(url) trg = driver.find_element_by_xpath(r'//*[@id="pinyin"]/span/a') #抓取讀音 url= trg.get_attribute('url') urlretrieve(url,'./mp3/'+ shz + '.mp3') #儲存為以字元為名字的mp3格式 print(url) pinyin = driver.find_element_by_xpath(r'//*[@id="pinyin"]/span/b').text print(pinyin) driver.back() with open('pinyi.txt','a+',encoding = ('UTF-8-sig')) as f: f.writelines(shz + ',' + pinyin + '\n') ele = WebDriverWait(driver,10,0.2).until(ec.title_contains('百度漢語')) print('抓取成功') # main(): txt = [] with open('yourtxt.txt','r',encoding = ('UTF-8-sig')) as f: txt = f.readlines() for i in range(len(txt)): try: print(i + 1) d = txt[i][-2] getvalue(d) except Exception as e: print(e) driver.close() driver.quit()