百度搜索結果爬蟲
阿新 • • 發佈:2018-12-26
程式碼如下
import requests from lxml import etree # 抓取整個頁面 words = input("輸入搜尋內容:") headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} date = {'wd':words} #wd為需要搜尋的內容,pn為你需要獲取第幾頁的內容 response = requests.get('https://www.baidu.com/s',headers=headers,params=date) result = response.text # 提取資料 html = etree.HTML(result,parser=etree.HTMLParser()) titles = html.xpath("//h3[@class='t' or @class='t c-title-en' or @class='t c-gap-bottom-small']") # 標題 過濾了廣告 abstracts = html.xpath("//div[@class='c-abstract' or @class='c-abstract c-abstract-en']") # 簡介 links = html.xpath("//div[@class='f13']/a[@class='c-showurl']/@href") #獲取完整的url for i in range(10): title = titles[i].xpath("string()") abstract = abstracts[i].xpath("text()") print('='*3) print(title) print(abstract) print(links[i]) print('='*3)
結果顯示如下: