Python爬蟲爬取百度搜索內容介面-xpath
阿新 • • 發佈:2019-01-07
百度爬蟲搜尋介面1.0版
百度爬蟲搜尋介面1.0版:
- 通過百度關鍵字遍歷到一級頁面的url
- 通過百度關鍵字遍歷到一級頁面的title標題
- 通過百度關鍵字遍歷到一級頁面的text文字
爬取思路
拼接url
經過測試,初始時拼接url,只需要加入keyword引數即可,為了便於之後如果有需要擴充的引數進行擴充套件,可以寫在不同的函式裡,進行功能解耦,通過params的方式進行引數的傳遞,及拼接url;
具體方法可以參照連結:
解析url得到響應的response
通過headers等引數來請求得到響應,中間加上RequestException的異常處理,因為返回的是文字text資料,所以返回response.text。如果是動態載入的xhr資料,則返回的資料則用response.json;實現如下:
def get_page(url): try: headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' } response = requests.get(url=url,headers=headers) # 更改編碼方式,否則會出現亂碼的情況 response.encoding = "utf-8" print(response.status_code) # print(response.text) if response.status_code == 200: return response.text return None except RequestException: return None
解析單頁的response
首先我們要清楚我們要抓取的內容有url,title,以及abstract text文字的內容。
根據div欄中的id號,我們可以找到我們想要的資料,但是要注意其中的文字內容包括帶圖片的,以及純文字,這兩種內容的xpath不同。
完整程式碼實現如下
import requests import urllib.parse from requests.exceptions import RequestException from urllib.parse import urljoin from lxml import etree import re import json # 百度搜索介面 def format_url(url, params: dict=None) -> str: query_str = urllib.parse.urlencode(params) return f'{ url }?{ query_str }' def get_url(keyword): params = { 'wd': str(keyword) } url = "https://www.baidu.com/s" url = format_url(url, params) # print(url) return url def get_page(url): try: headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' } response = requests.get(url=url,headers=headers) # 更改編碼方式,否則會出現亂碼的情況 response.encoding = "utf-8" print(response.status_code) # print(response.text) if response.status_code == 200: return response.text return None except RequestException: return None def parse_page(url,page): for i in range(1,int(page)+1): print("正在爬取第{}頁....".format(i)) title = "" sub_url = "" abstract = "" flag = 11 if i == 1: flag = 10 html = get_page(url) content = etree.HTML(html) for j in range(1,flag): data = {} res_title = content.xpath('//*[@id="%d"]/h3/a' % ((i - 1) * 10 + j)) if res_title: title = res_title[0].xpath('string(.)') sub_url = content.xpath('//*[@id="%d"]/h3/a/@href' % ((i - 1) * 10 + j)) if sub_url: sub_url = sub_url[0] res_abstract = content.xpath('//*[@id="%d"]/div[@class="c-abstract"]'%((i-1)*10+j)) if res_abstract: abstract = res_abstract[0].xpath('string(.)') else: res_abstract = content.xpath('//*[@id="%d"]/div/div[2]/div[@class="c-abstract"]'%((i-1)*10+j)) if res_abstract: abstract = res_abstract[0].xpath('string(.)') # res_abstract = content.xpath('//*[@id="%d"]/div/div[2]/p[1]'%((i-1)*10+j)) # if not abstract: # abstract = content.xpath('//*[@id="%d"]/div/div[2]/p[1]'%((i-1)*10+j))[0].xpath('string(.)') data['title'] = title data['sub_url'] = sub_url data['abstract'] = abstract rel_url = content.xpath('//*[@id="page"]/a[{}]/@href'.format(flag)) if rel_url: url = urljoin(url, rel_url[0]) else: print("無更多頁面!~") return yield data def main(): keyword = input("輸入關鍵字:") page = input("輸入查詢頁數:") url = get_url(keyword) results = parse_page(url,page) # 寫入檔案 file = open("data.json", 'w+', encoding='utf-8') for result in results: print(result) file.write(json.dumps(result, indent=2, ensure_ascii=False)) if __name__ == '__main__': main()