爬蟲自學之路(二) requests小技巧
阿新 • • 發佈:2018-12-18
2 RequestsCookieJar轉換為cookie字典
requests.utils.dict_from_cookiejar(resp.cookies)
3 cookie字典轉換為RequestsCookieJar
requests.utils.cookiejar_from_dict({"BDORZ":"27315"})
4 URL編碼
requests.utils.quote("編碼") # '%E7%BC%96%E7%A0%81'
5 URL解碼
requests.utils.unquote("%E7%BC%96%E7%A0%81") # “編碼”
6 不驗證HTTPS
requests.get("https://www.12306.cn/mormhweb/", verify=False)
7 設定超時時間 和 retrying模組
requests.get("https://www.baidu.cn/", timeout=5)
8 retrying模組
import requests from retrying import retry headers={ "User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1", "Referer": "https://m.douban.com/movie/", } @retry(stop_max_attempt_number=3)def _parse_url(url,method,data,proxies): print("*"*20) if method=="POST": response = requests.post(url,data=data,headers=headers,proxies=proxies) else: response = requests.get(url,headers=headers,timeout=3,proxies=proxies) assert response.status_code == 200 return response.content.decode() def parse_url(url,method="GET",data=None,proxies={}): try: html_str = _parse_url(url,method,data,proxies) except: html_str = None return html_str if __name__ == '__main__': url = "www.baidu.com" print(parse_url(url))
9 urldecode 對字串進行url解碼 發起一個get請求時,會在url後攜帶引數