1. 程式人生 > >第一個爬蟲小程式(攜帶登陸後的cookie)

第一個爬蟲小程式(攜帶登陸後的cookie)

import requests
class TiebaSpider:
	def __init__(self,tieba_name):
		"""
		初始化引數,完成基礎配置
		"""
		self.tieba_name = tieba_name
		self.url_base = "https://tieba.baidu.com/f?kw=" + tieba_name_crawl + "&ie=utf-8&pn={}"
		self.headers = {"User-Agent":"WSF"}
		
	def make_url_lists(self):
		"""
		生成下載列表
		"""
		return [self.url_base.format(i) for i in range(1,11)]
	
	def download_url(self,url_str):
		"""
		使用requests get方法下載指定頁面,並返回頁面效果
		"""
		result = requests.get(url_str,headers = self.headers)
		return result.content
	
	def save(self,result,page_num):
		"""
		儲存下載內容
		"""
		file_path = "{}-第{}頁.html".format(self.tieba_name,page_num)
		with open(file_path,"wb") as f:
			f.write(result)
		
	def run(self):
		"""
		下載主執行緒,實現主要的下載邏輯
		"""
		url_lists = self.make_url_lists()
		for url_str in url_lists:
			result_str = self.download_url(url_str)
           		 p_num      = url_lists.index(url_str) + 1
            		self.save_result(result_str,p_num)
            		
if __name__ == '__main__':
    tieba_spider = TiebaSpider("薛之謙")
    tieba_spider.run()
	

理解 session 和 cookie

session:當用戶訪問http-server時,會生成一個sessionID(唯一標識),在一定訪問週期內可用,在瀏覽網頁時會將記錄儲存在cookie中,下次訪問有快取記錄.

session 伺服器端生成一個字串儲存在某個使用者的唯一標識.用來唯一標識客戶端的訪問(如健身中心會員卡)

cookie 儲存在客戶機的資料,其中含有sessionID,傳送給伺服器後表明使用者身份.

import lxml.html

import requests
import re

def parse_form(html):
    tree = lxml.html.fromstring(html)
    data = {}
    for e in tree.cssselect('form input'):
        if e.get('name'):
            data[e.get('name')] = e.get('value')
    return data

def get_cookie():
    s = requests.session()
    result = s.get('http://example.webscraping.com/places/default/user/login?_next=/places/default/index')
    post_data = parse_form(result.text)
    print(s.cookies.get_dict())
    login_url ='http://example.webscraping.com/places/default/user/login?_next=/places/default/index'
    post_data['email']= '
[email protected]
' post_data['password'] = '2336517498' s.post(login_url,post_data) rs = s.post('http://example.webscraping.com/places/default/user/login?_next=/places/default/index') with open('login1.html','w+') as f: f.write(rs.text) if __name__ == '__main__': get_cookie()