python爬蟲系列(2.1-requests庫的基本的使用)
一、基本認識
1、傳送一個get請求
import requests
if __name__ == "__main__":
# 獲取一個get請求
response = requests.get('http://httpbin.org/get')
2、關於獲取請求到資料常見的返回值
import requests
if __name__ == "__main__":
# 獲取一個get請求
response = requests.get('http://httpbin.org/get')
# 對抓取的網站設定編碼
response.encoding = 'utf-8'
# 列印返回的資料
print(response.text)
print(response.json())
print(response.headers)
print(response.status_code)
print(response.url)
print(response.cookies)
print(response.json())
# 獲取最原始的字串,沒有編碼的(使用者response.text出現亂碼的時候,及下載二進位制檔案的時候)
print(response.content)
3、關於其他的請求方式
response = requests.post('http://httpbin.org/post')
response = requests.put('http://httpbin.org/put')
response = requests.delete('http://httpbin.org/delete')
response = requests.head('http://httpbin.org/get')
response = requests.options('http://httpbin.org/get')
1、直接在url地址後面拼接引數
import requests
if __name__ == "__main__":
# 定義一個請求頭(模擬瀏覽器)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
# 設定引數
data = {'name': 'june', 'password': 123456}
# 獲取一個get請求
response = requests.get('http://httpbin.org/get?name=june&password=123456', headers=headers)
# 對抓取的網站設定編碼
response.encoding = 'utf-8'
print(response.text)
2、使用params傳遞引數
import requests
if __name__ == "__main__":
# 定義一個請求頭(模擬瀏覽器)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
# 設定引數
data = {'name': 'june', 'password': 123456}
# 獲取一個get請求
response = requests.get('http://httpbin.org/get', headers=headers, params=data)
# 對抓取的網站設定編碼
response.encoding = 'utf-8'
print(response.text)
1、需要下載的伯樂線上的文章標題
2、書寫邏輯程式碼
import re
import requests
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
}
url = 'http://python.jobbole.com/category/guide/'
response = requests.get(url=url, headers=headers)
pattern = re.compile(
'<div.*?post-thumb.*?title="(.*?)".*?</a>', re.S
)
print(response.status_code)
result_list = re.findall(pattern, response.text)
f = open('jobbole1.txt', 'a+', encoding='utf8')
for item in result_list:
f.write(item.strip() + '\n')
f.close()
3、解說正則表示式
.*?表示非貪婪的匹配任何字元
re.S 使.匹配包括換行在內的全部字元
1、導包
import re
import os
import shutil
import requests
2、定義一個下載圖片的類
class DownPic(object):
def __init__(self):
self.url = 'http://python.jobbole.com/category/guide/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
}
self.create_dir()
def create_dir(cls):
# 如果資料夾存在就刪除
if os.path.exists('demo'):
shutil.rmtree('demo')
os.makedirs('demo')
def get_html(self):
response = requests.get(url=self.url, headers=self.headers)
return response.text
def pattern(self):
pattern = re.compile(
'<div.*?post-thumb.*?src="(.*?)".*?</a>', re.S
)
result_list = re.findall(pattern, self.get_html())
return result_list
def download(self):
for item in self.pattern():
# 獲取到的圖片地址再次請求
if item.rsplit('.')[-1] in ['png', 'jpg']:
resp = requests.get(item.strip())
try:
with open(os.path.join('demo', item.strip().rsplit("/")[-1]), 'wb') as f:
f.write(resp.content)
except Exception as e:
print(e)
else:
continue
3、呼叫
if __name__ == "__main__":
p = DownPic()
p.download()
1、格式
response = requests.post('http://httpbin.org/post', headers=headers, data=data)
2、傳送資料到伺服器端
import requests
if __name__ == "__main__":
# 定義一個請求頭(模擬瀏覽器)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
# 設定引數
data = {'email': '[email protected]', 'password': 123456}
# 獲取一個get請求
response = requests.post('https://httpbin.org/post', headers=headers, data=data)
# 對抓取的網站設定編碼
response.encoding = 'utf-8'
print(response.text)
import requests
if __name__ == "__main__":
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
}
data = {
'first': 'true',
'pn': '1',
'kd': 'python',
}
response = requests.post(url=url, headers=headers, data=data)
print(response.json())