1. 程式人生 > >爬蟲中什麽是requests

爬蟲中什麽是requests

header use lan os.path 上傳 lxml main pro inf

print(response.text)       #響應的信息
print(response.headers)  #獲取響應頭
print(response.status_code)  #響應狀態碼
print(response.encoding)   #響應的編碼
print(response.cookies)   #獲取cookies信息

帶參數GET請求

data = {
    ‘name‘:‘abc‘,
‘‘‘‘‘‘
}

response = requests.get(url=‘http://www.baidu.com‘,params=data)

解析json

import requests

response = requests.get(url=‘http://www.baidu.com‘)
print(response.json())

獲取二進制數據

import requests

response = requests.get(url=‘http://www.baidu.com‘)
print(response.content)

高級操作

文件上傳

import requests
flies = {
    ‘flies‘:open(‘XXX‘,‘rb‘)
}
response = requests.post(url=‘http://www.baidu.com‘,flies=flies)
print(response.content)
會話維持 (模擬登陸)
import requests

s = requests.Session()
s.get(‘http://httpbin.org/cookies/set/number/123456789‘)
response = s.get(‘http://httpbin.org/cookies‘)
print(response.text)

{
  "cookies": {
    "number": "123456789"
  }
}

證書驗證

import requests
import urllib3

url = ‘https://www.biqudu.com/43_43821/2520338.html‘
urllib3.disable_warnings() #關閉證書後再把警告提示關閉
response = requests.get(url=url,verify=False)
print(response.text)

代理認證

url = ‘https://www.biqudu.com/43_43821/2520338.html‘
proxies = {
    ‘http‘:‘http://127.0.0.2‘,   
    ‘https‘:‘http://user:[email protected]‘,  #帶密碼的代理
}

response = requests.get(url=url,proxies=proxies)
print(response.text)
    ****

請求超時處理

import requests
from requests.exceptions import ReadTimeout  #導入錯誤模塊

url = ‘https://www.taobao.com‘
try:
    response = requests.get(url=url,timeout=0.1)  #限制請求時間
    print(response.status_code)
except ReadTimeout:
    print(‘請求超時‘)

認證設置

#有的網站打開的瞬間就需要密碼認證

import requests
from requests.auth import HTTPBasicAuth

url = ‘https://www.taobao.com‘

response = requests.get(url=url,auth=(‘user‘,‘pwd‘))
print(response.status_code)

1,筆趣閣小說(入門級爬取文本信息)

抓取筆趣閣小說:排行榜單的小說總榜

1.請求初始url,獲取網頁源碼
2.解析網頁源碼,得到文本內容
3.將小說全部章節名存入txt文件中

from lxml import etree
import requests

url = ‘http://www.biqiuge.com/paihangbang‘

response = requests.get(url)
response.encoding = response.apparent_encoding


html = etree.HTML(response.text)
info = html.xpath("//div[@class=‘block bd‘][1]/ul[@class=‘tli‘]/li/a")
for i in info:
    title = i.xpath("./text()")[0]
    urls =i.xpath("./@href")[0]
    urls1 = ‘http://www.biqiuge.com‘+urls

    with open(title+‘.txt‘,‘w+‘,encoding=‘utf-8‘) as f:
        response1 = requests.get(url=urls1)
        response1.encoding = response1.apparent_encoding
        html = etree.HTML(response1.text)
        info = html.xpath("//div[@class=‘listmain‘]/dl/dd/a/text()")[6:]
        for i in info:
            f.write(i.strip()+‘\n‘)
        print(title+"------寫入成功")

------------------------------------------------------
判斷路徑是否存在,自動創建!!!
if not os.path.exists(title):
    os.mkdir(title)

path = os.path.join(title,title1)

if not os.path.exists(path):
    os.mkdir(path)

with open(path+ ‘\\‘ + title2 +‘.txt‘, ‘w+‘, encoding=‘utf-8‘) as f:
    for con in contents:
        f.write(con.strip() + ‘\n‘)
    print(title +‘---‘+ title1 +‘---‘+ title2 + ‘---寫入成功‘)

2,崔慶才博客(偽造頭信息爬取策略)

from lxml import etree

import requests
n = 0
with open(‘cuijincai.txt‘, ‘w+‘, encoding=‘utf-8‘) as f:
    for i in range(1,10):
        url = ‘https://cuiqingcai.com/category/technique/python/page/‘+str(i)
#這裏的循環,該網站是動態顯示,可以在f12/network中XHR中查到該鏈接url。
        headers = {

        Referer: https://cuiqingcai.com/category/technique/python
        ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36‘
        }
    #部分網站設置反爬機制,可以為請求頭設置 信息
        response = requests.get(url=url,headers=headers)
        html = etree.HTML(response.text)
        all_div = html.xpath("//article[@class=‘excerpt‘]")

        for div in  all_div:
            title = div.xpath("./header/h2/a/text()")[0]  #當前路徑下的標題信息
            author = div.xpath("./p[@class=‘auth-span‘]/span[@class=‘muted‘][1]/a/text()")[0]
            time = div.xpath("./p[@class=‘auth-span‘]/span[@class=‘muted‘][2]/text()")[0]
            liulanshu = div.xpath("./p[@class=‘auth-span‘]/span[@class=‘muted‘][3]/text()")[0]
            pinlun = div.xpath("./p[@class=‘auth-span‘]/span[@class=‘muted‘][4]/a/text()")[0]
            like = div.xpath("./p[@class=‘auth-span‘]/span[@class=‘muted‘][5]/a[@id=‘Addlike‘]/span[@class=‘count‘]/text()")[0]+‘喜歡‘
            n += 1
            f.write("第{}條\t{}\t{}\t{}\t{}\t{}\t{}\n".format(n,title,author,time,liulanshu,pinlun,like))

User Agent中文名為用戶代理,簡稱 UA,它是一個特殊字符串頭,使得服務器能夠識別客戶使用的操作系統及版本、CPU 類型、瀏覽器及版本、
瀏覽器渲染引擎、瀏覽器語言、瀏覽器插件等。

HTTP Referer是header的一部分,當瀏覽器向web服務器發送請求的時候,一般會帶上Referer,告訴服務器我是從哪個頁面鏈接過來的,
服務器基此可以獲得一些信息用於處理。

https://www.liaoxuefeng.com  該網站設置反爬,可以用上面設置頭信息爬取

爬蟲中什麽是requests