利用python requests庫模擬登陸知乎

阿新 • • 發佈：2019-02-18

當初搜模擬登陸的時候在知乎上也找到一些內容。

以下是程式碼

import requests
import time
import json
import os
import re
import sys
import subprocess
from bs4 import BeautifulSoup as BS


class ZhiHuClient(object):

    """連線知乎的工具類，維護一個Session
    2015.11.11

    用法：

    client = ZhiHuClient()

    # 第一次使用時需要呼叫此方法登入一次，生成cookie檔案
    # 以後可以跳過這一步
    client.login("username", "password")

    # 用這個session進行其他網路操作，詳見requests庫
    session = client.getSession()
    """

    # 網址引數是賬號型別
    TYPE_PHONE_NUM = "phone_num"
    TYPE_EMAIL = "email"
    loginURL = r"http://www.zhihu.com/login/{0}"
    homeURL = r"http://www.zhihu.com"
    captchaURL = r"http://www.zhihu.com/captcha.gif"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Host": "www.zhihu.com",
        "Upgrade-Insecure-Requests": "1",
    }

    captchaFile = os.path.join(sys.path[0], "captcha.gif")
    cookieFile = os.path.join(sys.path[0], "cookie")

    def __init__(self):
        os.chdir(sys.path[0])  # 設定指令碼所在目錄為當前工作目錄

        self.__session = requests.Session()
        self.__session.headers = self.headers  # 用self呼叫類變數是防止將來類改名
        # 若已經有 cookie 則直接登入
        self.__cookie = self.__loadCookie()
        if self.__cookie:
            print("檢測到cookie檔案，直接使用cookie登入")
            self.__session.cookies.update(self.__cookie)
            soup = BS(self.open(r"http://www.zhihu.com/").text, "html.parser")
            print("已登陸賬號： %s" % soup.find("span", class_="name").getText())
        else:
            print("沒有找到cookie檔案，請呼叫login方法登入一次！")

    # 登入
    def login(self, username, password):
        """
        驗證碼錯誤返回：
        {'errcode': 1991829, 'r': 1, 'data': {'captcha': '請提交正確的驗證碼 :('}, 'msg': '請提交正確的驗證碼 :('}
        登入成功返回：
        {'r': 0, 'msg': '登陸成功'}
        """
        self.__username = username
        self.__password = password
        self.__loginURL = self.loginURL.format(self.__getUsernameType())
        # 隨便開個網頁，獲取登陸所需的_xsrf
        html = self.open(self.homeURL).text
        soup = BS(html, "html.parser")
        _xsrf = soup.find("input", {"name": "_xsrf"})["value"]
        # 下載驗證碼圖片
        while True:
            captcha = self.open(self.captchaURL).content
            with open(self.captchaFile, "wb") as output:
                output.write(captcha)
            # 人眼識別
            print("=" * 50)
            print("已開啟驗證碼圖片，請識別！")
            subprocess.call(self.captchaFile, shell=True)
            captcha = input("請輸入驗證碼：")
            os.remove(self.captchaFile)
            # 傳送POST請求
            data = {
                "_xsrf": _xsrf,
                "password": self.__password,
                "remember_me": "true",
                self.__getUsernameType(): self.__username,
                "captcha": captcha
            }
            res = self.__session.post(self.__loginURL, data=data)
            print("=" * 50)
            # print(res.text) # 輸出指令碼資訊，除錯用
            if res.json()["r"] == 0:
                print("登入成功")
                self.__saveCookie()
                break
            else:
                print("登入失敗")
                print("錯誤資訊 --->", res.json()["msg"])

    def __getUsernameType(self):
        """判斷使用者名稱型別
        經測試，網頁的判斷規則是純數字為phone_num，其他為email
        """
        if self.__username.isdigit():
            return self.TYPE_PHONE_NUM
        return self.TYPE_EMAIL

    def __saveCookie(self):
        """cookies 序列化到檔案
        即把dict物件轉化成字串儲存
        """
        with open(self.cookieFile, "w") as output:
            cookies = self.__session.cookies.get_dict()
            json.dump(cookies, output)
            print("=" * 50)
            print("已在同目錄下生成cookie檔案：", self.cookieFile)

    def __loadCookie(self):
        """讀取cookie檔案，返回反序列化後的dict物件，沒有則返回None"""
        if os.path.exists(self.cookieFile):
            print("=" * 50)
            with open(self.cookieFile, "r") as f:
                cookie = json.load(f)
                return cookie
        return None

    def open(self, url, delay=0, timeout=10):
        """開啟網頁，返回Response物件"""
        if delay:
            time.sleep(delay)
        return self.__session.get(url, timeout=timeout)

    def getSession(self):
        return self.__session

if __name__ == '__main__':
    client = ZhiHuClient()

    client.login('xxxxxx','xxxxxxxx')
    # 第一次使用時需要呼叫此方法登入一次，生成cookie檔案
    # 以後可以跳過這一步
    # client.login("username", "password")

    # 用這個session進行其他網路操作，詳見requests庫
    session = client.getSession()
    r=session.get('http://www.zhihu.com')
    print(s.text)

來自知乎：

這模擬登陸的程式碼可以作為參考。

最後是關於獲取天氣預報的爬蟲程式碼：

import urllib.request
import re
def GetHtmlCode(url):
    page = urllib.request.urlopen(url)
    htmlCode = page.read().decode('gbk')
    page.close()
    return htmlCode

def FindGXUrl(homePage):
    gx_re_vague=r'<a href="[\S]+" rel="[\S]+">江蘇</a>'
    gx_url_vague=re.search(gx_re_vague,homePage).group()
    gx_re=r'http://[\w\./]+\.htm'
    gx_url=re.search(gx_re,gx_url_vague).group()
    return gx_url

def FindNNUrl(GXPage):
    by_re_vague=r'<a href="[\S]+?" title="[\S]+?">南京</a>'
    nn_url_vague=re.search(by_re_vague,GXPage).group()
    by_re=r'/[\S]+?\.htm'
    nn_url_suffix=re.search(by_re,nn_url_vague).group()
    return nn_url_suffix

def GetWeatherBlockList(WeatherPage):
    weatherBlock_re=r'<li class="week-detail-now" >[\s\S]+?</li>'
    weather_re=re.compile(weatherBlock_re)
    weatherList=re.findall(weather_re,WeatherPage)
    return weatherList

class Weather:
    date=''
    daytime=''
    nighttime=''
    temperatureL=''
    temperatureH=''
    def __init__(self,d,dT,nT,tL,tH):
        self.date=d
        self.daytime=dT
        self.nighttime=nT
        self.temperatureL=tL
        self.temperatureH=tH
    def print(self):
        print('\n%s：白天：%s，夜間：%s，\n最低溫度：%sC，最高溫度：%sC\n'%(self.date,self.daytime,self.nighttime,self.temperatureL,self.temperatureH))



def MakeWeatherInfo(block):
    dA_re=r'[\d]{2}月[\d]{2}日'
    dA=re.search(dA_re,block).group()

    dT_re=r'<b><font class="gray">白天：</font>.{1,6}</b>'
    dT=re.search(dT_re,block).group()
    dT=re.sub(r'<b>.+</font>','',dT)
    dT=re.sub(r'</b>','',dT)

    nT_re=r'<b><font class="gray">夜間：</font>.{1,6}</b>'
    nT=re.search(nT_re,block).group()
    nT=re.sub(r'<b>.+</font>','',nT)
    nT=re.sub(r'</b>','',nT)

    t_re=r'<font class="blue">.{0,4}</font>～<font class="red">.{0,4}</font>'
    t=re.search(t_re,block).group()
    t=re.findall(r'[\d]+',t)
    return Weather(dA,dT,nT,t[0],t[1])
homePage=GetHtmlCode("http://tianqi.2345.com/")
gx_url=FindGXUrl(homePage)
GXPage=GetHtmlCode(gx_url)
nn_url_suffix=FindNNUrl(GXPage)
nn_url='http://tianqi.2345.com'+nn_url_suffix

NNPage=GetHtmlCode(nn_url)
weatherList=GetWeatherBlockList(NNPage)#get a list of two days' weather

weather1=MakeWeatherInfo(weatherList[0])
weather2=MakeWeatherInfo(weatherList[1])
weather1.print()
weather2.print()

思路很簡單。但主要也是正則表示式的書寫，還是得勤加練習才對。

用Python實現爬蟲的確非常簡單。但是利用scrapy框架之類的剛接觸一會發現利用Python3連安裝都是各種error.累覺不愛。

現在都是單執行緒。以後能做多執行緒和分散式爬蟲的時候再回來補充吧。

之後幾個月打算研究django，但是這估計也是個很大的坑呢233333.還得學習SQL語言balabala..挑戰性很足。

如果學到什麼東西在往部落格裡放吧。記錄一下學習的過程。

大學實在是太枯燥了。也許是我不太喜歡社交呢233333。

利用python requests庫模擬登陸知乎

當初搜模擬登陸的時候在知乎上也找到一些內容。以下是程式碼 import requests import time import json import os import re import sys import subprocess from bs4 import Be

Python爬蟲之模擬登陸知乎

在chrome瀏覽器下抓取登陸過程的包（注意把Preserve log勾上）：表單的結構主要包括_xsrf, password, phone_num 我們要找到_xsrf的值，重新載入zhihu.

python爬蟲模擬登陸知乎網

自從暑假學了大概一個月左右的爬蟲，開學之後就沒怎麼搞爬蟲了，當時也就學到scrapy框架就放下了，大致瞭解了一下框架，不是太理解，但是在這之前本人的爬蟲水平也僅僅侷限於爬取簡單頁面，爬取動態頁面也稍微瞭解下，但是一直沒有學模擬登陸，因為當時怎麼也搞不懂模擬登陸是

利用Python requests庫從網上下載txt檔案時多出一個CR的處理

問題描述讀1 的Reading word lists小節時，發現需要從thinkpython2/code/words.txt上下載words.txt檔案。我不想利用複製-貼上的方法構造該檔案，想到之前學過的爬蟲技術，於是寫下如下程式碼： import requests r =

利用Python requests庫實現cas認證

1.準備工作-背景知識 1.1 requests庫簡介： python有很多可以用來測試介面的模組，個人覺得，requests庫是最好用的，在Robot Framwork裡，它的測試庫requestsLibrary，也是基於requests寫的。 1.1.1 安裝：作為第三方模組，使用前，需要安裝，最簡單

Scrapy 模擬登陸知乎--抓取熱點話題

折騰了將近兩天，中間數次想要放棄，還好硬著頭皮搞下去了，在此分享出來，希望有同等需求的各位能少走一些彎路。原始碼放在了github上，歡迎前往檢視。若是幫你解決了問題，或者給了你啟發，不要吝嗇給加一星。工具準備在開始之前，請確保 scrpay 正確安裝，手頭有一款簡潔

使用OKHttp模擬登陸知乎，兼談OKHttp中Cookie的使用！

本文主要是想和大家探討技術，讓大家學會Cookie的使用，切勿做違法之事！很多Android初學者在剛開始學習的時候，或多或少都想自己搞個應用出來，把自己學的十八般武藝全都用在這個APP上，其實這個想法很好，專案驅動學習，效率更高，這是大學老師教給我的。可是一個APP，如果純

模擬登陸知乎，2016/10/23可用

登入這東西，目前理解的還是比較淺，就是說想辦法搞到cookie就好。最簡單就是自己用瀏覽器登入上，從開發者工具裡把cookie複製。。。今天說的呢，既然是模擬登陸，就少不了賬號密碼這些環節。首先開啟知乎https://www.zhihu.com/#signin，開發者工

【scrapy】模擬登陸知乎

這個網上有個通用的教程，然而為這個教程已經花費了太多時間進行除錯，和知乎上的朋友交流，很多人也是這個地方遇到了問題，最後的結果。。是放棄了crawlspider。。先貼下這個連結。。。http://ju.outofmemory.cn/entry/105646 謹慎。。

用selenium模擬登陸知乎賬號，處理登陸介面隨機出現驗證碼視窗的問題

import requests from selenium import webdriver from bs4 import BeautifulSoup import time while True: #option = webdriver.Chr

python爬蟲系列(2.3-requests庫模擬使用者登入)

一、模擬登入拉鉤網 import re import requests class LoginLaGou(object): """ 模擬登入拉鉤網 """

Python 爬蟲-模擬登入知乎-爬取拉勾網職位資訊

用Python寫爬蟲是很方便的,最近看了xlzd.me的文章，他的文章寫的很到位，提供了很好的思路。因為他的文章部分程式碼省略了。下面是基於他的文章的三個程式碼片段: 基於Python3,Python2的話需要修改下input輸入函式和print的用法。爬取豆瓣電影top250 爬取拉勾網職位資訊模擬

Python3 模擬登入知乎（requests）

# -*- coding: utf-8 -*- """ 知乎登入分為兩種登入一是手機登入 API : https://www.zhihu.com/login/phone_num 二是郵箱登入 API : https://www.zhihu.c

[Python]網路爬蟲（三）：使用cookiejar管理cookie 以及模擬登入知乎

大家好哈，上一節我們研究了一下爬蟲的異常處理問題，那麼接下來我們一起來看一下Cookie的使用。為什麼要使用Cookie呢？ Cookie，指某些網站為了辨別使用者身份、進行session跟蹤而儲存在使用者本地終端上的資料（通常經過加密）比如說有些網站需要登入後才

Python 模擬登入知乎

前言前天看到一個爬取了知乎50多萬評論的帖子，羨慕的同時也想自己來嘗試一下。看看能不能獲取一些有價值的資訊。必備知識點下面簡單的來談談我對常見的防爬蟲的一些技巧的理解。 headers 現在很多伺服器都對爬蟲進行了限制，有一個

python--python3爬蟲之模擬登入知乎

程式碼在python3環境下測試通過： from bs4 import BeautifulSoup import requests url = 'http://www.zhihu.com' login_url = url+'/login/email' captcha_

20170717_python爬蟲之requests+cookie模擬登陸

ssi alert 之前 lose net .html .net 裝載 onos 在成功登陸之前,失敗了十幾次。完全找不到是什麽原因導致被網站判斷cookie是無效的。直到用了firefox的httpfox之後才發現cookie裏還有一個ASP.NET_SessionI

selenium 模擬登入知乎和微博

sleep https epo element select selenium clas .com -c pip install selenium __author__ = ‘admin‘ __date__ = 2017 / 11 / 3 from selenium im

python requests庫學習筆記（下）

mail 接收緩存 nbsp 0.10 基本 eat agen 維基百科 1.請求異常處理請求異常類型：請求超時處理（timeout）：實現代碼： import requestsfrom requests import exceptions #引

利用python requests庫模擬登陸知乎

相關推薦