python--- bs4和requests模組

阿新 • • 發佈：2018-11-08

1.bs4模組

bs4庫是解析、遍歷、維護、“標籤樹“的功能庫。通俗一點說就是： bs4庫把html原始碼重新進行了格式化，從而方便我們對其中的節點、標籤、屬性等進行操作。

獲取標籤內容

from bs4 import  BeautifulSoup

# 構造物件
soup = BeautifulSoup(open('westos.html'), 'html.parser')
# 獲取標籤, 預設獲取找到的第一個符合的內容
print(soup.title)
print(type(soup.title))
print(soup.p)

在這裡插入圖片描述

獲取標籤屬性

print(soup.p.attrs)
# 獲取標籤指定屬性的內容
print(soup.p['id'])
print(soup.p['class'])
print(soup.p['style'])

# 對屬性進行修改
soup.p['id'] = 'modifyid'
print(soup.p)
print(type(soup.p))

獲取標籤的文字內容

from bs4 import  BeautifulSoup

# 構造物件
soup = BeautifulSoup(open('westos.html'), 'html.parser')
print(dir(soup.title))
print(soup.title.text)
print(soup.title.string)
print(soup.title.name)
print(soup.head.title.string)

操作子節點

# 構造物件
soup = BeautifulSoup(open('westos.html'), 'html.parser')
print(soup.head.contents)
print(soup.head.children)
for el in soup.head.children:
    print('--->', el)

面向物件的匹配

# # 查詢指定的標籤內容(指定的標籤)
# res1 = soup.find_all('p')
# print(res1)
# # 查詢指定的標籤內容(指定的標籤)--與正則的使用
res1 = soup.find_all(re.compile(r'd+'))
print(res1)

# # 對於正則表示式進行編譯， 提高查詢速率；
# pattern = r'd.+'
# pattern = re.compile(pattern)
# print(re.findall(pattern, 'dog hello d'))

import re
from bs4 import  BeautifulSoup

# 構造物件
soup = BeautifulSoup(open('westos.html'), 'html.parser')

# 詳細查詢標籤
print(soup.find_all('p', id='test1'))
print(soup.find_all('p', id=re.compile(r'test\d{1}')))
print(soup.find_all('p', class_="class1"))
print(soup.find_all('p', class_=re.compile(r'class\d{1}')))
# 查詢多個標籤
print(soup.find_all(['p', 'div']))
print(soup.find_all([re.compile('^d'), re.compile('p')]))


# 內容的匹配
print(soup.find_all(text='文章標題'))
print(soup.find_all(text=re.compile('標題')))
print(soup.find_all(text=[re.compile('標題'), 'Title']))

CSS匹配

import re
from bs4 import  BeautifulSoup
# 構造物件
soup = BeautifulSoup(open('westos.html'), 'html.parser')
# CSS常見選擇器: 標籤選擇器(div), 類選擇器(.class1)， id選擇器(#idname)， 屬性選擇器(p[type="text"])
# 標籤選擇器(div)
res1 = soup.select("p")
print(res1)
# 類選擇器(.class1)
res2 = soup.select(".class2")
print(res2)
# id選擇器(#idname)
res3 = soup.select("#test1")
print(res3)
#  屬性選擇器(p[type="text"]
print(soup.select("p[id='test1']"))
print(soup.select("p['class']"))

2.獲取豆瓣最新電影的id號和電影名稱

import  requests
from bs4 import BeautifulSoup

url = "https://movie.douban.com/cinema/nowplaying/xian/"
# 1). 獲取頁面資訊
response = requests.get(url)
content = response.text
# print(content)
# 2). 分析頁面， 獲取id和電影名
soup = BeautifulSoup(content, 'lxml')
# 線找到所有的電影資訊對應的li標籤;
nowplaying_movie_list = soup.find_all('li', class_='list-item')
# print(nowplaying_movie_list[0])
# print(type(nowplaying_movie_list[0]))
# 儲存所有電影資訊[{'title':"名稱", "id":"id號"}]
movies_info = []

# 依次遍歷每一個li標籤， 再次提取需要的資訊
for item in nowplaying_movie_list:
    nowplaying_movie_dict = {}
    # 根據屬性獲取title內容和id內容
    # item['data-title']獲取li標籤裡面的指定屬性data-title對應的value值;
    nowplaying_movie_dict['title'] = item['data-title']
    nowplaying_movie_dict['id'] = item['id']
    nowplaying_movie_dict['actors'] = item['data-actors']
    nowplaying_movie_dict['director'] = item['data-director']

    # 將獲取的{'title':"名稱", "id":"id號"}新增到列表中;
    movies_info.append(nowplaying_movie_dict)

print(movies_info)

在這裡插入圖片描述

4.獲取指定電影的影評資訊

# 目標:
#      1). 爬取某一頁的評論資訊；
#      2).爬取某個電影的前10頁評論資訊；
#      3). 獲取所有電影的評論資訊;
import threading

import requests
from bs4 import  BeautifulSoup
# #      1). 爬取某一頁的評論資訊；
def getOnePageComment(id, pageNum):
    # 1). 根據頁數確定start變數的值
    # 第一頁: https://movie.douban.com/subject/26425063/comments?start=0&limit=20&sort=new_score&status=P
    # 第二頁: https://movie.douban.com/subject/26425063/comments?start=20&limit=20&sort=new_score&status=P
    # 第三頁: https://movie.douban.com/subject/26425063/comments?start=20&limit=40&sort=new_score&status=P
    start = (pageNum-1)*20
    url = "https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P" %(id, start)
    # 2). 爬取評論資訊的網頁內容
    content = requests.get(url).text
    # 3). 通過bs4分析網頁
    soup = BeautifulSoup(content, 'lxml')
    # 分析網頁得知， 所有的評論資訊都是在span標籤， 並且class為short;
    commentsList = soup.find_all('span', class_='short')
    pageComments = ""
    # 依次遍歷每一個span標籤， 獲取標籤裡面的評論資訊, 並將所有的評論資訊儲存到pageComments變數中;
    for commentTag in commentsList:
        pageComments += commentTag.text
    # return pageComments
    print("%s page" %(pageNum))
    global  comments
    comments += pageComments

#      2).爬取某個電影的前10頁評論資訊；
id = '26425063'
comments = ''
threads = []
# 爬取前10頁的評論資訊;獲取前幾頁就迴圈幾次；
for pageNum in range(10): # 0 , 1 2 3 4...9
    pageNum = pageNum + 1
    # getOnePageComment(id, pageNum)
    # 通過啟動多執行緒獲取每頁評論資訊
    t = threading.Thread(target=getOnePageComment, args=(id, pageNum))
    threads.append(t)
    t.start()
#     等待所有的子執行緒執行結束， 再執行主執行緒內容;
_ = [thread.join() for thread in threads]
print("執行結束")
with open("%s.txt" %(id), 'w') as f:
    f.write(comments)

在這裡插入圖片描述

5.資料清洗

import re
import wordcloud
import jieba

# 1. 對於爬取的評論資訊進行資料清洗(刪除不必要的逗號， 句號， 表情， 只留下中文或者英文內容)
with open('./doc/26425063.txt') as f:
    comments = f.read()
# 通過正則表示式實現
pattern = re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)')
deal_comments = re.findall(pattern, comments)
newComments = ''
for item in deal_comments:
    newComments += item
print(newComments)

6.詞雲分析

import jieba
import  wordcloud
import  numpy as np
from PIL import Image

text= "馬雲曾公開表態稱對錢沒興趣稱其從來沒碰過錢上了微博熱搜"

# 2).  '微博熱', '搜'切割有問題， 可以強調
# jieba.suggest_freq(('微博'),True)
# jieba.suggest_freq(('熱搜'),True)
# 強調檔案中出現的所有詞語；
jieba.load_userdict('./doc/newWord')
# 1). 切割中文， lcut返回一個列表， cut返回一個生成器；
result = jieba.lcut(text)
print("切分結果:", result)

# 4). 繪製詞雲
wc = wordcloud.WordCloud(
    background_color='snow',
    font_path='./font/msyh.ttf',    # 處理中文資料時
    min_font_size=5,    # 圖片中最小字型大小；
    max_font_size=15,   # 圖片中最大字型大小；
    width=200,  # 指定生成圖片的寬度
)
wc.generate(",".join(result))
wc.to_file('./doc/douban.png')

7.電影評論詞雲分析

import jieba
import  wordcloud
import  numpy as np
# 在python2中處理影象，Image； python3中如果處理影象， 千萬不要安裝Image, 安裝pillow
from PIL import Image

# 1). 切割中文， lcut返回一個列表， cut返回一個生成器；
result = jieba.lcut(open('./doc/26425063.txt').read())

# 2). 開啟圖片
imageObj = Image.open('./doc/mao.jpg')
cloud_mask = np.array(imageObj)

# 4). 繪製詞雲
wc = wordcloud.WordCloud(
    mask = cloud_mask,
    background_color='black',
    font_path='./font/msyh.ttf',    # 處理中文資料時
    min_font_size=5,    # 圖片中最小字型大小；
    max_font_size=50,   # 圖片中最大字型大小；
    width=500,  # 指定生成圖片的寬度
)
wc.generate(",".join(result))
wc.to_file('./doc/douban.png')

python--- bs4和requests模組

1.bs4模組 bs4庫是解析、遍歷、維護、“標籤樹“的功能庫。通俗一點說就是： bs4庫把html原始碼重新進行了格式化，從而方便我們對其中的節點、標籤、屬性等進行操作。獲取標籤內容 from bs4 import BeautifulSoup # 構造物件

利用python wxpy和requests寫一個自動應答微信機器人例項

在做測試的過程中，同事們經常需要獲取一個賬戶的token和個人資訊，我自己利用spring boot寫了一個介面，但是對於APP測試同學來說不是很方便，因為需要複製這個token到APP裡面去，所以我做了一個微信自動應答的機器人，來實現這個需求。思路如下：利用wxpy拿到對方發來的資訊，然後簡

Python中的requests模組

Python中的Requests模組 Requests模組是一個用於網路訪問的模組，類似的模組有urllib，urllib2，httplib，httplib2等，但由於其訪問http時的人性化，便於操作，深受人們喜歡。在爬蟲中常使用的模組：獲取網頁內容的----- urlli

python - 怎樣使用 requests 模組傳送http請求

最近在學python自動化，怎樣用python發起一個http請求呢？通過了解 request 模組可以幫助我們發起http請求步驟：　　1.首先import 下 request 模組　　2.然後看請求的方式，選擇對應的請求方法　　3.接受返回的報文資訊例子：get 方法　　imp

python3 urllib和requests模組

urllib模組是python自帶的，直接呼叫就好，用法如下： 1 #處理get請求，不傳data，則為get請求 2 import urllib 3 from urllib.request import urlopen 4 from urllib.parse

python 手動給requests模組新增urlretrieve下載檔案方法！

requests模組的前代是urllib模組，傳入引數headers、cookie、data什麼的肯定是requests好使，但是卻沒有urllib.request.urlretrieve這個方法，urlretrieve(url, filename=None,reportho

孤荷凌寒自學python第六十七天初步瞭解Python爬蟲初識requests模組

孤荷凌寒自學python第六十七天初步瞭解Python爬蟲初識requests模組（完整學習過程螢幕記錄視訊地址在文末）從今天起開始正式學習Python的爬蟲。今天已經初步瞭解了兩個主要的模組： requests BeautifulSoup 一

Python爬蟲——利用requests模組爬取妹子圖

近期學了下python爬蟲，利用requests模組爬取了妹子圖上的圖片，給單身狗們發波福利，哈哈！順便記錄一下第一次發部落格。話不多說，進入正題開發環境 python 3.6 涉及到的庫 requests lxml 先上一波爬取的截圖

python,perl和R模組的安裝

Python，Perl和R，作為生物資訊的主要語言，應用十分廣泛；主要原因就是這三種語言有豐富的包，這些包可以運用特定的方法，實現特定的功能！一.Python python 是這三種的支援面向物件最好的語言，第三方包也是最多的。 1.Python包安裝方式一 pip in

python 包和匯入模組

包包的構成三要素：目錄資料夾，xxx.py，__init__.py。 import包時，程式會先執行__init__.py檔案，用於初始化包的屬性和方法。若__init__.py為空時，預設初始化包內所有的屬性和方法。匯入模組如下檔案結構模組間

Python爬蟲之requests模組

獲取響應資訊 import requests response = requests.get('http://www.baidu.com') print(response.status_code) # 狀態碼 print(response.url) # 請求url print(respon

python-requests資料驅動延伸 python-requests模組的講解和應用

在 python-requests模組的講解和應用基礎上進行資料驅動的延伸 task_01_requests.py #-*- coding:utf-8 -*- #task_01_requests.py # 1：利用requests模組，編寫一個可以完成http

Python爬蟲【urllib3模組】和【requests模組】

前面介紹了urllib為啥還要引入urllib3模組？原因是：urllib3是比urllib更好用的API。需要自行安裝。在Pycharm的Terminal中輸入：pip install urllib3。例：urllib3中的PoolManager()模組使用ur

python網路程式設計requests和selenium模組

import requests #需要命令列下pip install requests安裝 req = requests.get("http://httpbin.org/get", headers = {"User-Agent" : "ua"}, proxies = {"http" : "i

Python中requests庫模組和lxml模組安裝問題（windows下）

1.requests模組安裝：第一次匯入requests模組，會報mportError: No module named requests的錯。這就是沒有成功匯入requests模組。 2.lxml模組安裝問題： 1.進入http://www.lfd

Python使用lxml模塊和Requests模塊抓取HTML頁面的教程

有時 oms 世界 tel 4.0 取出 itl imp syntax Web抓取Web站點使用HTML描述，這意味著每個web頁面是一個結構化的文檔。有時從中獲取數據同時保持它的結構是有用的。web站點不總是以容易處理的格式，如 csv 或者 json 提供它們的數據

25-3 requests模組的cookie和代理操作

一.基於requests模組的cookie操作引言：有些時候，我們在使用爬蟲程式去爬取一些使用者相關資訊的資料（爬取張三“人人網”個人主頁資料）時，如果使用之前requests模組常規操作時，往往達不到我們想要的目的，例如： 1 #!/usr/bin/env python 2 #

python bs4模組 BeautifulSoup 學習筆記

bs4 模組的 BeautifulSoup 可以用來爬取html頁面的內容，配合requests庫可以寫簡單的爬蟲。 1、利用requests請求html頁面，獲取HTML頁面內容 import requests from bs4 import BeautifulSoup

python - requests模組

requests模組 # requests 學習 # 安裝 # pip install requests # 匯入模組 import requests # 訪問網頁 # res = request.get('URL') # res = requests.get('http://127.0.0.1

[py]python的time和datetime模組獲取星期幾

import time import datetime #今天星期幾 today=int(time.strftime("%w")) print today #某個日期星期幾 anyday=datetime.datetime(2013,05,13).strftime("%w") print anyday

python--- bs4和requests模組

1.bs4模組

2.獲取豆瓣最新電影的id號和電影名稱

4.獲取指定電影的影評資訊

5.資料清洗

6.詞雲分析

7.電影評論詞雲分析

相關推薦