Python爬蟲應用視頻課程——筆記
阿新 • • 發佈:2018-09-05
oss hand 不存在 lose pytho 一個 spi ttr 選擇器 視頻課程鏈接:http://edu.51cto.com/course/14870.html
爬蟲,主講:湯小洋
一、爬蟲簡介
1. 爬蟲是什麽?
? 爬蟲,稱為網頁蜘蛛或網絡機器人,用於自動獲(爬)取互聯網上的信息,本質上就是一段代碼
? 任何一門高級開發語言都可以實現爬蟲,並不只有Python
2. 實現原理
? 通過代碼,模擬瀏覽器向服務器發送HTTP或HTTPS請求,然後對服務器響應的結果進行處理,從中獲取想要的數據
? 三步走:
- 獲取數據:發送請求並接收響應結果
- 處理數據:對響應結果進行處理,篩選出有效數據
- 存儲數據:將有效數據存儲起來
二、基本用法
1. 獲取數據
? 使用urllib模塊模擬瀏覽器發送請求
# 獲取數據 def get_data(): url = ‘https://search.51job.com/list/070200,000000,0000,00,9,99,java%25E5%25BC%2580%25E5%258F%2591,2,1.html‘ # 創建Request對象,指定url和請求頭 headers = { ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36‘ } req = request.Request(url, headers=headers) response = request.urlopen(req) # print(type(response)) # HTTPResponse類型 # print(response.getcode()) # 響應狀態碼 # print(response.info()) if response.getcode() == 200: data = response.read() # 讀取響應結果 # print(type(data)) # bytes類型 data = str(data, encoding=‘gbk‘) # 轉換為str # print(data) # 將數據寫入文件中 with open(‘index.html‘, mode=‘w‘, encoding=‘gbk‘) as f: f.write(data)
2. 處理數據
? 三種方式:
-
字符串解析
使用字符串+正則表達式
-
使用XPath
XPath是一門在XML文檔中查找信息的語言,用來在XML文檔中對元素和屬性進行遍歷。
使用Chrome瀏覽器的開發人員工具,獲取XPath
-
使用第三方模塊BeautifulSoup
Beautiful Soup 是一個可以從HTML或XML文件中提取數據的Python庫
安裝
pip install beautifulsoup4
# 處理數據 def parse_data(): with open(‘index.html‘, mode=‘r‘, encoding=‘gbk‘) as f: html = f.read() # 創建BeautifulSoup實例,解析html數據 bs = BeautifulSoup(html, ‘html.parser‘) # 指定使用html解析器parser ‘‘‘ 查找數據 ‘‘‘ # 1.find()方法,獲取第一個匹配的標簽 # div = bs.find(‘div‘) # print(div) # print(type(div)) # Tag類型 # 2.find_all()方法,獲取所有匹配的標簽 # metas = bs.find_all(‘meta‘) # 返回的是集合 # print(metas[0]) # print(bs.find_all(id=‘hello‘)) # 根據id獲取,返回的是集合 # print(bs.find_all(class_=‘itany‘)) # 根據class獲取 # 3.select()方法,使用CSS選擇器來獲取元素 # print(bs.select(‘#hello‘)) # print(bs.select(‘.itany‘)) # print(bs.select(‘p#world span‘)) # print(bs.select(‘[title]‘)) # 4.get_text()方法,獲取Tag中的文本 # value = bs.select(‘#hello‘)[0].get_text(strip=True) # print(len(value)) # print(value) # 獲取職位信息 divs = bs.select(‘#resultList .el‘) result = [] for div in divs[1:]: title = div.select(‘.t1‘)[0].get_text(strip=True) company = div.select(‘.t2‘)[0].get_text(strip=True) addr = div.select(‘.t3‘)[0].get_text(strip=True) salary = div.select(‘.t4‘)[0].get_text(strip=True) pubDate = div.select(‘.t5‘)[0].get_text(strip=True) # print(title, company, addr, salary, pubDate) row = { ‘title‘: title, ‘company‘: company, ‘addr‘: addr, ‘salary‘: salary, ‘pubDate‘: pubDate } result.append(row) return result
3. 存儲數據
3.1 存儲MySQL
# 存儲數據到MySQL
def save_to_mysql(data):
config = {
‘host‘: ‘localhost‘,
‘port‘: 3306,
‘user‘: ‘root‘,
‘password‘: ‘‘,
‘database‘: ‘python‘,
‘charset‘: ‘utf8‘
}
conn = pymysql.connect(**config)
cursor = conn.cursor()
sql = ‘‘‘
insert into t_job
(title, company, addr, salary, pubDate)
values
(%(title)s,%(company)s,%(addr)s,%(salary)s,%(pubDate)s)
‘‘‘
cursor.executemany(sql, data)
conn.commit()
cursor.close()
conn.close()
3.2 存儲到Excel
? 使用openpyxl模塊操作Excel
? 安裝openpyxl:pip install openpyxl
? 工作薄Workbook
? 工作表Sheet
? 單元格Cell
# 存儲數據到Excel
def save_to_excel(data):
# 創建工作薄Workbook
book = Workbook()
# 創建工作表Sheet
sheet = book.create_sheet(‘南京Java招聘信息‘, 0)
# 向工作表中添加數據
sheet.append([‘職位名‘, ‘公司名‘, ‘工作地點‘, ‘薪資‘, ‘發布時間‘])
for item in data:
row = [item[‘title‘], item[‘company‘], item[‘addr‘], item[‘salary‘], item[‘pubDate‘]]
sheet.append(row)
# 輸出保存
book.save(‘51job.xlsx‘)
3.3 存儲到Redis
? 安裝redis庫:pip install redis
# 存儲數據到Redis
def save_to_redis(data):
config = {
‘host‘: ‘192.168.2.30‘,
‘port‘: 6379,
‘charset‘: ‘utf8‘
}
r = redis.Redis(**config)
# r.set(‘name‘, ‘tom‘)
for item in data:
r.lpush(‘jobs‘, item)
# 從Redis中讀取數據
def read_from_redis():
config = {
‘host‘: ‘192.168.2.30‘,
‘port‘: 6379,
‘charset‘: ‘utf8‘,
‘decode_responses‘: True # 讀取時解碼
}
r = redis.Redis(**config)
print(r.lrange(‘jobs‘, 0, -1))
三、處理JSON數據
from urllib import request
import json
def get_data():
url = ‘https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=400&page_start=0‘
headers = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36‘
}
req = request.Request(url, headers=headers)
response = request.urlopen(req)
if response.getcode() == 200:
result = response.read()
# print(type(result)) # bytes類型
return result
def parse_data(html):
# 將字符串形式的json轉換為dict字典
data = json.loads(html)
# print(type(data), data)
movies = data[‘subjects‘]
for movie in movies:
print(movie[‘title‘], movie[‘rate‘])
if __name__ == ‘__main__‘:
parse_data(get_data())
四、爬蟲應用
? 步驟:
- 獲取數據
- 處理數據
- 存儲數據
- 數據可視化
1. 電影評論數據分析
from urllib import request
import json
from datetime import datetime, timedelta
import time
# 獲取數據
def get_data(url):
headers = {
‘User-Agent‘: ‘Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1‘
}
req = request.Request(url, headers=headers)
response = request.urlopen(req)
if response.getcode() == 200:
return response.read()
# 處理數據
def parse_data(html):
data = json.loads(html)[‘cmts‘]
comments = []
for item in data:
comment = {
‘id‘: item[‘id‘],
‘nickName‘: item[‘nickName‘],
‘cityName‘: item[‘cityName‘] if ‘cityName‘ in item else ‘‘, # 處理cityName不存在情況
‘content‘: item[‘content‘].replace(‘\n‘, ‘ ‘), # 處理評論內容換行的情況
‘score‘: item[‘score‘],
‘startTime‘: item[‘startTime‘]
}
comments.append(comment)
return comments
# 存儲數據到文本文件
def save_to_txt():
start_time = datetime.now().strftime(‘%Y-%m-%d %H:%M:%S‘) # 當前時間
end_time = ‘2018-08-10 00:00:00‘ # 結束時間
while start_time > end_time:
url = ‘http://m.maoyan.com/mmdb/comments/movie/1203084.json?_v_=yes&offset=0&startTime=‘ + start_time.replace(
‘ ‘, ‘%20‘)
try:
html = get_data(url)
except:
time.sleep(1)
html = get_data(url)
else:
time.sleep(0.1)
comments = parse_data(html)
print(comments)
start_time = comments[14][‘startTime‘] # 末尾評論時間
start_time = datetime.strptime(start_time, ‘%Y-%m-%d %H:%M:%S‘) - timedelta(seconds=1) # 向前減1秒,防止獲取到重復數據
start_time = datetime.strftime(start_time, ‘%Y-%m-%d %H:%M:%S‘)
for item in comments:
with open(‘comments.txt‘, mode=‘a‘, encoding=‘utf-8‘) as f:
f.write(str(item[‘id‘]) + ‘,‘ + item[‘nickName‘] + ‘,‘ + item[‘cityName‘] + ‘,‘ + item[
‘content‘] + ‘,‘ + str(item[‘score‘]) + ‘,‘ + item[‘startTime‘] + ‘\n‘)
if __name__ == ‘__main__‘:
# url = ‘http://m.maoyan.com/mmdb/comments/movie/1203084.json?_v_=yes&offset=15&startTime=2018-09-01%2011%3A10%3A00‘
# comments = parse_data(get_data(url))
# print(comments)
save_to_txt()
2. 數據可視化
? pyecharts類庫
2.1 粉絲位置分布
from collections import Counter
from pyecharts import Geo
import json
from pyecharts import Bar
def render():
# 獲取所有城市信息
cities = []
with open(‘comments.txt‘, mode=‘r‘, encoding=‘utf-8‘) as f:
rows = f.readlines()
for row in rows:
city = row.split(‘,‘)[2]
if city != ‘‘:
cities.append(city)
# 對城市數據和坐標文件中的地名進行處理
handle(cities)
# 統計每個城市出現的次數
# data = [] # [(‘南京‘,25),(‘北京‘,59)]
# for city in set(cities):
# data.append((city, cities.count(city)))
data = Counter(cities).most_common()
# 根據城市數據生成地理坐標圖
geo = Geo(
"《一出好戲》粉絲位置分布",
"數據來源:貓眼",
title_color="#fff",
title_pos="center",
width=1200,
height=600,
background_color="#404a59",
)
attr, value = geo.cast(data)
geo.add(
"",
attr,
value,
visual_range=[0, 3500],
visual_text_color="#fff",
symbol_size=15,
is_visualmap=True,
)
geo.render(‘粉絲位置分布.html‘)
# 根據城市數據生成柱狀圖
cities_top20 = Counter(cities).most_common(20) # 返回出現次數最多的20條
bar = Bar("《一出好戲》粉絲來源排行榜TOP20", ‘數據來源:貓眼‘, title_pos=‘center‘, width=1200, height=600)
attr, value = bar.cast(cities_top20)
bar.add("", attr, value)
bar.render(‘粉絲來源排行榜-柱狀圖.html‘)
# 處理地名數據,解析坐標文件中找不到地名的問題
def handle(cities):
with open(
‘C:/Users/User/PycharmProjects/python-spider/venv/Lib/site-packages/pyecharts/datasets/city_coordinates.json‘,
mode=‘r‘, encoding=‘utf-8‘) as f:
data = json.loads(f.read()) # 將str轉換為dict
# 循環判斷處理
data_new = data.copy() # 復制一份地名數據
for city in set(cities):
count = 0
for k in data:
count += 1
if k == city:
break
if k.startswith(city): # 處理簡寫的地名,如南京市 簡寫為 南京
data_new[city] = data[k]
break
if k.startswith(city[0:-1]) and len(city) >= 3: # 處理行政變更的地名,如溧水縣 改為 溧水區
data_new[city] = data[k]
break
# 處理不存在的情況
if count == len(data):
while city in cities:
cities.remove(city)
# print(len(data), len(data_new))
# 寫入覆蓋坐標文件
with open(
‘C:/Users/User/PycharmProjects/python-spider/venv/Lib/site-packages/pyecharts/datasets/city_coordinates.json‘,
mode=‘w‘, encoding=‘utf-8‘) as f:
f.write(json.dumps(data_new, ensure_ascii=False)) # 將dict轉換為str,指定ensure_ascii=False支持中文
if __name__ == ‘__main__‘:
render()
2.2 評價星級
from pyecharts import Pie
# 獲取評論中所有評分
rates = []
with open(‘comments.txt‘, mode=‘r‘, encoding=‘utf-8‘) as f:
rows = f.readlines()
for row in rows:
rates.append(row.split(‘,‘)[4])
# print(rates)
# 定義星級
attr = [‘五星‘, ‘四星‘, ‘三星‘, ‘二星‘, ‘一星‘]
value = [
rates.count(‘5‘) + rates.count(‘4.5‘),
rates.count(‘4‘) + rates.count(‘3.5‘),
rates.count(‘3‘) + rates.count(‘2.5‘),
rates.count(‘2‘) + rates.count(‘1.5‘),
rates.count(‘1‘) + rates.count(‘0.5‘)
]
# print(value)
pie = Pie("《一出好戲》評分星級", title_pos=‘center‘, width=900)
pie.add("", attr, value, is_label_show=True, is_legend_show=False)
pie.render(‘電影評分-餅圖.html‘)
2.3 詞雲圖
? jieba(結巴)是一個強大的分詞庫,完美支持中文分詞
? Matplotlib 是一個Python的 2D繪圖庫,可以生成繪圖,直方圖,功率譜,條形圖,錯誤圖,散點圖等
? wordcloud基於Python的詞雲生成類庫,很好用,而且功能強大
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
# 獲取所有評論內容
comments = []
with open(‘comments.txt‘, mode=‘r‘, encoding=‘utf-8‘) as f:
rows = f.readlines()
for row in rows:
comment = row.split(‘,‘)[3]
if comment != ‘‘:
comments.append(comment)
# 設置分詞
comment_after_split = jieba.cut(str(comments), cut_all=False)
words = ‘ ‘.join(comment_after_split) # 以空格進行拼接
# print(words)
# 設置屏蔽詞匯
stopwords = STOPWORDS.copy()
stopwords.add(‘電影‘)
stopwords.add(‘一出‘)
stopwords.add(‘好戲‘)
stopwords.add(‘有點‘)
# 導入背景圖
bg_image = plt.imread(‘love.jpg‘)
# 設置詞雲參數
wc = WordCloud(width=1024, height=768, background_color=‘white‘, mask=bg_image, stopwords=stopwords, max_font_size=400,
random_state=50,font_path=‘STKAITI.TTF‘)
# 將分詞後數據導入雲圖
wc.generate_from_text(words)
# 繪制圖像
plt.imshow(wc)
plt.axis(‘off‘) # 不顯示坐標軸
plt.show() # 顯示圖像
# 保存圖像到文件
wc.to_file(‘詞雲圖.jpg‘)
Python爬蟲應用視頻課程——筆記