1. 程式人生 > >Python利用scrapy框架,爬取大眾點評部分商鋪資料~

Python利用scrapy框架,爬取大眾點評部分商鋪資料~

分享一下,自己從0開始,用python爬取資料的歷程。希望可以可以幫到一起從0開始的小夥伴~~加油。

首先,我的開發環境是:

電腦:macOS Sierra 10.12.6 編譯器:PyCharm + 終端

我的電腦自帶的Python版本為2.7,我下載了一個Python3.6。使用3.6版本的來進行本次的編寫, 將新下載的Python配置到環境變數裡。一般他會自帶pip。開啟終端,cd到pip所在目錄,終端輸入 pip scrapy

開啟終端,cd到你想建立的專案目錄下,終端輸入 scrapy startproject Test 


就會在該目錄下自動生成一些檔案,接下來只要修改其中的一些檔案就可以了。

使用PyCharm開啟,先截圖一下目錄結構:


根目錄就是你建立的專案名,然後會有一個spiders資料夾,裡面會有__init__.py 

根目錄下的檔案,__init__.py , items.py  , middlewares.py , pipelines.py , settings.py

cd到Test目錄下,終端輸入 scrapy genspider ShopSpider "dianping.com"

會在Test目錄下生成一個ShopSpider.py檔案。


檔案都建立好了。去想要爬的網站看一下它原始碼的標籤結構。

根據想要爬的資料,修改items.py檔案

import scrapy


class TestItem(scrapy.Item):
    # 餐館名
shop_name = scrapy.Field() # 首頁圖 shop_img = scrapy.Field() # 評星 shop_star = scrapy.Field() # 評價人數 shop_evaluation = scrapy.Field() # 人均價位 shop_price = scrapy.Field() # 菜系 shop_type = scrapy.Field() # 地址1 shop_address1 = scrapy.Field() # 詳細地址 shop_address2 = scrapy.Field() # 推薦菜1
shop_food1 = scrapy.Field() # 推薦菜2 shop_food2 = scrapy.Field() # 推薦菜3 shop_food3 = scrapy.Field() # 口味評分 shop_sweet = scrapy.Field() # 環境評分 shop_environment = scrapy.Field() # 服務評分 shop_server = scrapy.Field()

修改爬蟲檔案ShopSpider.py

# -*- coding: utf-8 -*-
import scrapy
from Test.items import TestItem


class ShopSpider(scrapy.Spider):
    """
     功能:大眾點評瀋陽美食店鋪資料
    """
# 爬蟲名
name = 'ShopSpider'
# 作用範圍
allowed_domains = ['dianping.com']
    # baseurl
url = 'http://www.dianping.com/shenyang/ch10/g2714p'
offset = 1
# 爬取的url
start_urls = [url + str(offset)]

    def parse(self, response):
        for each in response.xpath("//div[@class='shop-list J_shop-list shop-all-list']/ul/li"):
            # 初始化模型物件≤
item = TencentItem()
            item['shop_name'] = each.xpath(".//img/@title").extract()[0]

            # 分割圖片url
imgorl = each.xpath(".//img/@src").extract()[0]
            img = imgorl.split('%')[0]
            item['shop_img'] = img

            item['shop_star'] = each.xpath(".//div[@class='comment']/span/@title").extract()[0]

            # 評價人數和平均價格 通過迴圈次數去找到兩個相同的標籤下的資料
price_tag = 0
for price in each.xpath(".//div[@class='comment']"):
                for p in price.xpath(".//a/b/text()"):
                    if price_tag == 0:
                        # 當評價人數為空的時候,第一個獲得到的資料包含'¥'那麼就是價格,否則是評價人數
ep = price.xpath(".//a/b/text()").extract()[0]
                        if '¥' in ep:
                            item['shop_price'] = ep
                        else:
                            item['shop_evaluation'] = ep
                        price_tag += 1
elif price_tag == 1:
                        item['shop_price'] = price.xpath(".//a/b/text()").extract()[1]
                        price_tag += 1
# 商店型別 和 地址,防止地址1不存在,需要判斷
at_tag = 0
for at in  each.xpath(".//div[@class='tag-addr']"):
                for att in at.xpath(".//a/span[@class='tag']/text()"):
                    if at_tag == 0:
                        item['shop_type'] = at.xpath(".//a/span[@class='tag']/text()").extract()[0]
                        at_tag += 1
elif at_tag == 1:
                        item['shop_address1'] = at.xpath(".//a/span[@class='tag']/text()").extract()[1]
                        at_tag += 1
# 地址2
item['shop_address2'] = each.xpath(".//div[@class='tag-addr']/span[@class='addr']/text()").extract()[0]

            # 推薦菜 判斷個數
food_tag = 0
for food in each.xpath(".//div[@class='recommend']"):
                for f in food.xpath(".//a/text()"):
                    if food_tag == 0:
                        item['shop_food1'] = food.xpath(".//a/text()").extract()[0]
                        food_tag += 1
elif food_tag == 1:
                        item['shop_food2'] = food.xpath(".//a/text()").extract()[1]
                        food_tag += 1
elif food_tag == 2:
                        item['shop_food3'] = food.xpath(".//a/text()").extract()[2]
                        food_tag += 1
# 其他評分
score_tag = 0
for score in each.xpath(".//span[@class='comment-list']"):
                for s in score.xpath(".//span/b/text()"):
                    if score_tag == 0:
                        item['shop_sweet'] = score.xpath(".//span/b/text()").extract()[0]
                        score_tag += 1
elif score_tag == 1:
                        item['shop_environment'] = score.xpath(".//span/b/text()").extract()[1]
                        score_tag += 1
elif score_tag == 2:
                        item['shop_server'] = score.xpath(".//span/b/text()").extract()[2]
                        score_tag += 1
yield item

        if self.offset < 50:
            self.offset += 1
#
        # # 每次處理完一頁的資料之後,重新發送下一頁頁面請求
        # # self.offset自增10,同時拼接為新的url,並呼叫回撥函式self.parse處理Response
yield scrapy.Request(self.url + str(self.offset), callback=self.parse)

其中遇到了一些問題,都是通過百度一點點補全的~寫了主要的註釋。

修改pipelines.py

import json


class TestPipeline(object):
    """
        功能:儲存item資料
    """
def __init__(self):
        # 開啟檔案
self.filename = open("shuiguoshengxian.json", "w")

    def process_item(self, item, spider):
        # 將獲取到的每條item轉換為json格式
text = json.dumps(dict(item), ensure_ascii=False) + ",\n"
self.filename.write(text)
        return item

    def close_spider(self, spider):
        # 關閉檔案
self.filename.close()

__init__方法中的檔名就是你要輸出的json檔名。

修改setting.py檔案

DEFAULT_REQUEST_HEADERS = {
  # 'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}

'''
    偽造一個使用者資訊,防止403
'''
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
ITEM_PIPELINES = {
   'Tencent.pipelines.TencentPipeline': 300,
}

'''
    防止403崩潰。
'''
HTTPERROR_ALLOWED_CODES = [403]

要注意的就是USER_AGENT的設定。防止拒絕訪問403錯誤。

終端輸入 scrapy crawl ShopSpider

爬取成功,就可以看到一個.json檔案了,開啟就可以看到其中爬到的資料。

類似於:

{"shop_name": "張福光九九草莓採摘園", "shop_img": "http://p0.meituan.net/deal/cbb3476245a7a22becae0835e072a031325900.png", "shop_star": "五星商戶", "shop_evaluation": "22", "shop_price": "¥122", "shop_type": "水果生鮮", "shop_address1": "甦家屯區", "shop_address2": "來勝村", "shop_sweet": "9.1", "shop_environment": "9.1", "shop_server": "9.2"},
{"shop_name": "糖糖水果撈", "shop_img": "http://p0.meituan.net/waimaipoi/cc0c567369d52a43f9607a8f2734ad7033647.jpg", "shop_star": "準五星商戶", "shop_evaluation": "13", "shop_price": "¥22", "shop_type": "水果生鮮", "shop_address1": "和平區", "shop_address2": "南京南街228-36號6門", "shop_sweet": "8.7", "shop_environment": "8.7", "shop_server": "8.7"},
{"shop_name": "奉鮮果切水果撈(渾南店)", "shop_img": "http://p0.meituan.net/deal/571c8808dead876be5b84a640128b12297393.jpg", "shop_star": "四星商戶", "shop_evaluation": "11", "shop_type": "水果生鮮", "shop_address1": "渾南區", "shop_address2": "渾南新區夾河街A-20號10門", "shop_sweet": "7.9", "shop_environment": "7.9", "shop_server": "8.0"},

寫一個建立資料庫表的py,準備將爬到的資料存在資料庫裡。

# -*- coding: utf-8 -*-
import pymysql

serverIp = "資料庫ip地址"
userName = "登入使用者名稱"
password = "登入密碼"
databaseName = "資料庫名"
# 開啟資料庫連線
db = pymysql.connect(serverIp, userName, password, databaseName)

# 使用cursor()方法建立一個遊標物件cursor
cursor = db.cursor()

# 建立表語句 注意長度限制
sql = """CREATE TABLE shuiguoshengxian (
shop_id  INT PRIMARY KEY auto_increment,
shop_name  VARCHAR(50),
shop_img VARCHAR(150),  
shop_star VARCHAR(10),
shop_evaluation INT,
shop_price INT,
shop_type VARCHAR(10),
shop_address1 VARCHAR(15),
shop_address2 VARCHAR(50),
shop_food1 VARCHAR(20), 
shop_food2 VARCHAR(20), 
shop_food3 VARCHAR(20), 
shop_sweet FLOAT, 
shop_environment FLOAT, 
shop_server FLOAT)"""
# 使用execute()方法執行SQL查詢
cursor.execute(sql)

# 使用 fetchone() 方法獲取單條資料.
# data = cursor.fetchone()
# print("Database version : %s " % data)
cursor.close()

# 關閉資料庫連線
db.close()

寫一個上傳json資料到資料庫的py

# -*- coding: utf-8 -*-
import json
import pymysql

serverIp = "資料庫ip地址"
userName = "登入使用者名稱"
password = "登入密碼"
databaseName = "資料庫名"
# 開啟資料庫連線  注意最後一個引數charset='utf8'
db = pymysql.connect(host=serverIp, user=userName, passwd=password, db=databaseName, port=3306, charset="utf8")

# 使用cursor()方法建立一個遊標物件cursor
cursor = db.cursor()

data = []
with open('shuiguoshengxian.json') as f:
    for line in f:
        # 需要資料為json格式,所以去掉每行末尾的','
data.append(json.loads(line[0:-2]))


for item in data:
    # 使用get方法如果對應key沒有值,則賦一個預設值
    # 防止字串中包含單引號
shop_name_str = item.get('shop_name', "").replace("'", "\\\'")
    shop_img_str = item.get('shop_img', '')
    shop_star_str = item.get('shop_star', '')
    shop_evaluation_str = item.get('shop_evaluation', 0)
    shop_price_stro = item.get('shop_price', '0')

    if shop_price_stro != '0':
        # 將前面的'¥'過濾掉
shop_price_str = shop_price_stro[1:]
    else:
        shop_price_str = 0
shop_type_str = item.get('shop_type', '')
    shop_address1_str = item.get('shop_address1', '')
    shop_address2_str = item.get('shop_address2', '')
    shop_food1_str = item.get('shop_food1', '')
    shop_food2_str = item.get('shop_food2', '')
    shop_food3_str = item.get('shop_food3', '')
    shop_sweet_str = item.get('shop_sweet', 0.0)
    shop_environment_str = item.get('shop_environment', 0.0)
    shop_server_str = item.get('shop_server', 0.0)

    str = "INSERT INTO shuiguoshengxian(shop_name, shop_img, shop_star, shop_evaluation, shop_price, shop_type, shop_address1, shop_address2, shop_food1, shop_food2, shop_food3, shop_sweet, shop_environment, shop_server) VALUES "
str = str + "('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');\r\n" % (shop_name_str, shop_img_str, shop_star_str, shop_evaluation_str, shop_price_str, shop_type_str, shop_address1_str, shop_address2_str, shop_food1_str, shop_food2_str, shop_food3_str, shop_sweet_str, shop_environment_str, shop_server_str)
    # str = "UPDATE shops SET shop_price = '%s' WHERE shop_name = '%s';" % (shop_price_str, shop_name_str)
cursor.execute(str)

f.close()
cursor.close()
db.commit()
db.close()

print("success")

注意,要把json檔案放在專案根目錄下,因為

with open('shuiguoshengxian.json') as f:

如果在別的路徑,可以填具體路徑。

OK,整個流程就是這樣。

說了一通,並不詳細,如果新人看到了,可能有很多疑問,歡迎提問,我會的都會解答的,。也歡迎大神來批評~~寫的程式碼雖然實現了想要的功能效果,指定漏洞百出,希望得到批評指點,謝謝。

~~~與君共勉。