Python利用scrapy框架,爬取大眾點評部分商鋪資料~
分享一下,自己從0開始,用python爬取資料的歷程。希望可以可以幫到一起從0開始的小夥伴~~加油。
首先,我的開發環境是:
電腦:macOS Sierra 10.12.6 編譯器:PyCharm + 終端
我的電腦自帶的Python版本為2.7,我下載了一個Python3.6。使用3.6版本的來進行本次的編寫, 將新下載的Python配置到環境變數裡。一般他會自帶pip。開啟終端,cd到pip所在目錄,終端輸入 pip scrapy
開啟終端,cd到你想建立的專案目錄下,終端輸入 scrapy startproject Test
就會在該目錄下自動生成一些檔案,接下來只要修改其中的一些檔案就可以了。
使用PyCharm開啟,先截圖一下目錄結構:
根目錄就是你建立的專案名,然後會有一個spiders資料夾,裡面會有__init__.py
根目錄下的檔案,__init__.py , items.py , middlewares.py , pipelines.py , settings.py
cd到Test目錄下,終端輸入 scrapy genspider ShopSpider "dianping.com"
會在Test目錄下生成一個ShopSpider.py檔案。
檔案都建立好了。去想要爬的網站看一下它原始碼的標籤結構。
根據想要爬的資料,修改items.py檔案
import scrapy class TestItem(scrapy.Item): # 餐館名shop_name = scrapy.Field() # 首頁圖 shop_img = scrapy.Field() # 評星 shop_star = scrapy.Field() # 評價人數 shop_evaluation = scrapy.Field() # 人均價位 shop_price = scrapy.Field() # 菜系 shop_type = scrapy.Field() # 地址1 shop_address1 = scrapy.Field() # 詳細地址 shop_address2 = scrapy.Field() # 推薦菜1shop_food1 = scrapy.Field() # 推薦菜2 shop_food2 = scrapy.Field() # 推薦菜3 shop_food3 = scrapy.Field() # 口味評分 shop_sweet = scrapy.Field() # 環境評分 shop_environment = scrapy.Field() # 服務評分 shop_server = scrapy.Field()
修改爬蟲檔案ShopSpider.py
# -*- coding: utf-8 -*- import scrapy from Test.items import TestItem class ShopSpider(scrapy.Spider): """ 功能:大眾點評瀋陽美食店鋪資料 """ # 爬蟲名 name = 'ShopSpider' # 作用範圍 allowed_domains = ['dianping.com'] # baseurl url = 'http://www.dianping.com/shenyang/ch10/g2714p' offset = 1 # 爬取的url start_urls = [url + str(offset)] def parse(self, response): for each in response.xpath("//div[@class='shop-list J_shop-list shop-all-list']/ul/li"): # 初始化模型物件≤ item = TencentItem() item['shop_name'] = each.xpath(".//img/@title").extract()[0] # 分割圖片url imgorl = each.xpath(".//img/@src").extract()[0] img = imgorl.split('%')[0] item['shop_img'] = img item['shop_star'] = each.xpath(".//div[@class='comment']/span/@title").extract()[0] # 評價人數和平均價格 通過迴圈次數去找到兩個相同的標籤下的資料 price_tag = 0 for price in each.xpath(".//div[@class='comment']"): for p in price.xpath(".//a/b/text()"): if price_tag == 0: # 當評價人數為空的時候,第一個獲得到的資料包含'¥'那麼就是價格,否則是評價人數 ep = price.xpath(".//a/b/text()").extract()[0] if '¥' in ep: item['shop_price'] = ep else: item['shop_evaluation'] = ep price_tag += 1 elif price_tag == 1: item['shop_price'] = price.xpath(".//a/b/text()").extract()[1] price_tag += 1 # 商店型別 和 地址,防止地址1不存在,需要判斷 at_tag = 0 for at in each.xpath(".//div[@class='tag-addr']"): for att in at.xpath(".//a/span[@class='tag']/text()"): if at_tag == 0: item['shop_type'] = at.xpath(".//a/span[@class='tag']/text()").extract()[0] at_tag += 1 elif at_tag == 1: item['shop_address1'] = at.xpath(".//a/span[@class='tag']/text()").extract()[1] at_tag += 1 # 地址2 item['shop_address2'] = each.xpath(".//div[@class='tag-addr']/span[@class='addr']/text()").extract()[0] # 推薦菜 判斷個數 food_tag = 0 for food in each.xpath(".//div[@class='recommend']"): for f in food.xpath(".//a/text()"): if food_tag == 0: item['shop_food1'] = food.xpath(".//a/text()").extract()[0] food_tag += 1 elif food_tag == 1: item['shop_food2'] = food.xpath(".//a/text()").extract()[1] food_tag += 1 elif food_tag == 2: item['shop_food3'] = food.xpath(".//a/text()").extract()[2] food_tag += 1 # 其他評分 score_tag = 0 for score in each.xpath(".//span[@class='comment-list']"): for s in score.xpath(".//span/b/text()"): if score_tag == 0: item['shop_sweet'] = score.xpath(".//span/b/text()").extract()[0] score_tag += 1 elif score_tag == 1: item['shop_environment'] = score.xpath(".//span/b/text()").extract()[1] score_tag += 1 elif score_tag == 2: item['shop_server'] = score.xpath(".//span/b/text()").extract()[2] score_tag += 1 yield item if self.offset < 50: self.offset += 1 # # # 每次處理完一頁的資料之後,重新發送下一頁頁面請求 # # self.offset自增10,同時拼接為新的url,並呼叫回撥函式self.parse處理Response yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
其中遇到了一些問題,都是通過百度一點點補全的~寫了主要的註釋。
修改pipelines.py
import json class TestPipeline(object): """ 功能:儲存item資料 """ def __init__(self): # 開啟檔案 self.filename = open("shuiguoshengxian.json", "w") def process_item(self, item, spider): # 將獲取到的每條item轉換為json格式 text = json.dumps(dict(item), ensure_ascii=False) + ",\n" self.filename.write(text) return item def close_spider(self, spider): # 關閉檔案 self.filename.close()
__init__方法中的檔名就是你要輸出的json檔名。
修改setting.py檔案
DEFAULT_REQUEST_HEADERS = { # 'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } ''' 偽造一個使用者資訊,防止403 ''' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
ITEM_PIPELINES = { 'Tencent.pipelines.TencentPipeline': 300, } ''' 防止403崩潰。 ''' HTTPERROR_ALLOWED_CODES = [403]
要注意的就是USER_AGENT的設定。防止拒絕訪問403錯誤。
終端輸入 scrapy crawl ShopSpider
爬取成功,就可以看到一個.json檔案了,開啟就可以看到其中爬到的資料。
類似於:
{"shop_name": "張福光九九草莓採摘園", "shop_img": "http://p0.meituan.net/deal/cbb3476245a7a22becae0835e072a031325900.png", "shop_star": "五星商戶", "shop_evaluation": "22", "shop_price": "¥122", "shop_type": "水果生鮮", "shop_address1": "甦家屯區", "shop_address2": "來勝村", "shop_sweet": "9.1", "shop_environment": "9.1", "shop_server": "9.2"}, {"shop_name": "糖糖水果撈", "shop_img": "http://p0.meituan.net/waimaipoi/cc0c567369d52a43f9607a8f2734ad7033647.jpg", "shop_star": "準五星商戶", "shop_evaluation": "13", "shop_price": "¥22", "shop_type": "水果生鮮", "shop_address1": "和平區", "shop_address2": "南京南街228-36號6門", "shop_sweet": "8.7", "shop_environment": "8.7", "shop_server": "8.7"}, {"shop_name": "奉鮮果切水果撈(渾南店)", "shop_img": "http://p0.meituan.net/deal/571c8808dead876be5b84a640128b12297393.jpg", "shop_star": "四星商戶", "shop_evaluation": "11", "shop_type": "水果生鮮", "shop_address1": "渾南區", "shop_address2": "渾南新區夾河街A-20號10門", "shop_sweet": "7.9", "shop_environment": "7.9", "shop_server": "8.0"},
寫一個建立資料庫表的py,準備將爬到的資料存在資料庫裡。
# -*- coding: utf-8 -*- import pymysql serverIp = "資料庫ip地址" userName = "登入使用者名稱" password = "登入密碼" databaseName = "資料庫名" # 開啟資料庫連線 db = pymysql.connect(serverIp, userName, password, databaseName) # 使用cursor()方法建立一個遊標物件cursor cursor = db.cursor() # 建立表語句 注意長度限制 sql = """CREATE TABLE shuiguoshengxian ( shop_id INT PRIMARY KEY auto_increment, shop_name VARCHAR(50), shop_img VARCHAR(150), shop_star VARCHAR(10), shop_evaluation INT, shop_price INT, shop_type VARCHAR(10), shop_address1 VARCHAR(15), shop_address2 VARCHAR(50), shop_food1 VARCHAR(20), shop_food2 VARCHAR(20), shop_food3 VARCHAR(20), shop_sweet FLOAT, shop_environment FLOAT, shop_server FLOAT)""" # 使用execute()方法執行SQL查詢 cursor.execute(sql) # 使用 fetchone() 方法獲取單條資料. # data = cursor.fetchone() # print("Database version : %s " % data) cursor.close() # 關閉資料庫連線 db.close()
寫一個上傳json資料到資料庫的py
# -*- coding: utf-8 -*- import json import pymysql serverIp = "資料庫ip地址" userName = "登入使用者名稱" password = "登入密碼" databaseName = "資料庫名" # 開啟資料庫連線 注意最後一個引數charset='utf8' db = pymysql.connect(host=serverIp, user=userName, passwd=password, db=databaseName, port=3306, charset="utf8") # 使用cursor()方法建立一個遊標物件cursor cursor = db.cursor() data = [] with open('shuiguoshengxian.json') as f: for line in f: # 需要資料為json格式,所以去掉每行末尾的',' data.append(json.loads(line[0:-2])) for item in data: # 使用get方法如果對應key沒有值,則賦一個預設值 # 防止字串中包含單引號 shop_name_str = item.get('shop_name', "").replace("'", "\\\'") shop_img_str = item.get('shop_img', '') shop_star_str = item.get('shop_star', '') shop_evaluation_str = item.get('shop_evaluation', 0) shop_price_stro = item.get('shop_price', '0') if shop_price_stro != '0': # 將前面的'¥'過濾掉 shop_price_str = shop_price_stro[1:] else: shop_price_str = 0 shop_type_str = item.get('shop_type', '') shop_address1_str = item.get('shop_address1', '') shop_address2_str = item.get('shop_address2', '') shop_food1_str = item.get('shop_food1', '') shop_food2_str = item.get('shop_food2', '') shop_food3_str = item.get('shop_food3', '') shop_sweet_str = item.get('shop_sweet', 0.0) shop_environment_str = item.get('shop_environment', 0.0) shop_server_str = item.get('shop_server', 0.0) str = "INSERT INTO shuiguoshengxian(shop_name, shop_img, shop_star, shop_evaluation, shop_price, shop_type, shop_address1, shop_address2, shop_food1, shop_food2, shop_food3, shop_sweet, shop_environment, shop_server) VALUES " str = str + "('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');\r\n" % (shop_name_str, shop_img_str, shop_star_str, shop_evaluation_str, shop_price_str, shop_type_str, shop_address1_str, shop_address2_str, shop_food1_str, shop_food2_str, shop_food3_str, shop_sweet_str, shop_environment_str, shop_server_str) # str = "UPDATE shops SET shop_price = '%s' WHERE shop_name = '%s';" % (shop_price_str, shop_name_str) cursor.execute(str) f.close() cursor.close() db.commit() db.close() print("success")
注意,要把json檔案放在專案根目錄下,因為
with open('shuiguoshengxian.json') as f:
如果在別的路徑,可以填具體路徑。
OK,整個流程就是這樣。
說了一通,並不詳細,如果新人看到了,可能有很多疑問,歡迎提問,我會的都會解答的,。也歡迎大神來批評~~寫的程式碼雖然實現了想要的功能效果,指定漏洞百出,希望得到批評指點,謝謝。
~~~與君共勉。