1. 程式人生 > >Pyspider例項之抓取資料並儲存到MySQL資料庫

Pyspider例項之抓取資料並儲存到MySQL資料庫

本次主要是在Pyspider例項之抓取小米眾籌產品的基礎上修改的,
本來想直接在之前那篇文章修改的,但是感覺有點長了,所以決定另外寫一篇。
閒話少說,直接進入正題:
1、在Pyspider的指令碼開頭引入:

from pyspider.database.mysql.mysqldb import SQL

2、重寫on_result方法:

def on_result(self,result):
        if not result or not result['original_id']:
            return
        sql = SQL()
        sql.insert('t_dream_xm_project'
,**result)

3、編寫資料庫指令碼(放入/usr/lib/python2.7/site-packages/pyspider/database/mysql/下):

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from six import itervalues
import MySQLdb

class SQL():
    #資料庫初始化
    def __init__(self):
        #資料庫連線相關資訊
        hosts    = '資料庫地址'  
        username = '資料庫使用者名稱'
        password = '資料庫密碼'
database = '資料庫名' charsets = 'utf8' self.connection = False try: self.conn = MySQLdb.connect(host = hosts,user = username,passwd = password,db = database,charset = charsets) self.cursor = self.conn.cursor() self.cursor.execute("set names "
+charsets) self.connection = True except Exception,e: print "Cannot Connect To Mysql!/n",e def escape(self,string): return '%s' % string #插入資料到資料庫 def insert(self,tablename=None,**values): if self.connection: tablename = self.escape(tablename) if values: _keys = ",".join(self.escape(k) for k in values) _values = ",".join(['%s',]*len(values)) sql_query = "insert into %s (%s) values (%s)" % (tablename,_keys,_values) else: sql_query = "replace into %s default values" % tablename try: if values: self.cursor.execute(sql_query,list(itervalues(values))) else: self.cursor.execute(sql_query) self.conn.commit() return True except Exception,e: print "An Error Occured: ",e return False

說明:這裡使用的是MySQLdb驅動,所以需要安裝MySQLdb
4、資料庫新建資料庫以及對應的表,表的欄位名稱和Pyspider指令碼中detail_page方法中return返回的欄位名稱對應。
OK,完成這項步驟就可以啟動伺服器進行測試了。
完整的Pyspider指令碼:

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-07-14 10:36:36
# Project: xiaomi

from pyspider.libs.base_handler import *
from pyspider.database.mysql.mysqldb import SQL
import urllib
import time
import json

class Handler(BaseHandler):
    #配置通用的請求屬性
    crawl_config = {
        'headers' : {'Connection':'keep-alive','Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q=0.8','content-type':'application/x-www-form-urlencoded','Referer':'//home.mi.com/crowdfundinglist?id=78&title=%E4%BC%97%E7%AD%B9','User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
    }

    @every(minutes=24 * 60)
    def on_start(self):
        #獲取所有的產品詳細產品地址
        param = 'data=%7B%22HomeList%22%3A%7B%22model%22%3A%22Homepage%22%2C%22action%22%3A%22BuildHome%22%2C%22parameters%22%3A%7B%22id%22%3A12%7D%7D%7D'
        self.crawl('https://home.mi.com/app/shopv3/pipe',method="GET",params=param,callback=self.index_page)


    @config(age=60 * 60)  
    def index_page(self, response):
        #獲取單個產品的詳細資訊
        for each in response.json['result']['HomeList']['data']:
            gid = each['gid']
            detailparm = "{\"detail\":{\"model\":\"Shopv2\",\"action\":\"getDetail\",\"parameters\":{\"gid\":\"%s\"}},\"comment\":{\"model\":\"Comment\",\"action\":\"getList\",\"parameters\":{\"goods_id\":\"%s\",\"orderby\":\"1\",\"pageindex\":\"0\",\"pagesize\":3}},\"activity\":{\"model\":\"Activity\",\"action\":\"getAct\",\"parameters\":{\"gid\":\"%s\"}}}" % (gid,gid,gid)
            detailreq = urllib.quote(detailparm)
            detailreq = "data=" + detailreq
            detailurl = "https://home.mi.com/app/shop/pipe?gid=%s" % gid 
            #print detailurl
            self.crawl(detailurl,method='POST',data=detailreq ,callback=self.detail_page)

    @config(priority=2)
    def detail_page(self, response):
        #轉換成Json格式的字串
        resultjsonstr = json.dumps(response.json)
        result = json.loads(resultjsonstr)['result']['detail']['data']['good']
        #將返回的結果儲存到檔案
        resultfile = open("/tmp/xiaomi/%s.txt" % result['gid'].encode('utf-8'),'w')
        resultfile.write(resultjsonstr)
        resultfile.close()
        #將返回的結果儲存到MySQL資料庫
        return {
            "original_id": result['gid'].encode('utf-8'),
            "project_name": result['name'].encode('utf-8'),
            "project_desc": result['summary'].encode('utf-8'),
            "curr_money":result['saled'].encode('utf-8'),
            "begin_date":time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(float(result['ctime'].encode('utf-8'))))
        }

    def on_result(self,result):
        if not result or not result['original_id']:
            return
        sql = SQL()
        sql.insert('t_dream_xm_project',**result)