1. 程式人生 > >Python百度雲網盤搜尋引擎原始碼及安裝教程

Python百度雲網盤搜尋引擎原始碼及安裝教程

執行環境
開始之前你需要安裝

•PHP 5.3.7 +
•MySQL
•Python 2.7 ~
•xunsearch 搜尋引擎
獲取原始碼
ssh 方式:

git clone [email protected]:k1995/BaiduyunSpider.git
https 方式:

git clone https://github.com/k1995/BaiduyunSpider
或手動下載

https://github.com/k1995/BaiduyunSpider/archive/master.zip
下載完畢後,專案的目錄結構大致是這樣的

--- indexer/  #索引
--- spider/   #爬蟲
--- sql/     
--- web/      #網站
    --- application/
        --- config/ # 配置相關
                --- config.php
                --- database.php # 資料庫配置
                ...
        ...
    --- static/ # 存放靜態資源,css|js|font
    --- system/
    --- index.php
    ...
開始部署
建立資料庫
建立名為pan的資料庫,編碼設為utf-8。然後匯入sql,完成表的建立。

網站部署
支援nginx,apache 伺服器。

apache 需要開啟 mod_rewrite 。

nginx  配置如下

location /
{  
    index index.php;
    try_files $uri $uri/ /index.php/$uri;
}

location ~ [^/]\.php(/|$)
{
    fastcgi_pass  127.0.0.1:9000;
    fastcgi_index index.php;
    include fastcgi.conf;
    include pathinfo.conf;
}
配置檔案修改
config.php 檔案修改網站標題,描述等資訊

database.php 修改資料庫賬號,密碼等資訊

網站是基於CodeIgniter 框架開發的,如安裝,部署,或二次開發有問題,請參考官網文件


啟動爬蟲
進入 spider/目錄,修改spider.py 中資料庫資訊。

如果你是第一次部署,需執行下面命令,完成做種

python spider.py --seed-user
上面其實就是抓取百度雲熱門分享使用者的相關資訊,然後從他們開始入手爬取資料

然後執行

python spider.py
此時爬蟲已經開始工作了

安裝xunsearch
目前使用xunsearch作為搜尋引擎,後面會更換為elasticsearch。

安裝過程請參考(不需要安裝,PHP SDK,我已經整合到web裡了)

http://xunsearch.com/doc/php/guide/start.installation

索引資料
上面我們完成了爬蟲的資料抓取,網站的搭建,但還不能搜尋,下面開始最後一步,索引的建立。

進入 indexer/目錄,在indexer.php中將$prefix,替換為你web的根路徑

require '$prefix/application/helpers/xs/lib/XS.php';
並修改資料庫賬號密碼

然後執行

./indexer.php

# -*- coding: utf-8 -*-
import urllib2,re,argparse,json,time
import MySQLdb as mdb
import metautils,traceback,Queue,socket
import random
"""
/*
 *--------------------------------------------   
 *
 *  
 *    
 *  Github 倉庫: https://github.com/k1995/BaiduyunSpider
 * 
 *  演示:http://www.11bt.net/ *
 *   
 * ----------------------------------------*/
"""

DB_HOST='127.0.0.1'
DB_PORT='3306'
DB_USER='root'
# MySQL密碼
DB_PASS='123123'
# 資料庫名稱
DB_NAME='pan'
SPIDER_INTERVAL=1

ERR_NO=0#正常
ERR_REFUSE=1#爬蟲爬取速度過快,被拒絕
ERR_EX=2#未知錯誤


proxy_list = [
               {'http':"x.x.x.x:8080"},
               {'http':"x.x.x.x:8081"},
               {'http':"x.x.x.x:8082"},
               {'http':"x.x.x.x:8083"},
               {'http':"x.x.x.x:8084"},
               {'http':"x.x.x.x:8085"},
               {'http':"x.x.x.x:8086"},
               {'http':"x.x.x.x:8087"},
               {'http':"x.x.x.x:8088"},
               {'http':"x.x.x.x:8089"}
                ]




def getHtml(url,ref=None,reget=5):
 try:

  uas = [
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",
    "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)",
    "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko",
    ]
  proxy_ip =random.choice(proxy_list)

  ua=random.choice(uas)

  print proxy_ip
  print ua
  proxy_support = urllib2.ProxyHandler(proxy_ip)
  opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler)
  urllib2.install_opener(opener)

  request = urllib2.Request(url)
  time.sleep(5)
  request.add_header('User-Agent', ua)
  if ref:
   request.add_header('Referer',ref)
  page = urllib2.urlopen(request,timeout=30)
  html = page.read()
 except:
  if reget>=1:
   #如果getHtml失敗,則再次嘗試5次

   print 'getHtml error,reget...%d'%(6-reget)
   time.sleep(20)
   return getHtml(url,ref,reget-1)
  else:
   print 'request url:'+url
   print 'failed to fetch html'
   exit()
 else:
  return html

class Db(object):
 def __init__(self):
  self.dbconn=None
  self.dbcurr=None

 def check_conn(self):
  try:
   self.dbconn.ping()
  except:
   return False
  else:
   return True

 def conn(self):
  self.dbconn=mdb.connect(DB_HOST, DB_USER, DB_PASS,DB_NAME, charset='utf8')
  self.dbconn.autocommit(False)
  self.dbcurr = self.dbconn.cursor()

 def fetchone(self):
  return self.dbcurr.fetchone()

 def fetchall(self):
  return self.dbcurr.fetchall()

 def execute(self, sql, args=None,falg=False):
  if not self.dbconn:
   #第一次連結資料庫
   self.conn()
  try:
   if args:
    rs=self.dbcurr.execute(sql,args)
   else:
    rs=self.dbcurr.execute(sql)
   return rs
  except Exception, e:
   if self.check_conn():
    print 'execute error'
    traceback.print_exc()
   else:
    print 'reconnect mysql'
    self.conn()
          if args:
              rs=self.dbcurr.execute(sql,args)
          else:
              rs=self.dbcurr.execute(sql)
          return rs
 
 def commit(self):
  self.dbconn.commit()

 def rollback(self):
  self.dbconn.rollback()
 
 def close(self):
  self.dbconn.close()
  self.dbcurr.close()
 def last_row_id(self):
  return self.dbcurr.lastrowid

class BaiduPanSpider(object):
 def __init__(self):
  self.db=Db()
  self.files=[]
  self.got_files_count=0
  self.got_follow_count=0
  self.while_count=0
  self.spider_queue=Queue.Queue(maxsize=20)
  self.status='stop'
  self.errno=ERR_NO
  self.file_type_t={'video':0,'image':1,'document':2,'music':3,'package':4,'software':5,'torrent':6,'other':-1}

 def getShareUser(self,uk):
  url='http://pan.baidu.com/share/count?uk=%d&channel=chunlei&clienttype=0&web=1'%uk
  follows_json=json.loads(getHtml(url,uk))
  if follows_json['errno']!=0:
   if follows_json['errno']==-55:
    self.errno=ERR_REFUSE
   else:
    self.errno=ERR_EX
   return False
  return {
   'pubshare_cnt':follows_json['pubshare_cnt'],
   'fans':follows_json['fans'],
   'follow':follows_json['follow'],
   'album':follows_json['follows_json']
  }

 def getHotUser(self):
  url='http://pan.baidu.com/pcloud/friend/gethotuserlist?type=1&from=feed&start=0&limit=24&channel=chunlei&clienttype=0&web=1'
  follows_json=json.loads(getHtml(url))
  if follows_json['errno']!=0:
   print u'failed to fetch hot users'
   return False
  returns=[]
  count=0

  for item in follows_json['hotuser_list']:
   count=count+1
   hot_uname=item['hot_uname'].encode('utf-8')
   hot_uk=item['hot_uk']
   avatar_url=item['avatar_url'].encode('utf-8')
   intro=item['intro'].encode('utf-8')
   follow_count=item['follow_count']
   fans_count=item['fans_count']
   pubshare_count=item['pubshare_count']
   album_count=item['album_count']
   returns.append({'hot_uname':hot_uname,'hot_uk':hot_uk,'avatar_url':avatar_url,'intro':intro,'follow_count':follow_count,'fans_count':fans_count,'pubshare_count':pubshare_count,'album_count':album_count})
  
  if count==0:
   print "got no hot users"
   return False
  else:
   print "success to fetched hot users: %d"%count
  return returns

 def getFans(self,uk,start=0,limit=24):
  #query_uk:使用者ID
  #limit:每一頁最多顯示數量
  #start:當前頁數
  follows_url='http://pan.baidu.com/pcloud/friend/getfanslist?query_uk=%d&limit=%d&start=%d'%(uk,limit,start)
  follows_json=json.loads(getHtml(follows_url,uk))
  if follows_json['errno']!=0:
   print u'failed to fetch fens'
   return False
  total_count=follows_json['total_count']
  returns=[]
  count=0

  for item in follows_json['fans_list']:
   count=count+1
   fans_uname=item['fans_uname'].encode('utf-8')
   fans_uk=item['fans_uk']
   avatar_url=item['avatar_url'].encode('utf-8')
   intro=item['intro'].encode('utf-8')
   follow_count=item['follow_count']
   fans_count=item['fans_count']
   pubshare_count=item['pubshare_count']
   album_count=item['album_count']
   returns.append({'fans_uname':fans_uname,'fans_uk':fans_uk,'avatar_url':avatar_url,'intro':intro,'follow_count':follow_count,'fans_count':fans_count,'pubshare_count':pubshare_count,'album_count':album_count})

  return (total_count,count,returns)

 def getFollows(self,uk,start=0,limit=24):
  follows_url='http://pan.baidu.com/pcloud/friend/getfollowlist?query_uk=%d&limit=%d&start=%d&bdstoken=d82467db8b1f5741daf1d965d1509181&channel=chunlei&clienttype=0&web=1'%(uk,limit,start)
  ref='http://pan.baidu.com/pcloud/friendpage?type=follow&uk=%d&self=1'%uk
  follows_json=json.loads(getHtml(follows_url,ref))
  if follows_json['errno']!=0:
   print 'getFollows errno:%d'%follows_json['errno']
   print 'request_url:'+follows_url
   if follows_json['errno']==-55:
    self.errno=ERR_REFUSE
   else:
    self.errno=ERR_EX
   return False
  total_count=follows_json['total_count']
  returns=[]
  count=0
  if(total_count>0):
   for item in follows_json['follow_list']:
    count=count+1
    returns.append({
     'follow_uname':item['follow_uname'].encode('utf-8'),
     'follow_uk':item['follow_uk'],
     'avatar_url':item['avatar_url'].encode('utf-8'),
     'intro':item['intro'].encode('utf-8'),
     'follow_count':item['follow_count'],
     'fans_count':item['fans_count'],
     'pubshare_count':item['pubshare_count'],
     'album_count':item['album_count']
    })
  
  return (total_count,count,returns)

 def getShareLists(self,uk,start=0,limit=60):
  sharelists_url='http://pan.baidu.com/pcloud/feed/getsharelist?category=0&auth_type=1&request_location=share_home&start=%d&limit=%d&query_uk=%d&channel=chunlei&clienttype=0&web=1'%(start,limit,uk)
  ref='http://pan.baidu.com/share/home?uk=%d&view=share'%uk
  listhtm=getHtml(sharelists_url,ref)
  print(sharelists_url)
  sharelists_json=json.loads(listhtm)
  if(sharelists_json['errno']!=0):
   print 'getShareLists errno:%d'%sharelists_json['errno']
   print 'request_url:'+sharelists_url
   if sharelists_json['errno']==-55:
    self.errno=ERR_REFUSE
   else:
    self.errno=ERR_EX
   return False
  total_count=sharelists_json['total_count']
  returns=[]
  count=0
  if total_count>0:
   for item in sharelists_json['records']:
    count=count+1
    feed_type=item['feed_type']
    isdir=0
    size=0
    md5=''
    album_id=''
    shorturl=''
    if feed_type=='share':
     if item['filecount']==1:
      filelist=item['filelist']
      isdir=filelist[0]['isdir']
      size=filelist[0]['size']
      md5=filelist[0]['md5']
     else:
      isdir=1
    elif feed_type=='album':
     album_id=item['album_id']
     isdir=2

    if item.has_key('shorturl'):
     shorturl=item['shorturl']
    if item.has_key('username'):
     username=item['username'].encode('utf-8')
    if feed_type=='share' or feed_type=='album':
     returns.append({
      'title':item['title'].encode('utf-8'),
      'username':username,
      'shorturl':shorturl,
      'shareid':item['source_id'],
      'feed_time':item['feed_time']//1000,#分享時間
      'dCnt':item['dCnt'],
      'isdir':isdir,
      'size':size,
      'md5':md5,
      'uk':uk,
      'feed_type':feed_type
     })
  return (total_count,count,returns)

 def getAlbum(self,uk,start=0,limit=60):
  url='http://pan.baidu.com/pcloud/album/getlist?start=%d&limit=%d&query_uk=%d&channel=chunlei&clienttype=0&web=1&bdstoken=d82467db8b1f5741daf1d965d1509181'%(start,limit,uk)
  album_json=json.loads(getHtml(url,uk))
  total_count=album_json['count']
  returns=[]
  count=0

  for item in album_json['album_list']:
   count=count+1
   title=item['title'].encode('utf-8')
   album_id=item['album_id']
   create_time=item['create_time']
   update_time=item['update_time']
   filecount=item['filecount']
   desc=item['desc']
   returns.append({'title':title,'album_id':album_id,'create_time':create_time,'desc':desc,'update_time':update_time,'filecount':filecount,'uk':uk})
  
  if count==0:
   print "get nothing"
   return False
  else:
   print "success to fetched : %d"%count

  if (start+count)<total_count:
   start=start+limit
   returns=returns+self.getAlbum(uk,start)
  return returns

 def seedUsers(self):
  hot_usrs=self.getHotUser()
  if not hot_usrs:
   return
  try:
   for user in hot_usrs:
    time_stamp=int(time.time())
    if user['pubshare_count']>0:
     self.db.execute("INSERT INTO share_users (uk,user_name,avatar_url,intro,follow_count,album_count,\
      fens_count,pubshare_count,last_visited,create_time,weight) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(
       user['hot_uk'],user['hot_uname'],user['avatar_url'],user['intro'],user['follow_count'],
       user['album_count'],user['fans_count'],user['pubshare_count'],time_stamp,time_stamp,5
      )
     )
     uid=self.db.last_row_id()
     self.db.execute("INSERT INTO spider_list (uk,uid) VALUES(%s,%s)",(user['hot_uk'],uid))
  except:
   traceback.print_exc()
   self.db.rollback()
  else:
   self.db.commit()
 
 def startSpider(self):
  if self.spider_queue.empty():
   fetched_users=self.db.execute('SELECT * from spider_list ORDER BY weight DESC limit 0,20')
   if fetched_users<=0:
    print 'nothing to spider,spider_list is empty'
    return False
   self.start='start'
   self.errno=ERR_NO
   fetchall=self.db.fetchall()
   #將資料庫中取出的待爬取的分享者,加入爬取佇列
   for item in fetchall:
    self.spider_queue.put({
    'sid':item[0],
    'uk':item[1],
    'file_fetched':item[2],
    'follow_fetched':item[3],
    'follow_done':item[4],
    'file_done':item[5],
    'weight':item[6],
    'uid':item[7]
   })
   self.got_follow_count=0
   self.got_files_count=0
   self.while_count=0
  
  while not self.spider_queue.empty():
   self.while_count+=1
   share_user=self.spider_queue.get()
   #爬取分享者的檔案列表
   if not share_user['file_done']:
    print '%d now spidering file ,%d  file fetched'%(share_user['uk'],share_user['file_fetched'])
    rs=self.getShareLists(share_user['uk'],share_user['file_fetched'])
    #print(rs)
    if not rs:
     print 'uk:%d error to fetch files,try again later...'%share_user['uk']
     return True
    total_count,fetched_count,file_list=rs
    total_fetched=share_user['file_fetched']+fetched_count
    print 'fetched_file_count:%d'%fetched_count
    if total_fetched>=total_count or total_count==0:
     share_user['file_done']=1#該分享者所有檔案爬取完成
    if total_count==0:
     self.db.execute("UPDATE spider_list set file_done=%s WHERE sid=%s",(1,share_user['sid']))
     self.db.commit()
    else:
     try:
      files_count=0
      for file in file_list:
       files_count+=1
       ext=''
       file_type=''
       file_type_i=-1
       if file['isdir']==0 and file['feed_type']=='share':
        ext = metautils.get_extension(file['title']).lower()
        file_type = metautils.get_category(ext)
        file_type_i=self.file_type_t[file_type]
       time_stamp=int(time.time())
       self.db.execute("INSERT INTO share_file (title,uk,user_name,shareid,shorturl,isdir,size,md5,ext,feed_time,create_time,file_type,uid,feed_type) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(file['title'],file['uk'],file['username'],file['shareid'],         file['shorturl'],file['isdir'],file['size'],file['md5'],ext,file['feed_time'],time_stamp,file_type_i,share_user['uid'],file['feed_type'])
       )
     except:
      share_user['file_done']=0
      self.db.rollback()
      traceback.print_exc()
      return False
     else:
      self.db.execute("UPDATE spider_list set file_fetched=%s,file_done=%s WHERE sid=%s",(total_fetched,share_user['file_done'],share_user['sid']))
      self.db.execute("UPDATE share_users set fetched=%s WHERE uid=%s",(total_fetched,share_user['uid']))
      share_user['file_fetched']=total_fetched
      self.got_files_count+=files_count
      self.db.commit()
      
   #爬取完檔案後在爬取訂閱列表
   if share_user['follow_done']==0 and share_user['file_done']==1:
    print '%d now spidering follow ,%d  follow fetched'%(share_user['uk'],share_user['follow_fetched'])
    rs=self.getFollows(share_user['uk'],share_user['follow_fetched'])
    if not rs:
     print 'error to fetch follows,try again later...'
     return
    total_count,fetched_count,follow_list=rs
    total_fetched=share_user['follow_fetched']+fetched_count
    print 'fetched_follow_count:%d'%fetched_count
    if total_fetched>=total_count or total_count==0:
     share_user['follow_done']=1
    if total_count==0:
     self.db.execute("DELETE FROM spider_list WHERE sid=%s",(share_user['sid'],))
     self.db.commit()
    else:
     try:
      follow_count=0
      for follow in follow_list:
       follow_count+=1
       #判斷該使用者是否已經在表中了
       if self.db.execute('SELECT * FROM share_users WHERE uk=%s',(follow['follow_uk'],))>0:
        print 'uk:%d has already in share_user table'%follow['follow_uk']
        continue
       time_stamp=int(time.time())
       self.db.execute("INSERT INTO share_users (uk,user_name,avatar_url,intro,follow_count,album_count,\
        fens_count,pubshare_count,last_visited,create_time,weight) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(
         follow['follow_uk'],follow['follow_uname'],follow['avatar_url'],follow['intro'],follow['follow_count'],
         follow['album_count'],follow['fans_count'],follow['pubshare_count'],time_stamp,time_stamp,5
        )
       )
       #將獲取的新分享者加入爬取列表
       self.db.execute("INSERT INTO spider_list (uk,uid) VALUES(%s,%s)",(follow['follow_uk'],self.db.last_row_id()))
     except:
      share_user['follow_done']=0
      self.db.rollback()
      traceback.print_exc()
      return False
     else:
      if share_user['follow_done']==1:
       #訂閱者爬取完成,該分享者的任務完成,從待爬取列表中刪除
       print 'delete follow fetched sid:%d from spider_list'%share_user['sid']
       self.db.execute("DELETE FROM spider_list WHERE sid=%s",(share_user['sid'],))
      else:
       self.db.execute("UPDATE spider_list set follow_fetched=%s,follow_done=%s WHERE sid=%s",(total_fetched,share_user['follow_done'],share_user['sid']))
      share_user['follow_fetched']=total_fetched
      self.got_follow_count+=follow_count
      self.db.commit()
   #只要分享者列表沒完成,說明該分享者還未爬取完,則加入工作佇列,繼續爬取
   if share_user['follow_done']==0:
    self.spider_queue.put(share_user)
   else:
    print '%d has done'%share_user['uk']
    del share_user
   time.sleep(SPIDER_INTERVAL)
  
  print '-----------------Done------------------'
  print 'while_count:%d'%self.while_count
  print 'got_follow_count:%d'%self.got_follow_count
  print 'got_files_count:%d'%self.got_files_count
  return True

 def stop(self):
  pass

if __name__ == "__main__":
 parser = argparse.ArgumentParser()
 parser.add_argument("--seed-user", help="get seed user", action="store_true")
 args = parser.parse_args()
 
 spider=BaiduPanSpider()
 
 # 做種
 if args.seed_user:
  spider.seedUsers()
 else:
  while(1):
   print 'start spider...'
   result=spider.startSpider()
   if not result:
    print 'The spider is refused,5 mins later try again auto...'
    time.sleep(60*5)
   else:
    print 'one worker queue id done'
    time.sleep(1)