1. 程式人生 > >集體智慧程式設計——搜尋與排名-Python實現

集體智慧程式設計——搜尋與排名-Python實現

學習構建一個簡易的搜尋引擎,步驟如下:

  • 網頁抓取:從一個或一組特定的網頁開始,根據網頁內部連結逐步追蹤到其他網頁。這樣遞迴進行爬取,直到到達一定深度或達到一定數量為止。
  • 建立索引:建立資料表,包含文件中所有單詞的位置資訊,文件本身不一定要儲存到資料庫中,索引資訊只需簡單的儲存到一個指向文件所在位置的引用即可。
  • 查詢和排名:根據合適的網頁排序方法,返回一個經過排序的頁面列表。

網頁爬取過程:
首先輸入一個初始的網頁,獲取該網頁的內容並進行解析,建立“網頁——詞——位置”的關係,存入wordlocation資料表中。隨後從網頁中尋找連結,若有連結,則將連結的相對路徑轉換成絕對路徑,並存儲,作為下一層檢索的URL。同時將“原網頁URL——連結URL——連結文字”關係儲存到link表中。
至此,一層爬取完畢。根據新加入的URL,爬取下一層網頁。

查詢過程:
使用者輸入一個查詢字串,將其分詞,據此構建SQL查詢語句,查詢一個網頁內同時含有該字串中所有詞的網頁地址。並記錄。

網頁排名:
1. 單詞頻度:即要查詢單詞在網頁中出現的次數,次數越多,分值越高。
2. 文件位置:單詞在文件中出現的位置越靠近文件的開始處,分值越高。
3. 單詞距離:要查詢的多個關鍵詞在網頁中的距離越近越好。
4. 外部連結:統計指向某網頁的連結的數量,數量越多越好。
5. PageRank值:關於計算方法詳見程式碼,很簡單。此處要說明的是,PageRank值不需要再使用者要查詢時才開始生成,而是在網頁爬取完畢,資料庫建立好之後,就可以計算每個網頁的PageRank值並存儲,定期對PageRank值進行更新即可。
計算方法:對所有網頁的PageRank設定一個任意的初始值,反覆計算,迭代若干次。在每次迭代期間,每個網頁的PageRank值會越來越接近真實值。

彙總排名
根據個性化的搜尋引擎需求,彙總上述各種方法的打分結果,注意打分結果在彙總時最好進行歸一化,以消除不同度量帶來的影響。彙總時為每種度量方法設定權值,計算不同度量方法的加權和。

詳細程式碼如下:

# -*- coding: utf-8 -*-
__author__ = 'Bai Chenjia'

import urllib2
# import BeautifulSoup
from bs4 import BeautifulSoup
from urlparse import urljoin
import sys
from sqlite3 import dbapi2 as sqlite
import
re import collections reload(sys) sys.setdefaultencoding('utf8') ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) class crawler: # 初始化crawler類並傳入資料庫名稱 def __init__(self, dbname): self.con = sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # 輔助函式,用於獲取條目的id,並且如果條目不存在,就將其加入資料庫中 # table為表名,field為欄位名,value為要查詢的欄位 def getentryid(self, table, field, value, createnew=True): cur = self.con.execute("select rowid from %s where %s='%s'" % (table, field, value)) # 資料庫操作,獲取單條查詢結果 res = cur.fetchone() # 如果res為空,表明查詢到的結果為空。則將該value插入到資料表中 if res is None: cur = self.con.execute("insert into %s (%s) values ('%s')" % (table, field, value)) return cur.lastrowid # 如果查詢到了相關記錄 else: return res[0] # 為每個網頁建立索引,引數為url和該url對應的soup # 首先利用gettextonly和separatewords獲取每個單詞,然後將每個單詞與網頁建立關聯 # 儲存單詞在文件中出現的位置 def addtoindex(self, url, soup): if self.isindexed(url): return print "Indexing " + url # 獲取每個單詞 text = self.gettextonly(soup) words = self.separatewords(text) # 得到url對應的ID urlid = self.getentryid('urllist', 'url', url) # 將每個單詞與該url關聯 for i in range(len(words)): word = words[i] if word in ignorewords: # 去除停用詞 continue # 獲取word的id,如果沒有則新建 wordid = self.getentryid('wordlist', 'word', word) # 為該單詞建立 urlid-wordid-location的資料項,插入wordlocation資料表中 self.con.execute("insert into wordlocation(urlid,wordid,location) \ values (%d,%d,%d)" % (urlid, wordid, i)) # 從一個HTML網頁中提取文字(不帶標籤)返回一個長字串 def gettextonly(self, soup): v = soup.string if v is None: c = soup.contents resulttext = '' for t in c: # 遞歸向下查詢 subtext = self.gettextonly(t) resulttext += subtext + '\n' return resulttext else: return v.strip() # 根據任何非空白字元進行分詞處理 # 將gettextonly產生的字串拆分成一組獨立的單詞,以便將其加入索引中 def separatewords(self, text): # 正則表示式,以非單詞字元作為分隔符 splitter = re.compile('\\W+') # 分割text文字為單詞,返回單詞列表 return [s.lower() for s in splitter.split(text) if s != ''] # 如果url已經建立索引,則返回true def isindexed(self, url): # 查詢該條url記錄並返回第一條,如果不存在則返回None u = self.con.execute("select rowid from urllist where url='%s'" % url).fetchone() if u is not None: # 檢查它是否已經被檢索過了 v = self.con.execute("select * from wordlocation where urlid=%d" % u[0]).fetchone() if v is not None: return True return False # 新增一個關聯兩個網頁的連結 def addlinkref(self, urlFrom, urlTo, linkText): words = self.separatewords(linkText) fromid = self.getentryid('urllist', 'url', urlFrom) toid = self.getentryid('urllist', 'url', urlTo) if fromid == toid: return cur = self.con.execute( "insert into link(fromid,toid) values (%d,%d)" % (fromid, toid)) linkid = cur.lastrowid for word in words: if word in ignorewords: continue wordid = self.getentryid('wordlist', 'word', word) self.con.execute( "insert into linkwords(linkid,wordid) values (%d,%d)" % (linkid, wordid)) # 從一小組網頁開始進行廣度優先搜尋,直至某一給定深度depth # 期間為網頁簡歷索引 def crawl(self, pages, depth=2): for i in range(depth): # 新建newpages set表明不能有重複 newpages = set() for page in pages: print "page:", page try: c = urllib2.urlopen(page) except: print "找不到網頁 %s" % page continue soup = BeautifulSoup(c.read(), "html5lib") self.addtoindex(page, soup) # 從解析後的網頁中查詢網頁中含有的連結標籤 links = soup('a') for link in links: # 如果含有指向其他網址的連結,則提取該網址並建立索引 if 'href' in dict(link.attrs): # 此處講連結的相對路徑轉為絕對路徑 url = urljoin(page, link['href']) # print "url:", url if url.find("'") != -1: continue url = url.split('#')[0] # 去掉位置部分 if url[0:4] == 'http' and not self.isindexed(url): newpages.add(url) linkText = self.gettextonly(link) self.addlinkref(page, url, linkText) # 更新資料庫 self.dbcommit() # 更新當前要檢索的連結pages pages = newpages print "end.." # 建立資料表 def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer,toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() # 建立pagerank資料表,計算每一個網頁的pagerank值,將值儲存在表中 def calculatepagerank(self, iteration=20): # 更新之前先清除之前的pagerank表,包含兩項urlid和score,其中urlid為主鍵 self.con.execute('drop table if exists pagerank') self.con.execute('create table pagerank(urlid primary key, score)') # 初始化每個url的pagerank值為1 self.con.execute('insert into pagerank select rowid, 1.0 from urllist') self.dbcommit() print "更新pagerank表..." for i in range(iteration): print "iteration ", i for (urlid,) in self.con.execute('select rowid from urllist'): pr = 0.15 # 迴圈遍歷指向當前網頁的所有網頁 for (linker,) in self.con.execute( 'select distinct fromid from link where toid=%d' % urlid): # 得到連結源對應網頁的url值 linkingpr = self.con.execute( 'select score from pagerank where urlid=%d' % linker).fetchone()[0] # 根據連結源,求總的連結數 linkingcount = self.con.execute( 'select count(*) from link where fromid=%d' % linker).fetchone()[0] pr += 0.85 * (linkingpr / linkingcount) # 更新資料表,修改pagerank值 self.con.execute( 'update pagerank set score=%f where urlid=%d' % (pr, urlid)) self.dbcommit() """ 上述crawler類用於爬取資料和生成資料庫 下面的searcher類用於搜尋 """ class searcher: def __init__(self, dbname): self.con = sqlite.connect(dbname) def __del__(self): self.con.close() # 聯合查詢資料庫。根據要檢索內容q所分成的單詞,查詢資料庫,尋找同時含有q中所有單詞的網頁 # 檢索 “footgame時”,查詢語句為 # select w0.urlid,w0.location,w1.location from # wordlocation w0,wordlocation w1 where w0.wordid=234 # and w0.urlid=w1.urlid and w1.wordid=1432 def getmatchrows(self, q): # Strings to build the query fieldlist = 'w0.urlid' tablelist = '' clauselist = '' wordids = [] # 分割單詞 words = q.split(' ') tablenumber = 0 # 構造資料庫聯合查詢的語句 for word in words: # 獲取單詞的ID號 wordrow = self.con.execute( "select rowid from wordlist where word='%s'" % word).fetchone() if wordrow is not None: wordid = wordrow[0] wordids.append(wordid) if tablenumber > 0: tablelist += ',' clauselist += ' and ' clauselist += 'w%d.urlid=w%d.urlid and ' % ( tablenumber - 1, tablenumber) fieldlist += ',w%d.location' % tablenumber tablelist += 'wordlocation w%d' % tablenumber clauselist += 'w%d.wordid=%d' % (tablenumber, wordid) tablenumber += 1 # 查詢,儲存結果,row中儲存的是urlid, wordid, location if clauselist != '': fullquery = 'select %s from %s where %s' % ( fieldlist, tablelist, clauselist) print fullquery cur = self.con.execute(fullquery) rows = [row for row in cur] # 返回值rows中儲存 urlid,wordid,location ,wordids中儲存要檢索的word的編號 return rows, wordids else: print "not found" # getmatchrows的返回值rows中儲存 urlid,location列表 ,wordids中儲存要檢索的word的編號 def getscoredlist(self, rows, wordids): # 初始化(url,score)字典,值都置為0 totalscores = dict([(row[0], 0) for row in rows]) # 設定不同打分標準的權重,根據權重打分 weights = [(1.0, self.locationscore(rows)), (1.0, self.frequencyscore(rows)), (2.0, self.pagerankscore(rows)), (1.0, self.distancescore(rows)), (1.0, self.inboundlinkscore(rows)), (2.0, self.linktextscore(rows, wordids)), (5.0, self.nnscore(rows, wordids))] # 其中wight是一個數值,scores是一個字典,儲存著所有rows在該項的分值 for weight, scores in weights: if scores is None: print "weight = ", weight print scores is None for url in totalscores: try: totalscores[url] += weight * scores[url] except: pass # print "url=",url,"scores=",len(scores.items()) return totalscores # 根據 rowid 從urllist 中獲取 url def geturlname(self, urlid): res = self.con.execute( "select url from urllist where rowid = %d" % urlid).fetchone()[0] return res # 該函式用於獲取最終檢索結果 # 首先 getmatchrows 檢索,隨後 getscoredlist 進行評分,排序,最後取評分前10項作為結果,獲取其真實URL輸出 def query(self, q): rows, wordids = self.getmatchrows(q) scores = self.getscoredlist(rows, wordids) rankedscores = sorted([(score, url) for (url, score) in scores.items()], key=lambda x: x[0], reverse=True) for score, urlid in rankedscores[0:10]: print score, self.geturlname(urlid) # 有的評價方法中分值越大越好,有的評價方法中則分值越小越好,因此需要歸一化處理,處理後的值域為0-1 def normalizescores(self, scores, smallIsBetter=0): vsmall = 0.00001 if smallIsBetter: # score.values取字典的值,返回列表 minscore = min(scores.values()) res = [] for u, l in scores.items(): temp = float(minscore) / max(vsmall, l) res.append((u, temp)) return dict(res) else: maxscore = max(max(scores.values()), vsmall) res = [] for u, c in scores.items(): temp = float(c) / maxscore res.append((u, temp)) return dict(res) # 基於文件中檢索詞的位置打分,文件的主題有可能出現在文件的開始處 # 對單詞在文件中越早出現給予越高的評價 def locationscore(self, rows): if len(rows) == 0: return collections.defaultdict(int) # 設定預設字典 locations = collections.defaultdict(int) for row in rows: # 累加所有單詞的位置和 loc = sum(row[1:]) locations[row[0]] = loc return self.normalizescores(locations, smallIsBetter=1) # 基於詞頻給文件打分 def frequencyscore(self, rows): if len(rows) == 0: # 返回預設值,任意鍵的值都為0 return collections.defaultdict(int) # 設定初值 counts = dict([(row[0], 0) for row in rows]) # 累加計算 for row in rows: counts[row[0]] += 1 # 歸一化 return self.normalizescores(counts, smallIsBetter=0) # 要查詢的多個單詞在同一網頁中的位置距離越近得分越高 def distancescore(self, rows): if len(rows) == 0: return collections.defaultdict(int) dis = collections.defaultdict(int) for row in rows: # 計算該網頁中檢索單詞的距離遠近 temp = sum([abs(row[i] - row[i - 1]) for i in range(2, len(row))]) dis[row[0]] = temp # print "dis = ", dis.items()[:] return self.normalizescores(dis, smallIsBetter=1) # 每個網頁的重要性由指向該網頁的其他網頁的連結數量之和決定 # 其中每個外部回指連結擁有同樣的權重 def inboundlinkscore(self, rows): if len(rows) == 0: return collections.defaultdict(int) uniqueurls = set([row[0] for row in rows]) bound_list = [(u, self.con.execute( 'select count(*) from link where toid=%d' % u).fetchone()[0]) for u in uniqueurls] bound_dict = collections.defaultdict(int) for u1, v in bound_list: bound_dict[u1] = v return self.normalizescores(bound_dict, smallIsBetter=0) # pagerank演算法給文件打分,直接從資料庫中提取pagerank表中的資料後作歸一化即可 def pagerankscore(self, rows): if len(rows) == 0: return collections.defaultdict(int) pagescore = [(row[0], self.con.execute( 'select score from pagerank where urlid=%d' % row[0]).fetchone()[0]) for row in rows] pagerank_dict = dict(pagescore) return self.normalizescores(pagerank_dict, smallIsBetter=0) # linktextscore演算法 copy def linktextscore(self, rows, wordids): linkscores = dict([(row[0], 0) for row in rows]) for wordid in wordids: cur = self.con.execute( 'select link.fromid,link.toid from linkwords,link where wordid=%d and linkwords.linkid=link.rowid' % wordid) for (fromid, toid) in cur: if toid in linkscores.keys(): pr = self.con.execute('select score from pagerank where urlid=%d' % fromid).fetchone()[0] linkscores[toid] += pr maxscore = max(linkscores.values()) normalizedscores = dict([(u, float(l) / maxscore) for (u, l) in linkscores.items()]) return normalizedscores # 基於神經網路 def nnscore(self, rows, wordids): return collections.defaultdict(int) if __name__ == "__main__": # 制定網頁執行爬蟲 # pagelist = ["http://www.bbc.com/sport/football/35622621"] # new_crawler = crawler(dbname='') # new_crawler.crawl(pagelist) # 該條語句只可以執行一次,創建出具庫 # newcrawler.createindextables() # 建立資料表 # newcrawler = crawler('searchindex.db') # pages = ['http://www.bbc.com/sport/football/35622621'] # newcrawler.crawl(pages) # 更新pagerank表 #newcrawler = crawler('searchindex.db') # newcrawler.calculatepagerank() # 測試資料庫,輸出pagerank排名前幾的網頁和值 #newcrawler = crawler('searchindex.db') #a = newcrawler.con.execute('select * from pagerank order by score desc').fetchall()[0:10] #e = searcher('searchindex.db') # for i in range(10): # print e.geturlname(a[i][0]), "score=", a[i][1] # e = searcher('searchindex.db') # result = e.getmatchrows('football game') # print "result = ", result[: ] # 呼叫query函式,根據權值輸出最終的頁面排序結果 e = searcher('searchindex.db') e.query('football game') # test function:normalizescores #testscore = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5} #e = searcher('searchindex.db') #res = e.normalizescores(testscore, 0) # print res