1. 程式人生 > >【Python】倒排索引

【Python】倒排索引

程式碼連結

預處理

word stemming

一個單詞可能不同的形式,在英語中比如動詞的主被動、單複數等。比如live\lives\lived.
雖然英文的處理看起來已經很複雜啦但實際在中文裡的處理要更加複雜的多。

stop words

比如a、the這種詞在處理的時候沒有實際意義。在這裡處理的時候先對詞頻進行統計,人為界定停詞,簡單的全部替換為空格。但是這種方式並不適用於所有的情況,對於比如,To be or not to be,這種就很難處理。

具體實現

Index.txt 記錄所出現的檔案
這裡將建立倒排索引分為三步

thefile.txt 所有出現過的詞(詞頻由高到低)
stop_word.txt 停詞
data.pkl 所建立的索引

1 count.py 確定停詞
2 index.py 建立倒排索引
3 query.py 用於查詢

這裡在建立倒排索引的時候只記錄了出現的檔名,並沒有記錄在檔案中出現的位置。

圖為count.py生成的詞頻統計

這裡寫圖片描述

count.py

#-*- coding:utf-8 -*-
'''
@author birdy qian
'''
import sys
from nltk import *                                                                                          #import natural-language-toolkit
from operator import itemgetter #for sort def output_count(fdist): #output the relative information #vocabulary =fdist.items() vocabulary =fdist.items() #get all the vocabulary
vocabulary=sorted(vocabulary, key=itemgetter(1),reverse=True) #sort the vocabulary in decreasing order print vocabulary[:250] #print top 250 vocabulary and its count on the screen print 'drawing plot.....' #show process fdist.plot(120 , cumulative=False) #print the plot #output in file file_object = open('thefile.txt', 'w') #prepare the file for writing for j in vocabulary: file_object.write( j[0] + ' ') #put put all the vocabulary in decreasing order file_object.close( ) #close the file def pre_file(filename): print("read file %s.txt....."%filename) #show process content = open( str(filename) + '.txt', "r").read() content = content.lower() for ch in '!"#$%&()*+,-./:;<=>[email protected][\\]^_‘{|}~' : #cancel the punction content = content.replace(ch, " ") plurals = content.split() #split the file at '\n' or ' ' stemmer = PorterStemmer() #prepare for stemming singles = [stemmer.stem(plural) for plural in plurals] #handling stemming return singles #main function def main(): print "read index....." #show process input = open('index.txt', 'r') #titles that need to be handled all_the_file =input.read( ) file=all_the_file.split() input.close() #close the file fdist1=FreqDist() #create a new dist for x in range( 0, len(file) ): #print file[x] txt = pre_file( file[x] ) #pre handing the txt for words in txt : words =words.decode('utf-8').encode(sys.getfilesystemencoding()) #change string typt from utf-8 to gbk fdist1[words] +=1 #add it to the dist output_count(fdist1) #runfile if __name__ == '__main__': main()

index.py

#-*- coding:utf-8 -*-
'''
@author birdy qian
'''

import sys
import pickle                   
from nltk import *                                                                                          #import natural-language-toolkit
from operator import itemgetter                                                                 #for sort


STOPWORDS = []                                                                                          #grobal variable

def output_index(result):
    #print result

    output = open('data.pkl', 'wb')
    pickle.dump(result, output)                                                                     # Pickle dictionary using protocol 0
    output.close()


def pre_file(filename): 
    global STOPWORDS
    print("read file %s.txt....."%filename)                                                             #show process
    content = open( str(filename) + '.txt', "r").read()
    content = content.lower()
    for ch in '!"#$%&()*+,-./:;<=>[email protected][\\]^_��{|}~' :                                           #cancel the punction
        content = content.replace(ch, " ")

    for ch in  STOPWORDS:                                                                               #cancel the stopwords
        content = content.replace(ch, " ")      

    plurals = content.split()                                                                               #split the file at '\n' or ' '

    stemmer = PorterStemmer()                                                                       #prepare for stemming
    singles = [stemmer.stem(plural) for plural in plurals]                                  #handling stemming

    return singles

def readfile(filename):
    input = open(filename, 'r')                                                                     #titles that need to be handled
    all_the_file =input.read( )
    words = all_the_file.split()                                                                            #split the file at '\n' or ' '
    input.close()           
    return words



#main function
def main(): 
    global STOPWORDS
    print "read index....."                                                                                 #show process
    file=readfile('index.txt')
    print "read stopwords....." 
    STOPWORDS = readfile('stop_word.txt')  

    print "create word list....."
    word = list(readfile('thefile.txt'))                                                                        #the file with all the words in all the books
    result = {}                                                                                                     #memorize the result 

    for x in range( 0, len(file) ):
        #print file[x]

        txt = pre_file( file[x] )                                                                                   # file[x] is the title 
        txt =  {}.fromkeys(txt).keys()                                                                      #cancel the repeat word
        #can also use text.set()                                                            

        for words in txt :
            words =words.decode('utf-8').encode(sys.getfilesystemencoding())        #change string typt from utf-8 to gbk
            if result.get(words) == None :                                                              #if the word is not in the dictionary
                result[words]=[file[x]]
            else:                                                                                                       #if the word is in the dictionary
                t=result.get(words)
                t.append(file[x])
                result[words]=t


    output_index(result)



#runfile
if __name__ == '__main__': 
    main()

query.py

#-*- coding:utf-8 -*-
'''
@author birdy qian
'''
import os 
import sys
import pprint, pickle
from nltk import PorterStemmer

def readfile(filename):
    input = open(filename, 'r')                                                                 #titles that need to be handled
    all_the_file =input.read( )
    words = all_the_file.split()                                                                        #split the file at '\n' or ' '
    input.close()                                                                                           #close the data
    return words

def getdata():
    pkl_file = open('data.pkl', 'rb')                                                               #index is saved in the file 'data.pkl'
    data1 = pickle.load(pkl_file)                                                                   #change the type
    #pprint.pprint(data1)
    pkl_file.close()                                                                                        #close the file
    return  data1                                                                                       #close the data

def output( result ):
    #print result
    if result == None:                                              #if the words is not in the index (one word return None)
        print None
        return
    if len(result) == 0 :                                           #if the words is not in the index (more than one words return [] )
        print None
        return 

    if len(result) < 10 :                                               #if the records is less than 10
        print result

    else:                                                                   #if the records is more than 10
        print 'get '+ str(len(result)) + ' records'                                                                         #the record number
        for i in range( 0 , len(result) / 10 +1):
            print '10 records start from ' +str(i*10+1)

            if 10 * i + 9 < len(result) :                                                                                           #print from 10 * i to 10 * i + 10
                print result[ 10 * i : 10 * i + 10 ]
            else:                                                                                                                           #print from 10 * i to end
                print result[ 10 * i :  len(result) ]
                break
            getstr = raw_input("Enter 'N' for next ten records & other input to quit : ")
            if getstr != 'N':
                break



#main function
def main(): 
    data_list = getdata()                                                                                                   #read data                                                                  
    STOPWORDS = readfile('stop_word.txt') 
    stemmer = PorterStemmer()                                                                                       #prepare for stemming

    while True:
        get_str = raw_input("Enter your query('\\'to quit): ")
        if get_str == '\\' :                                                                                                    #leave the loop
            break

        get_str = get_str.lower()
        for ch in  STOPWORDS:                                                                                           #cancel the stopwords
            get_str = get_str.replace(ch, " ")  
        query_list=get_str.split()                                                                                          #split the file at '\n' or ' '
        query_list = [stemmer.stem(plural) for plural in query_list]                                        #handling stemming


        while True:     
            if query_list != [] :
                break
            get_str = raw_input("Please enter more information: ")
            get_str = get_str.lower()
            for ch in  STOPWORDS:                                                                                       #cancel the stopwords
                 get_str = get_str.replace(ch, " ") 
            query_list=get_str.split()
            query_list = [stemmer.stem(plural) for plural in query_list]                                    #handling stemming



        result=[]
        for k in range( 0 , len(query_list) ):  
            if k==0:                                                                                                            #if the list has not been built 
                result = data_list.get( query_list[0] )
            else:                                                                                                                   #if the list has been built 
                result = list( set(result).intersection(data_list.get( query_list[k] ) ) )
        output( result )


#runfile
if __name__ == '__main__': 
    main()