1. 程式人生 > >【搜尋引擎】用whoosh+無監督聚類搭建一套智慧搜尋引擎

【搜尋引擎】用whoosh+無監督聚類搭建一套智慧搜尋引擎

因本介面自定義功能較多,且底層演算法自己搭建,故不再使用其他三方庫,只用whoosh其他演算法自己寫。

# -*- coding: utf-8 -*-
from whoosh.fields import Schema,TEXT,ID
from whoosh.index import create_in,open_dir
from whoosh.query import And,Term,Or
from whoosh.searching import *
from jieba.analyse import ChineseAnalyzer
from whoosh import scoring
import pymysql
from gensim import corpora,models
from gensim.similarities.docsim import Similarity
import datetime
import jieba
import os
import glob
import jieba.posseg as psg
jieba.load_userdict('..//..//spo//HR專業詞彙.txt')

def get_joblist():
    db = pymysql.connect('131.42.33.12','rxxt','52xxkk','unxxkkao',port=3306,charset='utf8')
    sql_job = "SELECT jobName,workPlace,jobDescript,un2co_job.id_job FROM un2co_job where  enddate>='"+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"' AND un2co_job.id_job in(SELECT id_job FROM un2co_natural_job_check WHERE natural_check=1)"
    
    cursor = db.cursor()
    cursor.execute(sql_job)
    joblist = cursor.fetchall()
    return joblist

def update_index():
    filename = glob.glob('*.xkk')
    print("filename:",filename)
    if len(filename)>0:
        now = datetime.datetime.now()
        before = datetime.datetime.strptime(filename[0][:-4], "%Y-%m-%d")
        a = now - before
        if a.days<=0:
            print('當前模型未過期,無需重新生成:',a.days)
            return
#        print(filename[0])
    #LSI模型生成並儲存
    joblist = get_joblist()
    jobinfolist = [list(jieba.cut(job[0]+job[1]+job[2])) for job in joblist]
        
    search_dictionary = corpora.Dictionary(jobinfolist)
        
    search_corpus = [search_dictionary.doc2bow(text) for text in jobinfolist]
    search_tfidf_model=models.TfidfModel(search_corpus)
    
    search_corpus_tfidf = [search_tfidf_model[doc] for doc in search_corpus]
    search_lsi= models.LsiModel(search_corpus_tfidf,id2word=search_dictionary,num_topics=21+80)    
    
    search_dictionary.save('search_lsi_index//search_lsi_index.dict')
    search_tfidf_model.save('search_lsi_index//search_lsi_index.tfidf')
    search_lsi.save('search_lsi_index//search_lsi_index.lsi')    
    corpus_lsi = [search_lsi[doc] for doc in search_corpus]
    sim = Similarity('search_lsi_index//Similarity-Lsi-index', corpus_lsi, num_features=200,num_best=30)
    sim.save('search_lsi_index//Similarity-Lsi-index.sim')
    #
    
    
    #WHOOSH模型生成並儲存
    schema = Schema(jobid=ID(stored=True),jobcontent=TEXT(stored=True,analyzer=ChineseAnalyzer()),
                    joblocation=TEXT(stored=True,analyzer=ChineseAnalyzer()),
                    jobname=TEXT(stored=True,analyzer=ChineseAnalyzer()))
    
    index = create_in("search_whoosh_index",schema)
    
    writer = index.writer()
    
    for job in joblist:
        writer.add_document(jobid=str(job[-1]),jobcontent=job[2]
                       ,joblocation=job[1],jobname=job[0])
        
    writer.commit()
    #
    
    with open('{0}.xkk'.format(str(datetime.datetime.now())[:10]),'w') as f:
        f.write(str(datetime.datetime.now())+'完成模型生成')
        
    print('模型生成結束,時間:'+str(datetime.datetime.now()))
    return joblist
    
def get_index():
    return open_dir('search_whoosh_index')

def get_whoosh_result(user):
    peg_rs = list(psg.cut(user))
#    print([w.word for w in psg.cut(user) if 'location' in w.flag])
    location_Term = [Term('joblocation',w.word) for w in peg_rs if 'location' in w.flag]
    job_Term = [Term('jobcontent',w.word) for w in peg_rs if 'job' in w.flag]
    jobname_Term = [Term('jobname',w.word) for w in peg_rs if 'job' in w.flag]

    Term_list = [location_Term,job_Term,jobname_Term]
    print(jobname_Term)
#        w.flag
    with get_index().searcher() as searcher:
        myquery = And([Or(term) for term in Term_list if len(term)>0])
        result = searcher.search(myquery,terms=True,limit=100)
        recommendlist = list(result)
#        print(recommendlist['jobid'])
        return [recommend['jobid'] for recommend in recommendlist]



def get_lsi_result(joblist,user):
#    print(joblist)
    if len(os.listdir('search_lsi_index//'))>3:
        search_dictionary = corpora.Dictionary.load('search_lsi_index//search_lsi_index.dict')
        search_tfidf_model=models.TfidfModel.load('search_lsi_index//search_lsi_index.tfidf')
        search_lsi= models.LsiModel.load('search_lsi_index//search_lsi_index.lsi')
        print("LSI開始載入了")
        search_similarity_lsi=Similarity.load('search_lsi_index//Similarity-Lsi-index.sim')
    else:
        jobinfolist = [list(jieba.cut(job[0]+job[1]+job[2])) for job in joblist]
        
        search_dictionary = corpora.Dictionary(jobinfolist)
            
        search_corpus = [search_dictionary.doc2bow(text) for text in jobinfolist]
        search_tfidf_model=models.TfidfModel(search_corpus)
        
        search_corpus_tfidf = [search_tfidf_model[doc] for doc in search_corpus]
        search_lsi= models.LsiModel(search_corpus_tfidf,id2word=search_dictionary,num_topics=21+80)
        
        
        search_dictionary.save('search_lsi_index//search_lsi_index.dict')
        search_tfidf_model.save('search_lsi_index//search_lsi_index.tfidf')
        search_lsi.save('search_lsi_index//search_lsi_index.lsi')
        
        search_corpus_lsi = [search_lsi[doc] for doc in search_corpus]
    #---lSi---
        search_similarity_lsi=Similarity('search_lsi_index//Similarity-Lsi-index', search_corpus_lsi, num_features=200,num_best=30)
        search_similarity_lsi.save('search_lsi_index//Similarity-Lsi-index.sim')

    test_cut_raw_1 = list(jieba.cut(user))    
    test_corpus_3 = search_dictionary.doc2bow(test_cut_raw_1)  # 2.轉換成bow向量r
    test_corpus_tfidf_3 = search_tfidf_model[test_corpus_3]  # 3.計算tfidf值
    test_corpus_lsi_3 = search_lsi[test_corpus_tfidf_3]  # 4.計算lsi值
    
    return [str(joblist[i[0]][3]) for i in search_similarity_lsi[test_corpus_lsi_3]]




if __name__ == '__main__':
#    global joblist
    
    user = '上海'
    joblist = get_joblist()
    update_index()
#    print(joblist)
    WHOOSH_Recoomend = get_whoosh_result(user)
    LSI_Recommend = get_lsi_result(joblist,user)
    
#    Result_Recommend=[]
#    Result_Recommend.extend(WHOOSH_Recoomend)
#    Result_Recommend.extend(LSI_Recommend)
    Result_Recommend = set(WHOOSH_Recoomend)|set(LSI_Recommend)
    print(Result_Recommend)
#        print(student_recommend_list)
#    whoosh_rs = get_whoosh_result(user)
#        print(jobidlist)
#        if result.has_matched_terms():
#    #        print(result.matched_terms())
#            for hit in result:
#                print(hit.matched_terms())



#Or([Term("content", "render"), And([Term("title", "shade"), Term("keyword", "animate")])])

 

其實還可以基於doc2vec進行一些搜尋操作。