【搜尋引擎】用whoosh+無監督聚類搭建一套智慧搜尋引擎
阿新 • • 發佈:2018-11-29
因本介面自定義功能較多,且底層演算法自己搭建,故不再使用其他三方庫,只用whoosh其他演算法自己寫。
# -*- coding: utf-8 -*- from whoosh.fields import Schema,TEXT,ID from whoosh.index import create_in,open_dir from whoosh.query import And,Term,Or from whoosh.searching import * from jieba.analyse import ChineseAnalyzer from whoosh import scoring import pymysql from gensim import corpora,models from gensim.similarities.docsim import Similarity import datetime import jieba import os import glob import jieba.posseg as psg jieba.load_userdict('..//..//spo//HR專業詞彙.txt') def get_joblist(): db = pymysql.connect('131.42.33.12','rxxt','52xxkk','unxxkkao',port=3306,charset='utf8') sql_job = "SELECT jobName,workPlace,jobDescript,un2co_job.id_job FROM un2co_job where enddate>='"+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"' AND un2co_job.id_job in(SELECT id_job FROM un2co_natural_job_check WHERE natural_check=1)" cursor = db.cursor() cursor.execute(sql_job) joblist = cursor.fetchall() return joblist def update_index(): filename = glob.glob('*.xkk') print("filename:",filename) if len(filename)>0: now = datetime.datetime.now() before = datetime.datetime.strptime(filename[0][:-4], "%Y-%m-%d") a = now - before if a.days<=0: print('當前模型未過期,無需重新生成:',a.days) return # print(filename[0]) #LSI模型生成並儲存 joblist = get_joblist() jobinfolist = [list(jieba.cut(job[0]+job[1]+job[2])) for job in joblist] search_dictionary = corpora.Dictionary(jobinfolist) search_corpus = [search_dictionary.doc2bow(text) for text in jobinfolist] search_tfidf_model=models.TfidfModel(search_corpus) search_corpus_tfidf = [search_tfidf_model[doc] for doc in search_corpus] search_lsi= models.LsiModel(search_corpus_tfidf,id2word=search_dictionary,num_topics=21+80) search_dictionary.save('search_lsi_index//search_lsi_index.dict') search_tfidf_model.save('search_lsi_index//search_lsi_index.tfidf') search_lsi.save('search_lsi_index//search_lsi_index.lsi') corpus_lsi = [search_lsi[doc] for doc in search_corpus] sim = Similarity('search_lsi_index//Similarity-Lsi-index', corpus_lsi, num_features=200,num_best=30) sim.save('search_lsi_index//Similarity-Lsi-index.sim') # #WHOOSH模型生成並儲存 schema = Schema(jobid=ID(stored=True),jobcontent=TEXT(stored=True,analyzer=ChineseAnalyzer()), joblocation=TEXT(stored=True,analyzer=ChineseAnalyzer()), jobname=TEXT(stored=True,analyzer=ChineseAnalyzer())) index = create_in("search_whoosh_index",schema) writer = index.writer() for job in joblist: writer.add_document(jobid=str(job[-1]),jobcontent=job[2] ,joblocation=job[1],jobname=job[0]) writer.commit() # with open('{0}.xkk'.format(str(datetime.datetime.now())[:10]),'w') as f: f.write(str(datetime.datetime.now())+'完成模型生成') print('模型生成結束,時間:'+str(datetime.datetime.now())) return joblist def get_index(): return open_dir('search_whoosh_index') def get_whoosh_result(user): peg_rs = list(psg.cut(user)) # print([w.word for w in psg.cut(user) if 'location' in w.flag]) location_Term = [Term('joblocation',w.word) for w in peg_rs if 'location' in w.flag] job_Term = [Term('jobcontent',w.word) for w in peg_rs if 'job' in w.flag] jobname_Term = [Term('jobname',w.word) for w in peg_rs if 'job' in w.flag] Term_list = [location_Term,job_Term,jobname_Term] print(jobname_Term) # w.flag with get_index().searcher() as searcher: myquery = And([Or(term) for term in Term_list if len(term)>0]) result = searcher.search(myquery,terms=True,limit=100) recommendlist = list(result) # print(recommendlist['jobid']) return [recommend['jobid'] for recommend in recommendlist] def get_lsi_result(joblist,user): # print(joblist) if len(os.listdir('search_lsi_index//'))>3: search_dictionary = corpora.Dictionary.load('search_lsi_index//search_lsi_index.dict') search_tfidf_model=models.TfidfModel.load('search_lsi_index//search_lsi_index.tfidf') search_lsi= models.LsiModel.load('search_lsi_index//search_lsi_index.lsi') print("LSI開始載入了") search_similarity_lsi=Similarity.load('search_lsi_index//Similarity-Lsi-index.sim') else: jobinfolist = [list(jieba.cut(job[0]+job[1]+job[2])) for job in joblist] search_dictionary = corpora.Dictionary(jobinfolist) search_corpus = [search_dictionary.doc2bow(text) for text in jobinfolist] search_tfidf_model=models.TfidfModel(search_corpus) search_corpus_tfidf = [search_tfidf_model[doc] for doc in search_corpus] search_lsi= models.LsiModel(search_corpus_tfidf,id2word=search_dictionary,num_topics=21+80) search_dictionary.save('search_lsi_index//search_lsi_index.dict') search_tfidf_model.save('search_lsi_index//search_lsi_index.tfidf') search_lsi.save('search_lsi_index//search_lsi_index.lsi') search_corpus_lsi = [search_lsi[doc] for doc in search_corpus] #---lSi--- search_similarity_lsi=Similarity('search_lsi_index//Similarity-Lsi-index', search_corpus_lsi, num_features=200,num_best=30) search_similarity_lsi.save('search_lsi_index//Similarity-Lsi-index.sim') test_cut_raw_1 = list(jieba.cut(user)) test_corpus_3 = search_dictionary.doc2bow(test_cut_raw_1) # 2.轉換成bow向量r test_corpus_tfidf_3 = search_tfidf_model[test_corpus_3] # 3.計算tfidf值 test_corpus_lsi_3 = search_lsi[test_corpus_tfidf_3] # 4.計算lsi值 return [str(joblist[i[0]][3]) for i in search_similarity_lsi[test_corpus_lsi_3]] if __name__ == '__main__': # global joblist user = '上海' joblist = get_joblist() update_index() # print(joblist) WHOOSH_Recoomend = get_whoosh_result(user) LSI_Recommend = get_lsi_result(joblist,user) # Result_Recommend=[] # Result_Recommend.extend(WHOOSH_Recoomend) # Result_Recommend.extend(LSI_Recommend) Result_Recommend = set(WHOOSH_Recoomend)|set(LSI_Recommend) print(Result_Recommend) # print(student_recommend_list) # whoosh_rs = get_whoosh_result(user) # print(jobidlist) # if result.has_matched_terms(): # # print(result.matched_terms()) # for hit in result: # print(hit.matched_terms()) #Or([Term("content", "render"), And([Term("title", "shade"), Term("keyword", "animate")])])
其實還可以基於doc2vec進行一些搜尋操作。