1. 程式人生 > >elasticsearch核心知識---52.倒排索引組成結構以及實現TF-IDF演算法

elasticsearch核心知識---52.倒排索引組成結構以及實現TF-IDF演算法

首先實現了採用java 簡易的實現TF-IDF演算法

package matrixOnto.Ja_9_10_va;

import com.google.common.base.Preconditions;
import org.nutz.lang.Strings;

import java.util.List;
import java.util.Objects;
import java.util.Optional;

/**
 *
 *
 * 
 * 關於elasticsearch中的TF-IDF演算法實現
 * 分為四部分:
 * TF: term frequency   詞頻率
 * DF:documnet frequency 文件頻率
 * IDF: inverse documnet frequency  逆文件頻率
 * TF-IDF=TF*IDF:評分演算法。
 * 大致的思路就是如此
 */
public class TF_IDF_Cal {



    /**
     * 檢視詞項在文件中的tf值 詞頻
     *
     * @param doc
     * @param term
     * @return
     */
    private double tf(List<String> doc, String term) {
        Preconditions.checkArgument(doc.size() >0, "inver doc size must gt 0");
        Objects.requireNonNull(term, "term can not be null");
        double  count = doc.stream().filter(d -> d.equalsIgnoreCase(term)).count();
        return count / doc.size();
    }

    /**
     * 檢視文件頻率   df
     * @param docs
     * @param term
     * @return
     */
    private int df(List<List<String>> docs, String term) {
        if (docs == null || docs.size() == 0 || Strings.isEmpty(term)) return 0;
        return (int) docs.stream().filter(doc -> doc.contains(term)).count();
    }

    /**
     * 檢視 逆文件頻率  idf
     * 對求的值 進行對數運算
     * @param docs
     * @param term
     * @return
     */
    private double idf(List<List<String>> docs, String term){
        return Math.log(docs.size()/(double)df(docs,term)+1);
    }

    /**
     * 關於TFIDF 演算法
     * @param doc
     * @param docs
     * @param term
     * @return
     */
    private double  tfIdf(List<String> doc,List<List<String>> docs,String term){
        return tf(doc,term)*idf(docs,term);
    }

    public static void main(String[] args) {
        TF_IDF_Cal cal = new TF_IDF_Cal();
        List<String> doc1 = List.of("人工", "智慧", "成為", "網際網路", "大會", "焦點");
        List<String> doc2 = List.of("谷歌", "推出", "開源", "人工", "智慧", "系統","工具");
        List<String> doc3 = List.of("網際網路", "的", "未來", "在", "人工", "智慧");
        List<String> doc4 = List.of("谷歌", "開源", "機器", "學習", "工具");
        List<List<String>> documents = List.of(doc1, doc2, doc3, doc4);
        Optional.ofNullable(cal.tf(doc2,"谷歌")).ifPresent(System.out::println);
        System.out.println("TF result:"+cal.tf(doc2,"谷歌"));
        System.out.println("DF result:"+cal.df(documents,"谷歌"));
        System.out.println("IDF result:"+cal.idf(documents,"谷歌"));
        double tf_idf = cal.tfIdf(doc2, documents, "谷歌");
        System.out.println("TF-IDF(谷歌)"+tf_idf);

        List<String> synDoc1 = List.of("我", "有", "一個", "一", "個", "西紅柿", "番茄");
        List<String> synDoc2 = List.of("西紅柿", "番茄", "番", "茄", "炒蛋", "炒", "蛋", "飯");
        List<String> synDoc3 = List.of("西紅柿", "番茄", "雞蛋", "雞", "蛋", "面");
        List<List<String>> synDocunments = List.of(synDoc1, synDoc2, synDoc3);

        synDocunments.stream().forEach(
                d->{
                    double result = cal.tfIdf(d, synDocunments, "西紅柿");
                    System.out.println("Doc=["+d.toString()+"] .......Score is:["+result+"]");
                }
        );
    }

}

倒排索引的結構   下面的結構主要為了計算評分

(1)包含這個關鍵詞的document list
(2)包含這個關鍵詞的所有document的數量:IDF(inverse document frequency)
(3)這個關鍵詞在每個document中出現的次數:TF(term frequency)
(4)這個關鍵詞在這個document中的次序
(5)每個document的長度:length norm
(6)包含這個關鍵詞的所有document的平均長度

worddoc1doc2


dog**
hello*
you*


倒排索引不可變的好處


(1)不需要鎖,提升併發能力,避免鎖的問題
(2)資料不變,一直儲存在os cache中,只要cache記憶體足夠
(3)filter cache一直駐留在記憶體,因為資料不變
(4)可以壓縮,節省cpu和io開銷


倒排索引不可變的壞處:每次都要重新構建整個索引