1. 程式人生 > >python處理文字使用n-gram方法

python處理文字使用n-gram方法

#tokenizer function, this will make 3 grams of each query
def get_ngrams(query):
    tempQuery = str(query)
    ngrams = []
    for i in range(0,len(tempQuery)-3):
        ngrams.append(tempQuery[i:i+3])
    return ngrams


#by zgd
def get_ngrams_zgd(input):
    output = {}
    n = 3
    for i in range(len(input) - n + 1):
        ngramTemp = " ".join(input[i:i + n])
        if ngramTemp not in output:
            output[ngramTemp] = 0
        output[ngramTemp] += 1
    return output