1. 程式人生 > >第八章8.3自然語言處理-庫的基本運用

第八章8.3自然語言處理-庫的基本運用

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
#下載nltk的data
# import nltk
# nltk.download()

#建立text物件
# from nlkt import word_tokenize
# from nltk import Text
# tokens=word_tokenize("here is some not very interesting text")
# text=Text(tokens)

#統計詞頻
# from nlkt import FreqDist
# from nltk.book import *
# #統計書籍中不重複的單詞與總單詞的資料之比
# #len(text6)/len(words)
#
# #統計出現頻率最高的前十個單詞
# fdist=FreqDist(text6)
# fdist.most_common(10)
# #檢視某個單詞的頻率
# fdist["Grail"]

#建立並搜尋2-ngram模型
# from nltk import bigrams
# from nltk.book import *
# bigrams=bigrams(text6,2)
# bigramsDict=FreqDist(bigrams)
# bigramsDict[("Sir","Robin")]

#nltk進行詞性分析
# from nlkt import word_tokenize
# from nltk import pos_tag
# text=word_tokenize("the dust was thick so he had to dust")
# pos_tag(text)


#選擇採集文字中的動詞的google
from nltk import word_tokenize,sent_tokenize,pos_tag
sentences=sent_tokenize("Google is one of the best companies in the world.I constantly google myself to see what i am up to")
nouns=['NN','NNS','NNP','NNPS']
for sentence in sentences:
    if "google" in sentence.lower():
        taggleWords=pos_tag(word_tokenize(sentence))
        for word in taggleWords:
            if word[0].lower()=='google' and word[1] in nouns:
                print(sentence)