第八章8.3自然語言處理-庫的基本運用
阿新 • • 發佈:2018-12-19
#!/usr/bin/env python # _*_ coding:utf-8 _*_ #下載nltk的data # import nltk # nltk.download() #建立text物件 # from nlkt import word_tokenize # from nltk import Text # tokens=word_tokenize("here is some not very interesting text") # text=Text(tokens) #統計詞頻 # from nlkt import FreqDist # from nltk.book import * # #統計書籍中不重複的單詞與總單詞的資料之比 # #len(text6)/len(words) # # #統計出現頻率最高的前十個單詞 # fdist=FreqDist(text6) # fdist.most_common(10) # #檢視某個單詞的頻率 # fdist["Grail"] #建立並搜尋2-ngram模型 # from nltk import bigrams # from nltk.book import * # bigrams=bigrams(text6,2) # bigramsDict=FreqDist(bigrams) # bigramsDict[("Sir","Robin")] #nltk進行詞性分析 # from nlkt import word_tokenize # from nltk import pos_tag # text=word_tokenize("the dust was thick so he had to dust") # pos_tag(text) #選擇採集文字中的動詞的google from nltk import word_tokenize,sent_tokenize,pos_tag sentences=sent_tokenize("Google is one of the best companies in the world.I constantly google myself to see what i am up to") nouns=['NN','NNS','NNP','NNPS'] for sentence in sentences: if "google" in sentence.lower(): taggleWords=pos_tag(word_tokenize(sentence)) for word in taggleWords: if word[0].lower()=='google' and word[1] in nouns: print(sentence)