python自然語言處理-讀書筆記7
阿新 • • 發佈:2018-11-11
# -*- coding:utf-8 -*- # __author__ = 'lipzhang' #分類和標註詞彙 #使用詞性標註器 import nltk # text = nltk.word_tokenize("And now for something completely different") # print(nltk.pos_tag(text))#在這裡我們看到and 是 CC,並列連詞;now 和completely 是 RB,副詞;for 是IN,介 詞;something 是NN,名詞;different 是JJ,形容詞。 # nltk.help.upenn_tagset('RB') # # text = nltk.Text(word.lower() for word in nltk.corpus.brown.words()) # print(text.similar('woman'))#text.similar()方法為一個詞w 找出所有上下文w1ww2,然 後找出所有出現在相同上下文中的詞 w',即w1w'w2。 #標註語料庫 #表示已標註的識別符號 # tagged_token = nltk.tag.str2tuple('fly/NN') # print( tagged_token) # sent = ''' The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PP said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/R accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT interest/NN of/IN both/ABX governments/NNS ''/'' ./. ''' # print([nltk.tag.str2tuple(t) for t in sent.split()])#我們可以直接從一個字串構造一個已標註的識別符號的連結串列。第一步是對字串分詞以 便能訪問單獨的詞/標記字串,然後將每一個轉換成一個元組 #讀取已標註的語料庫 # print(nltk.corpus.brown.tagged_words(tagset = 'universal')) #簡化的詞性標記集 # Tag Meaning Examples # ADJ adjective new, good, high, special, big, local # ADV adverb really, already, still, early, now # CNJ conjunction and, or, but, if, while, although # DET determiner the, a, some, most, every, no # EX existential there, there’s # FW foreign word dolce, ersatz, esprit, quo, maitre # MOD modal verb will, can, would, may, must, should # N noun year, home, costs, time, education # NP proper noun Alison, Africa, April, Washington # NUM number twenty-four, fourth, 1991, 14:24 # PRO pronoun he, their, her, its, my, I, us # P preposition on, of, at, with, by, into, under # TO the word to to # UH interjection ah, bang, ha, whee, hmpf, oops # V verb is, has, get, do, make, see, run # VD past tense said, took, told, made, asked # VG present participle making, going, playing, working # VN past participle given, taken, begun, sung # WH wh determiner who, which, when, what, where, how from nltk.corpus import brown # 名詞 Nons: 通常指代人、地點、事情、概念 # 動詞 Verbs: 用以描述事件和行為 # 形容詞和副詞 Adjectives and Adverbs: 形容詞用來描述名詞,副詞用來描述動詞 # def findtags(tag_prefix, tagged_text):#找出最頻繁的名詞標記 # cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix)) # return dict((tag, list(cfd[tag].keys())[:5]) for tag in cfd.conditions()) # tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news')) # for tag in sorted(tagdict): # print(tag, tagdict[tag]) # def process(sentence):#使用 POS 標記尋找三詞短語。 # for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence): # if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')): # print(w1, w2, w3) # for tagged_sent in brown.tagged_sents(): # print(process(tagged_sent)) #使用Python的詞典將詞與屬性之間建立對映 #POS-Tagging中每個詞都會對應一個tag, 很自然地,要建立詞與屬性的對映 python的dict提供一種defaultdict,nltk也提供一種 nltk.defauldict ,這樣使得使用不在dict中的key取value時不丟擲異常,而給出預設值 key和value都可以很複雜 # counts = nltk.defaultdict(int) # for (word, tag) in brown.tagged_words(categories='news'): # counts[tag] += 1 # print(counts['NN']) # print(list(counts)) # from operator import itemgetter #遞增地更新字典,按值排序。 # print(sorted(counts.items(), key=itemgetter(1), reverse=True)) # print([t for t, c in sorted(counts.items(), key=itemgetter(1), reverse=True)]) # Python 字典方法:常用的方法與字典相關習慣用法的總結 # 示例 說明 # d = {} 建立一個空的字典,並將分配給d # d[key] = value 分配一個值給一個給定的字典鍵 # d.keys() 字典的鍵的連結串列 # list(d) 字典的鍵的連結串列 # sorted(d) 字典的鍵,排序 # key in d 測試一個特定的鍵是否在字典中 # for key in d 遍歷字典的鍵 # d.values() 字典中的值的連結串列 # dict([(k1,v1), (k2,v2), ...]) 從一個鍵-值對連結串列建立一個字典 # d1.update(d2) 新增d2 中所有專案到d1 # defaultdict(int) 一個預設值為0 的字典 #自動標註 #預設標註器 from nltk.corpus import brown # brown_tagged_sents = brown.tagged_sents(categories='news') # brown_sents = brown.sents(categories='news') # raw = 'I do not like green eggs and ham, I do not like them Sam I am!' # tokens = nltk.word_tokenize(raw) # default_tagger = nltk.DefaultTagger('NN')#將所有詞都標註成NN 的標註器 # print(default_tagger.tag(tokens)) # print(default_tagger.evaluate(brown_tagged_sents)) # #正則表示式標註器 # patterns = [ # (r'.*ing$', 'VBG'), # gerunds # (r'.*ed$', 'VBD'), # simple past # (r'.*es$', 'VBZ'), # 3rd singular present # (r'.*ould$', 'MD'), # modals # (r'.*\'s$', 'NN$'), # possessive nouns # (r'.*s$', 'NNS'), # plural nouns # (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers # (r'.*', 'NN') # nouns (default) # ] # regexp_tagger = nltk.RegexpTagger(patterns) # print(regexp_tagger.tag(brown_sents[3])) # print(regexp_tagger.evaluate(brown_tagged_sents)) # #查詢標註器 #我們找出100個出現頻率最高的詞並存儲其tag — 使用這種資訊作為一個”lookup tagger”的模型(在NLTK中是UnigramTagger): # fd = nltk.FreqDist(brown.words(categories='news')) # cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) # most_freq_words = list(fd.keys())[:100] # likely_tags = dict((word, cfd[word].max()) for word in most_freq_words) # baseline_tagger = nltk.UnigramTagger(model=likely_tags,backoff=nltk.DefaultTagger('NN'))#我們首先使用lookup table, 如果不能決定一個token的tag,我們再使用default tagger — 這個過程就稱為 backoff .那麼這個過程怎麼實現呢:將default tagger作為lookup tagger的輸入引數 # print(baseline_tagger.evaluate(brown_tagged_sents)) # sent = brown.sents(categories='news')[3] # print(baseline_tagger.tag(sent))