1. 程式人生 > >python自然語言處理-讀書筆記7

python自然語言處理-讀書筆記7

# -*- coding:utf-8 -*-
# __author__ = 'lipzhang'

#分類和標註詞彙

#使用詞性標註器
import nltk
# text = nltk.word_tokenize("And now for something completely different")
# print(nltk.pos_tag(text))#在這裡我們看到and 是 CC,並列連詞;now 和completely 是 RB,副詞;for 是IN,介 詞;something 是NN,名詞;different 是JJ,形容詞。
# nltk.help.upenn_tagset('RB')
#
# text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
# print(text.similar('woman'))#text.similar()方法為一個詞w 找出所有上下文w1ww2,然 後找出所有出現在相同上下文中的詞 w',即w1w'w2。

#標註語料庫
#表示已標註的識別符號
# tagged_token = nltk.tag.str2tuple('fly/NN')
# print( tagged_token)
# sent = '''  The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN  other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC  Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PP  said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/R  accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT  interest/NN of/IN both/ABX governments/NNS ''/'' ./. '''
# print([nltk.tag.str2tuple(t) for t in sent.split()])#我們可以直接從一個字串構造一個已標註的識別符號的連結串列。第一步是對字串分詞以 便能訪問單獨的詞/標記字串,然後將每一個轉換成一個元組
#讀取已標註的語料庫
# print(nltk.corpus.brown.tagged_words(tagset = 'universal'))
#簡化的詞性標記集
# Tag	Meaning	Examples
# ADJ	adjective	new, good, high, special, big, local
# ADV	adverb	really, already, still, early, now
# CNJ	conjunction	and, or, but, if, while, although
# DET	determiner	the, a, some, most, every, no
# EX	existential	there, there’s
# FW	foreign word	dolce, ersatz, esprit, quo, maitre
# MOD	modal verb	will, can, would, may, must, should
# N	    noun	year, home, costs, time, education
# NP	proper noun	Alison, Africa, April, Washington
# NUM	number	twenty-four, fourth, 1991, 14:24
# PRO	pronoun	he, their, her, its, my, I, us
# P	    preposition	on, of, at, with, by, into, under
# TO	the word to	to
# UH	interjection	ah, bang, ha, whee, hmpf, oops
# V	    verb	is, has, get, do, make, see, run
# VD	past tense	said, took, told, made, asked
# VG	present participle	making, going, playing, working
# VN	past participle	given, taken, begun, sung
# WH	wh determiner	who, which, when, what, where, how
from nltk.corpus import brown
# 名詞 Nons: 通常指代人、地點、事情、概念
# 動詞 Verbs: 用以描述事件和行為
# 形容詞和副詞 Adjectives and Adverbs: 形容詞用來描述名詞,副詞用來描述動詞
# def findtags(tag_prefix, tagged_text):#找出最頻繁的名詞標記
#     cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix))
#     return dict((tag, list(cfd[tag].keys())[:5]) for tag in cfd.conditions())
# tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news'))
# for tag in sorted(tagdict):
#     print(tag, tagdict[tag])
# def process(sentence):#使用 POS 標記尋找三詞短語。
#     for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
#         if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
#             print(w1, w2, w3)
# for tagged_sent in brown.tagged_sents():
#      print(process(tagged_sent))

#使用Python的詞典將詞與屬性之間建立對映
#POS-Tagging中每個詞都會對應一個tag, 很自然地,要建立詞與屬性的對映 python的dict提供一種defaultdict,nltk也提供一種 nltk.defauldict ,這樣使得使用不在dict中的key取value時不丟擲異常,而給出預設值 key和value都可以很複雜
# counts = nltk.defaultdict(int)
# for (word, tag) in brown.tagged_words(categories='news'):
#     counts[tag] += 1
# print(counts['NN'])
# print(list(counts))
# from operator import itemgetter #遞增地更新字典,按值排序。
# print(sorted(counts.items(), key=itemgetter(1), reverse=True))
# print([t for t, c in sorted(counts.items(), key=itemgetter(1), reverse=True)])
# Python 字典方法:常用的方法與字典相關習慣用法的總結
# 示例 說明
# d = {} 建立一個空的字典,並將分配給d
# d[key] = value 分配一個值給一個給定的字典鍵
# d.keys() 字典的鍵的連結串列
# list(d) 字典的鍵的連結串列
# sorted(d) 字典的鍵,排序
# key in d 測試一個特定的鍵是否在字典中
# for key in d 遍歷字典的鍵
# d.values() 字典中的值的連結串列
# dict([(k1,v1), (k2,v2), ...]) 從一個鍵-值對連結串列建立一個字典
# d1.update(d2) 新增d2 中所有專案到d1
# defaultdict(int) 一個預設值為0 的字典

#自動標註
#預設標註器
from nltk.corpus import brown
# brown_tagged_sents = brown.tagged_sents(categories='news')
# brown_sents = brown.sents(categories='news')
# raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
# tokens = nltk.word_tokenize(raw)
# default_tagger = nltk.DefaultTagger('NN')#將所有詞都標註成NN 的標註器
# print(default_tagger.tag(tokens))
# print(default_tagger.evaluate(brown_tagged_sents))
# #正則表示式標註器
# patterns = [
#      (r'.*ing$', 'VBG'),               # gerunds
#      (r'.*ed$', 'VBD'),                # simple past
#      (r'.*es$', 'VBZ'),                # 3rd singular present
#      (r'.*ould$', 'MD'),               # modals
#      (r'.*\'s$', 'NN$'),               # possessive nouns
#      (r'.*s$', 'NNS'),                 # plural nouns
#      (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
#      (r'.*', 'NN')  # nouns (default)
#  ]
# regexp_tagger = nltk.RegexpTagger(patterns)
# print(regexp_tagger.tag(brown_sents[3]))
# print(regexp_tagger.evaluate(brown_tagged_sents))
# #查詢標註器  #我們找出100個出現頻率最高的詞並存儲其tag — 使用這種資訊作為一個”lookup tagger”的模型(在NLTK中是UnigramTagger):
# fd = nltk.FreqDist(brown.words(categories='news'))
# cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
# most_freq_words = list(fd.keys())[:100]
# likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
# baseline_tagger = nltk.UnigramTagger(model=likely_tags,backoff=nltk.DefaultTagger('NN'))#我們首先使用lookup table, 如果不能決定一個token的tag,我們再使用default tagger — 這個過程就稱為 backoff .那麼這個過程怎麼實現呢:將default tagger作為lookup tagger的輸入引數
# print(baseline_tagger.evaluate(brown_tagged_sents))
# sent = brown.sents(categories='news')[3]
# print(baseline_tagger.tag(sent))