1. 程式人生 > >python自然語言處理-讀書筆記3

python自然語言處理-讀書筆記3

# -*- coding:utf-8 -*-
# __author__ = 'lipzhang'
import nltk
from nltk.corpus import gutenberg #古騰堡語料庫
from nltk.corpus import webtext #w網路聊天文字
from nltk.corpus import nps_chat
from nltk.corpus import brown #布朗語料庫
from nltk.corpus import reuters #路透社語料庫
print(gutenberg.fileids())
emma = gutenberg.words('austen-emma.txt')
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))# raw() 函式給我們沒有進行過任何語言學處理的檔案的內容。因此,例如:len(gutenberg.raw(' blake-poems.txt')告訴我們文字中出現的詞彙個數,包括詞之間的空格。
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))#sents()函式把 文字劃分成句子,其中每一個句子是一個詞連結串列。
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print(int(num_chars / num_words), int(num_words / num_sents), int(num_words / num_vocab), fileid)#三個統計量:平均詞長、平均句子長度和本文中每個詞出現的平均次數
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65])
chatroom = nps_chat.posts('10-19-20s_706posts.xml')#
print(chatroom[123])

print(brown.categories())
print(brown.words(categories='news'))
print(brown.words(fileids=['cg22']))
print(brown.sents(categories=['news', 'editorial', 'reviews']))

news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])#統計情態動詞分別出現的次數;
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print(m + ':', fdist[m])

cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))#統計情態動詞分別出現的次數;
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

# print(reuters.fileids())
# print(reuters.categories())
# print(reuters.categories('training/9865'))
# print(reuters.fileids(['barley', 'corn']))
# print(reuters.words('training/9865')[:14])

from nltk.corpus import inaugural#就職演說語料庫
print(inaugural.fileids())
print([fileid[:4] for fileid in inaugural.fileids()])
cfd = nltk.ConditionalFreqDist((target, fileid[:4])for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen']if w.lower().startswith(target))
cfd.plot()#條件頻率分佈圖:計數就職演說語料庫中所有以 america 或 citizen 開始的詞。每個 演講單獨計數 。這樣就能觀察出隨時間變化用法上的演變趨勢 。計數沒有與文件長度進行歸 一化處理。

from nltk.corpus import udhr #世界人權宣言語料庫
languages = ['Chickasaw', 'English', 'German_Deutsch','Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist((lang, len(word))for lang in languages for word in udhr.words(lang + '-Latin1'))
cfd.plot(cumulative=True)#. 累積字長分佈:內容是 “ 世界人權宣言 ” 的 6 個翻譯版本;此圖顯示: 5 個或 5 個 以下字母組成的詞在 Ibibio 語言的文字中佔約 80 %,在德語文字中佔 60 %,在 Inuktitut 文 本中佔 25% 。