1. 程式人生 > >python自然語言處理 第二章(上)

python自然語言處理 第二章(上)

古騰堡語料庫

import nltk
nltk.corpus.gutenberg.fileids()
Out[78]: 
[u'austen-emma.txt',
 u'austen-persuasion.txt',
 u'austen-sense.txt',
 u'bible-kjv.txt',
 u'blake-poems.txt',
 u'bryant-stories.txt',
 u'burgess-busterbrown.txt',
 u'carroll-alice.txt',
 u'chesterton-ball.txt',
 u'chesterton-brown.txt',
 u'chesterton-thursday.txt',
 u'edgeworth-parents.txt',
 u'melville-moby_dick.txt',
 u'milton-paradise.txt',
 u'shakespeare-caesar.txt',
 u'shakespeare-hamlet.txt',
 u'shakespeare-macbeth.txt',
 u'whitman-leaves.txt']

# 挑選第一個簡·奧斯丁的《愛瑪》,並賦予一個簡短的名字emma
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
len(emma)
Out[81]: 192427
# 另一種輸入方式
from nltk.corpus import gutenberg
gutenberg.fileids()
Out[84]: 
[u'austen-emma.txt',
 u'austen-persuasion.txt',
 u'austen-sense.txt',
 u'bible-kjv.txt',
 u'blake-poems.txt',
 u'bryant-stories.txt',
 u'burgess-busterbrown.txt',
 u'carroll-alice.txt',
 u'chesterton-ball.txt',
 u'chesterton-brown.txt',
 u'chesterton-thursday.txt',
 u'edgeworth-parents.txt',
 u'melville-moby_dick.txt',
 u'milton-paradise.txt',
 u'shakespeare-caesar.txt',
 u'shakespeare-hamlet.txt',
 u'shakespeare-macbeth.txt',
 u'whitman-leaves.txt']
emma = gutenberg.words('austen-emma.txt')
# 通過遍歷前面列出的gutenberg檔案識別符號連結串列相應的fileid,計算統計每個文字
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid)) # 文字中出現的詞彙個數,包括詞之間的空格
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid)) # 把文字劃分為句子,每個句子是一個詞連結串列
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
4 24 26 austen-emma.txt
4 26 16 austen-persuasion.txt
4 28 22 austen-sense.txt
4 33 79 bible-kjv.txt
4 19 5 blake-poems.txt
4 19 14 bryant-stories.txt
4 17 12 burgess-busterbrown.txt
4 20 12 carroll-alice.txt
4 20 11 chesterton-ball.txt
4 22 11 chesterton-brown.txt
4 18 10 chesterton-thursday.txt
4 20 24 edgeworth-parents.txt
4 25 15 melville-moby_dick.txt
4 52 10 milton-paradise.txt
4 11 8 shakespeare-caesar.txt
4 12 7 shakespeare-hamlet.txt
4 12 6 shakespeare-macbeth.txt
4 36 12 whitman-leaves.txt
# 得到三個統計量:平均詞長、平均句子長度、文字中每個詞出現的平均次數
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')

macbeth_sentences

]: [[u'[', u'The', u'Tragedie', u'of', u'Macbeth', u'by', u'William', u'Shakespeare', u'1603', u']'], [u'Actus', u'Primus', u'.'], ...]

macbeth_sentences[1037]
Out[90]: 
[u'Good',
 u'night',
 u',',
 u'and',
 u'better',
 u'health',
 u'Attend',
 u'his',
 u'Maiesty']

longest_len = max([len(s) for s in macbeth_sentences])

[s for s in macbeth_sentences if len(s) == longest_len]
Out[92]: 
[[u'Doubtfull',
  u'it',
  u'stood',
  u',',
  u'As',
  u'two',
  u'spent',
  u'Swimmers',
  u',',
  u'that',
  u'doe',
  u'cling',
  u'together',
  u',',
  u'And',
  u'choake',
  u'their',
  u'Art',
  u':',
  u'The',
  u'mercilesse',
  u'Macdonwald',
  u'(',
  u'Worthie',
  u'to',
  u'be',
  u'a',
  u'Rebell',
  u',',
  u'for',
  u'to',
  u'that',
  u'The',
  u'multiplying',
  u'Villanies',
  u'of',
  u'Nature',
  u'Doe',
  u'swarme',
  u'vpon',
  u'him',
  u')',
  u'from',
  u'the',
  u'Westerne',
  u'Isles',
  u'Of',
  u'Kernes',
  u'and',
  u'Gallowgrosses',
  u'is',
  u'supply',
  u"'",
  u'd',
  u',',
  u'And',
  u'Fortune',
  u'on',
  u'his',
  u'damned',
  u'Quarry',
  u'smiling',
  u',',
  u'Shew',
  u"'",
  u'd',
  u'like',
  u'a',
  u'Rebells',
  u'Whore',
  u':',
  u'but',
  u'all',
  u"'",
  u's',
  u'too',
  u'weake',
  u':',
  u'For',
  u'braue',
  u'Macbeth',
  u'(',
  u'well',
  u'hee',
  u'deserues',
  u'that',
  u'Name',
  u')',
  u'Disdayning',
  u'Fortune',
  u',',
  u'with',
  u'his',
  u'brandisht',
  u'Steele',
  u',',
  u'Which',
  u'smoak',
  u"'",
  u'd',
  u'with',
  u'bloody',
  u'execution',
  u'(',
  u'Like',
  u'Valours',
  u'Minion',
  u')',
  u'caru',
  u"'",
  u'd',
  u'out',
  u'his',
  u'passage',
  u',',
  u'Till',
  u'hee',
  u'fac',
  u"'",
  u'd',
  u'the',
  u'Slaue',
  u':',
  u'Which',
  u'neu',
  u"'",
  u'r',
  u'shooke',
  u'hands',
  u',',
  u'nor',
  u'bad',
  u'farwell',
  u'to',
  u'him',
  u',',
  u'Till',
  u'he',
  u'vnseam',
  u"'",
  u'd',
  u'him',
  u'from',
  u'the',
  u'Naue',
  u'toth',
  u"'",
  u'Chops',
  u',',
  u'And',
  u'fix',
  u"'",
  u'd',
  u'his',
  u'Head',
  u'vpon',
  u'our',
  u'Battlements']]
# 網路和聊天文字
from nltk.corpus import webtext
for fileid in webtext.fileids():
    print fileid,webtext.raw(fileid)[:50]
firefox.txt Cookie Manager: "Don't allow sites that set remove
grail.txt SCENE 1: [wind] [clop clop clop] 
KING ARTHUR: Who
overheard.txt White guy: So, do you have any plans for this even
pirates.txt PIRATES OF THE CARRIBEAN: DEAD MAN'S CHEST, by Ted
singles.txt 25 SEXY MALE, seeks attrac older single lady, for 
wine.txt Lovely delicate, fragrant Rhone wine. Polished lea
布朗語料庫
from nltk.corpus import brown

brown.categories()
Out[97]: 
[u'adventure',
 u'belles_lettres',
 u'editorial',
 u'fiction',
 u'government',
 u'hobbies',
 u'humor',
 u'learned',
 u'lore',
 u'mystery',
 u'news',
 u'religion',
 u'reviews',
 u'romance',
 u'science_fiction']

brown.words(categories = 'news')
Out[98]: [u'The', u'Fulton', u'County', u'Grand', u'Jury', ...]

brown.words(fileids = ['cg22'])
Out[99]: [u'Does', u'our', u'society', u'have', u'a', ...]

brown.words(categories = ['new', 'editorial', 'reviews'])
Out[100]: [u'Assembly', u'session', u'brought', u'much', u'good', ...]
#比較不同文體中的情態動詞用法,第一步產生特定的文體計數

news_text = brown.words(categories = 'news')

fdist = nltk.FreqDist([w.lower() for w in news_text])

modals = ['can', 'could', 'may', 'might', 'must', 'will']

for m in modals:
    print m+':',fdist[m]



In can: 94[106]: 
could: 87
may: 93
might: 38
must: 53
will: 389

>>> import nltk
>>> from nltk.corpus import brown
>>> cfd = nltk.ConditionalFreqDist((genre, word)
for genre in brown.categories()
for word in brown.words(categories = genre))
>>> genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
>>> modals = ['can', 'could', 'may', 'might', 'must', 'will']
>>> cfd.tabulate(conditions=genres, samples=modals)
                  can could   may might  must  will 
           news    93    86    66    38    50   389 
       religion    82    59    78    12    54    71 
        hobbies   268    58   131    22    83   264 
science_fiction    16    49     4    12     8    16 
        romance    74   193    11    51    45    43 
          humor    16    30     8     8     9    13 
>>> cfd.plot(conditions=genres, samples=modals)

路透社語義庫

#路透社語料庫
from nltk.corpus import reuters
reuters.fileids()# 包含10788個新聞文件,共計130萬字
reuters.categories()# 90個主題,分為訓練和測試
reuters.categories('training/9865')
Out[128]: [u'barley', u'corn', u'grain', u'wheat']
reuters.categories('training/9865')
Out[128]: [u'barley', u'corn', u'grain', u'wheat']

reuters.categories(['training/9865', 'training/9880'])
Out[129]: [u'barley', u'corn', u'grain', u'money-fx', u'wheat']

reuters.fileids('barley')
Out[130]: 
[u'test/15618',
 u'test/15649',
 u'test/15676',
 u'test/15728',
 u'test/15871',
 u'test/15875',
 u'test/15952',
 u'test/17767',
 u'test/17769',
 u'test/18024',
 u'test/18263',
 u'test/18908',
 u'test/19275',
 u'test/19668',
 u'training/10175',
 u'training/1067',
 u'training/11208',
 u'training/11316',
 u'training/11885',
 u'training/12428',
 u'training/13099',
 u'training/13744',
 u'training/13795',
 u'training/13852',
 u'training/13856',
 u'training/1652',
 u'training/1970',
 u'training/2044',
 u'training/2171',
 u'training/2172',
 u'training/2191',
 u'training/2217',
 u'training/2232',
 u'training/3132',
 u'training/3324',
 u'training/395',
 u'training/4280',
 u'training/4296',
 u'training/5',
 u'training/501',
 u'training/5467',
 u'training/5610',
 u'training/5640',
 u'training/6626',
 u'training/7205',
 u'training/7579',
 u'training/8213',
 u'training/8257',
 u'training/8759',
 u'training/9865',
 u'training/9958']
reuters.fileids(['barley','corn'])
Out[131]: 
[u'test/14832',
 u'test/14858',
 u'test/15033',
 u'test/15043',
 u'test/15106',
 u'test/15287',
 u'test/15341',
 u'test/15618',
 u'test/15648',
 u'test/15649',
 u'test/15676',
 u'test/15686',
 u'test/15720',
 u'test/15728',
 u'test/15845',
 u'test/15856',
 u'test/15860',
 u'test/15863',
 u'test/15871',
 u'test/15875',
 u'test/15877',
 u'test/15890',
 u'test/15904',
 u'test/15906',
 u'test/15910',
 u'test/15911',
 u'test/15917',
 u'test/15952',
 u'test/15999',
 u'test/16012',
 u'test/16071',
 u'test/16099',
 u'test/16147',
 u'test/16525',
 u'test/16624',
 u'test/16751',
 u'test/16765',
 u'test/17503',
 u'test/17509',
 u'test/17722',
 u'test/17767',
 u'test/17769',
 u'test/18024',
 u'test/18035',
 u'test/18263',
 u'test/18482',
 u'test/18614',
 u'test/18908',
 u'test/18954',
 u'test/18973',
 u'test/19165',
 u'test/19275',
 u'test/19668',
 u'test/19721',
 u'test/19821',
 u'test/20018',
 u'test/20366',
 u'test/20637',
 u'test/20645',
 u'test/20649',
 u'test/20723',
 u'test/20763',
 u'test/21091',
 u'test/21243',
 u'test/21493',
 u'training/10120',
 u'training/10139',
 u'training/10172',
 u'training/10175',
 u'training/10319',
 u'training/10339',
 u'training/10487',
 u'training/10489',
 u'training/10519',
 u'training/1067',
 u'training/10701',
 u'training/10882',
 u'training/10956',
 u'training/11012',
 u'training/11085',
 u'training/11091',
 u'training/11208',
 u'training/11269',
 u'training/1131',
 u'training/11316',
 u'training/11392',
 u'training/11436',
 u'training/11607',
 u'training/11612',
 u'training/11729',
 u'training/11739',
 u'training/11769',
 u'training/11885',
 u'training/11936',
 u'training/11939',
 u'training/11964',
 u'training/12002',
 u'training/12052',
 u'training/12055',
 u'training/1215',
 u'training/12160',
 u'training/12311',
 u'training/12323',
 u'training/12372',
 u'training/12417',
 u'training/12428',
 u'training/12436',
 u'training/12500',
 u'training/12583',
 u'training/12587',
 u'training/1268',
 u'training/1273',
 u'training/12872',
 u'training/13099',
 u'training/13173',
 u'training/13179',
 u'training/1369',
 u'training/13744',
 u'training/13795',
 u'training/1385',
 u'training/13852',
 u'training/13856',
 u'training/1395',
 u'training/1399',
 u'training/14483',
 u'training/1582',
 u'training/1652',
 u'training/1777',
 u'training/1843',
 u'training/193',
 u'training/1952',
 u'training/197',
 u'training/1970',
 u'training/2044',
 u'training/2171',
 u'training/2172',
 u'training/2183',
 u'training/2191',
 u'training/2217',
 u'training/2232',
 u'training/2264',
 u'training/235',
 u'training/2382',
 u'training/2436',
 u'training/2456',
 u'training/2595',
 u'training/2599',
 u'training/2617',
 u'training/2727',
 u'training/2741',
 u'training/2749',
 u'training/2777',
 u'training/2848',
 u'training/2913',
 u'training/2922',
 u'training/2947',
 u'training/3132',
 u'training/3138',
 u'training/3191',
 u'training/327',
 u'training/3282',
 u'training/3299',
 u'training/3306',
 u'training/3324',
 u'training/3330',
 u'training/3337',
 u'training/3358',
 u'training/3401',
 u'training/3429',
 u'training/3847',
 u'training/3855',
 u'training/3881',
 u'training/3949',
 u'training/395',
 u'training/3979',
 u'training/3981',
 u'training/4047',
 u'training/4133',
 u'training/4280',
 u'training/4289',
 u'training/4296',
 u'training/4382',
 u'training/4490',
 u'training/4599',
 u'training/4825',
 u'training/4905',
 u'training/4939',
 u'training/4988',
 u'training/5',
 u'training/5003',
 u'training/501',
 u'training/5017',
 u'training/5033',
 u'training/5109',
 u'training/516',
 u'training/5185',
 u'training/5338',
 u'training/5467',
 u'training/5518',
 u'training/5531',
 u'training/5606',
 u'training/5610',
 u'training/5636',
 u'training/5637',
 u'training/5640',
 u'training/57',
 u'training/5847',
 u'training/5933',
 u'training/6',
 u'training/6142',
 u'training/6221',
 u'training/6236',
 u'training/6239',
 u'training/6259',
 u'training/6269',
 u'training/6386',
 u'training/6585',
 u'training/6588',
 u'training/6626',
 u'training/6735',
 u'training/6890',
 u'training/6897',
 u'training/694',
 u'training/7062',
 u'training/7205',
 u'training/7215',
 u'training/7336',
 u'training/7387',
 u'training/7389',
 u'training/7390',
 u'training/7395',
 u'training/7579',
 u'training/7700',
 u'training/7792',
 u'training/7917',
 u'training/7934',
 u'training/7943',
 u'training/8004',
 u'training/8140',
 u'training/8161',
 u'training/8166',
 u'training/8213',
 u'training/8257',
 u'training/8273',
 u'training/8400',
 u'training/8443',
 u'training/8446',
 u'training/8535',
 u'training/855',
 u'training/8759',
 u'training/8941',
 u'training/8983',
 u'training/8993',
 u'training/9058',
 u'training/9093',
 u'training/9094',
 u'training/934',
 u'training/9470',
 u'training/9521',
 u'training/9667',
 u'training/97',
 u'training/9865',
 u'training/9958',
 u'training/9989']
reuters.words('training/9865')[:10]
Out[132]: 
[u'FRENCH',
 u'FREE',
 u'MARKET',
 u'CEREAL',
 u'EXPORT',
 u'BIDS',
 u'DETAILED',
 u'French',
 u'operators',
 u'have']
就職演說語料庫
#就職演說語料庫
from nltk.corpus import inaugural
inaugural.fileids()# 包含55個文字,每個文字是一個總統的演講
[fileid[:4] for fileid in inaugural.fileids()]# 提取前4個字元為年份
#將詞彙轉成小寫,用startwith()檢驗是否從目標詞彙america或citizen開始
cdf = nltk.ConditionalFreqDist(
                               (target,fileid[:4])
                               for fileid in inaugural.fileids()
                               for w in inaugural.words(fileid)
                               for target in ['america', 'citizen']
                               if w.lower().startswith(target)
                               )
cdf.plot()

標註文字語料庫。。。。

其他語料庫。。。

載入自己的語料庫【不明白為什麼中文出不來啊!!!!!!!!!!!!!!!!!!!!!!!】

from nltk.corpus import PlaintextCorpusReader
corpus_root = r'e:\myyuliaoku'
file_pattern = r'.*\.txt'
wordlists = PlaintextCorpusReader(corpus_root, file_pattern)
wordlists.fileids()
wordlists.raw('laojiumenciku.txt')
words = wordlists.words()
words[:20]
nltk.FreqDist(wordlists.words("laojiumenciku.txt")).plot()# 不明白為什麼中文出不來,txt文件編碼已經改為utf-8
len(wordlists.sents())

from nltk.corpus import BracketParseCorpusReader
corpus_root = r'e:\myyuliaoku'
file_pattern = r'.*\.txt'
ptb = BracketParseCorpusReader(corpus_root, file_pattern,encoding = 'UTF-8')   #初始化讀取器:語料庫目錄和要載入檔案的格式,預設utf8格式的編碼
ptb.fileids()   #至此,可以看到目錄下的所有檔名,例如C000008/1001.txt,則成功了
ptb.raw("laojiumenciku.txt") 
條件頻率密度