python自然語言處理學習筆記三

阿新 • • 發佈：2019-02-02

第三章處理原始文字

1 從網路和硬碟訪問文字

#<<罪與罰>>的英文翻譯未作測試??

From utlib import urlopen

Url=’http://www.gutenberg.org/files/2554/2554.txt’

Raw=urlopen(url).read()

Type(raw)

Len(raw)

Raw[:75]

#分詞未作測試??

Tokens=nltk.word_tokenize(raw)

Type(tokens)

Len(tokens)

Tokens[:10]

#切片

Text=nltk.Text(tokens)

Type(text)

Text[1020:1060]

Text.collocations()

#手工挑出文字中的描述資訊

Raw.find(‘PART I’)#取得字串索引值

Raw.rfind(“End of Project Gutenberg’s Crime”)

Raw=raw[5303:1157681]

Raw.find(“PART I”)

處理html 未作測試??

Url=’http://news.bbc.co.uk/2/hi/health/2284783.stm’

Html=urlopen(url).read()

Html[:60]

#對html進行分詞

Raw=nltk.clean_html(html)

Tokens=nltk.word_tokenize(raw)

tokens

#取得感興趣的識別符號

Tokens=[96:399]

Text=nltk.Text(tokens)

Text.concordance(‘gene’)

處理搜尋引擎的結果

略

處理rss訂閱未作測試

Import feedparser

Llog=feedparser.parse(“http://languagelog.ldc.upenn.edu/nll/?feed=atom”)

Llog[‘feed’][‘title’]

Len(llog.entries)

Post=llog.entries[2]

Post.title

Content=post.content[0].value

Content[0:70]

Nltk.word_tokenize(nltk.html_clean(content))

Nltk.word_tokenize(nltk.clean_html(llog.entries[2].content[0].value))

讀取本檔案

#提示找不到檔案

>>> f=open('document.txt')

Traceback (most recent call last):

File "<input>", line 1, in <module>

FileNotFoundError: [Errno 2] No such fileor directory: 'document.txt'

#檢視當前目錄,在當前目錄下新增document.txt檔案

>>> import os

>>> os.listdir('.')

['.idea', 'One', 'Two']

#重新開啟並讀取檔案內容

>>> f=open('document.txt')

>>> f.read()

'this is my time\nTime files like anarrow.\nFruit files like a banana.\n'

#一次讀取檔案中的一行

>>> f=open('document.txt','rU')

>>> for line in f:

... print(line.strip())#刪除行尾換行符

...

this is my time

Time files like an arrow.

Fruit files like a banana.

#開啟語料庫中的檔名

>>> path=nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')

>>> raw=open(path,'rU').read()

從PDF,MS WORD 和其他二進位制檔案中提取文字

#捕獲使用者在程式互動時的輸入

>>> s=input('Enter some text')

Enter some text>? On an exceptionallyhot eveing early in july

>>> print('You typed',len(nltk.word_tokenize(s)),'words')

You typed 8 words

NLP的流程

>>>raw=open('document.txt').read()

>>> type(raw)

#分詞

>>> tokens=nltk.word_tokenize(raw)

>>> type(tokens)

>>> words=[w.lower() for w intokens]

>>> type(words)

>>> vocab=sorted(set(words))

>>> type(vocab)

#可以追加一個元素到一個連結串列,介不能到一個字串

>>> vocab.append('blog')

>>> raw.append('blog')

Traceback (most recent call last):

File "<input>", line 1, in <module>

AttributeError: 'str' object has noattribute 'append'

#字串+字串連結串列+連結串列連結串列不能加字串

>>> query='Who knows?'

>>>beatles=['john','paul','george','ringo']

>>> query+beatles

Traceback (most recent call last):

File "<input>", line 1, in <module>

TypeError: Can't convert 'list' object tostr implicitly

2 字串:最底層的文字處理

#字串的基本操作


monty = 'Monty python'
print(monty)

circus = "Monty python's Flying Circus"
print(circus)

circus = 'Monty python\'s Flying Circus'
print(circus)

#多行文字可以使用反斜槓連線或是括號

>>> couplet="Shall I comparethee to a Summer's day?"\

... "Thou are more lovely and moretemperate:"

>>> print(couplet)

Shall I compare thee to a Summer's day?Thouare more lovely and more temperate:

>>> couplet=("Shall I comparethee to a Summer's day?"

... "Thou are more lovely and moretemperate:")

>>> print(couplet)

Shall I compare thee to a Summer's day?Thouare more lovely and more temperate:

#以上方法顯示的字串沒有換行,可以使用三引號

>>> couplet='''Shall I comparethee to a Summer's day?

... Thou are more lovely and moretemperate:'''

>>> print(couplet)

Shall I compare thee to a Summer's day?

Thou are more lovely and more temperate:

>>>couplet="""Shall I compare thee to a Summer's day?

... Thou are more lovely and moretemperate:"""

>>> print(couplet)

Shall I compare thee to a Summer's day?

Thou are more lovely and more temperate:

#字串的連線操作

>>> 'very'+'very'+'very'

'veryveryvery'

>>> 'very'*3

'veryveryvery'

#輸出字串,使用print()

#訪問單個字串

>>> monty='Monty python'

>>> monty[0]

'M'

#正數為正向索引,負數為反向索引

>>> monty[-1]

#不要在行尾輸出換行符 ??

#字元轉小寫,過濾非字母字元

>>> import nltk

>>> from nltk.corpus importgutenberg

>>>raw=gutenberg.raw('melville-moby_dick.txt')

>>> fdist=nltk.FreqDist(ch.lower()for ch in raw if ch.isalpha())

>>> fdist.keys()

dict_keys(['s', 'z', 'r', 'h', 'a', 'i','n', 'b', 't', 'j', 'o', 'e', 'c', 'm', 'x', 'y', 'g', 'd', 'q', 'v', 'w', 'f','k', 'p', 'u', 'l'])

#訪問子字串,類似於對連結串列的切片操作

>>> monty='Monty python'

>>> monty[6:10]

'pyth'

#使用負數索引

>>> monty[-12:-7]

'Monty'

#分別從字串頭和尾開始

>>> monty[:5]

'Monty'

>>> monty[6:]

'python'

#測試字串被包含

>>> phrase='And now for something completelydifferent'

>>> if 'thing' in phrase:

... print("found 'thing")

...

found 'thing

#使用find()查詢子字串位

>>> monty.find('python')

#檢視字串的更多操作

>>> help(str)

Help on class str in module builtins:

class str(object)

| str(object='') -> str

| str(bytes_or_buffer[,encoding[, errors]]) -> str

| Create a new string object from the given object. If encoding or

| errors is specified, then the object must expose a data buffer

| thatwill be decoded using the given encoding and error handler.

| Otherwise, returns the result of object.__str__() (if defined)

| orrepr(object).

| encoding defaults to sys.getdefaultencoding().

| errors defaults to 'strict'.

| Methods defined here:

| __add__(self, value, /) #私有方法

| Return self+value.

| __contains__(self, key, /)

| Return key in self.

| __eq__(self, value, /)

| Return self==value.

| __format__(...)

| S.__format__(format_spec) -> str

| Return a formatted version of S as described by format_spec.

| __ge__(self, value, /)

| Return self>=value.

| __getattribute__(self, name, /)

| Return getattr(self, name).

| __getitem__(self, key, /)

| Return self[key].

| __getnewargs__(...)

| __gt__(self, value, /)

| Return self>value.

| __hash__(self, /)

| Return hash(self).

| __iter__(self, /)

| Implement iter(self).

| __le__(self, value, /)

| Return self<=value.

| __len__(self, /)

| Return len(self).

| __lt__(self, value, /)

| Return self<value.

| __mod__(self, value, /)

| Return self%value.

| __mul__(self, value, /)

| Return self*value.n

| __ne__(self, value, /)

| Return self!=value.

| __new__(*args, **kwargs) from builtins.type

| Create and return a new object. See help(type) for accurate signature.

| __repr__(self, /)

| Return repr(self).

| __rmod__(self, value, /)

| Return value%self.

| __rmul__(self, value, /)

| Return self*value.

| __sizeof__(...)

| S.__sizeof__() -> size of S in memory, in bytes

| __str__(self, /)

| Return str(self).

| capitalize(...)

| S.capitalize() -> str

| Return a capitalized version of S, i.e. make the first character

| have upper case and the rest lower case.

| casefold(...)

| S.casefold() -> str

| Return a version of S suitable for caseless comparisons.

| center(...)

| S.center(width[, fillchar]) -> str

| Return S centered in a string of length width. Padding is

| done using the specified fill character (default is a space)

| count(...) #字串中字元數量

| S.count(sub[, start[, end]]) -> int

| Return the number of non-overlapping occurrences of substring sub in

| string S[start:end]. Optionalarguments start and end are

| interpreted as in slice notation.

| encode(...)

| S.encode(encoding='utf-8', errors='strict') -> bytes

| Encode S using the codec registered for encoding. Default encoding

| is 'utf-8'. errors may be given to set a different error

| handling scheme. Default is 'strict' meaning that encoding errors raise

| aUnicodeEncodeError. Other possible values are 'ignore', 'replace' and

| 'xmlcharrefreplace' as well as any other name registered with

| codecs.register_error that can handle UnicodeEncodeErrors.

| endswith(...) #是否以指定字串結尾

| S.endswith(suffix[, start[, end]]) -> bool

| Return True if S ends with the specified suffix, False otherwise.

| With optional start, test S beginning at that position.

| With optional end, stop comparing S at that position.

| suffix can also be a tuple of strings to try.

| expandtabs(...)

| S.expandtabs(tabsize=8) -> str

| Return a copy of S where all tab characters are expanded using spaces.

| If tabsize is not given, a tab size of 8 characters is assumed.

| find(...) #查詢子字串的第一個索引

| S.find(sub[, start[, end]]) -> int

| Return the lowest index in S where substring sub is found,

| such that sub is contained within S[start:end]. Optional

| arguments start and end are interpreted as in slice notation.

| Return -1 on failure.

| format(...) #格式化字串

| S.format(*args, **kwargs) -> str

| Return a formatted version of S, using substitutions from args andkwargs.

| The substitutions are identified by braces ('{' and '}').

| format_map(...)

| S.format_map(mapping) -> str

| Return a formatted version of S, using substitutions from mapping.

| The substitutions are identified by braces ('{' and '}').

| index(...)

| S.index(sub[, start[, end]]) -> int

| Like S.find() but raise ValueError when the substring is not found.

| isalnum(...) #是否為數字

| S.isalnum() -> bool

| Return True if all characters in S are alphanumeric

| and there is at least one character in S,False otherwise.

| isalpha(...) #是不為字母

| S.isalpha() -> bool

| Return True if all characters in S are alphabetic

| and there is at least one character in S, False otherwise.

| isdecimal(...)

| S.isdecimal() -> bool

| Return True if there are only decimal characters in S,

| False otherwise.

| isdigit(...)

| S.isdigit() -> bool

| Return True if all characters in S are digits

| and there is at least one character in S, False otherwise.

| isidentifier(...)

| S.isidentifier() -> bool

| Return True if S is a valid identifier according

| to the language definition.

| Use keyword.iskeyword() to test for reserved identifiers

| such as "def" and "class".

| islower(...) #是否小寫

| S.islower() -> bool

| Return True if all cased characters in S are lowercase and there is

| at least one cased character in S, False otherwise.

| isnumeric(...)

| S.isnumeric() -> bool

| Return True if there are only numeric characters in S,

| False otherwise.

| isprintable(...)

| S.isprintable() -> bool

| Return True if all characters in S are considered

| printable in repr() or S is empty, False otherwise.

| isspace(...)

| S.isspace() -> bool

| Return True if all characters in S are whitespace

| and there is at least one character in S, False otherwise.

| istitle(...)

| S.istitle() -> bool

| Return True if S is a titlecased string and there is at least one

| character in S, i.e. upper- and titlecase characters may only

| follow uncased characters and lowercase characters only cased ones.

| Return False otherwise.

| isupper(...)#是不是大寫

| S.isupper() -> bool

| Return True if all cased characters in S are uppercase and there is

| at least one cased character in S, False otherwise.

| join(...) #連線字串

| S.join(iterable) -> str

| Return a string which is the concatenation of the strings in the

| iterable. The separator betweenelements is S.

| ljust(...)

| S.ljust(width[, fillchar]) -> str

| Return S left-justified in a Unicode string of length width. Padding is

| done using the specified fill character (default is a space).

| lower(...)

| S.lower() -> str

| Return a copy of the string S converted to lowercase.

| lstrip(...)

| S.lstrip([chars]) -> str

| Return a copy of the string S with leading whitespace removed.

| If chars is given and not None, remove characters in chars instead.

| partition(...)

| S.partition(sep) -> (head, sep, tail)

| Search for the separator sep in S, and return the part before it,

| the separator itself, and the part after it. If the separator is not

| found, return S and two empty strings.

| replace(...) #替換

| S.replace(old, new[, count]) -> str

| Return a copy of S with all occurrences of substring

| old replaced by new. If theoptional argument count is

| given, only the first count occurrences are replaced.

| rfind(...) #反向查詢

| S.rfind(sub[, start[, end]]) -> int

| Return the highest index in S where substring sub is found,

| such that sub is contained within S[start:end]. Optional

| arguments start and end are interpreted as in slice notation.

| Return -1 on failure.

| rindex(...)

| S.rindex(sub[, start[, end]]) -> int

| Like S.rfind() but raise ValueError when thesubstring is not found.

| rjust(...)

| S.rjust(width[, fillchar]) -> str

| Return S right-justified in a string of length width. Padding is

| done using the specified fill character (default is a space).

| rpartition(...)

| S.rpartition(sep) -> (head, sep, tail)

| Search for the separator sep in S, starting at the end of S, and return

| the part before it, the separator itself, and the part after it. If the

| separator is not found, return two empty strings and S.

| rsplit(...)

| S.rsplit(sep=None, maxsplit=-1) -> list of strings

| Return a list of the words in S, using sep as the

| delimiter string, starting at the end of the string and

| working to the front. If maxsplitis given, at most maxsplit

| splits are done. If sep is not specified, any whitespace string

| is a separator.

| rstrip(...)

| S.rstrip([chars]) -> str

| Return a copy of the string S with trailing whitespace removed.

| If chars is given and not None, remove characters in chars instead.

| split(...)

| S.split(sep=None, maxsplit=-1) -> list of strings

| Return a list of the words in S, using sep as the

| delimiter string. If maxsplit isgiven, at most maxsplit

| splits are done. If sep is not specified or is None, any

| whitespace string is a separator and empty strings are

| removed from the result.

| splitlines(...) #按行分割成字串連結串列

| S.splitlines([keepends]) -> list of strings

| Return a list of the lines in S, breaking at line boundaries.

| Line breaks are not included in the resulting list unless keepends

| is given and true.

| startswith(...)

| S.startswith(prefix[, start[, end]]) -> bool

| Return True if S starts with the specified prefix, False otherwise.

| With optional start, test S beginning at that position.

| With optional end, stop comparing S at that position.

| prefix can also be a tuple of strings to try.

| strip(...) #返加首尾沒有空白字元

| S.strip([chars]) -> str

| Return a copy of the string S with leading and trailing

| whitespace removed.

| If chars is given and not None, remove characters in chars instead.

| swapcase(...)

| S.swapcase() -> str

| Return a copy of S with uppercase characters converted to lowercase

| and vice versa.

| title(...)

| S.title() -> str

| Return a titlecased version of S, i.e. words start with title case

| characters, all remaining cased characters have lower case.

| translate(...)

| S.translate(table) -> str

| Return a copy of the string S in which each character has been mapped

| through the given translation table. The table must implement

| lookup/indexing via __getitem__, for instance a dictionary or list,

| mapping Unicode ordinals to Unicode ordinals, strings, or None. If

| this operation raises LookupError, the character is left untouched.

| Characters mapped to None are deleted.

| upper(...)

| S.upper() -> str

| Return a copy of S converted to uppercase.

| zfill(...)

| S.zfill(width) -> str

| Pad a numeric string S with zeros on theleft, to fill a field

| of the specified width. The string S is never truncated.

| ----------------------------------------------------------------------

| Static methods defined here:

| maketrans(x,y=None, z=None, /)

| Return a translation table usable for str.translate().

| If there is only one argument, it must be a dictionary mapping Unicode

| ordinals (integers) or characters to Unicode ordinals, strings or None.

| Character keys will be then converted to ordinals.

| If there are two arguments, they must be strings of equal length, and

| in the resulting dictionary, each character in x will be mapped to the

| character at the same position in y. If there is a third argument, it

| must be a string, whose characters will be mapped to None in the result.

連結串列與字串的差異

#字串和連結串列不能連線

#字串是不可變的,連結串列可以修改其元素內容

#連結串列操作靈活,可以對段落,句子,短語,單詞,字元進行操作

3 使用unicode進行文書處理

從檔案中提取已編碼檔案

>>>path=nltk.data.find('corpora/unicode_samples/polish-lat2.txt')

>>> import codecs

#將編碼資料讀入為unicode字串,將字串以指定編碼形式寫出

>>>f=codecs.open(path,encoding='latin2') #latin-2,也稱為iso-8859-2

>>> for line in f:

... line=line.strip()

#unicode_escape是一個虛擬編碼,將所有非ascii字元轉換成\uXXXX形式

#編點碼以兩位數字的形式\xXX表示

... print(line.encode('unicode_escape'))

...

b'"Berlinka" to skarb kultury isztuki niemieckiej. Przewiezione przez'

b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowejna Dolny \\u015al\\u0105sk, zosta\\u0142y'

b'odnalezione po 1945 r. na terytoriumPolski. Trafi\\u0142y do Biblioteki'

b'Jagiello\\u0144skiej w Krakowie,obejmuj\\u0105 ponad 500 tys. zabytkowych'

b'archiwali\\xf3w, m.in. manuskrypty Goethego,Mozarta, Beethovena, Bacha.'

#查詢字元的整數序數

>>> ord('a')

>>> a=u'\u0061'

>>> a

'a'

>>> print(a)

>>> nacute=u'\u0144'

>>> nacute

'ń'

>>>nacute_utf=nacute.encode('utf-8')

>>> print(repr(nacute_utf))

b'\xc5\x84'

#第三行中超出ascii碼範圍的字元,輸出它的utf-8轉義值

>>> import unicodedata

>>>lines=codecs.open(path,encoding='latin2').readlines()

>>> line=lines[2]

>>>print(line.encode('unicode_escape'))

b'Niemc\\xf3w pod koniec II wojny\\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y\\n'

>>> for c in line:

... if ord(c)>127:

... print('%rU+%04x%s' % (c.encode('utf-8'),ord(c),unicodedata.name(c)))

...

b'\xc3\xb3'U+00f3LATIN SMALL LETTER O WITHACUTE

b'\xc5\x9b'U+015bLATIN SMALL LETTER S WITHACUTE

b'\xc5\x9a'U+015aLATIN CAPITAL LETTER SWITH ACUTE

b'\xc4\x85'U+0105LATIN SMALL LETTER A WITHOGONEK

b'\xc5\x82'U+0142LATIN SMALL LETTER L WITHSTROKE

Python字串函式和re模組接收unicode字串

>>> line.find(u'zostau0142y')

-1

>>> line=line.lower()

>>>print(line.encode('unicode_escape'))

b'niemc\\xf3w pod koniec ii wojny\\u015bwiatowej na dolny \\u015bl\\u0105sk, zosta\\u0142y\\n'

>>> import re

>>> m=re.search(u'\u015b\w*',line)

>>> m.group()

'światowej'

Nltk分詞器可以將unicode作為輸入

>>> nltk.word_tokenize(line)

['niemców', 'pod', 'koniec','ii', 'wojny', 'światowej', 'na', 'dolny', 'śląsk', ',','zostały']

在python中使用本地編碼

#在第一行或是第二行新增

# -*- coding: <coding> -*-

Coding可以是latin-1 big5 utf-8

4 使用正則表示式檢測片語搭配

使用基本的元字元

>>> import nltk

>>> wordlist=[w for w innltk.corpus.words.words('en') if w.islower()]

#查詢以ed結尾的詞,search(p,s)表示查詢s中是否有p模式

>>> [w for w in wordlist ifre.search('ed$',w)]

['abaissed', 'abandoned', 'abased','abashed', 'abatised', 'abed', 'aborted', 'abridged', 'abscessed',

...

'younghearted', 'zagged', 'zed', 'zeed','zigzagged', 'zonated', 'zoned']

#.匹配單個字元,^表示開始,$表示結束,?表示前面的字元是可選的

#第三個字元是j,第六個字元是t

[w for w in wordlist ifre.search('^..j..t..$',w)]

['abjectly', 'adjuster', 'dejected','dejectly', 'injector', 'majestic', 'objectee', 'objector', 'rejecter','rejector', 'unjilted', 'unjolted', 'unjustly']

範圍與閉包

#在手機的t9輸入系統輸入4653,產生的聯想詞彙

>>> [w for w in wordlist ifre.search('^[ghi][mno][jlk][def]$',w)]

['gold', 'golf', 'hold', 'hole']

#+表示一個或多個(現使用*)

>>> chat_words=sorted(set(w for win nltk.corpus.nps_chat.words()))

>>> [w for w in chat_words if re.search('^m+i+n+e+$',w)]

['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee','miiiiiinnnnnnnnnneeeeeeee', 'mine', 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']

>>> [w for w in chat_words ifre.search('^[ha]+$',w)]

['a', 'aaaaaaaaaaaaaaaaa', 'aaahhhh', 'ah','ahah', 'ahahah', 'ahh', 'ahhahahaha', 'ahhh', 'ahhhh', 'ahhhhhh','ahhhhhhhhhhhhhh', 'h', 'ha', 'haaa', 'hah', 'haha', 'hahaaa', 'hahah','hahaha', 'hahahaa', 'hahahah', 'hahahaha', 'hahahahaaa', 'hahahahahaha','hahahahahahaha', 'hahahahahahahahahahahahahahahaha', 'hahahhahah','hahhahahaha']

#匹配母音字線外的其他字母

[^aeiouAEIOU]

#非母音字母組成的詞彙

^[aeiouAEIOU]+$

#\.只匹配一個句號

>>> [w for w in wsj ifre.search('^[0-9]+\.[0-9]+$',w)]

['0.0085', '0.05', '0.1', '0.16', '0.2','0.25', '0.28', '0.3', '0.4', '0.5', '0.50', '0.54', '0.56', '0.60', '0.7',

...

'9.8', '9.82', '9.9', '92.9', '93.3','93.9', '94.2', '94.8', '95.09', '96.4', '98.3', '99.1', '99.3']

>>> [w for w in wsj if re.search('^[A-Z]+\$$',w)]

['C$', 'US$']

#{3,5}表示前面至少出現3次,最多出現5次

>>> [w for w in wsj ifre.search('^[0-9]+-[a-z]{3,5}$',w)]

['10-day', '10-lap', '10-year','100-share', '12-point', '12-year', '14-hour', '15-day', '150-point','190-point', '20-point', '20-stock', '21-month', '237-seat', '240-page','27-year', '30-day', '30-point', '30-share', '30-year', '300-day', '36-day','36-store', '42-year', '50-state', '500-stock', '52-week', '69-point','84-month', '87-store', '90-day']

>>> [w for w in wsj ifre.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$',w)]

['black-and-white', 'bread-and-butter','father-in-law', 'machine-gun-toting', 'savings-and-loan']

#|匹配指定的字串

>>> [w for w in wsj ifre.search('(ed|ing)$',w)]

['62%-owned', 'Absorbed', 'According','Adopting', 'Advanced', 'Advancing', 'Alfred', 'Allied',

...

'yielded', 'yielding', 'yttrium-containing','zoomed']

正則表達示的常用操作符

操作符	功能
.	匹配所有
^abc	以abc開頭
Abc$	以abc結尾
[abc]	匹配字元集合
[A-Z0-9]	匹配字元範圍
Ed\|ing\|s	匹配指定字串
*	0個或多個
+	1個或多個
?	0個或一個
{n}	重複n次,(n為非負)
{n,}	至少重複n次
{,n}	重複不多於n次
{m,n}	至少m次不多於n次
A(b\|c)+	括號表示操作符的範圍

5 正則表示式的有益應用

提取字元塊

#找出一個詞中的所有母音字母,計數

>>>word='wupercallifragilisticexpialidocious'

>>> re.findall(r'[aeiou]',word)

['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i','e', 'i', 'a', 'i', 'o', 'i', 'o', 'u']

>>>len(re.findall(r'[aeiou]',word))

#找出兩個或兩個以上的母音序列,計算相對頻率

>>>

>>>wsj=sorted(set(nltk.corpus.treebank.words()))

>>> fd=nltk.FreqDist(vs for wordin wsj

... for vs in re.findall(r'[aeiou]{2,}',word))

... fd.items()

dict_items([('eei', 2), ('aia', 1),('aiia', 1), ('au', 106), ('ao', 6), ('eo', 39), ('ioa', 1), ('ia', 253),('uu', 1), ('ui', 95), ('oa', 59), ('iai', 1), ('ueui', 1), ('ae', 11), ('ei',86), ('ai', 261), ('eou', 5), ('ou', 329), ('ee', 217), ('uo', 8), ('iou', 27),('ie', 331), ('uie', 3), ('iu', 14), ('aii', 1), ('iao', 1), ('eu', 18),('ooi', 1), ('ue', 105), ('oui', 6), ('oei', 1), ('ieu', 3), ('oi', 65), ('io',549), ('uou', 5), ('ea', 476), ('oo', 174), ('ua', 109), ('eau', 10), ('oe',15), ('eea', 1), ('aa', 3), ('uee', 4)])

其他操作

#首母音,詞尾母音,所有的子音

>>>regexp=r'^[AEIOUaeiou]+[AEIOUaeiou]+$|[^AEIOUaeiou]'

>>> def compress(word):

... pieces=re.findall(regexp,word)

... return ''.join(pieces)

>>>english_udhr=nltk.corpus.udhr.words('English-Latin1')

>>> print(nltk.tokenwrap(compress(w)for w in english_udhr[:75]))

nvrsl Dclrtn f Hmn Rghts Prmbl Whrs rcgntnf th nhrnt dgnty nd f th ql

nd nlnbl rghts f ll mmbrs f th hmn fmly sth fndtn f frdm , jstc nd pc

n th wrld , Whrs dsrgrd nd cntmpt fr hmnrghts hv rsltd n brbrs cts

whch hv trgd th cnscnc f mnknd , nd th dvntf wrld n whch hmn bngs

shll njy frdm f spch nd

>>>rotokas_words=nltk.corpus.toolbox.words('rotokas.dic')

>>> cvs=[cv for w in rotokas_wordsfor cv in re.findall(r'[ptksvr][aeiou]',w)]

>>> cfd=nltk.ConditionalFreqDist(cvs)

>>> cfd.tabulate()

a e i o u

k 418 148 94 420 173

p 83 31 105 34 51

r 187 63 84 89 79

s 0 0 100 2 1

t 47 8 0 148 37

v 93 27 105 48 49

查詢詞幹

#直接去掉看起來像字尾的字元

>>> def stem(word):

... for suffix in ['ing','ly','ed','ious','ies','ive','es','s','ment']:

... if word.endswith(suffix):

... return word[:-len(suffix)]

... return word

#使用正則表達示提取詞幹

>>> re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$','processing')

['ing']

>>>re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$','processing')

['processing']

>>>re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$','processing')

[('process', 'ing')]

#非貪婪

>>>re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$','processes')

[('process', 'es')]

#貪婪

>>>re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$','processes')

[('processe', 's')]

>>> def stem(word):

... regexp=r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'

... stem,suffix=re.findall(regexp,word)[0]

... return stem

...

>>> raw="""DENNIS:Listen,strange women lying in ponds distributing swords

... is no bassis for a system ofgovernment.Supreme executive power derives from

... a mandate form masses,not from somefarcical aquatic ceremony."""

>>> [stem(t) for t in tokens]

['DENNIS', ':', 'Listen', ',', 'strange','women', 'ly', 'in', 'pond', 'distribut', 'sword', 'i', 'no', 'bassi', 'for', 'a','system', 'of', 'government.Supreme', 'execut', 'power', 'deriv', 'from', 'a','mandate', 'form', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic','ceremony',’.’]

搜尋已分詞文字

#找出所有a * man

>>> from nltk.corpus importgutenberg,nps_chat

>>> moby=nltk.Text(gutenberg.words('melville-moby_dick.txt'))

>>>moby.findall(r'<a>(<.*>)<man>')

monied; nervous; dangerous; white; white;white; pious; queer; good;

mature; white; Cape; great; wise; wise;butterless; white; fiendish;

pale; furious; better; certain; complete;dismasted; younger; brave;

brave; brave; brave

#找出以bro結尾的三個片語成的短語

>>>chat=nltk.Text(nps_chat.words())

>>>chat.findall(r'<.*><.*><bro>')

you rule bro; telling you bro; u twiztedbro

#找出以字母l開頭的,三個或更多片語成的序列

>>>chat.findall(r'<l.*>{3,}')

lol lol lol; lmao lol lol; lol lol lol; lala la la la; la la la; la

la la; lovely lol lol love; lol lol lol.;la la la; la la la

#在詞料庫中搜索x and other ys

>>> from nltk.corpus import brown

>>>hobbies_learned=nltk.Text(brown.words(categories=['hobbies','learned']))

>>>hobbies_learned.findall(r'<\w*> <and> <other> <\w*s>')

speed and other activities; water and otherliquids; tomb and other

landmarks; Statues and other monuments;pearls and other jewels;

charts and other items; roads and otherfeatures; figures and other

objects; military and other areas; demandsand other factors;

abstracts and other compilations; iron andother metals

6 規範化文字

詞幹提取器(porter和lancaster)

>>> import nltk

>>> raw="""DENNIS:Listen,strange women lying in ponds distributing swords

... is no bassis for a system ofgovernment.Supreme executive power derives from

... a mandate form masses,not from somefarcical aquatic ceremony."""

#詞幹提取器

>>> porter=nltk.PorterStemmer()

>>> lancaster=nltk.LancasterStemmer()

#提取詞幹

>>> [porter.stem(t) for t intokens]

['DENNI', ':', 'Listen', ',', 'strang','women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bassi', 'for','a', 'system', 'of', 'government.Suprem', 'execut', 'power', 'deriv', 'from','a', 'mandat', 'form', 'mass', ',', 'not', 'from', 'some', 'farcic', 'aquat','ceremoni', '.']

>>> [lancaster.stem(t) for t intokens]

['den', ':', 'list', ',', 'strange', 'wom','lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bass', 'for', 'a','system', 'of', 'government.supreme', 'execut', 'pow', 'der', 'from', 'a','mand', 'form', 'mass', ',', 'not', 'from', 'som', 'farc', 'aqu', 'ceremony','.']

#使用詞幹提取器索引文字

>>> import nltk

class IndexedText(object):

def __init__(self, stemmer, text):

self._text = text

self._stemmer = stemmer

self._index = nltk.Index((self._stem(word), i) for (i, word) inenumerate(text))

def concordance(self, word, width=40):

key = self._stem(word)

wc = int(width / 4)

for i in self._index[key]:

lcontext = ' '.join(self._text[i - wc:i])

rcontext = ' '.join(self._text[i:i + wc])

ldisplay = '%*s' % (width, lcontext[-width:])

rdisplay = '%-*s' % (width, rcontext[:width])

print(ldisplay, rdisplay)

def _stem(self, word):

return self._stemmer.stem(word).lower()

porter = nltk.PorterStemmer()

grail =nltk.corpus.webtext.words('grail.txt')

text = IndexedText(porter, grail)

text.concordance('lie')

#執行結果如下:

r king ! DENNIS : Listen , strange womenlying in ponds distributing swords is no

beata very brave retreat . ROBIN : All lies ! MINSTREL : [ singing ] Bravest of

Nay . Nay . Come . Come . You may lie here . Oh , but you are wounded!

doctors immediately ! No , no , please !Lie down . [ clap clap ] PIGLET : Well

ere is much danger , for beyond the cavelies the Gorge of Eternal Peril , which

you . Oh ... TIM : To the north there lies a cave -- the cave ofCaerbannog --

h it and lived ! Bones of full fifty menlie strewn about its lair . So , brave k

not stop our fight ' til each one of youlies dead , and the Holy Grail returns t

詞形歸併 (詞形歸併器)

>>> import nltk

>>> raw="""DENNIS:Listen,strange women lying in ponds distributing swords

... is no bassis for a system ofgovernment.Supreme executive power derives from

... a mandate form masses,not from somefarcical aquatic ceremony."""

>>> tokens=nltk.word_tokenize(raw)

#用作編譯文字詞彙,或是想要一個有效詞條列表

>>> wnl=nltk.WordNetLemmatizer()

>>> [wnl.lemmatize(t) for t intokens]

['DENNIS', ':', 'Listen', ',', 'strange','woman', 'lying', 'in', 'pond', 'distributing', 'sword', 'is', 'no', 'bassis','for', 'a', 'system', 'of', 'government.Supreme', 'executive', 'power','derives', 'from', 'a', 'mandate', 'form', 'mass', ',', 'not', 'from', 'some','farcical', 'aquatic', 'ceremony', '.']

7 用正則表示式為文字分詞

分詞的簡單方法

#按空格分割文字

>>> import nltk

>>> import re

>>> raw="""'WhenI'm a Duchess,' she said herself,(not in a very hopefultone

... though),'I won't have any peper in mykitchen AT ALL.Soup does very

... well tithout--Maybe it's always peperthat makes people hot-tempered,'..."""

#僅匹配空格,會帶有\n

>>> re.split(r' ',raw)

["'When", "I'm", 'a',"Duchess,'", 'she', 'said', 'herself,(not', 'in', 'a', 'very',"hopefultone\nthough),'I", "won't", 'have', 'any', 'peper','in', 'my', 'kitchen', 'AT', 'ALL.Soup', 'does', 'very\nwell','tithout--Maybe', "it's", 'always', 'peper', 'that', 'makes','people', "hot-tempered,'..."]

#使用'[ \t\n]+',可以匹配一個或多個空格,製表符或換行

>>> re.split(r'[\t\n]+',raw)

["'When", "I'm", 'a',"Duchess,'", 'she', 'said', 'herself,(not', 'in', 'a', 'very','hopefultone', "though),'I", "won't", 'have', 'any','peper', 'in', 'my', 'kitchen', 'AT', 'ALL.Soup', 'does', 'very', 'well','tithout--Maybe', "it's", 'always', 'peper', 'that', 'makes','people', "hot-tempered,'..."]

#\W可以匹配字母,數字和下劃線以外的字元

>>> re.split(r'\W+',raw)

['', 'When', 'I', 'm', 'a', 'Duchess','she', 'said', 'herself', 'not', 'in', 'a', 'very', 'hopefultone', 'though','I', 'won', 't', 'have', 'any', 'peper', 'in', 'my', 'kitchen', 'AT', 'ALL','Soup', 'does', 'very', 'well', 'tithout', 'Maybe', 'it', 's', 'always','peper', 'that', 'makes', 'people', 'hot', 'tempered', '']

>>> re.findall(r'\w+|\S\w*',raw)

["'When", 'I', "'m",'a', 'Duchess', ',', "'", 'she', 'said', 'herself', ',', '(not','in', 'a', 'very', 'hopefultone', 'though', ')', ',', "'I", 'won',"'t", 'have', 'any', 'peper', 'in', 'my', 'kitchen', 'AT', 'ALL','.Soup', 'does', 'very', 'well', 'tithout', '-', '-Maybe', 'it',"'s", 'always', 'peper', 'that', 'makes', 'people', 'hot','-tempered', ',', "'", '.', '.', '.']

>>>print(re.findall(r"\w+(?:[-']\w)*|'|[-.(]+|\S\w*",raw))

["'", 'When', "I'm",'a', 'Duchess', ',', "'", 'she', 'said', 'herself', ',', '(', 'not','in', 'a', 'very', 'hopefultone', 'though', ')', ',', "'", 'I',"won't", 'have', 'any', 'peper', 'in', 'my', 'kitchen', 'AT', 'ALL','.', 'Soup', 'does', 'very', 'well', 'tithout', '--', 'Maybe',"it's", 'always', 'peper', 'that', 'makes', 'people', 'hot-t','empered', ',', "'", '...']

正則表示式符號

符號	功能
\b	詞邊界
\d	任一十進位制數([0-9])
\D	任何非數字字元[^0-9]
\s	任何空白字元[\t\n\r\f\v]
\S	任何非空白字元([^\t\n\r\f\v])
\W	任何非字線數字字元([^a-zA-Z0-9])
\t	製表符
\n	換行符

Nltk正則表示式分詞器

Nltk.regexp_tokenize

>>> text='That U.S.A. poster-printcosts $12.40..,'

>>> pattern=r'''(?x)

... ([A-Z]\.)+

... | \w+(-\w+)*

... | \$?\d+(\.\d+)?%?

... | \.\.\.

... '''

分詞的其他問題

8 分割

斷句

#計算每個句子的平均詞數

>>>len(nltk.corpus.brown.words())/len(nltk.corpus.brown.sents())

20.250994070456922

#文字斷句的例子

>>> import pprint

>>> sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')

>>>text=nltk.corpus.gutenberg.raw('chesterton-thursday.txt')

>>>sents=sent_tokenizer.tokenize(text)

>>>pp=pprint.PrettyPrinter(indent=4)

>>> pp.pprint(sents[171:181])

[ 'In the wild events which were to follow this girl had no\n'

'part at all; he never saw her again until all his tale was over.',

'And yet, in some indescribable way, she kept recurring like a\n'

'motive in music through all his mad adventures afterwards, and the\n'

'glory of her strange hair ran like a red thread through those dark\n'

'and ill-drawn tapestries of the night.',

'For what followed was so\n'

'improbable, that it might well have been a dream.',

'When Syme went out into the starlit street, he found it for the\n'

'moment empty.',

'Then he realised (in some odd way) that the silence\n'

'was rather a living silence than a dead one.',

'Directly outside the\n'

'door stood a street lamp, whose gleam gilded the leaves of the tree\n'

'that bent out over the fence behind him.',

'About a foot from the\n'

'lamp-post stood a figure almost as rigid and motionless as the\n'

'lamp-post itself.',

'The tall hat and long frock coat were black; the\n'

'face, in an abrupt shadow, was almost as dark.',

'Only a fringe of\n'

'fiery hair against the light, and also something aggressive in the\n'

'attitude, proclaimed that it was the poet Gregory.',

'He had something\n'

'of the look of a masked bravo waiting sword in hand for his foe.']

分詞

>>>seg2='0100100100100001001001000010100100010010000100010010000'

... words=[]

... last=0

... for i in range(len(segs)):

... if segs[i]=='1':

... words.append(text[last:i+1])

... last=i+1

... words.append(text[last:])

... return words

...

>>> segment(text,seg1)

['doyouseethekitty', 'seethedoggy','doyoulikethekitty', 'likethedoggy']

>>> segment(text,seg2)

['do', 'you', 'see', 'the', 'kitty', 'see','the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']

#計算儲存詞典和重構源文字的成本

>>>seg3='0000100100000011001000000110000100010000001100010000001'

... words=segment(text,segs)

... text_size=len(words)

... lexicon_size=len(''.join(list(set(words))))

... return text_size+lexicon_size

...

>>> segment(text,seg3)

['doyou', 'see', 'thekitt', 'y', 'see','thedogg', 'y', 'doyou', 'like', 'thekitt', 'y', 'like', 'thedogg', 'y']

>>> evaluate(text,seg1)

>>> evaluate(text,seg2)

>>> evaluate(text,seg3)

#使用模擬退火演算法的非確定性搜尋

from random import randint

def flip(segs, pos):
    return segs[:pos] + str(1 - int(segs[pos])) + segs[pos + 1:]


def flip_n(segs, n):
    for i in range(n):
        segs = flip(segs, randint(0, len(segs) - 1))
    return segs


def anneal(text, segs, iterations, cooling_rate):
    temperature = float(len(segs))
    while temperature > 0.5:
        best_segs, best = segs, evaluate(text, segs)
        for i in range(iterations):
            guess = flip_n(segs, int(round(temperature)))
            score = evaluate(text, guess)
            if score < best:
                best, best_segs = score, guess
        score, segs = best, best_segs
        temperature = temperature / cooling_rate
        print(evaluate(text, segs), segment(text, segs))
    print

    return segs

>>> anneal(text, seg1, 5000, 1.2)

63 ['doyouseethekitty', 'seethedoggy','doyoulikethekitty', 'likethedoggy']

63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty','likethedoggy']

60 ['doyousee', 'thekitty', 'seethedoggy','doyou', 'l', 'ike', 'thekitty', 'l', 'ike', 'thedoggy']

58 ['doyo', 'u', 'see', 'thekitty', 'see','thedoggy', 'doyo', 'ul', 'ike', 'thekitty', 'l', 'i', 'k', 'e', 'thedoggy']

54 ['doyo', 'u', 'see', 'thekitty', 'see','thedoggy', 'doyo', 'u', 'l', 'ike', 'thekitty', 'l', 'ik', 'e', 'thedoggy']

51 ['doyo', 'u', 'see', 'thekitty', 'see','t', 'hedoggy', 'doyo', 'u', 'l', 'ike', 'thekitty', 'l', 'ike', 't','hedoggy']

48 ['doyo', 'u', 'see', 'thekitty', 'see','t', 'hedoggy', 'doyo', 'u', 'like', 'thekitty', 'like', 't', 'hedoggy']

45 ['doyou', 'see', 'thekitty', 'see', 't','hedoggy', 'doyou', 'like', 'thekitty', 'like', 't', 'hedoggy']

42 ['doyou', 'see', 'thekitty', 'see','thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']

42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy','doyou', 'like', 'thekitty', 'like', 'thedoggy']

42 ['doyou', 'see', 'thekitty', 'see','thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']

'0000100100000001001000000010000100010000000100010000000'

9 格式化:從連結串列到字串

從連結串列到字串

>>>silly=['We','called','him','Tortoise','because','he','taught','us','.']

>>> ' '.join(silly)

'We called him Tortoise because he taughtus .'

>>> ";".join(silly)

'We;called;him;Tortoise;because;he;taught;us;.'

>>> "".join(silly)

'WecalledhimTortoisebecausehetaughtus.'

字串與格式

#顯示物件內容的兩種方式

>>> word='cat'

>>> sentence="""hello

... world"""

>>> print(word)

cat

>>> print(sentence)

hello

World

>>> word

'cat'

>>> sentence

'hello\nworld'

>>> import nltk

>>>fdist=nltk.FreqDist(['dog','cat','dog','cat','dog','snake','dog','cat'])

>>> for word in fdist:

... print(word,'->',fdist[word],';',)

...

dog -> 4 ;

snake -> 1 ;

cat -> 3 ;

#使用字串格式化表示式

>>> for word in fdist:

... print('%s->%d' % (word,fdist[word]),)

...

dog->4

snake->1

cat->3

排列

#布朗語料庫不同部分的頻率模型

>>>from nltk.corpus import brown

>>>def tabulate(cfdist, words, categories):

print('%-16s' % 'Category', )

for word in words:

print('%6s' % word, end="",sep=None)

print()

for category in categories:

print('%-16s' % category,end="", sep=None)

for word in words:

print('%6d' %cfdist[category][word], end="", sep=None)

print()

>>>cfd = nltk.ConditionalFreqDist(

(genre, word)

for genre in brown.categories()

for word in brown.words(categories=genre))

genres= ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']

modals= ['can', 'could', 'may', 'might', 'must', 'will']

tabulate(cfd,modals, genres)

python自然語言處理學習筆記三

python自然語言處理學習筆記三

python自然語言處理-學習筆記（二）

python自然語言處理學習筆記一

python 自然語言處理學習筆記

自然語言處理學習筆記（三）

用Python進行自然語言處理學習筆記一

python自然語言處理-讀書筆記5

python自然語言處理-讀書筆記4

python自然語言處理-讀書筆記3

python自然語言處理-讀書筆記9

python自然語言處理-讀書筆記8

python自然語言處理-讀書筆記7

python自然語言處理-讀書筆記6

python自然語言處理-讀書筆記

python自然語言處理 -讀書筆記1

自然語言處理學習筆記（五）

自然語言處理學習筆記（四）

自然語言處理學習筆記（二）

自然語言處理學習筆記（一）

python自然語言處理學習資源彙總

python自然語言處理學習筆記三

相關推薦