Hands-Natural-language-processing-python 1: NLTK
阿新 • • 發佈:2018-12-23
基本用法:
>>> from nltk.tokenize import word_tokenize as wtoken >>> wtoken(samples_tw[20]) >>> from nltk.stem import PorterStemmer >>> stemming = PorterStemmer() >>> stemming.stem('enjoying') 'enjoy' >>> stemming.stem('enjoys') 'enjoy' >>> stemming.stem('enjoyable') 'enjoy' >>> from nltk.corpus import stopwords >>> sw_l = stopwords.words('english') >>> sw_l[20:40] ['himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this'] >>> example_text = "This is an example sentence to test stopwords" >>> example_text_without_stopwords = [word for word in example_text.split() if word not in sw_l] >>> example_text_without_stopwords ['This', 'example', 'sentence', 'test', 'stopwords']
>>> from nltk.corpus import webtext >>> webtext_sentences = webtext.sents('firefox.txt') >>> webtext_words = webtext.words('firefox.txt') >>> len(webtext_sentences) 1142 >>> len(webtext_words) 102457 >>> vocabulary = set(webtext_words) >>> len(vocabulary) 8296 >>> frequency_dist = nltk.FreqDist(webtext_words) >>> sorted(frequency_dist, key=frequency_dist.__getitem__, reverse=True)[0:30] ['.', 'in', 'to', '"', 'the', "'", 'not', '-', 'when', 'on', 'a', 'is', 't', 'and', 'of', '(', 'page', 'for', 'with', ')', 'window', 'Firefox', 'does', 'from', 'open', ':', 'menu', 'should', 'bar', 'tab'] >>> large_words = dict([(k,v) for k,v in frequency_dist.items() if len(k)>3]) >>> frequency_dist = nltk.FreqDist(large_words) >>> frequency_dist.plot(50, cumulative=False)
wcloud = WordCloud().generate_from_frequencies(frequency_dist)
import matplotlib.pyplot as plt
plt.imshow(wcloud, interpolation='bilinear')
<matplotlib.image.AxesImage object at 0x000000000DED65F8>
plt.axis('off')
(-0.5, 399.5, 199.5, -0.5)
plt.show()