第八章 自然語言處理-概括資料-資料清洗加去掉常規詞語
阿新 • • 發佈:2018-12-19
#!/usr/bin/env python # _*_ coding:utf-8 _*_ import operator import re import string from collections import OrderedDict from urllib.request import urlopen from bs4 import BeautifulSoup def cleanInput(input): input= re.sub('\n+'," ",input) input=re.sub('\[[0-9]*\]',"",input) input=re.sub(' +'," ",input) input=bytes(input,"UTF-8") input=input.decode("ascii","ignore") cleanInput=[] input=input.split(' ') for item in input: item=item.strip(string.punctuation) if len(item)>0 or (item.lower()=='a' or item.lower()=='t'): cleanInput.append(item) return cleanInput def ngrams(input ,n): input=cleanInput(input) output={} for i in range(len(input)-n+1): outputTmp=" ".join(input[i:i+n]) if outputTmp not in output: output[outputTmp]=0 output[outputTmp]+=1 return output def isCommon(ngram): commonWords = ["the", "be", "and", "of", "a", "in", "to", "have", "it", "i", "that", "for", "you", "he", "with", "on", "do", "say", "this", "they", "is", "an", "at", "but", "we", "his", "from", "that", "not", "by", "she", "or", "as", "what", "go", "their", "can", "who", "get", "if", "would", "her", "all", "my", "make", "about", "know", "will", "as", "up", "one", "time", "has", "been", "there", "year", "so", "think", "when", "which", "them", "some", "me", "people", "take", "out", "into", "just", "see", "him", "your", "come", "could", "now", "than", "like", "other", "how", "then", "its", "our", "two", "more", "these", "want", "way", "look", "first", "also", "new", "because", "day", "more", "use", "no", "man", "find", "here", "thing", "give", "many", "well"] ngram=ngram.lower() if ngram in commonWords: return True return False content=str(urlopen("https://pythonscraping.com/files/inaugurationSpeech.txt").read(),'utf-8') ngrams=ngrams(content,2) nagramsPicked=ngrams.copy() for k,v in ngrams.items(): words=k.split(" ") for eachWord in words: flag=isCommon(eachWord) if flag: nagramsPicked.pop(k) break sortedNagrams=sorted(ngrams.items(), key=operator.itemgetter(1),reverse=True) print(sortedNagrams)