第七章7.1 資料清洗--將從網站上爬去的資料進行清洗然後轉為2-grams序列輸出
阿新 • • 發佈:2018-12-19
#!/usr/bin/env python # _*_ coding:utf-8 _*_ import re import string from collections import OrderedDict from urllib.request import urlopen from bs4 import BeautifulSoup def cleanInput(input): input= re.sub('\n+'," ",input) input=re.sub('\[[0-9]*\]',"",input) input=re.sub(' +'," ",input) input=bytes(input,"UTF-8") input=input.decode("ascii","ignore") cleanInput=[] input=input.split(' ') for item in input: item=item.strip(string.punctuation) if len(item)>0 or (item.lower()=='a' or item.lower()=='t'): cleanInput.append(item) return cleanInput def ngrams(input ,n): input=cleanInput(input) output=[] outputNew=[] for i in range(len(input)-n+1): output.append(str(input[i:i+n])) setout=set(output) for item in setout: outputNew.append((item,output.count(item))) return outputNew html=urlopen("https://en.wikipedia.org/wiki/Python_(programming_language)") bsObj=BeautifulSoup(html,"html.parser") content=bsObj.find("div",{"id":"mw-content-text"}).get_text() ngrams=ngrams(content,2) ngrams=OrderedDict(sorted(ngrams,key=lambda t: t[1],reverse=True)) print(ngrams) print("2-ngrams count is "+str(len(ngrams)))