1. 程式人生 > >階段作業1:完整的中英文詞頻統計+補交上次作業

階段作業1:完整的中英文詞頻統計+補交上次作業

#補交作業

cc = ('''Counting stars Lately I've been, I've been losing sleep   
Dreaming 'bout the things that we could be   
But baby I've been, I've been prayin' hard     
Said no more counting dollars   We'll be counting stars   
Yeah, we'll be counting stars   I see this life Like a swinging vine  
 Swing my heart across the line   In my face is flashing signs   Seek it out and ye shall find
  Old, but I'm not that old   Young, but I'm not that bold   And I don't think the world is sold  
 I'm just doing what we're told   I, feel something so right   But doing the wrong thing   
I, feel something so wrong   But doing the right thing   I could lie, could lie, could lie  
 everything that kills me makes me feel alive   Lately I've been, I've been losing sleep  
 Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard  
 Said no more counting dollars   We'll be counting stars   Lately I've been, I've been losing sleep   
Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard   Said no more counting dollars  
 We'll be, we'll be counting stars   I feel the love And I feel it burn   Down this river every turn  
 Hope is a four letter word   Make that money   Watch it burn   Old, but I'm not that old  
 Young, but I'm not that bold   And I don't think the world is sold   I'm just doing what we're told  
 I, feel something so wrong   But doing the right thing   I could lie, could lie, could lie  
 Everything that drowns me makes me wanna fly   Lately I've been, I've been losing sleep  
 Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard
  Said no more counting dollars   We'll be counting stars   Lately I've been, I've been losing sleep  
 Dreaming 'bout the things that we could be   Baby I've been, I've been prayin' hard  
 Said no more counting dollars   We'll be, we'll be counting stars   Take that money And watch it burn   Sink in the river
''')
cc = cc.replace('.', ' ')
ccList = cc.split()
print(len(cc), ccList)  # 分隔一個單詞並統計英文單詞個數
ccSet = set(ccList)  # 將列表轉化成集合,再將集合轉化成字典來統計每個單詞出現個數

print(ccSet)


strDict = {}
# for star in ccSet:
#     strDict[star] = ccList.count(star)
# print(strDict, len(strDict))
for star in ccSet:
    strDict[star]=cc.count(star)
for key in ccSet:
    print(key,strDict[key])
wclist=list(ccSet.items())
print(wclist)
# def takeSecond(elem):
#     return  elem[1]
# wclist.sort(key=takeSecond,reverse=True)
# print(wclist)

#按詞頻排序
wcList=list(strDict.items())
print(wcList)
wcList.sort(key=lambda x:x[1],reverse=True)
print(wcList)

#輸出TOP(20)
for i in range(20):
    print(wcList[i])


# 列表的遍歷

cclist = ['wqdq', 'dqd', 'Awd', 313, '小四', 'dqd']
print(cclist)
cclist.append('gegeheh')
print(cclist)
cclist.pop(2)
print(cclist)
for i in cclist:
    print(i)

# 元組的遍歷

tuple = ('jtfjhrr', 'rqfw f2q', 800, 10)
print(tuple[2])
for i in tuple:
    print(i)

# 字典的遍歷

dic = {'fhehe': '4w6436', 'jgdns': 7, '4w6436': 'First'}

print('fhehe:', dic['fhehe'])
print('4w6436:', dic['4w6436'])

dic['4w6436'] = 8;
dic['4w6436'] = "對接歐文機房的維護"

print('4w6436:', dic['4w6436'])
print('4w6436:', dic['4w6436'])

for key in dic:
    print(key, ':', dic.get(key))

# 集合的遍歷

a = set([1, 2, 3, 6, 5])
print(a)

a.add(4)
print(a)
a.add('uteru')
print(a)

a.remove(5)
print(a)

for i in a:
    print(i)

  

 

 

#此次作業

fo=open('ccc1015.txt','r',encoding='utf-8')
strBig=fo.read().lower()
fo.close()
print(strBig)
#字串預處理:#大小寫,標點符號,特殊符號
sep=""".,:;!?"""
for ch in sep:
    strBig=strBig.replace(ch,'')
strlist=strBig.split()
print(len(strlist),strlist)
strSet=set(strlist)
exclude={'is','be','be','I','we','the','in'}
strSet=strSet-exclude
print(len(strSet),strSet)
strDict={}
for word in strSet:
    strDict[word]=strlist.count(word)
print(len(strDict),strDict)
#按詞頻排序
wcList=list(strDict.items())
print(wcList)
wcList.sort(key=lambda x:x[1],reverse=True)
print(wcList)

#輸出TOP(20)
for i in range(20):
    print(wcList[i])




# 中文版


#讀取文字檔案
f = open('shengxu.txt','r',encoding='utf-8')
story = f.read()
f.close()
print(story)

#預處理
sep = ',。:“”?!'''     #符號處理
for ch in sep:
    story=story.replace(ch,' ')   #利用for迴圈語句把特殊符號替換成空格
    print(story)

#中文分詞:結巴
import jieba
cnStr = story
#精確模式
print(list(jieba.cut(cnStr)))

# 分隔提取單詞
strList = story.split(' ')
print(len(strList), strList)
# 單詞計數字典
strSet = set(strList)
print(len(strSet), strSet)
strDict = {}
for word in strSet:
    strDict[word] = strList.count(word)
    # print(len(strDict),strDict)
# 詞頻排序
wcList = list(strDict.items())
# print(wcList)
wcList.sort(key=lambda x: x[1], reverse=True)
# print(wcList)

# 輸出TOP10
for i in range(10):
    print(wcList[i])