1. 程式人生 > >字典統計詞頻

字典統計詞頻

import pandas as pd
import numpy as np

#構造B列為多值,那麼B列是字串,也就是['','',''],這樣可以split。不能寫成[[],[],[]],這樣是list,list不能split。
temp=pd.DataFrame({'A':[1,2,3],'B':['4,2,1','5,3,2','6,4,3']},index=['a','b','c'])
print(temp)
#    A      B
# a  1  4,2,1
# b  2  5,3,2
# c  3  6,4,3


for index, row in temp[['A','B']].iterrows():
    print(index)
    #a
    #b
    #c
    print(row)#下面這些類是series
    # A
    # 1
    # B
    # 4, 2, 1
    # Name: a, dtype: object
    # A
    # 2
    # B
    # 5, 3, 2
    # Name: b, dtype: object
    # A
    # 3
    # B
    # 6, 4, 3
    # Name: c, dtype: object
    print(row['A'])
    # 1
    # 2
    # 3
    print(row['B'])
    # 4, 2, 1
    # 5, 3, 2
    # 6, 4, 3


#統計詞頻
#寫法1:(更簡單?)
from collections import defaultdict
back = defaultdict(lambda :0)
for index, row in temp[['A', 'B']].iterrows():
    word_list=row['B'].split(',')#這一列是以空格分隔的括號裡就空的,以逗號分隔就是','
    for word in word_list:
        # print(back[word])#這種寫法在這裡寫這一句,會列印0,因為上面已設定預設為0
        back[word] = back[word] + 1
        print(back[word])
        # 1
        # 1
        # 1
        # 1
        # 1
        # 2
        # 1
        # 2
        # 2
print(back)
#defaultdict(<function <lambda> at 0x0000015191AFE598>, {'5': 1, '6': 1, '1': 1, '2': 2, '3': 2, '4': 2})


#寫法2:
back = {}
for index, row in temp[['A', 'B']].iterrows():
    word_list=row['B'].split(',')
    for word in word_list:
        # print(back[word])#會報錯,因為字典統計詞頻需要首先有這個詞
        try:
            back[word]=back[word]+1
        except:
            back[word]=1
        print(back[word])
        # 1
        # 1
        # 1
        # 1
        # 1
        # 2
        # 1
        # 2
        # 2
print(back)
#{'5': 1, '6': 1, '1': 1, '2': 2, '3': 2, '4': 2}