數據加載存儲和文件格式
阿新 • • 發佈:2017-08-22
讀取文本 == nan most module message col art mos
原文地址:
https://github.com/AsuraDong/Blog/blob/master/Articles/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E6%95%B0%E6%8D%AE%E5%8A%A0%E8%BD%BD%E5%AD%98%E5%82%A8%E5%92%8C%E6%96%87%E4%BB%B6%E6%A0%BC%E5%BC%8F.md
1.讀取文本格式數據
import pandas as pd
import numpy as np
import sys
import pymysql
# 圖片:pandas解析函數
df = pd.read_csv(‘ex1.csv‘)
print(df)
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
df = pd.read_table(‘ex1.csv‘,sep=‘,‘) #可以使用read_table,但必須指定分隔符
# sep還可以是正則表達式
print(df)
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
df = pd.read_csv(‘ex2.csv‘,header = None)#不是每一個csv都有header
print(df)
0 1 2 3 4
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
df = pd.read_csv(‘ex2.csv‘,names=[‘a‘,‘b‘,‘c‘,‘d‘,‘names‘])#指定名字
print(df)
a b c d names
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
names=[‘a‘,‘b‘,‘c‘,‘d‘,‘names‘]
df = pd.read_csv(‘ex2.csv‘,names=names,index_col=‘names‘) #將names做成索引
print(df)
#names對應三個,abcd分別有對應的
a b c d
names
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
df = pd.read_csv(‘csv_mindex.csv‘)
print(‘原始樣子:‘,‘\n‘,df)
df = pd.read_csv(‘csv_mindex.csv‘,index_col=[‘keys‘,‘key2‘])
#層次化索引.
#請註意keys和key2的順序
print(df)
原始樣子:
keys key2 value1 value2
0 one a 1 2
1 one b 3 4
2 two a 9 10
3 two c 13 14
value1 value2
keys key2
one a 1 2
b 3 4
two a 9 10
c 13 14
df = pd.read_csv(‘ex4.csv‘)
print(‘原始樣子:‘,‘\n‘,df)
#跳過文件的第幾行
print()
df = pd.read_csv(‘ex4.csv‘,skiprows=[0,2])
print(df)
原始樣子:
# hey!
a b c d message
# just wanted to make things more difficult NaN NaN NaN NaN
1 2 NaN 4 hello
a b c d message
0 1 2 NaN 4 hello
pd.isnull(df)# 處理缺失值
df = pd.read_csv(‘ex4.csv‘,skiprows=[0,2],na_values=[‘hello‘])# 接收一組用於表示缺失值的字符串
print(df)
print(pd.isnull(df))
a b c d message
0 1 2 NaN 4 NaN
a b c d message
0 False False True False True
sentinels = {‘message‘:[‘foo‘,‘NA‘],‘d‘:[‘a‘,‘NaN‘]}# 用一個字典為各列指定不同的NA標記值
df = pd.read_csv(‘ex4.csv‘,skiprows=[0,2],na_values=sentinels)
print(df)
a b c d message
0 1 2 NaN 4 hello
# 圖片:read_table/csv參數
2.逐塊讀取文本文件
# nrows參數指定只讀取定行。算上第一行哦
pd.read_csv(‘ex1.csv‘,nrows=4)
<style> .dataframe thead tr:only-child th { text-align: right; }
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
# chunksize 指定分塊讀取
chunks = pd.read_csv(‘ex1.csv‘,chunksize=2)
print(chunks)
<pandas.io.parsers.TextFileReader object at 0x0000007D7E4A39B0>
for chunk in chunks:
print(chunk)
print(‘=‘*10,)
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
==========
a b c d message
2 9 10 11 12 foo
==========
3.將數據寫出到文本格式
data = pd.read_csv(‘ex1.csv‘,nrows=3)
data.to_csv(‘ex1_1.csv‘) #to_csv寫入
data.to_csv(‘ex1_2.csv‘,sep=‘|‘)# 別的分隔符
data.to_csv(‘ex1_1.csv‘,na_rep=‘NULL‘)# 缺失值會被替換為na_rep
data.to_csv(sys.stdout,index=False,header=False)
# 行、列標簽被禁止
# 輸出到控制臺
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
data.to_csv(sys.stdout,index=False,columns=[‘a‘,‘b‘])
a,b
1,2
5,6
9,10
data.to_csv(sys.stdout)
,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo
4.DataFrame
# 可以將json格式的數據傳給DataFreame
# 也可以數據將數據庫的rows傳給DataFrame
conn = pymysql.Connect(host=‘172.31.238.166‘,port=3306,user=‘luowang‘,passwd=‘root‘, charset=‘UTF8‘,db=‘dyx‘)
cursor=conn.cursor()
sql=‘select * from access_log‘;
cursor.execute(sql)
rows= cursor.fetchall()
print(cursor.description)
((‘aid‘, 3, None, 16, 16, 0, False), (‘site_id‘, 3, None, 16, 16, 0, False), (‘count‘, 3, None, 32, 32, 0, False))
# cursor.description第一個保存了列的信息
# pd.DataFrame(rows,columns=[i[0] for i in cursor.description])
pd.DataFrame(rows,columns=zip(*cursor.description)[0])
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-74-05969a36ac33> in <module>()
1 # cursor.description第一個保存了列的信息
2 # pd.DataFrame(rows,columns=[i[0] for i in cursor.description])
----> 3 pd.DataFrame(rows,columns=zip(*cursor.description)[0])
TypeError: ‘zip‘ object is not subscriptable
[i[0] for i in cursor.description]
[‘aid‘, ‘site_id‘, ‘count‘]
pd.DataFrame(list(rows),columns=[i[0] for i in cursor.description]) #rows必須是list類型
<style> .dataframe thead tr:only-child th { text-align: right; }
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
aid | site_id | count | |
---|---|---|---|
0 | 1 | 1 | 45 |
1 | 2 | 3 | 100 |
2 | 3 | 1 | 230 |
3 | 4 | 2 | 10 |
4 | 5 | 5 | 205 |
5 | 6 | 4 | 13 |
6 | 7 | 3 | 220 |
7 | 8 | 5 | 545 |
8 | 9 | 3 | 201 |
9 | 10 | 10 | 10 |
10 | 11 | 11 | 11 |
數據加載存儲和文件格式