读写文本格式的数据
# read_csv读入DataFrame
import pandas as pd
df = pd.read_csv(r'/Users/faye/Desktop/examples/ex1.csv')
df
>>>
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [4]:
# read_table读入,并指定分隔符
pd.read_table(r'/Users/faye/Desktop/examples/ex1.csv', sep = ',')
/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: FutureWarning: read_table is deprecated, use read_csv instead.
Out[4]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [7]:
# 原文件无标题行,分配默认的列名
pd.read_csv(r'/Users/faye/Desktop/examples/ex2.csv',header = None)
Out[7]:
0 1 2 3 4
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [9]:
# 自定义列名
pd.read_csv(r'/Users/faye/Desktop/examples/ex2.csv',names = ['a','b','c','d','message'])
Out[9]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [10]:
# index_col指定索引位置
pd.read_csv(r'/Users/faye/Desktop/examples/ex2.csv',names = ['a','b','c','d','message'],index_col = 'message')
Out[10]:
a b c d
message
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
In [12]:
# 将多个列转化成层次化索引
parsed = pd.read_csv(r'/Users/faye/Desktop/examples/csv_mindex.csv',index_col = ['key1','key2'])
parsed
Out[12]:
value1 value2
key1 key2
one a 1 2
b 3 4
c 5 6
d 7 8
two a 9 10
b 11 12
c 13 14
d 15 16
In [13]:
# 正则表达式\s+作为read_table
result = pd.read_table(r'/Users/faye/Desktop/examples/ex3.txt',sep = '\s+')
result
/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: FutureWarning: read_table is deprecated, use read_csv instead.
Out[13]:
A B C
aaa -0.264438 -1.026059 -0.619500
bbb 0.927272 0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382 1.100491
In [14]:
# skiprows跳过文件的行
pd.read_csv(r'/Users/faye/Desktop/examples/ex4.csv',skiprows = [0,2,3]) # 跳过第一、三、四行
Out[14]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
read_csv/read_table函数的参数:
逐块读取文本文件
# 设置pandas显示的更紧些
pd.options.display.max_rows = 10
result = pd.read_csv(r'/Users/faye/Desktop/examples/ex6.csv')
result
>>>
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q
... ... ... ... ... ...
9995 2.311896 -0.417070 -1.409599 -0.515821 L
9996 -0.479893 -0.650419 0.745152 -0.646038 E
9997 0.523331 0.787112 0.486066 1.093156 K
9998 -0.362559 0.598894 -1.843201 0.887292 G
9999 -0.096376 -1.012999 -0.657431 -0.573315 0
10000 rows × 5 columns
In [17]:
# nrows指定读取几行
pd.read_csv(r'/Users/faye/Desktop/examples/ex6.csv', nrows =5)
Out[17]:
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q
将数据写出到文本格式
data = pd.read_csv(r'/Users/faye/Desktop/examples/ex5.csv')
data
>>>
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
In [21]:
# to_csv将数据写到以逗号分隔的文件
data.to_csv(r'/Users/faye/Desktop/examples/out.csv')
In [22]:
# 使用其他分隔符
import sys
data.to_csv(sys.stdout,sep = '|')
|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo
In [23]:
# 标记缺失值
data.to_csv(sys.stdout,na_rep = 'NULL')
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo
In [24]:
# 禁用行、列标签
data.to_csv(sys.stdout,index = False,header = False)
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo
In [25]:
# 只写出部分列,并排序
data.to_csv(sys.stdout,index = False,columns = ['a','b','c'])
a,b,c
1,2,3.0
5,6,
9,10,11.0
json数据
# 通过json.loads将json字符串转换成python形式
obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
{"name": "Katie", "age": 38,
"pets": ["Sixes", "Stache", "Cisco"]}]
} """
import json
result = json.loads(obj)
result
>>>
{'name': 'Wes',
'places_lived': ['United States', 'Spain', 'Germany'],
'pet': None,
'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},
{'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}
In [34]:
# json.dumps将python对象转换成json
asjson = json.dumps(result)
asjson
Out[34]:
'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]}, {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]}'
In [35]:
# pandas.read_json自动将特别格式的json转换成Series或DataFrame
data = pd.read_json(r'/Users/faye/Desktop/examples/example.json')
data
Out[35]:
a b c
0 1 2 3
1 4 5 6
2 7 8 9
In [36]:
# to_json将数据从pandas输出到json
data.to_json()
Out[36]:
'{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}'
XML和HTML:Web信息收集
# read_html 可以使用lxml和Beautiful Soup自动将HTML文件解析成DataFrame
tables = pd.read_html(r'/Users/faye/Desktop/examples/fdic_failed_bank_list.html')
tables
>>>
[ Bank Name City ST CERT \
0 Allied Bank Mulberry AR 91
1 The Woodbury Banking Company Woodbury GA 11297
2 First CornerStone Bank King of Prussia PA 35312
3 Trust Company Bank Memphis TN 9956
4 North Milwaukee State Bank Milwaukee WI 20364
.. ... ... .. ...
542 Superior Bank, FSB Hinsdale IL 32646
543 Malta National Bank Malta OH 6629
544 First Alliance Bank & Trust Co. Manchester NH 34264
545 National State Bank of Metropolis Metropolis IL 3815
546 Bank of Honolulu Honolulu HI 21029
Acquiring Institution Closing Date \
0 Today's Bank September 23, 2016
1 United Bank August 19, 2016
2 First-Citizens Bank & Trust Company May 6, 2016
3 The Bank of Fayette County April 29, 2016
4 First-Citizens Bank & Trust Company March 11, 2016
.. ... ...
542 Superior Federal, FSB July 27, 2001
543 North Valley Bank May 3, 2001
544 Southern New Hampshire Bank & Trust February 2, 2001
545 Banterra Bank of Marion December 14, 2000
546 Bank of the Orient October 13, 2000
Updated Date
0 November 17, 2016
1 November 17, 2016
2 September 6, 2016
3 September 6, 2016
4 June 16, 2016
.. ...
542 August 19, 2014
543 November 18, 2002
544 February 18, 2003
545 March 17, 2005
546 March 17, 2005
[547 rows x 7 columns]]
In [38]:
len(tables)
Out[38]:
1
In [40]:
failures = tables[0]
failures
Out[40]:
Bank Name City ST CERT Acquiring Institution Closing Date Updated Date
0 Allied Bank Mulberry AR 91 Today's Bank September 23, 2016 November 17, 2016
1 The Woodbury Banking Company Woodbury GA 11297 United Bank August 19, 2016 November 17, 2016
2 First CornerStone Bank King of Prussia PA 35312 First-Citizens Bank & Trust Company May 6, 2016 September 6, 2016
3 Trust Company Bank Memphis TN 9956 The Bank of Fayette County April 29, 2016 September 6, 2016
4 North Milwaukee State Bank Milwaukee WI 20364 First-Citizens Bank & Trust Company March 11, 2016 June 16, 2016
... ... ... ... ... ... ... ...
542 Superior Bank, FSB Hinsdale IL 32646 Superior Federal, FSB July 27, 2001 August 19, 2014
543 Malta National Bank Malta OH 6629 North Valley Bank May 3, 2001 November 18, 2002
544 First Alliance Bank & Trust Co. Manchester NH 34264 Southern New Hampshire Bank & Trust February 2, 2001 February 18, 2003
545 National State Bank of Metropolis Metropolis IL 3815 Banterra Bank of Marion December 14, 2000 March 17, 2005
546 Bank of Honolulu Honolulu HI 21029 Bank of the Orient October 13, 2000 March 17, 2005
547 rows × 7 columns
In [41]:
failures.head()
Out[41]:
Bank Name City ST CERT Acquiring Institution Closing Date Updated Date
0 Allied Bank Mulberry AR 91 Today's Bank September 23, 2016 November 17, 2016
1 The Woodbury Banking Company Woodbury GA 11297 United Bank August 19, 2016 November 17, 2016
2 First CornerStone Bank King of Prussia PA 35312 First-Citizens Bank & Trust Company May 6, 2016 September 6, 2016
3 Trust Company Bank Memphis TN 9956 The Bank of Fayette County April 29, 2016 September 6, 2016
4 North Milwaukee State Bank Milwaukee WI 20364 First-Citizens Bank & Trust Company March 11, 2016 June 16, 2016
二进制数据格式
# to_pickle将数据以pickle格式保存到磁盘上
frame = pd.read_csv(r'/Users/faye/Desktop/examples/ex1.csv')
frame
>>>
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [45]:
frame.to_pickle(r'/Users/faye/Desktop/examples/frame_pickle')
In [46]:
# read_pickle读取pickle化数据
pd.read_pickle(r'/Users/faye/Desktop/examples/frame_pickle')
Out[46]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
读取Excel文件
pd.read_excel(r'/Users/faye/Desktop/examples/ex1.xlsx')
Out[47]:
Unnamed: 0 a b c d message
0 0 1 2 3 4 hello
1 1 5 6 7 8 world
2 2 9 10 11 12 foo
In [51]:
# to_excel将数据写入
frame.to_excel(r'/Users/faye/Desktop/examples/ex3.xlsx')
Web APIs交互
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
resp
Out[52]:
<Response [200]>
In [53]:
data = resp.json()
data[0]['title']
Out[53]:
'CI'
In [55]:
issues = pd.DataFrame(data, columns=['number', 'title',
.....: 'labels', 'state'])
issues
Out[55]:
number title labels state
0 27868 CI [] open
1 27867 CI: Failing to compile for Travis 3.7 build [] open
2 27866 Added missing space to error description [] open
3 27865 Cannot use .ix in IntervaIndex('pandas._libs.i... [{'id': 2822098, 'node_id': 'MDU6TGFiZWwyODIyM... open
4 27862 DOC: See also for DataFrame.iterrows() should ... [{'id': 134699, 'node_id': 'MDU6TGFiZWwxMzQ2OT... open
... ... ... ... ...
25 27827 BUG: Fixed groupby quantile for listlike q [{'id': 233160, 'node_id': 'MDU6TGFiZWwyMzMxNj... open
26 27826 BUG: Fix groupby quantile segfault [{'id': 233160, 'node_id': 'MDU6TGFiZWwyMzMxNj... open
27 27825 Dispatch pd.is_na for scalar extension value [{'id': 849023693, 'node_id': 'MDU6TGFiZWw4NDk... open
28 27824 Document DataFrame._constructor_sliced-like pr... [{'id': 35818298, 'node_id': 'MDU6TGFiZWwzNTgx... open
29 27820 DOC: clarify that read_parquet accepts a direc... [{'id': 134699, 'node_id': 'MDU6TGFiZWwxMzQ2OT... open
30 rows × 4 columns