pandas中的常用索引方式总结(一)–df[]方式
import pandas as pd
import numpy as np
1.Series对象
s1 = pd.Series(np.array([1,2,3,4]))
print(s1)
print("---------------------------")
s2 = pd.Series(np.array([1,2,3,4]),index=["a","b","c","d"])
print(s2)
print("---------------------------")
s3 = pd.Series(np.array([1,2,3,4]),index=[4,3,2,1])
print(s3)
print("---------------------------")
s4 = pd.Series(np.array([1,2,3,4]),index=[1,2,3,4])
print(s4)
0 1
1 2
2 3
3 4
dtype: int32
---------------------------
a 1
b 2
c 3
d 4
dtype: int32
---------------------------
4 1
3 2
2 3
1 4
dtype: int32
---------------------------
1 1
2 2
3 3
4 4
dtype: int32
a = s2['a']
print(a)
a = 2
print(s2)
1
a 1
b 2
c 3
d 4
dtype: int32
b = s2['a':'c']
print(b)
b['a'] = 3
print(s2)
a 1
b 2
c 3
dtype: int32
a 3
b 2
c 3
d 4
dtype: int32
c = s2[0:3]
print(c)
c[0] = 9
print(s2)
a 1
b 2
c 3
dtype: int32
a 9
b 2
c 3
d 4
dtype: int32
d = s3[0:3] #在索引和下标都是自然数时,切片时是按照下标来切的。(这个也符合python切片的传统)
print(d)
e = s3[4:3] #正因为按照下标来切,所以这时解释器会把s3[4:3]当成是下标处理,因此得到的结果是空
print(e)
f = s3[4] #但单独索引,不切片的时候,会把括号中的数字当成是索引标签来对待。
print(f)
4 1
3 2
2 3
dtype: int32
Series([], dtype: int32)
1
f = s4[1:3]
print(f)
h = s4[1]
print(h)
g = s4[0] #由于是将0当成标签,但是这个Series中没有0标签,于是报错
print(g)
2 2
3 3
dtype: int32
1
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-23-5bc245228087> in <module>()
5 print(h)
6
----> 7 g = s4[0] #由于是将0当成标签,但是这个Series中没有0标签,于是报错
8 print(g)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
764 key = com._apply_if_callable(key, self)
765 try:
--> 766 result = self.index.get_value(self, key)
767
768 if not is_scalar(result):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
3101 try:
3102 return self._engine.get_value(s, k,
-> 3103 tz=getattr(series.dtype, 'tz', None))
3104 except KeyError as e1:
3105 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 0
i = s2[['b','d','a']]
print(i)
i[0] = 8
print(i)
print(s2)
b 2
d 4
a 1
dtype: int32
b 8
d 4
a 1
dtype: int32
a 1
b 2
c 3
d 4
dtype: int32
j = s3[[1,2,3]] #可见,在索引和下标同为数字时,花式索引会将括号中的数字优先当做是索引而非下标(这点与切片索引正好相反)
print(j)
1 4
2 3
3 2
dtype: int32
综上,我们可以总结出以下几点结论(重点,敲黑板):
1.在pandas中,用df[]方式切片,如果是用标签切片,那么得到的结果是两端都封闭的,如果用下标切片,那么则仍然是左闭右开的
2.如果标签和下标都是自然数,那么单独索引([])时数字当做标签处理,而切片索引时([:])则当做下标处理(也可以把单独索引当做是花式索引的特例)
3.由于Series对象和DataFrame对象底层都是ndarray对象,因此,用df[]索引时,切片索引返回的仍然是原对象的视图,而花式索引和布尔素引返回的是新对象
4.在索引和下标同为数字时,花式索引会将括号中的数字优先当做是索引而非下标(这点与切片索引正好相反)
5.除非迫不得已,否则不要将索引和列设置成数字,以免引起误会
2.DataFrame对象
以上,关于df[]切片方式在Series中需要注意的地方基本上已经讲完,上述五点结论在DataFrame的df[]索引方式中也是成立的。但是,对于DataFrame对象,df[]切片方式又会有几个不同之处需要注意
df = pd.DataFrame(np.arange(12).reshape(3,4),index=["a","b","c"],columns=["m","n","p","l"])
print(df)
m n p l
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
首先,需要注意的是,df[]方式下的索引在中括号中智能接收一个索引表达式(不像是numpy中数组有几维就可以有几个索引表达式)
df1 = df["p"]
print(df1)
a 2
b 6
c 10
Name: p, dtype: int32
df2 = df["a":"b","m"]
print(df2)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-9-2dc8044c3f5f> in <module>()
----> 1 df2 = df["a":"b","m"]
2 print(df2)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2683 return self._getitem_multilevel(key)
2684 else:
-> 2685 return self._getitem_column(key)
2686
2687 def _getitem_column(self, key):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2690 # get column
2691 if self.columns.is_unique:
-> 2692 return self._get_item_cache(key)
2693
2694 # duplicate columns & possible reduce dimensionality
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
2482 """Return the cached item, item represents a label indexer."""
2483 cache = self._item_cache
-> 2484 res = cache.get(item)
2485 if res is None:
2486 values = self._data.get(item)
TypeError: unhashable type: 'slice'
其次,当这个索引表达式为单个的标签时,则必须是列标签,也就是此时默认对列进行索引
df3 = df["n"]
print(df3)
a 1
b 5
c 9
Name: n, dtype: int32
df4 = df["a"]
print(df4)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3062 try:
-> 3063 return self._engine.get_loc(key)
3064 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'a'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-11-90c519cf05b1> in <module>()
----> 1 df4 = df["a"]
2 print(df4)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2683 return self._getitem_multilevel(key)
2684 else:
-> 2685 return self._getitem_column(key)
2686
2687 def _getitem_column(self, key):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2690 # get column
2691 if self.columns.is_unique:
-> 2692 return self._get_item_cache(key)
2693
2694 # duplicate columns & possible reduce dimensionality
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
2484 res = cache.get(item)
2485 if res is None:
-> 2486 values = self._data.get(item)
2487 res = self._box_item_values(item, values)
2488 cache[item] = res
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
4113
4114 if not isna(item):
-> 4115 loc = self.items.get_loc(item)
4116 else:
4117 indexer = np.arange(len(self.items))[isna(self.items)]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3063 return self._engine.get_loc(key)
3064 except KeyError:
-> 3065 return self._engine.get_loc(self._maybe_cast_indexer(key))
3066
3067 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'a'
当这个索引表达式是用切片形式时,那么这时必须是“对行”切片。如果队列切片,则是个空DF对象。
df5 = df["a":"c"]
print(df5)
m n p l
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
df6 =df["m":"p"]
print(df6)
Empty DataFrame
Columns: [m, n, p, l]
Index: []
以上就是DF中df[]索引方式下需要注意的三点,但是由于pandas对象中索引既可以用标签也可以用下标来表示,因此还有必要说说当索引表达式用下标来表达时的情况
df7 = df[2]
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3062 try:
-> 3063 return self._engine.get_loc(key)
3064 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 2
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-17-3ad20433a432> in <module>()
----> 1 df7 = df[2]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2683 return self._getitem_multilevel(key)
2684 else:
-> 2685 return self._getitem_column(key)
2686
2687 def _getitem_column(self, key):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2690 # get column
2691 if self.columns.is_unique:
-> 2692 return self._get_item_cache(key)
2693
2694 # duplicate columns & possible reduce dimensionality
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
2484 res = cache.get(item)
2485 if res is None:
-> 2486 values = self._data.get(item)
2487 res = self._box_item_values(item, values)
2488 cache[item] = res
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
4113
4114 if not isna(item):
-> 4115 loc = self.items.get_loc(item)
4116 else:
4117 indexer = np.arange(len(self.items))[isna(self.items)]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3063 return self._engine.get_loc(key)
3064 except KeyError:
-> 3065 return self._engine.get_loc(self._maybe_cast_indexer(key))
3066
3067 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 2
由于单独索引是将索引表达式优先当做标签使用(见上文),又因为在DF对象中单独索引默认为对列索引(见上文),因此,这是解释器就会去找例子中的df对象有没有一个列标签为2,结果没有,就报错了
df8 = df[0:3]
print(df8)
m n p l
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
当切片索引时,解释器会优先把索引表达式中的数字当成下标使用(见上文),而在DF对象中进行切片索引是默认对行进行切片的(见上文),因此,解释器就会去寻找df对象的索引中的1,2,3行,并且将其“索引”出来。