import numpy as np
import pandas as pd
df = pd.read_csv(r"C:\Users\zhoukaiwei\Desktop\joyful-pandas\data\learn_pandas.csv",
usecols = ['School', 'Grade', 'Name', 'Gender',
'Weight', 'Transfer'])
df['Name'].head()
0 Gaopeng Yang
1 Changqiang You
2 Mei Sun
3 Xiaojuan Sun
4 Gaojuan You
Name: Name, dtype: object
df[['Gender','Name']].head()#获得多个列
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
Gender |
Name |
0 |
Female |
Gaopeng Yang |
1 |
Male |
Changqiang You |
2 |
Male |
Mei Sun |
3 |
Female |
Xiaojuan Sun |
4 |
Male |
Gaojuan You |
df.Name.head()#获得单列,且列名中不包空和上面df['Name']是一样的
0 Gaopeng Yang
1 Changqiang You
2 Mei Sun
3 Xiaojuan Sun
4 Gaojuan You
Name: Name, dtype: object
A = pd.Series([1,2,3,4,5,6],index=['a','b','a','a','a','c'])
A['a']
#如果取出多个索引的对应元素,则可以使用 [items的列表] :
A[['c','b']]
A = pd.Series([1,2,3,4,5,6],index=['a','b','a','a','a','c'])
A['c': 'b': -1]#获得两个索引之间的元素
c 6
a 5
a 4
a 3
b 2
dtype: int64
前面讲到了对 DataFrame 的列进行选取,下面要讨论其行的选取。对于表而言,有两种索引器,一种是基于 元素 的 loc 索引器,另一种是基于 位置 的
iloc 索引器。loc 索引器的一般形式是 loc[*, ] ,其中第一个 * 代表行的选择,第二个 * 代表列的选择,如果省略第二个位置写作 loc[] ,这个 *
是指行的筛选。其中, * 的位置一共有五类合法对象,分别是:单个元素、元素列表、元素切片、布尔列表以及函数,下面将依次说明。
为了演示相应操作,先利用 set_index 方法把 Name 列设为索引,关于该函数的其他用法将在多级索引一章介绍。
import numpy as np
import pandas as pd
df = pd.read_csv(r"C:\Users\zhoukaiwei\Desktop\joyful-pandas\data\learn_pandas.csv",
usecols = ['School', 'Grade', 'Name', 'Gender',
'Weight', 'Transfer'])
df_A =df.set_index('Name')
df_A.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
School |
Grade |
Gender |
Weight |
Transfer |
Name |
|
|
|
|
|
Gaopeng Yang |
Shanghai Jiao Tong University |
Freshman |
Female |
46.0 |
N |
Changqiang You |
Peking University |
Freshman |
Male |
70.0 |
N |
Mei Sun |
Shanghai Jiao Tong University |
Senior |
Male |
89.0 |
N |
Xiaojuan Sun |
Fudan University |
Sophomore |
Female |
41.0 |
N |
Gaojuan You |
Fudan University |
Sophomore |
Male |
74.0 |
N |
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
School |
Grade |
Gender |
Weight |
Transfer |
Name |
|
|
|
|
|
Qiang Sun |
Tsinghua University |
Junior |
Female |
53.0 |
N |
Qiang Sun |
Tsinghua University |
Sophomore |
Female |
40.0 |
N |
Qiang Sun |
Shanghai Jiao Tong University |
Junior |
Female |
NaN |
N |
School Shanghai Jiao Tong University
Grade Junior
Gender Female
Weight 53
Transfer N
Name: Quan Zhao, dtype: object
df_A.loc['Qiang Sun','School']
Name
Qiang Sun Tsinghua University
Qiang Sun Tsinghua University
Qiang Sun Shanghai Jiao Tong University
Name: School, dtype: object
#取出列表中所有元素值对应的行或列:
df_A.loc[['Qiang Sun','Quan Zhao'],['School','Gender']]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
School |
Gender |
Name |
|
|
Qiang Sun |
Tsinghua University |
Female |
Qiang Sun |
Tsinghua University |
Female |
Qiang Sun |
Shanghai Jiao Tong University |
Female |
Quan Zhao |
Shanghai Jiao Tong University |
Female |
df_A.loc['Gaojuan You':'Gaoqiang Qian','School':'Gender']#使用切片
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
School |
Grade |
Gender |
Name |
|
|
|
Gaojuan You |
Fudan University |
Sophomore |
Male |
Xiaoli Qian |
Tsinghua University |
Freshman |
Female |
Qiang Chu |
Shanghai Jiao Tong University |
Freshman |
Female |
Gaoqiang Qian |
Tsinghua University |
Junior |
Female |
在实际的数据处理中,根据条件来筛选行是极其常见的,此处传入 loc 的布尔列表与 DataFrame
长度相同,且列表为 True 的位置所对应的行会被选中, False 则会被剔除。
df_A.loc[df_A.Weight>70].head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
School |
Grade |
Gender |
Weight |
Transfer |
Name |
|
|
|
|
|
Mei Sun |
Shanghai Jiao Tong University |
Senior |
Male |
89.0 |
N |
Gaojuan You |
Fudan University |
Sophomore |
Male |
74.0 |
N |
Xiaopeng Zhou |
Shanghai Jiao Tong University |
Freshman |
Male |
74.0 |
N |
Xiaofeng Sun |
Tsinghua University |
Senior |
Male |
71.0 |
N |
Qiang Zheng |
Shanghai Jiao Tong University |
Senior |
Male |
87.0 |
N |
df_A.loc[df_A.Grade.isin(['Freshman', 'Senior'])].head()#获得大一和大四的学生
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
School |
Grade |
Gender |
Weight |
Transfer |
Name |
|
|
|
|
|
Gaopeng Yang |
Shanghai Jiao Tong University |
Freshman |
Female |
46.0 |
N |
Changqiang You |
Peking University |
Freshman |
Male |
70.0 |
N |
Mei Sun |
Shanghai Jiao Tong University |
Senior |
Male |
89.0 |
N |
Xiaoli Qian |
Tsinghua University |
Freshman |
Female |
51.0 |
N |
Qiang Chu |
Shanghai Jiao Tong University |
Freshman |
Female |
52.0 |
N |
def condition(x):
condition_1_1 = x.School == 'Fudan University'
condition_1_2 = x.Grade == 'Senior'
condition_1_3 = x.Weight > 70
condition_1 = condition_1_1 & condition_1_2 & condition_1_3
condition_2_1 = x.School == 'Peking University'
condition_2_2 = x.Grade == 'Senior'
condition_2_3 = x.Weight > 80
condition_2 = condition_2_1 & (~condition_2_2) & condition_2_3
result = condition_1 | condition_2
return result
df_A.loc[condition]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
School |
Grade |
Gender |
Weight |
Transfer |
Name |
|
|
|
|
|
Qiang Han |
Peking University |
Freshman |
Male |
87.0 |
N |
Chengpeng Zhou |
Fudan University |
Senior |
Male |
81.0 |
N |
Changpeng Zhao |
Peking University |
Freshman |
Male |
83.0 |
N |
Chengpeng Qian |
Fudan University |
Senior |
Male |
73.0 |
Y |
#由于函数无法返回如 start: end: step 的切片形式,故返回切片时要用 slice 对象进行包装:
df_A.loc[lambda x: slice('Gaojuan You','Gaoqiang Qian')]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
School |
Grade |
Gender |
Weight |
Transfer |
Name |
|
|
|
|
|
Gaojuan You |
Fudan University |
Sophomore |
Male |
74.0 |
N |
Xiaoli Qian |
Tsinghua University |
Freshman |
Female |
51.0 |
N |
Qiang Chu |
Shanghai Jiao Tong University |
Freshman |
Female |
52.0 |
N |
Gaoqiang Qian |
Tsinghua University |
Junior |
Female |
50.0 |
N |
iloc 的使用与 loc 完全类似,只不过是针对位置进行筛选,在相应的 * 位置处一共也有五类
合法对象,分别是:整数、整数列表、整数切片、布尔列表以及函数,函数的返回值必须是前面
的四类合法对象中的一个,其输入同样也为 DataFrame 本身。
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
School |
Grade |
Name |
|
|
Gaopeng Yang |
Shanghai Jiao Tong University |
Freshman |
Changqiang You |
Peking University |
Freshman |
df_A.iloc[1: 4, 2:4] # 切片不包含结束端点
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
Gender |
Weight |
Name |
|
|
Changqiang You |
Male |
70.0 |
Mei Sun |
Male |
89.0 |
Xiaojuan Sun |
Female |
41.0 |
#在 pandas 中,支持把字符串形式的查询表达式传入 query 方法来查询数据,其表
#达式的执行结果必须返回布尔列表
df.query('((School == "Fudan University")&(Grade == "Senior")&(Weight > 70))')
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
School |
Grade |
Name |
Gender |
Weight |
Transfer |
66 |
Fudan University |
Senior |
Chengpeng Zhou |
Male |
81.0 |
N |
131 |
Fudan University |
Senior |
Chengpeng Qian |
Male |
73.0 |
Y |
df.query('Grade == ["Junior", "Senior"]').head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
School |
Grade |
Name |
Gender |
Weight |
Transfer |
2 |
Shanghai Jiao Tong University |
Senior |
Mei Sun |
Male |
89.0 |
N |
7 |
Tsinghua University |
Junior |
Gaoqiang Qian |
Female |
50.0 |
N |
9 |
Peking University |
Junior |
Juan Xu |
Female |
NaN |
N |
11 |
Tsinghua University |
Junior |
Xiaoquan Lv |
Female |
43.0 |
N |
12 |
Shanghai Jiao Tong University |
Senior |
Peng You |
Female |
48.0 |
NaN |
low, high =70, 80
df.query('Weight.between(@low, @high)').head()
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-73-7b9719bb38f0> in <module>
1 low, high =70, 80
----> 2 df.query('Weight.between(@low, @high)').head()
D:\Anaconda3\lib\site-packages\pandas\core\frame.py in query(self, expr, inplace, **kwargs)
3343 kwargs["level"] = kwargs.pop("level", 0) + 1
3344 kwargs["target"] = None
-> 3345 res = self.eval(expr, **kwargs)
3346
3347 try:
D:\Anaconda3\lib\site-packages\pandas\core\frame.py in eval(self, expr, inplace, **kwargs)
3473 kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers)
3474
-> 3475 return _eval(expr, inplace=inplace, **kwargs)
3476
3477 def select_dtypes(self, include=None, exclude=None) -> "DataFrame":
D:\Anaconda3\lib\site-packages\pandas\core\computation\eval.py in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target, inplace)
344 eng = _engines[engine]
345 eng_inst = eng(parsed_expr)
--> 346 ret = eng_inst.evaluate()
347
348 if parsed_expr.assigner is None:
D:\Anaconda3\lib\site-packages\pandas\core\computation\engines.py in evaluate(self)
71
72 # make sure no names in resolvers and locals/globals clash
---> 73 res = self._evaluate()
74 return reconstruct_object(
75 self.result_type, res, self.aligned_axes, self.expr.terms.return_type
D:\Anaconda3\lib\site-packages\pandas\core\computation\engines.py in _evaluate(self)
111 env = self.expr.env
112 scope = env.full_scope
--> 113 _check_ne_builtin_clash(self.expr)
114 return ne.evaluate(s, local_dict=scope)
115
D:\Anaconda3\lib\site-packages\pandas\core\computation\engines.py in _check_ne_builtin_clash(expr)
27 Terms can contain
28 """
---> 29 names = expr.names
30 overlap = names & _ne_builtins
31
D:\Anaconda3\lib\site-packages\pandas\core\computation\expr.py in names(self)
812 """
813 if is_term(self.terms):
--> 814 return frozenset([self.terms.name])
815 return frozenset(term.name for term in com.flatten(self.terms))
816
D:\Anaconda3\lib\site-packages\pandas\core\generic.py in __hash__(self)
1667 def __hash__(self):
1668 raise TypeError(
-> 1669 f"{repr(type(self).__name__)} objects are mutable, "
1670 f"thus they cannot be hashed"
1671 )
TypeError: 'Series' objects are mutable, thus they cannot be hashed
df_A = pd.DataFrame({'id': list('abcde'),
'value': [10, 20, 30, 20, 20]})
df_A
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
id |
value |
0 |
a |
10 |
1 |
b |
20 |
2 |
c |
30 |
3 |
d |
20 |
4 |
e |
20 |
df_A.sample(3,replace = True,weights = df_A.value)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
id |
value |
3 |
d |
20 |
2 |
c |
30 |
4 |
e |
20 |
import numpy as np
import pandas as pd
np.random.seed(0)
multi_index = pd.MultiIndex.from_product([list('ABCD'),
df.Gender.unique()], names=('School', 'Gender'))#重新建表
multi_column = pd.MultiIndex.from_product([['Height', 'Weight'],
df.Grade.unique()], names=('Indicator', 'Grade'))
df_multi = pd.DataFrame(np.c_[(np.random.randn(8,4)*5 + 163).tolist(),
(np.random.randn(8,4)*5 + 65).tolist()],
index = multi_index,
columns = multi_column).round(1)
df_multi
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
|
Indicator |
Height |
Weight |
|
Grade |
Freshman |
Senior |
Sophomore |
Junior |
Freshman |
Senior |
Sophomore |
Junior |
School |
Gender |
|
|
|
|
|
|
|
|
A |
Female |
171.8 |
165.0 |
167.9 |
174.2 |
60.6 |
55.1 |
63.3 |
65.8 |
Male |
172.3 |
158.1 |
167.8 |
162.2 |
71.2 |
71.0 |
63.1 |
63.5 |
B |
Female |
162.5 |
165.1 |
163.7 |
170.3 |
59.8 |
57.9 |
56.5 |
74.8 |
Male |
166.8 |
163.6 |
165.2 |
164.7 |
62.5 |
62.8 |
58.7 |
68.9 |
C |
Female |
170.5 |
162.0 |
164.6 |
158.7 |
56.9 |
63.9 |
60.5 |
66.9 |
Male |
150.2 |
166.3 |
167.3 |
159.3 |
62.4 |
59.1 |
64.9 |
67.1 |
D |
Female |
174.3 |
155.7 |
163.2 |
162.1 |
65.3 |
66.5 |
61.8 |
63.2 |
Male |
170.7 |
170.3 |
163.8 |
164.9 |
61.6 |
63.2 |
60.9 |
56.4 |
FrozenList(['School', 'Gender'])
FrozenList(['Indicator', 'Grade'])
df_multi.index.get_level_values(0)#获得某一层的索引
Index(['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], dtype='object', name='School')
熟悉了结构后,现在回到原表,将学校和年级设为索引,此时的行为多级索引,列为单级索引,
由于默认状态的列索引不含名字,因此对应于刚刚图中 Indicator 和 Grade 的索引名位置是空缺的。
df_multi = df.set_index(['School','Grade'])
df_multi.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
|
Name |
Gender |
Weight |
Transfer |
School |
Grade |
|
|
|
|
Shanghai Jiao Tong University |
Freshman |
Gaopeng Yang |
Female |
46.0 |
N |
Peking University |
Freshman |
Changqiang You |
Male |
70.0 |
N |
Shanghai Jiao Tong University |
Senior |
Mei Sun |
Male |
89.0 |
N |
Fudan University |
Sophomore |
Xiaojuan Sun |
Female |
41.0 |
N |
Sophomore |
Gaojuan You |
Male |
74.0 |
N |
由于多级索引中的单个元素以元组为单位,因此之前在第一节介绍的 loc 和 iloc 方法完全可以照搬
,只需把标量的位置替换成对应的元组,不过在索引前最好对 MultiIndex 进行排序以避免性能警告:
df_multi = df_multi.sort_index()
df_multi.loc[('Fudan University','Junior')].head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
|
Name |
Gender |
Weight |
Transfer |
School |
Grade |
|
|
|
|
Fudan University |
Junior |
Yanli You |
Female |
48.0 |
N |
Junior |
Chunqiang Chu |
Male |
72.0 |
N |
Junior |
Changfeng Lv |
Male |
76.0 |
N |
Junior |
Yanjuan Lv |
Female |
49.0 |
NaN |
Junior |
Gaoqiang Zhou |
Female |
43.0 |
N |
df_multi.loc[[('Fudan University', 'Senior'),
('Shanghai Jiao Tong University', 'Freshman')]].head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
|
Name |
Gender |
Weight |
Transfer |
School |
Grade |
|
|
|
|
Fudan University |
Senior |
Chengpeng Zheng |
Female |
38.0 |
N |
Senior |
Feng Zhou |
Female |
47.0 |
N |
Senior |
Gaomei Lv |
Female |
34.0 |
N |
Senior |
Chunli Lv |
Female |
56.0 |
N |
Senior |
Chengpeng Zhou |
Male |
81.0 |
N |
前面介绍的方法,即使在索引不重复的时候,也只能对元组整体进行切片,而不能对每层进行切片,
也不允许将切片和布尔列表混合使用,引入 IndexSlice 对象就能解决这个问题。 Slice 对象一共
有两种形式,第一种为 loc[idx[,]] 型,第二种为 loc[idx[,],idx[,]] 型,
下面将进行介绍。为了方便演示,下面构造一个 索引不重复的 DataFrame :
np.random.seed(0)
L1,L2 = ['A','B','C'],['a','b','c']
mul_index1 = pd.MultiIndex.from_product([L1,L2],names=('Upper', 'Lower'))
L3,L4 = ['D','E','F'],['d','e','f']
mul_index2 = pd.MultiIndex.from_product([L3,L4],names=('Big', 'Small'))
df_ex = pd.DataFrame(np.random.randint(-9,10,(9,9)),index=mul_index1,columns=mul_index2)
df_ex
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
|
Big |
D |
E |
F |
|
Small |
d |
e |
f |
d |
e |
f |
d |
e |
f |
Upper |
Lower |
|
|
|
|
|
|
|
|
|
A |
a |
3 |
6 |
-9 |
-6 |
-6 |
-2 |
0 |
9 |
-5 |
b |
-3 |
3 |
-8 |
-3 |
-2 |
5 |
8 |
-4 |
4 |
c |
-1 |
0 |
7 |
-4 |
6 |
6 |
-9 |
9 |
-6 |
B |
a |
8 |
5 |
-2 |
-9 |
-8 |
0 |
-9 |
1 |
-6 |
b |
2 |
9 |
-7 |
-9 |
-9 |
-5 |
-4 |
-3 |
-1 |
c |
8 |
6 |
-5 |
0 |
1 |
-8 |
-8 |
-2 |
0 |
C |
a |
-6 |
-3 |
2 |
5 |
9 |
-9 |
5 |
-6 |
3 |
b |
1 |
2 |
-5 |
-3 |
-5 |
6 |
-6 |
3 |
-5 |
c |
-1 |
5 |
6 |
-6 |
6 |
4 |
7 |
8 |
-4 |
index = pd.IndexSlice#定义slice对象
df_ex.loc[index['C':,('D','e'):]]#loc[idx[*,*]] 型
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
|
Big |
D |
E |
F |
|
Small |
e |
f |
d |
e |
f |
d |
e |
f |
Upper |
Lower |
|
|
|
|
|
|
|
|
C |
a |
-3 |
2 |
5 |
9 |
-9 |
5 |
-6 |
3 |
b |
2 |
-5 |
-3 |
-5 |
6 |
-6 |
3 |
-5 |
c |
5 |
6 |
-6 |
6 |
4 |
7 |
8 |
-4 |
df_ex.loc[index[:'A', lambda x:x.sum()>0]] # 列和大于0
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
|
Big |
D |
F |
|
Small |
d |
e |
e |
Upper |
Lower |
|
|
|
A |
a |
3 |
6 |
9 |
b |
-3 |
3 |
-4 |
c |
-1 |
0 |
9 |
df_ex.loc[index[:'A', 'b':], index['E':, 'e':]]#loc[idx[*,*],idx[*,*]] 型前一个 idx 指代的是行索引,后一个是列索引。
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
|
Big |
E |
F |
|
Small |
e |
f |
e |
f |
Upper |
Lower |
|
|
|
|
A |
b |
-2 |
5 |
-4 |
4 |
c |
6 |
6 |
9 |
-6 |
前面提到了多级索引表的结构和切片,那么除了使用 set_index 之外,
如何自己构造多级索引呢?常用的有 from_tuples, from_arrays, from_product 三种方法,
它们都是 pd.MultiIndex 对象下的函数。
#from_tuples 指根据传入由元组组成的列表进行构造:
my_A = [('a','cat'),('a','dog'),('b','cat'),('b','dog')]
pd.MultiIndex.from_tuples(my_A,names = ['First','Second'])
MultiIndex([('a', 'cat'),
('a', 'dog'),
('b', 'cat'),
('b', 'dog')],
names=['First', 'Second'])
#from_arrays 根据传入列表中,对应层的列表进行构造:
my_array = [list('aabb'), ['cat', 'dog']*2]
pd.MultiIndex.from_arrays(my_array, names=['First','Second'])
MultiIndex([('a', 'cat'),
('a', 'dog'),
('b', 'cat'),
('b', 'dog')],
names=['First', 'Second'])
#from_product 指根据给定多个列表的笛卡尔积进行构造:
list1 = ['a','b']
list2 = ['cat','dog']
pd.MultiIndex.from_product([list1,list2],names = ['First','Second'])
MultiIndex([('a', 'cat'),
('a', 'dog'),
('b', 'cat'),
('b', 'dog')],
names=['First', 'Second'])
np.random.seed(0)
L1,L2,L3 = ['A','B'],['a','b'],['alpha','beta']
m_index1 = pd.MultiIndex.from_product([L1,L2,L3],names = ('Upper','Lower','Extra'))
L4,L5,L6 = ['C','D'],['c','d'],['cat','dog']
m_index2 = pd.MultiIndex.from_product([L4,L5,L6],names=('Big', 'Small', 'Other'))
df_ex = pd.DataFrame(np.random.randint(-9,10,(8,8)),index=m_index1,columns=m_index2)
df_ex
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
|
|
Big |
C |
D |
|
|
Small |
c |
d |
c |
d |
|
|
Other |
cat |
dog |
cat |
dog |
cat |
dog |
cat |
dog |
Upper |
Lower |
Extra |
|
|
|
|
|
|
|
|
A |
a |
alpha |
3 |
6 |
-9 |
-6 |
-6 |
-2 |
0 |
9 |
beta |
-5 |
-3 |
3 |
-8 |
-3 |
-2 |
5 |
8 |
b |
alpha |
-4 |
4 |
-1 |
0 |
7 |
-4 |
6 |
6 |
beta |
-9 |
9 |
-6 |
8 |
5 |
-2 |
-9 |
-8 |
B |
a |
alpha |
0 |
-9 |
1 |
-6 |
2 |
9 |
-7 |
-9 |
beta |
-9 |
-5 |
-4 |
-3 |
-1 |
8 |
6 |
-5 |
b |
alpha |
0 |
1 |
-8 |
-8 |
-2 |
0 |
-6 |
-3 |
beta |
2 |
5 |
9 |
-9 |
5 |
-6 |
3 |
1 |
df_ex.swaplevel(0,2,axis=1).head() # 列索引的第一层和第三层交换
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
|
|
Other |
cat |
dog |
cat |
dog |
cat |
dog |
cat |
dog |
|
|
Small |
c |
c |
d |
d |
c |
c |
d |
d |
|
|
Big |
C |
C |
C |
C |
D |
D |
D |
D |
Upper |
Lower |
Extra |
|
|
|
|
|
|
|
|
A |
a |
alpha |
3 |
6 |
-9 |
-6 |
-6 |
-2 |
0 |
9 |
beta |
-5 |
-3 |
3 |
-8 |
-3 |
-2 |
5 |
8 |
b |
alpha |
-4 |
4 |
-1 |
0 |
7 |
-4 |
6 |
6 |
beta |
-9 |
9 |
-6 |
8 |
5 |
-2 |
-9 |
-8 |
B |
a |
alpha |
0 |
-9 |
1 |
-6 |
2 |
9 |
-7 |
-9 |
df_ex.reorder_levels([2,0,1],axis=0).head() # 列表数字指代原来索引中的层
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
|
|
Big |
C |
D |
|
|
Small |
c |
d |
c |
d |
|
|
Other |
cat |
dog |
cat |
dog |
cat |
dog |
cat |
dog |
Extra |
Upper |
Lower |
|
|
|
|
|
|
|
|
alpha |
A |
a |
3 |
6 |
-9 |
-6 |
-6 |
-2 |
0 |
9 |
beta |
A |
a |
-5 |
-3 |
3 |
-8 |
-3 |
-2 |
5 |
8 |
alpha |
A |
b |
-4 |
4 |
-1 |
0 |
7 |
-4 |
6 |
6 |
beta |
A |
b |
-9 |
9 |
-6 |
8 |
5 |
-2 |
-9 |
-8 |
alpha |
B |
a |
0 |
-9 |
1 |
-6 |
2 |
9 |
-7 |
-9 |
df_ex.droplevel(1,axis=1)#删除某一层的索引
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
|
|
Big |
C |
D |
|
|
Other |
cat |
dog |
cat |
dog |
cat |
dog |
cat |
dog |
Upper |
Lower |
Extra |
|
|
|
|
|
|
|
|
A |
a |
alpha |
3 |
6 |
-9 |
-6 |
-6 |
-2 |
0 |
9 |
beta |
-5 |
-3 |
3 |
-8 |
-3 |
-2 |
5 |
8 |
b |
alpha |
-4 |
4 |
-1 |
0 |
7 |
-4 |
6 |
6 |
beta |
-9 |
9 |
-6 |
8 |
5 |
-2 |
-9 |
-8 |
B |
a |
alpha |
0 |
-9 |
1 |
-6 |
2 |
9 |
-7 |
-9 |
beta |
-9 |
-5 |
-4 |
-3 |
-1 |
8 |
6 |
-5 |
b |
alpha |
0 |
1 |
-8 |
-8 |
-2 |
0 |
-6 |
-3 |
beta |
2 |
5 |
9 |
-9 |
5 |
-6 |
3 |
1 |
df_ex.droplevel([0,1],axis=0)#删除列索引
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
Big |
C |
D |
Small |
c |
d |
c |
d |
Other |
cat |
dog |
cat |
dog |
cat |
dog |
cat |
dog |
Extra |
|
|
|
|
|
|
|
|
alpha |
3 |
6 |
-9 |
-6 |
-6 |
-2 |
0 |
9 |
beta |
-5 |
-3 |
3 |
-8 |
-3 |
-2 |
5 |
8 |
alpha |
-4 |
4 |
-1 |
0 |
7 |
-4 |
6 |
6 |
beta |
-9 |
9 |
-6 |
8 |
5 |
-2 |
-9 |
-8 |
alpha |
0 |
-9 |
1 |
-6 |
2 |
9 |
-7 |
-9 |
beta |
-9 |
-5 |
-4 |
-3 |
-1 |
8 |
6 |
-5 |
alpha |
0 |
1 |
-8 |
-8 |
-2 |
0 |
-6 |
-3 |
beta |
2 |
5 |
9 |
-9 |
5 |
-6 |
3 |
1 |
df_ex.rename_axis(index = {'Uper':'Changed_row'},columns = {'Other':'Changed_Col'})
.head()#修改索引层名字rename_axis
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
|
|
Big |
C |
D |
|
|
Small |
c |
d |
c |
d |
|
|
Changed_Col |
cat |
dog |
cat |
dog |
cat |
dog |
cat |
dog |
Upper |
Lower |
Extra |
|
|
|
|
|
|
|
|
A |
a |
alpha |
3 |
6 |
-9 |
-6 |
-6 |
-2 |
0 |
9 |
beta |
-5 |
-3 |
3 |
-8 |
-3 |
-2 |
5 |
8 |
b |
alpha |
-4 |
4 |
-1 |
0 |
7 |
-4 |
6 |
6 |
beta |
-9 |
9 |
-6 |
8 |
5 |
-2 |
-9 |
-8 |
B |
a |
alpha |
0 |
-9 |
1 |
-6 |
2 |
9 |
-7 |
-9 |
df_ex.rename(columns={'cat':'not_cat'},level=2).head()#修改索引值rename
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
|
|
Big |
C |
D |
|
|
Small |
c |
d |
c |
d |
|
|
Other |
not_cat |
dog |
not_cat |
dog |
not_cat |
dog |
not_cat |
dog |
Upper |
Lower |
Extra |
|
|
|
|
|
|
|
|
A |
a |
alpha |
3 |
6 |
-9 |
-6 |
-6 |
-2 |
0 |
9 |
beta |
-5 |
-3 |
3 |
-8 |
-3 |
-2 |
5 |
8 |
b |
alpha |
-4 |
4 |
-1 |
0 |
7 |
-4 |
6 |
6 |
beta |
-9 |
9 |
-6 |
8 |
5 |
-2 |
-9 |
-8 |
B |
a |
alpha |
0 |
-9 |
1 |
-6 |
2 |
9 |
-7 |
-9 |
df_ex.rename(index=lambda x:str.upper(x),level=2).head()#通过函数修改索引值
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
|
|
Big |
C |
D |
|
|
Small |
c |
d |
c |
d |
|
|
Other |
cat |
dog |
cat |
dog |
cat |
dog |
cat |
dog |
Upper |
Lower |
Extra |
|
|
|
|
|
|
|
|
A |
a |
ALPHA |
3 |
6 |
-9 |
-6 |
-6 |
-2 |
0 |
9 |
BETA |
-5 |
-3 |
3 |
-8 |
-3 |
-2 |
5 |
8 |
b |
ALPHA |
-4 |
4 |
-1 |
0 |
7 |
-4 |
6 |
6 |
BETA |
-9 |
9 |
-6 |
8 |
5 |
-2 |
-9 |
-8 |
B |
a |
ALPHA |
0 |
-9 |
1 |
-6 |
2 |
9 |
-7 |
-9 |
A = iter(list('abcdefgh'))
df_ex.rename(index=lambda x:next(A),level = 2)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-80-bc33686b6fdf> in <module>
----> 1 A = iter(list('abcdefgh'))
2 df_ex.rename(index=lambda x:next(A),level = 2)
TypeError: 'list' object is not callable
df_A = pd.DataFrame({"A":list('aacd'),"B":list('PQRT'),"C":[1,2,3,4]})
df_A
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-86-2fdf41795bad> in <module>
----> 1 df_A = pd.DataFrame({"A":list('aacd'),"B":list('PQRT'),"C":[1,2,3,4]})
2 df_A
TypeError: 'list' object is not callable
df_reindex = pd.DataFrame({"Weight":[60,70,80], "Height":[176,180,179]}, index=['1001','1003','1002'])
df_reindex
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
Weight |
Height |
1001 |
60 |
176 |
1003 |
70 |
180 |
1002 |
80 |
179 |
df_reindex.reindex(index=['1001','1002','1003','1004'],columns=['Weight','Gender'])
#增加一行的同时,去掉一列并新增一列
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
Weight |
Gender |
1001 |
60.0 |
NaN |
1002 |
80.0 |
NaN |
1003 |
70.0 |
NaN |
1004 |
NaN |
NaN |
df_existed = pd.DataFrame(index = ['1001','1002','1003','1004'],columns=['Weight','Gender'])
df_reindex.reindex_like(df_existed)
# reindex_like与reindex相似 ,其功能是仿照传入的表的索引来进行被调用表索引的变形。
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
Weight |
Gender |
1001 |
60.0 |
NaN |
1002 |
80.0 |
NaN |
1003 |
70.0 |
NaN |
1004 |
NaN |
NaN |
df_A = pd.DataFrame([[0,1],[1,2],[3,4]],
index = pd.Index(['a','b','a'],name='id1'))
df_B = pd.DataFrame([[4,5],[2,6],[7,1]],index = pd.Index(['b','b','c'],name='id2'))
id1,id2 = df_A.index.unique(),df_B.index.unique()
id1.intersection(id2)
Index(['b'], dtype='object')
Index(['a', 'b', 'c'], dtype='object')
Index(['a'], dtype='object')
id1.symmetric_difference(id2)
Index(['a', 'c'], dtype='object')
#若两张表需要做集合运算的列并没有被设置索引,一种办法是先转成索引,运算后再恢复,
#另一种方法是利用 isin 函数
df_a = df_A.reset_index()
df_b = df_B.reset_index()
df_a
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
id1 |
0 |
1 |
0 |
a |
0 |
1 |
1 |
b |
1 |
2 |
2 |
a |
3 |
4 |
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
id2 |
0 |
1 |
0 |
b |
4 |
5 |
1 |
b |
2 |
6 |
2 |
c |
7 |
1 |
df_a[df_a.id1.isin(df_b.id2)]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
df = pd.read_csv(r'C:\Users\zhoukaiwei\Desktop\joyful-pandas\data/company.csv')
df.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
EmployeeID |
birthdate_key |
age |
city_name |
department |
job_title |
gender |
0 |
1318 |
1/3/1954 |
61 |
Vancouver |
Executive |
CEO |
M |
1 |
1319 |
1/3/1957 |
58 |
Vancouver |
Executive |
VP Stores |
F |
2 |
1320 |
1/2/1955 |
60 |
Vancouver |
Executive |
Legal Counsel |
F |
3 |
1321 |
1/2/1959 |
56 |
Vancouver |
Executive |
VP Human Resources |
M |
4 |
1322 |
1/9/1958 |
57 |
Vancouver |
Executive |
VP Finance |
M |
分别只使用 query 和 loc 选出年龄不超过四十岁且工作部门为 Dairy 或 Bakery 的男性。
df.query('(age <= 40)&(department is in ['Diary','Bakery'])&(gender=='M')').head()
File "<ipython-input-108-aa2a95f52718>", line 1
df.query('(age <= 40)&(department is in ['Diary','Bakery'])&(gender=='M')').head()
^
SyntaxError: invalid syntax
选出员工 ID 号 为奇数所在行的第1、第3和倒数第2列。
df.iloc[(df.EmployeeID%2==1).values,[0,2,-2]].head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
EmployeeID |
age |
job_title |
1 |
1319 |
58 |
VP Stores |
3 |
1321 |
56 |
VP Human Resources |
5 |
1323 |
53 |
Exec Assistant, VP Stores |
6 |
1325 |
51 |
Exec Assistant, Legal Counsel |
8 |
1329 |
48 |
Store Manager |
按照以下步骤进行索引操作:
把后三列设为索引后交换内外两层
df.set_index(df.columns[-3:].tolist()).swaplevel(0,2,axis=0).head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
|
|
EmployeeID |
birthdate_key |
age |
city_name |
gender |
job_title |
department |
|
|
|
|
M |
CEO |
Executive |
1318 |
1/3/1954 |
61 |
Vancouver |
F |
VP Stores |
Executive |
1319 |
1/3/1957 |
58 |
Vancouver |
Legal Counsel |
Executive |
1320 |
1/2/1955 |
60 |
Vancouver |
M |
VP Human Resources |
Executive |
1321 |
1/2/1959 |
56 |
Vancouver |
VP Finance |
Executive |
1322 |
1/9/1958 |
57 |
Vancouver |
df.reset_index(level=1)#恢复中间一层
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-116-bd39f5c7cad0> in <module>
----> 1 df.reset_index(level=1)
D:\Anaconda3\lib\site-packages\pandas\core\frame.py in reset_index(self, level, drop, inplace, col_level, col_fill)
4818 if not isinstance(level, (tuple, list)):
4819 level = [level]
-> 4820 level = [self.index._get_level_number(lev) for lev in level]
4821 if len(level) < self.index.nlevels:
4822 new_index = self.index.droplevel(level)
D:\Anaconda3\lib\site-packages\pandas\core\frame.py in <listcomp>(.0)
4818 if not isinstance(level, (tuple, list)):
4819 level = [level]
-> 4820 level = [self.index._get_level_number(lev) for lev in level]
4821 if len(level) < self.index.nlevels:
4822 new_index = self.index.droplevel(level)
D:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _get_level_number(self, level)
1413
1414 def _get_level_number(self, level) -> int:
-> 1415 self._validate_index_level(level)
1416 return 0
1417
D:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _validate_index_level(self, level)
1405 elif level > 0:
1406 raise IndexError(
-> 1407 f"Too many levels: Index has only 1 level, not {level + 1}"
1408 )
1409 elif level != self.name:
IndexError: Too many levels: Index has only 1 level, not 2
修改外层索引名为 Gender
df.rename_axis(index={'gender':'Gender'}).head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
EmployeeID |
birthdate_key |
age |
city_name |
department |
job_title |
gender |
0 |
1318 |
1/3/1954 |
61 |
Vancouver |
Executive |
CEO |
M |
1 |
1319 |
1/3/1957 |
58 |
Vancouver |
Executive |
VP Stores |
F |
2 |
1320 |
1/2/1955 |
60 |
Vancouver |
Executive |
Legal Counsel |
F |
3 |
1321 |
1/2/1959 |
56 |
Vancouver |
Executive |
VP Human Resources |
M |
4 |
1322 |
1/9/1958 |
57 |
Vancouver |
Executive |
VP Finance |
M |
用下划线合并两层行索引