## 8-pandas聚合运算 原

eddy_linux

``````#encoding:utf8

import numpy as np
import pandas as pd

'''

'''

df = pd.DataFrame(
{
'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randint(1,10,5),
'data2':np.random.randint(1,10,5),
}
)
print(df)
'''
data1  data2 key1 key2
0      1      8    a  one
1      7      5    a  two
2      1      6    b  one
3      9      7    b  two
4      5      4    a  one
'''

#内置聚合函数
#求和
print(df.groupby('key1').sum())
'''
data1  data2
key1
a        21     19
b         6      4
'''
#求平均值
print(df.groupby('key1').mean())
'''
data1  data2
key1
a     3.333333    5.0
b     3.500000    4.0
'''
#求最小值
print(df.groupby('key1').min())
'''
data1  data2 key2
key1
a         1      3  one
b         4      1  one
'''
#求最大值
print(df.groupby('key1').max())
'''
data1  data2 key2
key1
a         9      9  two
b         4      4  two
'''
#展示一些列的聚合运算结果
print(df.groupby('key1').describe())
'''
data1     data2
key1
a    count  3.000000  3.000000
mean   4.666667  5.333333
std    0.577350  2.081666
min    4.000000  3.000000
25%    4.500000  4.500000
50%    5.000000  6.000000
75%    5.000000  6.500000
max    5.000000  7.000000

b    count  2.000000  2.000000
mean   6.500000  8.500000
std    3.535534  0.707107
min    4.000000  8.000000
25%    5.250000  8.250000
50%    6.500000  8.500000
75%    7.750000  8.750000
max    9.000000  9.000000
'''

#自定义聚合函数
grouped = df.groupby('key1')

#聚合结果最大值减去最小值
def peak_range(s):
return s.max() - s.min()
print(df)
print(grouped.agg(peak_range))
'''
data1  data2 key1 key2
0      7      2    a  one
1      2      8    a  two
2      7      8    b  one
3      7      2    b  two
4      1      7    a  one
data1  data2
key1
a         6      6
b         0      6

'''
#还可以传如多个聚合函数进行聚合运算
print(grouped.agg(['std','mean','sum',peak_range]))
'''
data1                         data2
std mean sum peak_range       std      mean sum peak_range
key1
a     1.732051  5.0  15          3  2.516611  6.666667  20          5
b     2.121320  2.5   5          3  2.121320  2.500000   5          3
'''
print(grouped.agg(['std','mean','sum',('range',peak_range)]))
'''
data1                         data2
std      mean sum range       std      mean sum range
key1
a     3.785939  3.666667  11     7  2.081666  7.333333  22     4
b     1.414214  5.000000  10     2  0.707107  1.500000   3     1
'''
#不同的列使用不同的聚合函数
d = {'data1':'mean','data2':'sum'}

print(grouped.agg(d))
'''
data1  data2
key1
a       5.0     19
b       5.5     17
'''
d = {'data1':['mean','sum'],'data2':['sum',('range',peak_range)]}
print(grouped.agg(d))
'''
data2           data1
sum range      mean sum
key1
a       14     7  5.333333  16
b        9     5  5.500000  11
'''
print(grouped.agg(d).reset_index())
'''
key1 data2       data1
sum range  mean sum
0    a    16     7     7  21
1    b    15     3     5  10
'''
#高级应用
df = pd.DataFrame(
{
'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randint(1,10,5),
'data2':np.random.randint(1,10,5),
}
)
k1_mean = df.groupby('key1').mean()
print(k1_mean)
'''
data1     data2
key1
a       5.0  3.333333
b       4.0  8.500000
'''
#首先修改聚合之后的列显示名称
print(k1_mean)
'''
mean_data1  mean_data2
key1
a       7.666667         6.0
b       8.000000         5.0
'''
#使用合并函数
print(pd.merge(df,k1_mean,left_on='key1',right_index=True))
'''
data1  data2 key1 key2  mean_data1  mean_data2
0      5      7    a  one         4.0    6.666667
1      2      8    a  two         4.0    6.666667
4      5      5    a  one         4.0    6.666667
2      6      8    b  one         7.5    8.000000
3      9      8    b  two         7.5    8.000000
'''
#另外的方式
print(k1_mean)
'''
mean_data1  mean_data2
0    6.666667         7.0
1    6.666667         7.0
2    5.500000         3.5
3    5.500000         3.5
4    6.666667         7.0
'''
#合并
df[k1_mean.columns] = k1_mean
print(df)
'''
data1  data2 key1 key2  mean_data1  mean_data2
0      9      3    a  one           7    1.666667
1      7      1    a  two           7    1.666667
2      7      5    b  one           7    6.500000
3      7      8    b  two           7    6.500000
4      5      1    a  one           7    1.666667
'''

df = pd.DataFrame(
np.random.randint(1,10,(5,5)),
columns=list('abcde'),
index=['Alice','Bob','Candy','Dark','Emily']
)
print(df)
'''
a  b  c  d  e
Alice  8  1  4  8  1
Bob    6  8  2  7  1
Candy  3  2  3  2  6
Dark   8  9  2  9  8
Emily  7  9  6  3  4
'''

def demean(s):
return s - s.mean()
key = ['one','one','two','one','two']
demeaned = df.groupby(key).transform(demean)
print(demeaned)
'''
a    b    c         d    e
Alice  1.0 -1.0 -3.0  0.666667  2.0
Bob    2.0  0.0  0.0  1.666667 -3.0
Candy -2.5  3.0  1.5  0.500000 -0.5
Dark  -3.0  1.0  3.0 -2.333333  1.0
Emily  2.5 -3.0 -1.5 -0.500000  0.5
'''
#误差
print(demeaned.groupby(key).mean())
'''
a             b             c    d    e
one  2.960595e-16  2.960595e-16  2.960595e-16  0.0  0.0
two  0.000000e+00  0.000000e+00  0.000000e+00  0.0  0.0
'''

#输出某一列排序的最大的前两行
df = pd.DataFrame(
{
'key1':['a','a','b','b','a','a','a','b','b','a'],
'key2':['one','two','one','two','one','one','two','one','two','one'],
'data1':np.random.randint(1,10,10),
'data2':np.random.randint(1,10,10),
}
)
print(df)
'''
data1  data2 key1 key2
0      1      2    a  one
1      5      8    a  two
2      7      9    b  one
3      8      3    b  two
4      9      8    a  one
5      7      8    a  one
6      6      7    a  two
7      2      7    b  one
8      7      8    b  two
9      4      4    a  one

'''
def top(g,n=2,column='data1'):
return g.sort_values(by=column,ascending=False)[:n]

print(df)
print(top(df))
print(top(df,n=3))
'''
data1  data2 key1 key2
0      1      4    a  one
1      3      5    a  two
2      1      6    b  one
3      8      7    b  two
4      7      7    a  one
5      6      4    a  one
6      4      1    a  two
7      4      8    b  one
8      4      3    b  two
9      6      4    a  one

data1  data2 key1 key2
3      8      7    b  two
4      7      7    a  one

data1  data2 key1 key2
3      8      7    b  two
4      7      7    a  one
5      6      4    a  one

'''
print(df)
print(df.groupby('key1').apply(top))
'''
data1  data2 key1 key2
0      5      9    a  one
1      5      2    a  two
2      4      9    b  one
3      9      3    b  two
4      3      1    a  one
5      7      5    a  one
6      3      4    a  two
7      4      3    b  one
8      4      6    b  two
9      9      6    a  one
data1  data2 key1 key2
key1
a    9      9      6    a  one
5      7      5    a  one
b    3      9      3    b  two
2      4      9    b  one
'''

#对于Nan值采用分组的平均值进行填充
states = [
'成都','贵州','重庆','西藏',
'广州','杭州','福州','海南'
]
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8),index=states)
data[['重庆','福州','海南']]=np.NaN
print(data)
'''

'''
print(data.groupby(group_key).mean())
'''
East    1.641739
West   -0.894022
'''
def fill_mean(g):
return g.fillna(g.mean())

print(data.groupby(group_key).apply(fill_mean))
'''

'''

``````

### eddy_linux

（转载）Python数据分析之pandas学习

fjssharpsword
2017/11/28
0
0

pandas提供了一个灵活高效的groupby功能，它使你能以一种自然的方式对数据集进行切片、切块、摘要等操作。 根据一个或多个键（可以是函数、数组或DataFrame列名）拆分pandas对象。 计算分组摘...

Betty__
2016/10/03
92
0
Pandas系列6-DataFrame的分组与聚合

geekpy
07/23
0
0
pandas读取csv文件数据的方法及注意点

pandas是一个高效的数据分析工具。基于其高度抽象的数据结构（点击这里了解DataFrame数据结构），几乎可以对数据进行任何你想要的操作。 由于现实世界中数据源的格式非常多，pandas也支持了不...

schnauzer
07/17
0
0
Pandas初学者代码优化指南

dev_csdn
2017/11/21
0
0

1个开发如何撑起一个过亿用户的小程序

3分钟前
0
0
golang ssh包使用方法介绍

linuxprobe16
8分钟前
0
0
layer

Layui Layer在open弹出层中异步加载数据和form表单radio、checkbox、select不渲染，不可点击的解决办法 layer 实现弹窗提交信息 function confirmUpdateAward(i) { layer.open({ ...

mickelfeng
57分钟前
0
0
Spring boot中如何获取profiles环境

实现ApplicationContextAware @Componentpublic class QiNiuPropertiesConfig implements ApplicationContextAware { /// 获取当前环境public String getActiveProfile() { ret......

3
0

2
0