2018.03.28 python-pandas groupby使用

groupby 分组统计

1.根据某些条件将数据分组

2.对每个组独立应用函数

3.将结果合并到一个数据结构中

Dataframe在行或列上分组,将一个函数应用到各个分组并产生一个新值,然后函数执行结果被合并到最终的结果对象中

#分组
import numpy as np
import pandas as pd
df = pd.DataFrame({'A':['foo','bar','foo','bar','foo','bar','foo','foo'], 'B':['one','one','two','three','two','two','one','three'], 'C':np.random.randn(8), 'D':np.random.randn(8)}) print(df) print('------') print(df.groupby('A'),type(df.groupby('A'))) #直接分组得到的是groupby对象,是一个中间数据,没有进行计算 print(df.groupby('A').sum())#自动过滤字符串列 print(df.groupby('A').mean())#平均值 b = df.groupby(['A','B']).mean() print(b,type(b),'\n',b.columns) c = df.groupby(['A'])['D'].mean()#以A分组,取D列平均值 print(c,type(c),'\n')

结果:

A B C D

0 foo one 0.429615 -0.708782

1 bar one 0.891751 1.140575

2 foo two -0.261858 -0.516835

3 bar three 1.310361 0.269657

4 foo two 1.048076 1.374218

5 bar two -0.410148 1.061132

6 foo one -1.124137 -0.729367

7 foo three 0.289513 0.892714

------

<pandas.core.groupby.DataFrameGroupBy object at 0x000000000FBACA58> <class 'pandas.core.groupby.DataFrameGroupBy'>

C D

A

bar 1.791963 2.471364

foo 0.381208 0.311947

C D

A

bar 0.597321 0.823788

foo 0.076242 0.062389

C D

A B

bar one 0.891751 1.140575

three 1.310361 0.269657

two -0.410148 1.061132

foo one -0.347261 -0.719074

three 0.289513 0.892714

two 0.393109 0.428691 <class 'pandas.core.frame.DataFrame'>

Index(['C', 'D'], dtype='object')

A

bar 0.823788

foo 0.062389

Name: D, dtype: float64 <class 'pandas.core.series.Series'>

#分组 - 可迭代的对象
df = pd.DataFrame({'X':['A','B','A','B'],'Y':[1,3,4,2]})
print(df)
print(df.groupby('X'),type(df.groupby('X')))
print('-------')
print(list(df.groupby('X')),'->可迭代对象,直接生成list\n')
print(list(df.groupby('X'))[0],'->以元组的形式显示')
for n,g in df.groupby('X'):
    print(n)
    print(g)
    print('###')
print('--------')
#n是组名,g是分组后的DataFrame
print(df.groupby(['X']).get_group('A'),'\n')
print(df.groupby(['X']).get_group('B'),'\n')
#.get_group提取分组后的组

grouped = df.groupby(['X'])
print(grouped.groups)
print(grouped.groups['A'])#也可写 df.groupby('X').groups['A']
print('-------')
#.groups:将分组后的groups转化为dict
#可以字典索引方法来查看groups里的元素

sz = grouped.size()
print(sz,type(sz))
#.size() 查看分组后的长度
print('---------')
df = pd.DataFrame({'A':['foo','bar','foo','bar','foo','bar','foo','foo'],
                   'B':['one','one','two','three','two','two','one','three'],
                   'C':np.random.randn(8),
                   'D':np.random.randn(8)})
grouped = df.groupby(['A','B']).groups
print(df)
print(grouped)
print(grouped['foo','three'])

dic=dict({'A':[1,2,3],
       'B':[2,3,4]})
print(dic,type(dic))

结果:

X Y

0 A 1

1 B 3

2 A 4

3 B 2

<pandas.core.groupby.DataFrameGroupBy object at 0x000000000F889F60> <class 'pandas.core.groupby.DataFrameGroupBy'>

-------

[('A', X Y

0 A 1

2 A 4), ('B', X Y

1 B 3

3 B 2)] ->可迭代对象,直接生成list

('A', X Y

0 A 1

2 A 4) ->以元组的形式显示

A

X Y

0 A 1

2 A 4

###

B

X Y

1 B 3

3 B 2

###

--------

X Y

0 A 1

2 A 4

X Y

1 B 3

3 B 2

{'A': Int64Index([0, 2], dtype='int64'), 'B': Int64Index([1, 3], dtype='int64')}

Int64Index([0, 2], dtype='int64')

-------

X

A 2

B 2

dtype: int64 <class 'pandas.core.series.Series'>

---------

A B C D

0 foo one -0.881923 -0.825102

1 bar one -0.626412 -0.618638

2 foo two -1.741248 1.557698

3 bar three 1.076928 1.738265

4 foo two -0.954103 -0.741415

5 bar two 1.224841 -0.479472

6 foo one 0.680046 -0.476137

7 foo three -1.519952 -0.421738

{('bar', 'one'): Int64Index([1], dtype='int64'), ('bar', 'three'): Int64Index([3], dtype='int64'), ('bar', 'two'): Int64Index([5], dtype='int64'), ('foo', 'one'): Int64Index([0, 6], dtype='int64'), ('foo', 'three'): Int64Index([7], dtype='int64'), ('foo', 'two'): Int64Index([2, 4], dtype='int64')}

Int64Index([7], dtype='int64')

{'A': [1, 2, 3], 'B': [2, 3, 4]} <class 'dict'>

#其他轴上分组
df = pd.DataFrame({'data1':np.random.randn(2),
                   'data2':np.random.randn(2),
                   'key1':['a','b'],
                   'key2':['one','two']})
print(df)
print(df.dtypes)
print('--------')
for n,p in df.groupby(df.dtypes,axis=1):
    print(n)
    print(p)
    print('##')
#按照值类型分组,分为2组

结果:

data1 data2 key1 key2

0 0.813374 0.232957 a one

1 -0.213256 1.393156 b two

data1 float64

data2 float64

key1 object

key2 object

dtype: object

--------

float64

data1 data2

0 0.813374 0.232957

1 -0.213256 1.393156

##

object

key1 key2

0 a one

1 b two

##

#通过字典或者Series分组
df = pd.DataFrame(np.arange(16).reshape(4,4),
                 columns = ['a','b','c','d'])
print(df)
print('-------')

mapping = {'a':'one','b':'one','c':'two','d':'two','e':'three'}
print(mapping)
by_column = df.groupby(mapping,axis = 1)
print(by_column.sum())
print('---------')
#mapping中 a,b列对应为one,c,d列对应为two,以字典为分组

s=pd.Series(mapping)
print(s)
print(s.groupby(s).count())
#s中,index = a,b对应的是one;c,d对应的是two,以Series来分组

结果:

a b c d

0 0 1 2 3

1 4 5 6 7

2 8 9 10 11

3 12 13 14 15

-------

{'a': 'one', 'b': 'one', 'c': 'two', 'd': 'two', 'e': 'three'}

one two

0 1 5

1 9 13

2 17 21

3 25 29

---------

a one

b one

c two

d two

e three

dtype: object

one 2

three 1

two 2

dtype: int64