6-pandas索引
6-pandas索引
eddy_linux 发表于3个月前
6-pandas索引
  • 发表于 3个月前
  • 阅读 8
  • 收藏 1
  • 点赞 0
  • 评论 0

腾讯云 新注册用户 域名抢购1元起>>>   

#encoding:utf8



import numpy as np
import pandas as pd

s = pd.Series(np.random.rand(5),index=list('abcde'))
print(s)
'''
a    0.203759
b    0.793161
c    0.139707
d    0.094600
e    0.941411
dtype: float64
'''
print(s.index)
'''
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
'''
#给行索引字段取名
s.index.name = 'alpha'
print(s)
'''
alpha
a    0.203759
b    0.793161
c    0.139707
d    0.094600
e    0.941411
'''

df = pd.DataFrame(np.random.randn(4,3),columns=['one','two','three'])
print(df)
'''
        one       two     three
0  0.733709 -0.399455 -0.873937
1 -0.970276  0.814565  0.332324
2 -0.097845  0.443321 -1.039903
3 -1.440090 -0.165967 -1.305140
'''
#行索引
print(df.index)
'''
RangeIndex(start=0, stop=4, step=1)
'''

#列索引
print(df.columns)
'''
Index(['one', 'two', 'three'], dtype='object')
'''
#给行列索引取名
df.index.name = 'row'
df.columns.name = 'col'
print(df)
'''
col       one       two     three
row
0    1.897495  1.459275  0.146160
1    1.166402  0.446709 -0.685580
2   -0.114267  1.552098  0.403563
3   -0.073791  0.593768 -0.446416
'''

'''
pandas索引类型
有许多类型
pd.CategoricalIndex
pd.DatetimeIndex
pd.Float64Index
pd.Index
pd.Int64Index
pd.MultiIndex
pd.PeriodIndex
pd.TimedeltaIndex
'''

#重复索引
s = pd.Series(np.arange(6),index=list('abcbda'))
print(s)
'''
a    0
b    1
c    2
b    3
d    4
a    5
'''
print(s['a'])
'''
a    0
a    5
返回的是一个Series对于重复索引的
'''
print(s['c'])
'''
2
返回的是一个具体值
'''
#判断是否有重复索引
print(s.index.is_unique)
'''
False
'''
#列出不重复的索引(多个重复索引将被整合为一个索引出现)
print(s.index.unique())
'''
['a' 'b' 'c' 'd']
'''
#处理重复索引进行数据清洗,根据实际需求来
#对索引进行分组求和
print(s.groupby(s.index).sum())
'''
a    5
b    4
c    2
d    4
'''
#对索引进行分组平均值
print(s.groupby(s.index).mean())
'''
a    2.5
b    2.0
c    2.0
d    4.0
'''

#多级索引
#用二维数据来展示更高维度的数据

a = [
    ['a','a','a','b','b','c','c'],
    [1,2,3,1,2,2,3]
]

t = list(zip(*a))
print(t)
'''
[('a', 1), ('a', 2), ('a', 3), ('b', 1), ('b', 2), ('c', 2), ('c', 3)]
'''
#创建多级索引
index = pd.MultiIndex.from_tuples(t,names=['level1','level2'])
print(index)
'''
MultiIndex(levels=[['a', 'b', 'c'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2], [0, 1, 2, 0, 1, 1, 2]],
           names=['level1', 'level2'])
'''
s = pd.Series(np.random.rand(7),index=index)
print(s)
'''
a       1         0.721871
        2         0.681721
        3         0.400783
b       1         0.807796
        2         0.282922
c       2         0.090593
        3         0.352263
'''
#选取一级索引
print(s['b'])
'''
level2
1    0.802142
2    0.710330
dtype: float64
'''
print(s['b':'c'])
'''
level1  level2
b       1         0.432311
        2         0.251192
c       2         0.129710
        3         0.626829
'''
print(s[['a','c']])
'''
level1  level2
a       1         0.744435
        2         0.229332
        3         0.243126
c       2         0.106514
        3         0.985381
'''
#选取所有二级索引
print(s[:,2])
'''
level1
a    0.736723
b    0.650342
c    0.981221
'''
#选取一级索引为a二级索引为2注意如果没有此一级和耳机索引将抛出异常
print(s['a',2])
'''
0.0155083268965
'''

#DataFrame的多级索引
df = pd.DataFrame(np.random.randint(1,10,(4,3)),index=[['a','a','b','b'],[1,2,1,2]],
                  columns=[['one','one','two'],['blue','red','blue']]
                  )
df.index.names = ['row-1','row-2']
df.columns.names = ['col-1','col-2']

print(df)
'''
col-1        one      two
col-2       blue red blue
row-1 row-2
a     1        2   8    9
      2        7   3    2
b     1        4   2    6
      2        8   4    5
'''
#一级索引
print(df.loc['a'])
'''
col-1  one      two
col-2 blue red blue
row-2
1        7   5    3
2        9   1    8
'''
#二级索引
print(df.loc['a',1])
'''
col-1  col-2
one    blue     6
       red      9
two    blue     6
'''
print(df.loc['a',1].index)
'''
MultiIndex(levels=[['one', 'two'], ['blue', 'red']],
           labels=[[0, 0, 1], [0, 1, 0]],
           names=['col-1', 'col-2'])
'''

#多级索引的交换

df2 = df.swaplevel('row-1','row-2')
print(df)
print(df2)
'''
col-1        one      two
col-2       blue red blue
row-1 row-2
a     1        6   7    8
      2        5   1    6
b     1        5   2    4
      2        9   6    8

col-1        one      two
col-2       blue red blue
row-2 row-1
1     a        6   7    8
2     a        5   1    6
1     b        5   2    4
2     b        9   6    8
'''
#对索引进行排序
#对一级索引进行排序
print(df2.sortlevel(0))
'''
col-1        one      two
col-2       blue red blue
row-2 row-1
1     a        6   7    3
      b        1   8    1
2     a        6   2    4
      b        4   5    9
'''
#对二级索引进行排序
print(df2.sortlevel(1))
'''
col-1        one      two
col-2       blue red blue
row-2 row-1
1     a        3   3    8
2     a        7   7    5
1     b        2   7    9
2     b        2   6    4
'''
#多级索引的统计
print(df)
#根据一级索引来求和
print(df.sum(level=0))
'''
col-1        one      two
col-2       blue red blue
row-1 row-2
a     1        8   5    7
      2        6   1    1
b     1        8   1    2
      2        1   9    2

col-1  one      two
col-2 blue red blue
row-1
a       14   6    8
b        9  10    4

'''
#根据二级索引来求和
print(df)
print(df.sum(level=1))
'''
col-1        one      two
col-2       blue red blue
row-1 row-2
a     1        7   6    5
      2        4   1    5
b     1        4   4    2
      2        6   5    1

col-1  one      two
col-2 blue red blue
row-2
1       11  10    7
2       10   6    6

注意区别
一级索引求和是把一级索引之后的所有值求和
二级索引求和是把同名的索引的值求和
'''

df = pd.DataFrame(
    {
        'a':range(7),
        'b':range(7,0,-1),
        'c':['one','one','one','two','two','two','two'],
        'd':[0,1,2,0,1,2,3]
     }
)
print(df)
#把数据的列设置成索引值
'''
   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3
'''
print(df.set_index('c'))
'''
     a  b  d
c
one  0  7  0
one  1  6  1
one  2  5  2
two  3  4  0
two  4  3  1
two  5  2  2
two  6  1  3
'''
#二级
print(df.set_index(['c','d']))
'''
       a  b
c   d
one 0  0  7
    1  1  6
    2  2  5
two 0  3  4
    1  4  3
    2  5  2
    3  6  1

'''
#把多级索引降级
df2 = df.set_index(['c','d'])
print(df2)
'''
       a  b
c   d
one 0  0  7
    1  1  6
    2  2  5
two 0  3  4
    1  4  3
    2  5  2
    3  6  1
'''
print(df2.reset_index().sort_index('columns'))
'''
     c  d  a  b
0  one  0  0  7
1  one  1  1  6
2  one  2  2  5
3  two  0  3  4
4  two  1  4  3
5  two  2  5  2
6  two  3  6  1

   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3
'''


 

共有 人打赏支持
粉丝 19
博文 132
码字总数 185568
×
eddy_linux
如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!
* 金额(元)
¥1 ¥5 ¥10 ¥20 其他金额
打赏人
留言
* 支付类型
微信扫码支付
打赏金额:
已支付成功
打赏金额: