4-pandas核心数据结构
4-pandas核心数据结构
eddy_linux 发表于3个月前
4-pandas核心数据结构
  • 发表于 3个月前
  • 阅读 3
  • 收藏 0
  • 点赞 0
  • 评论 0

腾讯云 技术升级10大核心产品年终让利>>>   

#encoding:utf8

'''
pandas核心数据结构
    series:是一个一维带标签的数组,数组里面可以方任意数据
            s = pd.Series(data,index=index)
            其中index是一个列表来做为数据的标签,data可以是不同的数据类型
            Series特性:
                类ndarray对象
                类dict对象
                标签对齐操作

    DataFrame:是二维带行和列标签的数组,可以把DataFrame相乘一个excel表
              或者一个SQl数据库的表格,还可以看作一个Series对象字典,它是pandas最常用的数据结构
              df = pd.DataFrame(data,index=index,columns=columns)
              index是行标签
              columns是列标签
              data:由一维numpy数组,list和Series构成的字典
                    二维numpy数组
                    另外的DataFrame对象
              DataFrame特性:
                列选择,增加,删除
                使用assign()方法插入新列
                索引和选择
                    选择一列df['col]---->Series
                    根据行标签选择一行df.loc['label']--->Series
                    根据行位置来选择一行df.iloc['label']---->Series
                    选择多行df[2:10]---->DataFrame
                    根据布尔值选择多行df[布尔值]---->DataFrame
                数据对齐
                使用numpy函数

    Panel:是三维带标签的数组,实际上pandas的名称又来就是panel演进的,比较少用,但依然是最重要的基础数据结构之一
          items:坐标轴0.索引对应的元素是一个DataFrame
          major_axis:坐标轴1,DataFrame的行标签
          minor_asis:坐标轴2,DataFrame的列标签
'''
import pandas as pd
import numpy as np
#指定索引,数据是np
s = pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
print(s)
'''
a   -0.642983
b   -1.354435
c    0.460587
d    0.747326
e   -0.403708
dtype: float64
'''
#不指定索引的话会默认分配索引
s = pd.Series(np.random.randn(5))
print(s)
'''
0   -1.990919
1    1.659830
2    0.093845
3   -0.793828
4    0.058719
dtype: float64
'''

#数据是字典
d = {'a':0,'b':1,'d':3}
s = pd.Series(d,index=list('abcd'))
print(s)
'''
a    0.0
b    1.0
c    NaN
d    3.0
dtype: float64
由于索引里面有c,而数据中没有c,所以c用Nan填充
'''

#标量,每个值都一样
s = pd.Series(5,index=list('abcde'))
print(s)
'''
a    5
b    5
c    5
d    5
e    5
dtype: int64
'''


#Series特性
#类nparray对象
s = pd.Series(np.random.randn(5))
print(s)

'''
0   -1.012615
1   -1.101502
2   -0.429467
3    0.697508
4   -1.504106
dtype: float64
'''
print(s[0])
'''
-1.012615
'''
#前三个值
print(s[:3])
#第三个到第五个的值
print(s[2:5])
#看某几个
print(s[[1,3,4]])

#还可以传给np的函数
print(np.sin(s))
print(np.exp(s))

#类字典对象
s = pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
print(s['a'])
s['b'] = 3
s['g'] = 10
#如果字典的key不存在报错
#当然也可以使用get方法
print(s.get('f'))

#标签对齐
s1 = pd.Series(np.random.randn(3),index=['a','c','e'])
s2 = pd.Series(np.random.randn(3),index=['a','d','e'])
print('{0}\n\n{1}'.format(s1,s2))
'''
a    1.053980
c   -0.112771
e   -1.757399
dtype: float64

a    2.574253
d   -1.000329
e   -0.113808
dtype: float64
'''
print(s1 + s2)
'''
a    2.304986
c         NaN
d         NaN
e   -0.892602
dtype: float64
他会自动以s1为准对齐标签进行操作
'''

#DataFrame
d = {
    'one':pd.Series([1,2,3],index=['a','b','c']),
    'two':pd.Series([1,2,3,4],index=['a','b','c','d'])
}
df = pd.DataFrame(d)
print(df)
'''
   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
'''

#指定行索引
df = pd.DataFrame(d,index=['d','b','a'])
print(df)
'''
   one  two
d  NaN    4
b  2.0    2
a  1.0    1
'''
#指定列索引
df = pd.DataFrame(d,columns=['two','three'])
print(df)
'''
   two three
a    1   NaN
b    2   NaN
c    3   NaN
d    4   NaN
'''

#从词典的数据为列表进行创建
d = {
    'one':[1,2,3,4],
    'two':[12,32,41,67]
}
df = pd.DataFrame(d)
print(df)
'''
   one  two
0    1   12
1    2   32
2    3   41
3    4   67
这里要注意如果采用列表数据进行创建列表中的元素个数必须一致否则报错
Series中可以不必相同
'''


#从列表创建
data = [
    (1,2.2,'A'),
    (2,3,'B')
]
df = pd.DataFrame(data)
print(df)
'''
   0    1  2
0  1  2.2  A
1  2  3.0  B
没有行列标签,会自动分配一个
'''

#从列表的数据为字典进行创建

data = [
    {'a':1,'b':2},
    {'a':5,'b':10,'c':20}
]
df = pd.DataFrame(data)
print(df)
'''
   a   b     c
0  1   2   NaN
1  5  10  20.0
会自动把列表的数据中的字典的key为列索引
'''

#复杂结构
d = {
    ('a','b'):{('A','B'):1,('A','C'):2},
    ('a','a'):{('A','C'):3,('A','B'):4},
    ('a','c'):{('A','B'):5,('A','C'):6},
    ('b','a'):{('A','C'):7,('A','B'):8},
    ('b','b'):{('A','D'):9,('A','B'):10}
}

df = pd.DataFrame(d)
print(df)
'''
       a              b
       a    b    c    a     b
A B  4.0  1.0  5.0  8.0  10.0
  C  3.0  2.0  6.0  7.0   NaN
  D  NaN  NaN  NaN  NaN   9.0

'''
#通过Series创建
s = pd.Series(np.random.randn(5),index=list('abcde'))
df = pd.DataFrame(s)
print(df)
'''
          0
a  1.245420
b -1.805637
c -1.719812
d -0.268262
e  0.324174
'''
df = pd.DataFrame(s,index=list('acd'),columns=['A'])
print(df)
'''
          A
a  0.175551
c -1.375858
d  0.600202
'''
#DataFrame特性
#列选择增减删除
df = pd.DataFrame(np.random.randn(6,4),columns=['one','two','three','four'])
print(df)
'''
        one       two     three      four
0 -0.883441  0.722139  0.008607 -1.845884
1  0.062726 -0.133322  1.251902 -0.332315
2  1.180431  0.973133 -0.480912 -0.080268
3  0.164558 -0.903407 -1.449404 -1.018378
4 -1.002809 -0.461247  2.521952 -1.218778
5  0.386091 -0.285654  1.479596 -0.409710
'''
#选择列
print(df['one'])
'''
0   -0.473044
1   -1.313548
2   -0.749416
3    0.620463
4   -0.362576
5   -0.410367
'''
#选择行
print(df.loc[1])
'''
one     -1.313548
two     -0.415634
three   -0.624267
four     1.234304
'''
#赋值
df['three'] = df['one'] + df['two']
print(df)
'''
第三列就是第一列和第二列相加得到
        one       two     three      four
0 -0.122570  0.028326 -0.094244  0.328752
1  0.300773  1.763818  2.064590 -0.515548
2 -0.701206 -0.956082 -1.657288  0.278864
3  0.622287  0.295386  0.917673 -0.642725
4  0.215740 -1.051719 -0.835979 -1.007359
5 -0.104314  1.050029  0.945715  0.583706
'''
#删除列
del df['three']
#df.pop('three')
print(df)
'''
        one       two      four
0 -0.356041  0.257288 -0.773332
1 -0.351089  0.310845  0.743968
2  0.252796  0.072242 -0.422798
3  0.440768  0.004342 -0.076067
4  0.840860 -0.629879  1.252147
5  0.127443 -0.065718  1.393988
'''
#增加列,默认添加在最后一列
df['flag'] = df['one'] > 0.2
df['five'] = 5
print(df)
'''
        one       two      four   flag  five
0 -1.469445 -0.211843  0.389252  False     5
1  1.301886  0.953945  0.167914   True     5
2  0.433649  0.223498 -1.042002   True     5
3  1.603915 -1.301473  0.289294   True     5
4 -0.098793 -0.962284 -1.245439  False     5
5  0.179785  1.041043 -0.596245  False     5
'''
#在原来的第二列之前插入一个列标签为bar,值为df['one'] + df['two']
df.insert(1,'bar',df['one'] + df['two'])
print(df)
'''
        one       bar       two      four   flag  five
0 -1.167471 -1.454080 -0.286609  0.809202  False     5
1  0.238044 -0.982485 -1.220529  0.599817   True     5
2 -0.026740  0.076688  0.103428  0.442741  False     5
3 -1.116262  0.165048  1.281309 -1.473095  False     5
4 -1.284309 -2.375614 -1.091305 -0.704744  False     5
5 -1.033716 -1.598951 -0.565234 -0.814488  False     5
'''
#并不会直接操作原数据会进行复制操作
print(df.assign(eddy = df['one'] / df['two']))
print(df)
'''
        one       bar       two      four   flag  five       eddy
0 -1.841226 -2.319084 -0.477858 -0.115945  False     5   3.853080
1  0.954525 -0.721547 -1.676072 -0.211269   True     5  -0.569501
2  0.926548  0.877620 -0.048928 -1.575646   True     5 -18.936899
3  0.298663  1.894321  1.595657  0.061068   True     5   0.187173
4  1.058452  0.962337 -0.096116 -0.495765   True     5 -11.012270
5  0.172647  0.009734 -0.162913  0.501113  False     5  -1.059748
        one       bar       two      four   flag  five
0 -1.841226 -2.319084 -0.477858 -0.115945  False     5
1  0.954525 -0.721547 -1.676072 -0.211269   True     5
2  0.926548  0.877620 -0.048928 -1.575646   True     5
3  0.298663  1.894321  1.595657  0.061068   True     5
4  1.058452  0.962337 -0.096116 -0.495765   True     5
5  0.172647  0.009734 -0.162913  0.501113  False     5
'''

#索引和选择
df = pd.DataFrame(np.random.randint(1,10,(6,4)),index=list('abcdef'),columns=list('ABCD'))
print(df)
'''
   A  B  C  D
a  7  1  4  7
b  7  3  6  4
c  6  8  2  8
d  4  7  5  7
e  1  6  6  9
f  7  4  3  3
'''
#选择列
print(df['A'])
'''
a    6
b    7
c    6
d    6
e    2
f    4
'''
#选择行
print(df.loc['a'])
'''
A    2
B    6
C    3
D    5
'''
#选择行但是以行索引值为准
print(df)
print(df.iloc[1])
'''
   A  B  C  D
a  6  3  2  1
b  5  1  5  5
c  9  1  5  8
d  4  1  2  9
e  6  4  4  9
f  9  5  1  8

A    5
B    1
C    5
D    5
'''
#选择某几行
#选择第二行到第四行
print(df)
print(df[1:4])
'''
   A  B  C  D
a  3  8  4  6
b  2  7  6  4
c  7  1  5  9
d  5  4  8  9
e  9  5  1  9
f  8  1  7  9

   A  B  C  D
b  2  7  6  4
c  7  1  5  9
d  5  4  8  9
'''
#效率更高的写法
print(df)
print(df.iloc[1:4])
'''
   A  B  C  D
a  3  1  3  1
b  3  5  8  3
c  7  3  3  7
d  2  5  1  9
e  2  9  4  5
f  6  1  7  7

   A  B  C  D
b  3  5  8  3
c  7  3  3  7
d  2  5  1  9
'''

#布尔值
#选择A列大于4的行
print(df[df.A > 4])
'''
   A  B  C  D
b  9  4  4  7
c  5  5  1  4
e  9  1  5  2
f  9  4  1  7
'''

#标签对齐
df1 = pd.DataFrame(np.random.randn(10,4),index=list('abcdefghij'),columns=list('ABCD'))
df2 = pd.DataFrame(np.random.randn(7,3),index=list('cdefghi'),columns=list('ABC'))
print(df1)
'''
          A         B         C         D
a -0.475250  1.816780 -0.345222  0.599880
b  1.842124  1.375863  1.050803  0.261957
c  0.903875  0.332738 -0.414794  0.481178
d  0.069099  0.554739 -0.864424  0.525776
e  0.575736  1.592963  2.050674 -0.102034
f -0.173357 -0.201238 -1.363390 -0.409137
g -0.537125 -0.695808 -2.004893  0.186505
h  2.153636 -0.989632 -0.557407 -0.271087
i -0.635415 -0.294707  0.199212  0.188860
j -0.521159 -0.939149 -1.627262  0.656986
'''
print(df2)
'''
          A         B         C
c  1.816506 -0.156875 -0.374446
d -1.660415 -0.742605 -0.994546
e  1.070771 -0.512518 -0.254673
f -0.814818 -0.467923 -0.336066
g -0.109751  0.711748  0.223611
h  0.276906  0.114592  0.060898
i  0.725123  0.653745 -1.304302
'''
print(df1 + df2)
'''
          A         B         C   D
a       NaN       NaN       NaN NaN
b       NaN       NaN       NaN NaN
c -0.996902 -0.835830 -1.171030 NaN
d -0.248686 -3.335871 -1.359282 NaN
e -2.031014  1.477378 -1.086871 NaN
f -0.475711  1.778047 -0.830834 NaN
g  0.933408 -0.948903  1.098031 NaN
h -1.358368  2.109962 -0.162035 NaN
i  1.033558  2.602100 -0.822506 NaN
j       NaN       NaN       NaN NaN
会自动的标签对齐相加
'''

print(df1 - df1.iloc[0])
'''
          A         B         C         D
a  0.000000  0.000000  0.000000  0.000000
b  1.648827 -0.631181 -0.765940  0.173203
c -1.132634 -0.673565 -1.364718 -1.112644
d  2.367267 -0.829438 -0.952571 -0.079665
e -0.130107 -0.604533 -1.371570 -1.231660
f -0.966749 -0.582235 -2.694483  0.117960
g -0.450261 -1.784781 -1.680883  0.249425
h -0.639796  0.282884 -0.968743 -0.944879
i -0.130247 -1.117115 -0.626351 -0.781328
j  0.826533 -1.531762  1.013906 -0.164497
'''

#numpy的函数使用
print(np.exp(df2))
'''
          A         B         C
c  1.665900  0.177399  2.487602
d  1.142773  0.189716  4.251170
e  0.388794  0.763401  0.678560
f  0.514500  1.722456  2.525163
g  0.954387  0.236827  0.444355
h  0.286272  0.547206  0.862291
i  2.002238  6.573480  0.460405
'''
print(np.sin(df2))
'''
          A         B         C
c  0.397303  0.073660  0.266379
d -0.842873  0.895713 -0.013407
e -0.689216 -0.268336 -0.623929
f -0.961239  0.636018 -0.697631
g  0.963668 -0.599516  0.385500
h  0.999467 -0.987262  0.908637
i  0.005188  0.256560  0.999625
'''
print(df2.values)
print(type(df2.values))
'''
[[-0.29429177 -0.33319518  0.51916513]
 [ 0.31237605 -1.72148036 -1.32130143]
 [ 1.06673122 -2.41777182  0.86887143]
 [-0.48582326 -0.19638844 -0.17211501]
 [ 2.41560869  0.71202002 -0.96272687]
 [-0.1714937   0.25948967 -0.184975  ]
 [ 0.7248686   0.19815541 -0.42017762]]
<class 'numpy.ndarray'>
DataFrame的数据其实就一个numpy array的对象
'''
#进行转换
print(np.asarray(df2))
'''
[[-0.10337481  1.10982992 -0.99893912]
 [ 0.56375586  0.33834332 -0.77602715]
 [ 0.79749184 -0.17482679 -0.56520724]
 [ 0.9123768   0.30936735  1.66618853]
 [-0.18823813 -1.60572378 -1.07929157]
 [ 1.11949436 -2.36540965 -0.27929168]
 [ 0.63920646 -0.64467357 -0.13412456]]
 与df2.values等价
 '''

#panel
#创建
data = {
        'Items1':pd.DataFrame(np.random.randn(4,3)),
        'Items2':pd.DataFrame(np.random.randn(4,2))
}

pn = pd.Panel(data)
print(pn)
'''
<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 4 (major_axis) x 3 (minor_axis)
Items axis: Items1 to Items2
Major_axis axis: 0 to 3
Minor_axis axis: 0 to 2
'''

print(pn['Items1'])
'''
          0         1         2
0 -1.526990 -0.070476 -0.403109
1  1.514263 -0.278211  1.023529
2  0.514959 -0.532971 -0.484037
3  0.409065 -0.576834 -0.820684
'''

print(pn['Items2'])
'''
          0         1   2
0 -0.778019 -1.901650 NaN
1 -1.325655  0.018238 NaN
2 -1.117964 -1.111286 NaN
3 -0.354138 -0.627724 NaN
'''
#查看一维索引值
print(pn.items)
'''
Index(['Items1', 'Items2'], dtype='object')
'''
#查看二维索引值
print(pn.major_axis)
'''
RangeIndex(start=0, stop=4, step=1)
'''
#查看三维索引值
print(pn.minor_axis)
'''
RangeIndex(start=0, stop=3, step=1)
'''
#查看二维索引的值
print(pn.major_xs(0))
print(pn.major_xs(1))
print(pn.major_xs(2))
print(pn.major_xs(3))
'''
     Items1    Items2
0  0.554649 -1.718509
1 -1.142421  1.048699
2 -0.129635       NaN
     Items1    Items2
0  0.023580 -0.046784
1  0.118717  0.603499
2  1.477423       NaN
     Items1    Items2
0  0.178097  0.273654
1 -0.718359  0.441026
2 -0.787361       NaN
     Items1    Items2
0  0.089366 -0.745780
1  0.090689  1.228318
2  0.397654       NaN
当然也会有三维的索引值pn.minor_xs()
'''

#panel---->DataFrame的转换
#实质还是三维的数据只是说是表现形式不一样
print(pn.to_frame())
'''
               Items1    Items2
major minor
0     0     -0.175946 -0.920919
      1      0.345892  0.472304
1     0     -0.756189  0.252012
      1      0.046795  1.462241
2     0     -0.313086 -0.870141
      1     -0.393559  0.921510
3     0     -0.147461  0.943085
      1      1.701652 -0.062511
'''






 

共有 人打赏支持
粉丝 19
博文 132
码字总数 185568
×
eddy_linux
如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!
* 金额(元)
¥1 ¥5 ¥10 ¥20 其他金额
打赏人
留言
* 支付类型
微信扫码支付
打赏金额:
已支付成功
打赏金额: