pandas 基础

2018/11/18 21:48
阅读数 0

<!-- TOC -->

<!-- /TOC -->

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

创建一个Series ,同时让pandas自动生成索引列

s = pd.Series([1,3,5,np.nan,6,8])
# 查看s
s
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

创建一个DataFrame数据框

### 创建一个DataFrame ,可以传入一个numpy array 可以自己构建索引以及列标
dates = pd.date_range('2018-11-01',periods=7)
#### 比如说生成一个时间序列,以20181101 为起始位置的,7个日期组成的时间序列,数据的类型为datetime64[ns]
dates
DatetimeIndex(['2018-11-01', '2018-11-02', '2018-11-03', '2018-11-04',
               '2018-11-05', '2018-11-06', '2018-11-07'],
              dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(7,4),index= dates,columns=list('ABCD'))
df
# 产生随机正态分布的数据,7行4列,分别对应的index的长度以及column的长度

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>-1.325547</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> </tr> </tbody> </table> </div>

### 同时用可以使用dict的实行创建DataFrame
df2 = pd.DataFrame({"A":1,
                   "B":"20181101",
                   'C':np.array([3]*4,dtype='int32'),
                   'D':pd.Categorical(['test','train','test','train']),
                   "E":1.5},
                  )
df2

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>1</td> <td>20181101</td> <td>3</td> <td>test</td> <td>1.5</td> </tr> <tr> <th>1</th> <td>1</td> <td>20181101</td> <td>3</td> <td>train</td> <td>1.5</td> </tr> <tr> <th>2</th> <td>1</td> <td>20181101</td> <td>3</td> <td>test</td> <td>1.5</td> </tr> <tr> <th>3</th> <td>1</td> <td>20181101</td> <td>3</td> <td>train</td> <td>1.5</td> </tr> </tbody> </table> </div>

df2.dtypes
### 查看数据框中的数据类型,常见的数据类型还有时间类型以及float类型
A       int64
B      object
C       int32
D    category
E     float64
dtype: object

查看数据


# 比如说看前5行
df.head()

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>-1.325547</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> </tr> </tbody> </table> </div>

# 后4行
df.tail(4)

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> </tr> </tbody> </table> </div>

# 查看DataFrame的索引
df.index
DatetimeIndex(['2018-11-01', '2018-11-02', '2018-11-03', '2018-11-04',
               '2018-11-05', '2018-11-06', '2018-11-07'],
              dtype='datetime64[ns]', freq='D')
# 查看DataFrame的列索引
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')
# 查看DataFrame的数据,将DataFrame转化为numpy array 的数据形式
df.values
array([[ 2.19709382,  0.90891281, -0.64802911, -1.32554721],
       [ 0.35466158, -1.22424591, -0.50120854, -1.49017025],
       [-0.24583358, -1.04959585,  2.36622453,  0.6373212 ],
       [-0.6899396 ,  0.47128154, -1.41740143,  0.26890482],
       [-0.54804068, -0.84193368,  0.57312781, -1.05517487],
       [-0.6910726 ,  0.93301611,  1.85764662,  0.77552552],
       [ 0.46707509,  0.36240665,  2.31937488, -0.721314  ]])

数据的简单统计

# 可以使用describe函数对DataFrame中的数值型数据进行统计
df.describe()

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>count</th> <td>7.000000</td> <td>7.000000</td> <td>7.000000</td> <td>7.000000</td> </tr> <tr> <th>mean</th> <td>0.120563</td> <td>-0.062880</td> <td>0.649962</td> <td>-0.415779</td> </tr> <tr> <th>std</th> <td>1.031487</td> <td>0.942664</td> <td>1.553537</td> <td>0.955789</td> </tr> <tr> <th>min</th> <td>-0.691073</td> <td>-1.224246</td> <td>-1.417401</td> <td>-1.490170</td> </tr> <tr> <th>25%</th> <td>-0.618990</td> <td>-0.945765</td> <td>-0.574619</td> <td>-1.190361</td> </tr> <tr> <th>50%</th> <td>-0.245834</td> <td>0.362407</td> <td>0.573128</td> <td>-0.721314</td> </tr> <tr> <th>75%</th> <td>0.410868</td> <td>0.690097</td> <td>2.088511</td> <td>0.453113</td> </tr> <tr> <th>max</th> <td>2.197094</td> <td>0.933016</td> <td>2.366225</td> <td>0.775526</td> </tr> </tbody> </table> </div>

df2.describe()
### 对于其他的数据类型的数据describe函数会自动过滤掉

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>C</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>count</th> <td>4.0</td> <td>4.0</td> <td>4.0</td> </tr> <tr> <th>mean</th> <td>1.0</td> <td>3.0</td> <td>1.5</td> </tr> <tr> <th>std</th> <td>0.0</td> <td>0.0</td> <td>0.0</td> </tr> <tr> <th>min</th> <td>1.0</td> <td>3.0</td> <td>1.5</td> </tr> <tr> <th>25%</th> <td>1.0</td> <td>3.0</td> <td>1.5</td> </tr> <tr> <th>50%</th> <td>1.0</td> <td>3.0</td> <td>1.5</td> </tr> <tr> <th>75%</th> <td>1.0</td> <td>3.0</td> <td>1.5</td> </tr> <tr> <th>max</th> <td>1.0</td> <td>3.0</td> <td>1.5</td> </tr> </tbody> </table> </div>

### DataFrame 的转置,将列索引与行索引进行调换,行数据与列数进行调换
df.T

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>2018-11-01 00:00:00</th> <th>2018-11-02 00:00:00</th> <th>2018-11-03 00:00:00</th> <th>2018-11-04 00:00:00</th> <th>2018-11-05 00:00:00</th> <th>2018-11-06 00:00:00</th> <th>2018-11-07 00:00:00</th> </tr> </thead> <tbody> <tr> <th>A</th> <td>2.197094</td> <td>0.354662</td> <td>-0.245834</td> <td>-0.689940</td> <td>-0.548041</td> <td>-0.691073</td> <td>0.467075</td> </tr> <tr> <th>B</th> <td>0.908913</td> <td>-1.224246</td> <td>-1.049596</td> <td>0.471282</td> <td>-0.841934</td> <td>0.933016</td> <td>0.362407</td> </tr> <tr> <th>C</th> <td>-0.648029</td> <td>-0.501209</td> <td>2.366225</td> <td>-1.417401</td> <td>0.573128</td> <td>1.857647</td> <td>2.319375</td> </tr> <tr> <th>D</th> <td>-1.325547</td> <td>-1.490170</td> <td>0.637321</td> <td>0.268905</td> <td>-1.055175</td> <td>0.775526</td> <td>-0.721314</td> </tr> </tbody> </table> </div>

df

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>-1.325547</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> </tr> </tbody> </table> </div>

数据的排序

df.sort_index(ascending=False)
### 降序,按照列进行降序,通过该索引列

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> </tr> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>-1.325547</td> </tr> </tbody> </table> </div>


print(df.sort_values(by=['B','A']))
#  默认是升序,可以选择多指排序,先照B,后排A,如果B中的数据一样,则按照A中的大小进行排序
df.sort_values(by='B')
                   A         B         C         D
2018-11-02  0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596  2.366225  0.637321
2018-11-05 -0.548041 -0.841934  0.573128 -1.055175
2018-11-07  0.467075  0.362407  2.319375 -0.721314
2018-11-04 -0.689940  0.471282 -1.417401  0.268905
2018-11-01  2.197094  0.908913 -0.648029 -1.325547
2018-11-06 -0.691073  0.933016  1.857647  0.775526

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> </tr> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>-1.325547</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> </tr> </tbody> </table> </div>

选择数据(类似于数据库中sql语句)

df['A']
# 取出单独的一列数据,等价于df.A
2018-11-01    2.197094
2018-11-02    0.354662
2018-11-03   -0.245834
2018-11-04   -0.689940
2018-11-05   -0.548041
2018-11-06   -0.691073
2018-11-07    0.467075
Freq: D, Name: A, dtype: float64
# 通过[]进行行选择切片
df[0:3]

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>-1.325547</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> </tr> </tbody> </table> </div>

# 同时对于时间索引而言,可以直接使用比如
df['2018-11-01':'2018-11-04']

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>-1.325547</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> </tr> </tbody> </table> </div>

另外可以使用标签来选择


df.loc['2018-11-01']
A    2.197094
B    0.908913
C   -0.648029
D   -1.325547
Name: 2018-11-01 00:00:00, dtype: float64
#### 通过标签来进行多个轴上的进行选择
df.loc[:,["A","B"]] # 等价于df[["A","B"]]

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> </tr> </tbody> </table> </div>

df.loc["2018-11-01":"2018-11-03",["A","B"]]

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> </tr> </tbody> </table> </div>

#### 获得一个标量数据
df.loc['2018-11-01','A']
2.1970938156943904

通过位置获取数据

df.iloc[3]  # 获得第四行的数据
A   -0.689940
B    0.471282
C   -1.417401
D    0.268905
Name: 2018-11-04 00:00:00, dtype: float64
df.iloc[1:3,1:4]  #  与numpy中的ndarray类似

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-02</th> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> </tr> <tr> <th>2018-11-03</th> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> </tr> </tbody> </table> </div>

# 可以选取不连续的行或者列进行取值
df.iloc[[1,3],[1,3]]

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>B</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-02</th> <td>-1.224246</td> <td>-1.490170</td> </tr> <tr> <th>2018-11-04</th> <td>0.471282</td> <td>0.268905</td> </tr> </tbody> </table> </div>

#  对行进行切片处理
df.iloc[1:3,:]

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> </tr> </tbody> </table> </div>

# 对列进行切片
df.iloc[:,1:4]

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>0.908913</td> <td>-0.648029</td> <td>-1.325547</td> </tr> <tr> <th>2018-11-02</th> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> </tr> <tr> <th>2018-11-03</th> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> </tr> <tr> <th>2018-11-04</th> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> </tr> <tr> <th>2018-11-05</th> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> </tr> <tr> <th>2018-11-06</th> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> </tr> <tr> <th>2018-11-07</th> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> </tr> </tbody> </table> </div>

# 获取特定的值
df.iloc[1,3]
-1.4901702546027098

布尔值索引

# 使用单列的数据作为条件进行筛选
df[df.A>0]

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>-1.325547</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> </tr> </tbody> </table> </div>

 #很少用到,很少使用这种大范围的条件进行筛选
df[df>0] 

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2018-11-03</th> <td>NaN</td> <td>NaN</td> <td>2.366225</td> <td>0.637321</td> </tr> <tr> <th>2018-11-04</th> <td>NaN</td> <td>0.471282</td> <td>NaN</td> <td>0.268905</td> </tr> <tr> <th>2018-11-05</th> <td>NaN</td> <td>NaN</td> <td>0.573128</td> <td>NaN</td> </tr> <tr> <th>2018-11-06</th> <td>NaN</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>NaN</td> </tr> </tbody> </table> </div>

# 使用isin()方法过滤
df2.head()

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>1</td> <td>20181101</td> <td>3</td> <td>test</td> <td>1.5</td> </tr> <tr> <th>1</th> <td>1</td> <td>20181101</td> <td>3</td> <td>train</td> <td>1.5</td> </tr> <tr> <th>2</th> <td>1</td> <td>20181101</td> <td>3</td> <td>test</td> <td>1.5</td> </tr> <tr> <th>3</th> <td>1</td> <td>20181101</td> <td>3</td> <td>train</td> <td>1.5</td> </tr> </tbody> </table> </div>

df2[df2['D'].isin(['test'])]

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>1</td> <td>20181101</td> <td>3</td> <td>test</td> <td>1.5</td> </tr> <tr> <th>2</th> <td>1</td> <td>20181101</td> <td>3</td> <td>test</td> <td>1.5</td> </tr> </tbody> </table> </div>

设定数值(类似于sql update 或者add)

  • 设定一个新的列
df['E'] = [1,2,3,4,5,6,7]
df

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>-1.325547</td> <td>1</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> <td>2</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> <td>3</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> <td>4</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> <td>5</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> <td>6</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> <td>7</td> </tr> </tbody> </table> </div>

  • 通过标签设定新的值
df.loc['2018-11-01','E']= 10  # 第一行,E列的数据修改为10
df

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>-1.325547</td> <td>10</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> <td>2</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> <td>3</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> <td>4</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> <td>5</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> <td>6</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> <td>7</td> </tr> </tbody> </table> </div>

df.iloc[1,4]=5000  # 第二行第五列数据修改为5000
df

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>-1.325547</td> <td>10</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> <td>5000</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> <td>3</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> <td>4</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> <td>5</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> <td>6</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> <td>7</td> </tr> </tbody> </table> </div>

df3 =df.copy()
df3[df3<0]= -df3
df3  # 都变成非负数

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>0.648029</td> <td>1.325547</td> <td>10</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>1.224246</td> <td>0.501209</td> <td>1.490170</td> <td>5000</td> </tr> <tr> <th>2018-11-03</th> <td>0.245834</td> <td>1.049596</td> <td>2.366225</td> <td>0.637321</td> <td>3</td> </tr> <tr> <th>2018-11-04</th> <td>0.689940</td> <td>0.471282</td> <td>1.417401</td> <td>0.268905</td> <td>4</td> </tr> <tr> <th>2018-11-05</th> <td>0.548041</td> <td>0.841934</td> <td>0.573128</td> <td>1.055175</td> <td>5</td> </tr> <tr> <th>2018-11-06</th> <td>0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> <td>6</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>0.721314</td> <td>7</td> </tr> </tbody> </table> </div>

缺失值处理

df

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>-1.325547</td> <td>10</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>-1.490170</td> <td>5000</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>0.637321</td> <td>3</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> <td>4</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> <td>5</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> <td>6</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> <td>7</td> </tr> </tbody> </table> </div>

df['E']=[1,np.nan,2,np.nan,4,np.nan,6]
df.loc['2018-11-01':'2018-11-03','D']=np.nan
df

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>NaN</td> <td>1.0</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>NaN</td> <td>2.0</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> <td>NaN</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> <td>4.0</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> <td>NaN</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> <td>6.0</td> </tr> </tbody> </table> </div>

  • 去掉缺失值的行
df4 = df.copy()
df4.dropna(how='any')

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> <td>4.0</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> <td>6.0</td> </tr> </tbody> </table> </div>

df4.dropna(how='all')
# """DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)""" 
# aixs 轴0或者1 index或者columns
# how 方式
# thresh 超过阈值个数的缺失值
# subset 那些字段的处理
# inplace 是否直接在原数据框中的替换

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>NaN</td> <td>1.0</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>NaN</td> <td>2.0</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> <td>NaN</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> <td>4.0</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> <td>NaN</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> <td>6.0</td> </tr> </tbody> </table> </div>

  • 对缺失值就行填充
df4.fillna(1000)

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>1000.000000</td> <td>1.0</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>1000.000000</td> <td>1000.0</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>1000.000000</td> <td>2.0</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> <td>1000.0</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> <td>4.0</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> <td>1000.0</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> <td>6.0</td> </tr> </tbody> </table> </div>

  • 对数据进行布尔值进行填充
pd.isnull(df4)

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>False</td> <td>False</td> <td>False</td> <td>True</td> <td>False</td> </tr> <tr> <th>2018-11-02</th> <td>False</td> <td>False</td> <td>False</td> <td>True</td> <td>True</td> </tr> <tr> <th>2018-11-03</th> <td>False</td> <td>False</td> <td>False</td> <td>True</td> <td>False</td> </tr> <tr> <th>2018-11-04</th> <td>False</td> <td>False</td> <td>False</td> <td>False</td> <td>True</td> </tr> <tr> <th>2018-11-05</th> <td>False</td> <td>False</td> <td>False</td> <td>False</td> <td>False</td> </tr> <tr> <th>2018-11-06</th> <td>False</td> <td>False</td> <td>False</td> <td>False</td> <td>True</td> </tr> <tr> <th>2018-11-07</th> <td>False</td> <td>False</td> <td>False</td> <td>False</td> <td>False</td> </tr> </tbody> </table> </div>

数据操作

#统计的工作一般情况下都不包含缺失值,
df4.mean() 
#  默认是对列进行求平均,沿着行方向也就是axis=0
A    0.120563
B   -0.062880
C    0.649962
D   -0.183015
E    3.250000
dtype: float64
df4.mean(axis=1)
#  沿着列方向求每行的平均
2018-11-01    0.864494
2018-11-02   -0.456931
2018-11-03    0.767699
2018-11-04   -0.341789
2018-11-05    0.425596
2018-11-06    0.718779
2018-11-07    1.685509
Freq: D, dtype: float64
 # 对于拥有不同维度,需要对齐的对象进行操作。Pandas会自动的沿着指定的维度进行广播:
s = pd.Series([1,3,4,np.nan,6,7,8],index=dates)
s
2018-11-01    1.0
2018-11-02    3.0
2018-11-03    4.0
2018-11-04    NaN
2018-11-05    6.0
2018-11-06    7.0
2018-11-07    8.0
Freq: D, dtype: float64
df4.sub(s,axis='index')

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>1.197094</td> <td>-0.091087</td> <td>-1.648029</td> <td>NaN</td> <td>0.0</td> </tr> <tr> <th>2018-11-02</th> <td>-2.645338</td> <td>-4.224246</td> <td>-3.501209</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2018-11-03</th> <td>-4.245834</td> <td>-5.049596</td> <td>-1.633775</td> <td>NaN</td> <td>-2.0</td> </tr> <tr> <th>2018-11-04</th> <td>NaN</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2018-11-05</th> <td>-6.548041</td> <td>-6.841934</td> <td>-5.426872</td> <td>-7.055175</td> <td>-2.0</td> </tr> <tr> <th>2018-11-06</th> <td>-7.691073</td> <td>-6.066984</td> <td>-5.142353</td> <td>-6.224474</td> <td>NaN</td> </tr> <tr> <th>2018-11-07</th> <td>-7.532925</td> <td>-7.637593</td> <td>-5.680625</td> <td>-8.721314</td> <td>-2.0</td> </tr> </tbody> </table> </div>

df4

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>NaN</td> <td>1.0</td> </tr> <tr> <th>2018-11-02</th> <td>0.354662</td> <td>-1.224246</td> <td>-0.501209</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2018-11-03</th> <td>-0.245834</td> <td>-1.049596</td> <td>2.366225</td> <td>NaN</td> <td>2.0</td> </tr> <tr> <th>2018-11-04</th> <td>-0.689940</td> <td>0.471282</td> <td>-1.417401</td> <td>0.268905</td> <td>NaN</td> </tr> <tr> <th>2018-11-05</th> <td>-0.548041</td> <td>-0.841934</td> <td>0.573128</td> <td>-1.055175</td> <td>4.0</td> </tr> <tr> <th>2018-11-06</th> <td>-0.691073</td> <td>0.933016</td> <td>1.857647</td> <td>0.775526</td> <td>NaN</td> </tr> <tr> <th>2018-11-07</th> <td>0.467075</td> <td>0.362407</td> <td>2.319375</td> <td>-0.721314</td> <td>6.0</td> </tr> </tbody> </table> </div>

df4.apply(np.cumsum)

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> <th>E</th> </tr> </thead> <tbody> <tr> <th>2018-11-01</th> <td>2.197094</td> <td>0.908913</td> <td>-0.648029</td> <td>NaN</td> <td>1.0</td> </tr> <tr> <th>2018-11-02</th> <td>2.551755</td> <td>-0.315333</td> <td>-1.149238</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2018-11-03</th> <td>2.305922</td> <td>-1.364929</td> <td>1.216987</td> <td>NaN</td> <td>3.0</td> </tr> <tr> <th>2018-11-04</th> <td>1.615982</td> <td>-0.893647</td> <td>-0.200415</td> <td>0.268905</td> <td>NaN</td> </tr> <tr> <th>2018-11-05</th> <td>1.067942</td> <td>-1.735581</td> <td>0.372713</td> <td>-0.786270</td> <td>7.0</td> </tr> <tr> <th>2018-11-06</th> <td>0.376869</td> <td>-0.802565</td> <td>2.230360</td> <td>-0.010745</td> <td>NaN</td> </tr> <tr> <th>2018-11-07</th> <td>0.843944</td> <td>-0.440158</td> <td>4.549735</td> <td>-0.732059</td> <td>13.0</td> </tr> </tbody> </table> </div>

df4.apply(lambda x: x.max()-x.min())
A    2.888166
B    2.157262
C    3.783626
D    1.830700
E    5.000000
dtype: float64

统计个数与离散化

s = pd.Series(np.random.randint(0,7,size=15))
s
0     1
1     6
2     3
3     1
4     1
5     0
6     4
7     1
8     3
9     4
10    6
11    1
12    4
13    3
14    5
dtype: int32
s.value_counts()
# 统计元素的个数,并按照元素统计量进行排序,未出现的元素不会显示出来
1    5
4    3
3    3
6    2
5    1
0    1
dtype: int64
s.reindex(range(0,7))
# 按照固定的顺序输出元素的个数统计
0    1
1    6
2    3
3    1
4    1
5    0
6    4
dtype: int32
s.mode()
#  众数 
0    1
dtype: int32
  • 离散化
# 连续值转化为离散值,可以使用cut函数进行操作(bins based on vlaues) qcut (bins based on sample
# quantiles) 函数
arr = np.random.randint(0,20,size=15)  # 正态分布
arr
array([ 3, 14, 10,  2,  2,  0, 17, 13,  7,  0, 15, 14,  4, 19,  9])
factor = pd.cut(arr,3)
factor
[(-0.019, 6.333], (12.667, 19.0], (6.333, 12.667], (-0.019, 6.333], (-0.019, 6.333], ..., (12.667, 19.0], (12.667, 19.0], (-0.019, 6.333], (12.667, 19.0], (6.333, 12.667]]
Length: 15
Categories (3, interval[float64]): [(-0.019, 6.333] < (6.333, 12.667] < (12.667, 19.0]]
pd.value_counts(factor)
(12.667, 19.0]     6
(-0.019, 6.333]    6
(6.333, 12.667]    3
dtype: int64
factor1 = pd.cut(arr,[-1,5,10,15,20])
pd.value_counts(factor1)
(-1, 5]     6
(10, 15]    4
(5, 10]     3
(15, 20]    2
dtype: int64
factor2 = pd.qcut(arr,[0,0.25,0.5,0.75,1])
pd.value_counts(factor2)
(9.0, 14.0]      4
(2.5, 9.0]       4
(-0.001, 2.5]    4
(14.0, 19.0]     3
dtype: int64
展开阅读全文
打赏
0
0 收藏
分享
加载中
更多评论
打赏
0 评论
0 收藏
0
分享
返回顶部
顶部