自定义函数方法

原创
2017/07/09 15:37
阅读数 98

使用Pandas apply()函数自定义队数据进行操作

import pandas as pd
import numpy as np
# read titanic_train.csv
titanic_survival = pd.read_csv('titanic_train.csv')

# print fist 5 rows
print(titanic_survival.head())
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
# get page column
# NaN stands for "not a number", to indicate a missing data
age = titanic_survival['Age']
print(age[:10])
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64
# we can use the pandas.isnull() function which takes a pandas series
# and returns a series of True and False values
age_is_null = pd.isnull(age)
print(age_is_null[:10])
0    False
1    False
2    False
3    False
4    False
5     True
6    False
7    False
8    False
9    False
Name: Age, dtype: bool
age_null_true = age[age_is_null]
print(age_null_true[:10])
5    NaN
17   NaN
19   NaN
26   NaN
28   NaN
29   NaN
31   NaN
32   NaN
36   NaN
42   NaN
Name: Age, dtype: float64
# print the number of age_null_count
age_null_count = len(age_null_true)
print(age_null_count)
177
# The result of this is that mean_age would be NaN.
# This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival['Age']) / len(titanic_survival['Age'])
print(mean_age)
nan
good_ages = titanic_survival['Age'][age_is_null == False]
correct_mean_age = sum(good_ages) / len(good_ages)
print(correct_mean_age)
29.6991176471
# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival['Age'].mean()
print(correct_mean_age)
29.69911764705882
# mean fare for each class
passgenger_class = [1, 2, 3]
fare = {}
for each_class in passgenger_class:
    mean_fee = titanic_survival['Fare'][titanic_survival['Pclass']==each_class].mean()
    fare[each_class] = mean_fee
print(fare)
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
pclass_survived = titanic_survival.\
    pivot_table(index='Pclass', values='Survived', aggfunc=np.mean)
print(pclass_survived)
Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64
pclass_age = titanic_survival.pivot_table(index='Pclass', values=['Age', 'Survived'])
print(pclass_age)
              Age  Survived
Pclass                     
1       38.233441  0.629630
2       29.877630  0.472826
3       25.140620  0.242363
name_row_19 = titanic_survival.loc[19]['Name']
print(name_row_19)
Masselmani, Mrs. Fatima
new_titanic_survival = titanic_survival.sort_values('Age',ascending=False)
# print(new_titanic_survival)
new_titanic_survival = new_titanic_survival.reset_index(level='Pclass', drop=True)
print(new_titanic_survival.loc[0:2])
   level_0  index  PassengerId  Survived  Pclass  \
0        0    630          631         1       1   
1        1    851          852         0       3   
2        2    493          494         0       1   

                                   Name   Sex   Age  SibSp  Parch    Ticket  \
0  Barkworth, Mr. Algernon Henry Wilson  male  80.0      0      0     27042   
1                   Svensson, Mr. Johan  male  74.0      0      0    347060   
2               Artagaveytia, Mr. Ramon  male  71.0      0      0  PC 17609   

      Fare Cabin Embarked  
0  30.0000   A23        S  
1   7.7750   NaN        S  
2  49.5042   NaN        C  
def hundredth_row(column):
    # Extract the hundredth item
    hundredth_item = column.iloc[99]
    return hundredth_item
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)
PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
dtype: object
# By passing in the axis = 1 argument, we can use the DataFrame.apply method
# to iterate rows instead of columns
def which_class(row):
    pclass = row['Pclass']
    if pd.isnull(pclass):
        return 'Unknown'
    elif pclass == 1:
        return 'First Class'
    elif pclass == 2:
        return 'Second Class'
    elif pclass == 3:
        return 'ThirdClass'
classes = titanic_survival.apply(which_class, axis=1)
print(classes[:10])
0      ThirdClass
1     First Class
2      ThirdClass
3     First Class
4      ThirdClass
5      ThirdClass
6     First Class
7      ThirdClass
8      ThirdClass
9    Second Class
dtype: object
def is_minor(row):
    if row['Age'] < 18:
        return True
    else:
        return False
minors = titanic_survival.apply(is_minor, axis=1)
print(minors[:5])
0    False
1    False
2    False
3    False
4    False
dtype: bool
def generate_age_label(row):
    age = row['Age']
    if pd.isnull(age):
        return 'Unknown'
    elif age < 18:
        return 'minor'
    else:
        return 'adult'
age_label = titanic_survival.apply(generate_age_label, axis=1)
print(age_label[:5])
0    adult
1    adult
2    adult
3    adult
4    adult
dtype: object
titanic_survival['age_label'] = age_label
age_group_survival = titanic_survival.pivot_table(index='age_label', values='Survived')
print(age_group_survival)
age_label
Unknown    0.293785
adult      0.381032
minor      0.539823
Name: Survived, dtype: float64
展开阅读全文
打赏
0
0 收藏
分享
加载中
更多评论
打赏
0 评论
0 收藏
0
分享
返回顶部
顶部