# 量化投资学习笔记19——回归分析:实操，泰坦尼克号乘客生还机会预测，线性回归方法。

2019/04/10 10:10

print(train_data.info())
print(test_data.info())


 g = train_data.groupby(["SibSp", "Survived"])
df = pd.DataFrame(g.count()["PassengerId"])
print(df)

g = train_data.groupby(["Parch", "Survived"])
df = pd.DataFrame(g.count()["PassengerId"])
print(df)


print(train_data.isnull().sum())
print(test_data.isnull().sum())


 train_data["Age"].fillna(train_data["Age"].median(), inplace = True)
test_data["Age"].fillna(test_data["Age"].median(), inplace = True)
train_data["Embarked"] = train_data["Embarked"].fillna('S')
train_data.loc[(train_data.Cabin.notnull()), "Cabin"] = 1
train_data.loc[(train_data.Cabin.isnull()), "Cabin"] = 0
test_data.loc[(test_data.Cabin.notnull()), "Cabin"] = 1
test_data.loc[(test_data.Cabin.isnull()), "Cabin"] = 0


 train_data.loc[train_data["Sex"] == "male", "Sex"] = 0
train_data.loc[train_data["Sex"] == "female", "Sex"] = 1
test_data.loc[test_data["Sex"] == "male", "Sex"] = 0
test_data.loc[test_data["Sex"] == "female", "Sex"] = 1


 C:0, Q:1, S:2
train_data.loc[train_data["Embarked"] == 'C', "Embarked"] = 0
train_data.loc[train_data["Embarked"] == 'Q', "Embarked"] = 1
train_data.loc[train_data["Embarked"] == 'S', "Embarked"] = 2
test_data.loc[test_data["Embarked"] == 'C', "Embarked"] = 0
test_data.loc[test_data["Embarked"] == 'Q', "Embarked"] = 1
test_data.loc[test_data["Embarked"] == 'S', "Embarked"] = 2



columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked', 'Survived', 'Cabin']
new_train_data = train_data[columns]
print(new_train_data.info())


OK，可以开始建模了。 先用刚学的线性回归模型。 线性回归模型 特征变量

 predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked', 'Cabin']
LR = LinearRegression()


 kf = KFold(5, random_state = 0)
train_target = new_train_data["Survived"]
accuracys = []
for train, test in kf.split(new_train_data):
LR.fit(new_train_data.loc[train, predictors], new_train_data.loc[train, "Survived"])
pred = LR.predict(new_train_data.loc[test, predictors])
pred[pred >= 0.6] = 1
pred[pred < 0.6] = 0
accuracy = len(pred[pred == new_train_data.loc[test, "Survived"]])/len(test)
accuracys.append(accuracy)
print(np.mean(accuracys))


 print("回归系数:", LR.coef_)
print("截距:", LR.intercept_)
X = new_train_data[predictors]
y = new_train_data["Survived"]
Y = LR.predict(X)
print("模型评分:", LR.score(X, y))
i = 241
for index in predictors:
X = new_train_data[index]
fig = plt.subplot(i)
i += 1
plt.plot(X, Y, "*")
plt.plot(X, y, "o")
plt.savefig("LRtest.png")


 # 看模型的假设检验
X = new_train_data[predictors]
model = sm.OLS(Y, X).fit()
res = get_index(model)
print("回归参数", model.params)
print("回归结果", res)
print(model.summary())


0
0 收藏

0 评论
0 收藏
0