2021/09/10 16:37

# 逻辑回归

，进来一个x，经过f(x)的运算，会得到一个概率值。之后我们根据这个概率值来进行分类

import numpy as np
import matplotlib.pyplot as plt

if __name__ == "__main__":

def sigmoid(t):
return 1 / (1 + np.exp(-t))

x = np.linspace(-10, 10, 500)
y = sigmoid(x)
plt.plot(x, y)
plt.show()

，这个函数和上面的分类函数是等价的，原因也很简单，当y=1的时候，该式就等于，当y=0的时候，该式就等于

import numpy as np
from math import sqrt

def accuracy_score(y_true, y_predict):
"""计算y_true和y_predict之间的准确率"""
assert len(y_true) == len(y_predict), \
"the size of y_true must be equal to the size of y_predict"

return np.sum(y_true == y_predict) / len(y_true)

def mean_squared_error(y_true, y_predict):
"""计算y_true和y_predict之间的MSE"""
assert len(y_true) == len(y_predict), \
"the size of y_true must be equal to the size of y_predict"

return np.sum((y_true - y_predict)**2) / len(y_true)

def root_mean_squared_error(y_true, y_predict):
"""计算y_true和y_predict之间的RMSE"""

return sqrt(mean_squared_error(y_true, y_predict))

def mean_absolute_error(y_true, y_predict):
"""计算y_true和y_predict之间的MAE"""

return np.sum(np.absolute(y_true - y_predict)) / len(y_true)

def r2_score(y_true, y_predict):
"""计算y_true和y_predict之间的R Square"""
return 1 - mean_squared_error(y_true, y_predict) / np.var(y_true)
import numpy as np
from .metrics import accuracy_score

class LogisticRegression:

def __init__(self):
# 初始化LogisticRegression模型
self.coef = None  # 系数
self.interception = None  # 截距
self._theta = None

def _sigmoid(self, t):
return 1 / (1 + np.exp(-t))

def fit(self, X_train, y_train, eta=0.01, n_iters=1e4):
# 根据训练数据集X_train,y_train，使用梯度下降法训练Logistic Regression模型
assert X_train.shape[0] == y_train.shape[0], \
"X_train的列数必须等于y_train的长度"

def J(theta, X_b, y):
# 构建损失函数
p_hat = self._sigmoid(X_b.dot(theta))
try:
return - np.sum(y * np.log(p_hat) + (1 - y) * np.log(1 - p_hat)) / len(y)
except:
return float('inf')

def dJ(theta, X_b, y):
# 对theta求偏导数，获取梯度向量
return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(X_b)

def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
"""
梯度下降算法
:param X_b: 带虚拟特征1的自变量特征矩阵
:param y: 因变量向量
:param initial_theta: 初始的常数向量，这里需要注意的是真正待求的是常数向量，求偏导的也是常数向量
:param eta: 迭代步长、学习率
:param n_iters: 最大迭代次数
:param epsilon: 误差值
:return:
"""
theta = initial_theta
# 真实迭代次数
i_iter = 0
while i_iter < n_iters:
# 获取梯度
last_theta = theta
# 迭代更新theta，不断顺着梯度方向寻找新的theta
theta = theta - eta * gradient
# 计算前后两次迭代后的损失函数差值的绝对值
if abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon:
break
# 更新迭代次数
i_iter += 1
return theta

X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])
self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
self.interception = self._theta[0]
self.coef = self._theta[1:]
return self

def predict_proba(self, X_predict):
# 给定待预测数据集X_predict,返回表示X_predict的结果概率向量
assert self.interception is not None and self.coef is not None, \
"开始预测前必须fit"
assert X_predict.shape[1] == len(self.coef), \
"预测的特征数必须与训练的特征数相等"
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return self._sigmoid(X_b.dot(self._theta))

def predict(self, X_predict):
# 给定待预测数据集X_predict,返回表示X_predict的结果向量
assert self.interception is not None and self.coef is not None, \
"开始预测前必须fit"
assert X_predict.shape[1] == len(self.coef), \
"预测的特征数必须与训练的特征数相等"
proba = self.predict_proba(X_predict)
return np.array(proba >= 0.5, dtype='int')

def score(self, X_test, y_test):
# 根据测试数据集X_test和y_test确定当前模型的准确度
y_predict = self.predict(X_test)
return accuracy_score(y_test, y_predict)

def __repr__(self):
return "LogisticRegression()"

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

if __name__ == "__main__":

X = iris.data
y = iris.target
# 鸢尾花数据集有3个分类，而逻辑回归只能进行二分类，所以我们去掉2这个特征
X = X[y < 2, :2]
y = y[y < 2]
print(X.shape)
print(y.shape)
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
plt.show()

(100, 2)
(100,)

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from playLA.model_selection import train_test_split
from playLA.LogisticRegression import LogisticRegression

if __name__ == "__main__":

X = iris.data
y = iris.target
# 鸢尾花数据集有3个分类，而逻辑回归只能进行二分类，所以我们去掉2这个特征
X = X[y < 2, :2]
y = y[y < 2]
print(X.shape)
print(y.shape)
# plt.scatter(X[y == 0, 0], X[y == 1, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
# 查看分类准确度
print(log_reg.score(X_test, y_test))
# 查看测试数据集每一个元素的概率值
print(log_reg.predict_proba(X_test))
# 查看分类结果
print(y_test)
# 查看预测结果
print(log_reg.predict(X_test))

(100, 2)
(100,)
1.0
[0.92972035 0.98664939 0.14852024 0.01685947 0.0369836  0.0186637
0.04936918 0.99669244 0.97993941 0.74524655 0.04473194 0.00339285
0.26131273 0.0369836  0.84192923 0.79892262 0.82890209 0.32358166
0.06535323 0.20735334]
[1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0]
[1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0]

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from playLA.model_selection import train_test_split
from playLA.LogisticRegression import LogisticRegression

if __name__ == "__main__":

X = iris.data
y = iris.target
# 鸢尾花数据集有3个分类，而逻辑回归只能进行二分类，所以我们去掉2这个特征
X = X[y < 2, :2]
y = y[y < 2]
print(X.shape)
print(y.shape)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
# 查看分类准确度
print(log_reg.score(X_test, y_test))
# 查看测试数据集每一个元素的概率值
print(log_reg.predict_proba(X_test))
# 查看分类结果
print(y_test)
# 查看预测结果
print(log_reg.predict(X_test))
# 查看逻辑回归的系数
print(log_reg.coef)
# 查看逻辑回归的截距
print(log_reg.interception)

(100, 2)
(100,)
1.0
[0.92972035 0.98664939 0.14852024 0.01685947 0.0369836  0.0186637
0.04936918 0.99669244 0.97993941 0.74524655 0.04473194 0.00339285
0.26131273 0.0369836  0.84192923 0.79892262 0.82890209 0.32358166
0.06535323 0.20735334]
[1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0]
[1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0]
[ 3.01796521 -5.04447145]
-0.6937719272911228

≥0的时候，我们的概率估计值≥0.5，此时我们将新来的样本x分类为y=1；当<0的时候，我们的概率估计值<0.5，此时我们将新来的样本x分类为y=0。换句话说，我们为新来的样本分类为1还是0，这个边界点，这个位置就被称为决策边界

def x2(x1):
return (- log_reg.coef[0] * x1 - log_reg.interception) / log_reg.coef[1]

x1_plot = np.linspace(4, 8, 1000)
x2_plot = x2(x1_plot)
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
plt.plot(x1_plot, x2_plot)
plt.show()

def x2(x1):
return (- log_reg.coef[0] * x1 - log_reg.interception) / log_reg.coef[1]

x1_plot = np.linspace(4, 8, 1000)
x2_plot = x2(x1_plot)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.plot(x1_plot, x2_plot)
# plt.show()

plt.scatter(X_test[y_test == 0, 0], X_test[y_test == 0, 1], color='red')
plt.scatter(X_test[y_test == 1, 0], X_test[y_test == 1, 1], color='blue')
plt.plot(x1_plot, x2_plot)
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from playLA.model_selection import train_test_split
from playLA.LogisticRegression import LogisticRegression
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X = iris.data
y = iris.target
# 鸢尾花数据集有3个分类，而逻辑回归只能进行二分类，所以我们去掉2这个特征
X = X[y < 2, :2]
y = y[y < 2]
# print(X.shape)
# print(y.shape)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
# 查看分类准确度
# print(log_reg.score(X_test, y_test))
# # 查看测试数据集每一个元素的概率值
# print(log_reg.predict_proba(X_test))
# # 查看分类结果
# print(y_test)
# # 查看预测结果
# print(log_reg.predict(X_test))
# # 查看逻辑回归的系数
# print(log_reg.coef)
# # 查看逻辑回归的截距
# print(log_reg.interception)

# def x2(x1):
#     return (- log_reg.coef[0] * x1 - log_reg.interception) / log_reg.coef[1]
#
# x1_plot = np.linspace(4, 8, 1000)
# x2_plot = x2(x1_plot)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.plot(x1_plot, x2_plot)
# plt.show()

# plt.scatter(X_test[y_test == 0, 0], X_test[y_test == 0, 1], color='red')
# plt.scatter(X_test[y_test == 1, 0], X_test[y_test == 1, 1], color='blue')
# plt.plot(x1_plot, x2_plot)
# plt.show()

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

plot_decision_boundary(log_reg, axis=[4, 7.5, 1.5, 4.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from playLA.model_selection import train_test_split
from playLA.LogisticRegression import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier

if __name__ == "__main__":

X = iris.data
y = iris.target
# 鸢尾花数据集有3个分类，而逻辑回归只能进行二分类，所以我们去掉2这个特征
X = X[y < 2, :2]
y = y[y < 2]
# print(X.shape)
# print(y.shape)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
# 查看分类准确度
# print(log_reg.score(X_test, y_test))
# # 查看测试数据集每一个元素的概率值
# print(log_reg.predict_proba(X_test))
# # 查看分类结果
# print(y_test)
# # 查看预测结果
# print(log_reg.predict(X_test))
# # 查看逻辑回归的系数
# print(log_reg.coef)
# # 查看逻辑回归的截距
# print(log_reg.interception)

# def x2(x1):
#     return (- log_reg.coef[0] * x1 - log_reg.interception) / log_reg.coef[1]
#
# x1_plot = np.linspace(4, 8, 1000)
# x2_plot = x2(x1_plot)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.plot(x1_plot, x2_plot)
# plt.show()

# plt.scatter(X_test[y_test == 0, 0], X_test[y_test == 0, 1], color='red')
# plt.scatter(X_test[y_test == 1, 0], X_test[y_test == 1, 1], color='blue')
# plt.plot(x1_plot, x2_plot)
# plt.show()

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(log_reg, axis=[4, 7.5, 1.5, 4.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()

# KNN的决策边界
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
print(knn_clf.score(X_test, y_test))
plot_decision_boundary(knn_clf, axis=[4, 7.5, 1.5, 4.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
plt.show()

1.0

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from playLA.model_selection import train_test_split
from playLA.LogisticRegression import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier

if __name__ == "__main__":

X = iris.data
y = iris.target
# 鸢尾花数据集有3个分类，而逻辑回归只能进行二分类，所以我们去掉2这个特征
X = X[y < 2, :2]
y = y[y < 2]
# print(X.shape)
# print(y.shape)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
# 查看分类准确度
# print(log_reg.score(X_test, y_test))
# # 查看测试数据集每一个元素的概率值
# print(log_reg.predict_proba(X_test))
# # 查看分类结果
# print(y_test)
# # 查看预测结果
# print(log_reg.predict(X_test))
# # 查看逻辑回归的系数
# print(log_reg.coef)
# # 查看逻辑回归的截距
# print(log_reg.interception)

# def x2(x1):
#     return (- log_reg.coef[0] * x1 - log_reg.interception) / log_reg.coef[1]
#
# x1_plot = np.linspace(4, 8, 1000)
# x2_plot = x2(x1_plot)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.plot(x1_plot, x2_plot)
# plt.show()

# plt.scatter(X_test[y_test == 0, 0], X_test[y_test == 0, 1], color='red')
# plt.scatter(X_test[y_test == 1, 0], X_test[y_test == 1, 1], color='blue')
# plt.plot(x1_plot, x2_plot)
# plt.show()

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(log_reg, axis=[4, 7.5, 1.5, 4.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()

# KNN的决策边界
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
# print(knn_clf.score(X_test, y_test))
# plot_decision_boundary(knn_clf, axis=[4, 7.5, 1.5, 4.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()

# KNN多分类的决策边界
knn_clf_all = KNeighborsClassifier()
knn_clf_all.fit(iris.data[:, :2], iris.target)
plot_decision_boundary(knn_clf_all, axis=[4, 8, 1.5, 4.5])
plt.scatter(iris.data[iris.target == 0, 0], iris.data[iris.target == 0, 1], color='red')
plt.scatter(iris.data[iris.target == 1, 0], iris.data[iris.target == 1, 1], color='blue')
plt.scatter(iris.data[iris.target == 2, 0], iris.data[iris.target == 2, 1], color='green')
plt.show()


import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from playLA.model_selection import train_test_split
from playLA.LogisticRegression import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier

if __name__ == "__main__":

X = iris.data
y = iris.target
# 鸢尾花数据集有3个分类，而逻辑回归只能进行二分类，所以我们去掉2这个特征
X = X[y < 2, :2]
y = y[y < 2]
# print(X.shape)
# print(y.shape)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
# 查看分类准确度
# print(log_reg.score(X_test, y_test))
# # 查看测试数据集每一个元素的概率值
# print(log_reg.predict_proba(X_test))
# # 查看分类结果
# print(y_test)
# # 查看预测结果
# print(log_reg.predict(X_test))
# # 查看逻辑回归的系数
# print(log_reg.coef)
# # 查看逻辑回归的截距
# print(log_reg.interception)

# def x2(x1):
#     return (- log_reg.coef[0] * x1 - log_reg.interception) / log_reg.coef[1]
#
# x1_plot = np.linspace(4, 8, 1000)
# x2_plot = x2(x1_plot)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.plot(x1_plot, x2_plot)
# plt.show()

# plt.scatter(X_test[y_test == 0, 0], X_test[y_test == 0, 1], color='red')
# plt.scatter(X_test[y_test == 1, 0], X_test[y_test == 1, 1], color='blue')
# plt.plot(x1_plot, x2_plot)
# plt.show()

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(log_reg, axis=[4, 7.5, 1.5, 4.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()

# KNN的决策边界
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
# print(knn_clf.score(X_test, y_test))
# plot_decision_boundary(knn_clf, axis=[4, 7.5, 1.5, 4.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()

# KNN多分类的决策边界
knn_clf_all = KNeighborsClassifier(n_neighbors=50)
knn_clf_all.fit(iris.data[:, :2], iris.target)
plot_decision_boundary(knn_clf_all, axis=[4, 8, 1.5, 4.5])
plt.scatter(iris.data[iris.target == 0, 0], iris.data[iris.target == 0, 1], color='red')
plt.scatter(iris.data[iris.target == 1, 0], iris.data[iris.target == 1, 1], color='blue')
plt.scatter(iris.data[iris.target == 2, 0], iris.data[iris.target == 2, 1], color='green')
plt.show()


import numpy as np
import matplotlib.pyplot as plt

if __name__ == "__main__":

np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
y = np.array(X[:, 0]**2 + X[:, 1]**2 < 1.5, dtype='int')
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from playLA.LogisticRegression import LogisticRegression

if __name__ == "__main__":

np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
y = np.array(X[:, 0]**2 + X[:, 1]**2 < 1.5, dtype='int')
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
# 使用逻辑回归，不添加多项式项
log_reg = LogisticRegression()
log_reg.fit(X, y)
print(log_reg.score(X, y))

0.605

import numpy as np
import matplotlib.pyplot as plt
from playLA.LogisticRegression import LogisticRegression
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
y = np.array(X[:, 0]**2 + X[:, 1]**2 < 1.5, dtype='int')
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
# 使用逻辑回归，不添加多项式项
log_reg = LogisticRegression()
log_reg.fit(X, y)
print(log_reg.score(X, y))

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

plot_decision_boundary(log_reg, axis=[-4, 4, -4, 4])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from playLA.LogisticRegression import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

if __name__ == "__main__":

np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
y = np.array(X[:, 0]**2 + X[:, 1]**2 < 1.5, dtype='int')
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
# 使用逻辑回归，不添加多项式项
log_reg = LogisticRegression()
log_reg.fit(X, y)
print(log_reg.score(X, y))

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(log_reg, axis=[-4, 4, -4, 4])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

# 使用逻辑回归，添加多项式项
def PolynomialLogisticRegression(degree):
return Pipeline([
('poly', PolynomialFeatures(degree)),
('std_scaler', StandardScaler()),
('log_reg', LogisticRegression())
])

poly_log_reg = PolynomialLogisticRegression(degree=2)
poly_log_reg.fit(X, y)
print(poly_log_reg.score(X, y))

plot_decision_boundary(poly_log_reg, axis=[-4, 4, -4, 4])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

0.605
0.95

import numpy as np
import matplotlib.pyplot as plt
from playLA.LogisticRegression import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

if __name__ == "__main__":

np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
y = np.array(X[:, 0]**2 + X[:, 1]**2 < 1.5, dtype='int')
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
# 使用逻辑回归，不添加多项式项
log_reg = LogisticRegression()
log_reg.fit(X, y)
print(log_reg.score(X, y))

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(log_reg, axis=[-4, 4, -4, 4])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

# 使用逻辑回归，添加多项式项
def PolynomialLogisticRegression(degree):
return Pipeline([
('poly', PolynomialFeatures(degree)),
('std_scaler', StandardScaler()),
('log_reg', LogisticRegression())
])

poly_log_reg = PolynomialLogisticRegression(degree=2)
poly_log_reg.fit(X, y)
print(poly_log_reg.score(X, y))

# plot_decision_boundary(poly_log_reg, axis=[-4, 4, -4, 4])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

poly_log_reg2 = PolynomialLogisticRegression(degree=20)
poly_log_reg2.fit(X, y)
print(poly_log_reg2.score(X, y))

plot_decision_boundary(poly_log_reg2, axis=[-4, 4, -4, 4])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

0.605
0.95
0.955

scikit-learn中的逻辑回归

import numpy as np
import matplotlib.pyplot as plt

if __name__ == "__main__":

np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
# 生成抛物线的数据
y = np.array(X[:, 0]**2 + X[:, 1] < 1.5, dtype='int')
# 添加噪音
for _ in range(20):
y[np.random.randint(200)] = 1
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
# 生成抛物线的数据
y = np.array(X[:, 0]**2 + X[:, 1] < 1.5, dtype='int')
# 添加噪音
for _ in range(20):
y[np.random.randint(200)] = 1
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_train, y_train))
print(log_reg.score(X_test, y_test))

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

plot_decision_boundary(log_reg, axis=[-4, 4, -4, 4])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

0.7933333333333333
0.86

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

if __name__ == "__main__":

np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
# 生成抛物线的数据
y = np.array(X[:, 0]**2 + X[:, 1] < 1.5, dtype='int')
# 添加噪音
for _ in range(20):
y[np.random.randint(200)] = 1
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_train, y_train))
print(log_reg.score(X_test, y_test))

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(log_reg, axis=[-4, 4, -4, 4])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

# 使用逻辑回归，添加多项式项
def PolynomialLogisticRegression(degree):
return Pipeline([
('poly', PolynomialFeatures(degree)),
('std_scaler', StandardScaler()),
('log_reg', LogisticRegression())
])

poly_log_reg = PolynomialLogisticRegression(degree=2)
poly_log_reg.fit(X_train, y_train)
print(poly_log_reg.score(X_train, y_train))
print(poly_log_reg.score(X_test, y_test))
plot_decision_boundary(poly_log_reg, axis=[-4, 4, -4, 4])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

0.7933333333333333
0.86
0.9066666666666666
0.94

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

if __name__ == "__main__":

np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
# 生成抛物线的数据
y = np.array(X[:, 0]**2 + X[:, 1] < 1.5, dtype='int')
# 添加噪音
for _ in range(20):
y[np.random.randint(200)] = 1
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_train, y_train))
print(log_reg.score(X_test, y_test))

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(log_reg, axis=[-4, 4, -4, 4])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

# 使用逻辑回归，添加多项式项
def PolynomialLogisticRegression(degree):
return Pipeline([
('poly', PolynomialFeatures(degree)),
('std_scaler', StandardScaler()),
('log_reg', LogisticRegression())
])

poly_log_reg = PolynomialLogisticRegression(degree=2)
poly_log_reg.fit(X_train, y_train)
print(poly_log_reg.score(X_train, y_train))
print(poly_log_reg.score(X_test, y_test))
# plot_decision_boundary(poly_log_reg, axis=[-4, 4, -4, 4])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

poly_log_reg2 = PolynomialLogisticRegression(degree=20)
poly_log_reg2.fit(X_train, y_train)
print(poly_log_reg2.score(X_train, y_train))
print(poly_log_reg2.score(X_test, y_test))
plot_decision_boundary(poly_log_reg2, axis=[-4, 4, -4, 4])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

0.7933333333333333
0.86
0.9066666666666666
0.94
0.94
0.92

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

if __name__ == "__main__":

np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
# 生成抛物线的数据
y = np.array(X[:, 0]**2 + X[:, 1] < 1.5, dtype='int')
# 添加噪音
for _ in range(20):
y[np.random.randint(200)] = 1
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_train, y_train))
print(log_reg.score(X_test, y_test))

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(log_reg, axis=[-4, 4, -4, 4])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

# 使用逻辑回归，添加多项式项
def PolynomialLogisticRegression(degree):
return Pipeline([
('poly', PolynomialFeatures(degree)),
('std_scaler', StandardScaler()),
('log_reg', LogisticRegression())
])

poly_log_reg = PolynomialLogisticRegression(degree=2)
poly_log_reg.fit(X_train, y_train)
print(poly_log_reg.score(X_train, y_train))
print(poly_log_reg.score(X_test, y_test))
# plot_decision_boundary(poly_log_reg, axis=[-4, 4, -4, 4])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

poly_log_reg2 = PolynomialLogisticRegression(degree=20)
poly_log_reg2.fit(X_train, y_train)
print(poly_log_reg2.score(X_train, y_train))
print(poly_log_reg2.score(X_test, y_test))
# plot_decision_boundary(poly_log_reg2, axis=[-4, 4, -4, 4])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

def PolynomialLogisticRegression(degree, C):
return Pipeline([
('poly', PolynomialFeatures(degree)),
('std_scaler', StandardScaler()),
('log_reg', LogisticRegression(C=C))
])

poly_log_reg3 = PolynomialLogisticRegression(degree=20, C=0.1)
poly_log_reg3.fit(X_train, y_train)
print(poly_log_reg3.score(X_train, y_train))
print(poly_log_reg3.score(X_test, y_test))
plot_decision_boundary(poly_log_reg3, axis=[-4, 4, -4, 4])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

0.7933333333333333
0.86
0.9066666666666666
0.94
0.94
0.92
0.84
0.92

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

if __name__ == "__main__":

np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
# 生成抛物线的数据
y = np.array(X[:, 0]**2 + X[:, 1] < 1.5, dtype='int')
# 添加噪音
for _ in range(20):
y[np.random.randint(200)] = 1
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_train, y_train))
print(log_reg.score(X_test, y_test))

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(log_reg, axis=[-4, 4, -4, 4])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

# 使用逻辑回归，添加多项式项
def PolynomialLogisticRegression(degree):
return Pipeline([
('poly', PolynomialFeatures(degree)),
('std_scaler', StandardScaler()),
('log_reg', LogisticRegression())
])

poly_log_reg = PolynomialLogisticRegression(degree=2)
poly_log_reg.fit(X_train, y_train)
print(poly_log_reg.score(X_train, y_train))
print(poly_log_reg.score(X_test, y_test))
# plot_decision_boundary(poly_log_reg, axis=[-4, 4, -4, 4])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

poly_log_reg2 = PolynomialLogisticRegression(degree=20)
poly_log_reg2.fit(X_train, y_train)
print(poly_log_reg2.score(X_train, y_train))
print(poly_log_reg2.score(X_test, y_test))
# plot_decision_boundary(poly_log_reg2, axis=[-4, 4, -4, 4])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

def PolynomialLogisticRegression(degree, C):
return Pipeline([
('poly', PolynomialFeatures(degree)),
('std_scaler', StandardScaler()),
('log_reg', LogisticRegression(C=C))
])

poly_log_reg3 = PolynomialLogisticRegression(degree=20, C=0.1)
poly_log_reg3.fit(X_train, y_train)
print(poly_log_reg3.score(X_train, y_train))
print(poly_log_reg3.score(X_test, y_test))
# plot_decision_boundary(poly_log_reg3, axis=[-4, 4, -4, 4])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

def PolynomialLogisticRegression(degree, C, penalty='l2'):
return Pipeline([
('poly', PolynomialFeatures(degree)),
('std_scaler', StandardScaler()),
('log_reg', LogisticRegression(C=C, penalty=penalty, solver='liblinear'))
])

poly_log_reg4 = PolynomialLogisticRegression(degree=20, C=0.1, penalty='l1')
poly_log_reg4.fit(X_train, y_train)
print(poly_log_reg4.score(X_train, y_train))
print(poly_log_reg4.score(X_test, y_test))
plot_decision_boundary(poly_log_reg4, axis=[-4, 4, -4, 4])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

0.7933333333333333
0.86
0.9066666666666666
0.94
0.94
0.92
0.84
0.92
0.8266666666666667
0.9

OvR与OvO

1. OvR
2. OvO

OvR(One vs Rest)

OvO(One vs One)

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X = iris.data[:, :2]
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_test, y_test))

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

plot_decision_boundary(log_reg, axis=[4, 8.5, 1.5, 4.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.scatter(X[y == 2, 0], X[y == 2, 1])
plt.show()

0.7894736842105263

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X = iris.data[:, :2]
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
# 使用OvR的方式进行多分类
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_test, y_test))

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(log_reg, axis=[4, 8.5, 1.5, 4.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()
# 使用OvO的方式进行多分类
log_reg2 = LogisticRegression(multi_class='multinomial', solver='newton-cg')
log_reg2.fit(X_train, y_train)
print(log_reg2.score(X_test, y_test))
plot_decision_boundary(log_reg2, axis=[4, 8.5, 1.5, 4.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.scatter(X[y == 2, 0], X[y == 2, 1])
plt.show()

0.7894736842105263
0.7894736842105263

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X = iris.data[:, :2]
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
# 使用OvR的方式进行多分类
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_test, y_test))

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(log_reg, axis=[4, 8.5, 1.5, 4.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()
# 使用OvO的方式进行多分类
log_reg2 = LogisticRegression(multi_class='multinomial', solver='newton-cg')
log_reg2.fit(X_train, y_train)
print(log_reg2.score(X_test, y_test))
# plot_decision_boundary(log_reg2, axis=[4, 8.5, 1.5, 4.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()

X = iris.data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_test, y_test))

log_reg2 = LogisticRegression(multi_class='multinomial', solver='newton-cg')
log_reg2.fit(X_train, y_train)
print(log_reg2.score(X_test, y_test))

0.7894736842105263
0.7894736842105263
1.0
1.0

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier

if __name__ == "__main__":

X = iris.data[:, :2]
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
# 使用OvR的方式进行多分类
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_test, y_test))

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(log_reg, axis=[4, 8.5, 1.5, 4.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()
# 使用OvO的方式进行多分类
log_reg2 = LogisticRegression(multi_class='multinomial', solver='newton-cg')
log_reg2.fit(X_train, y_train)
print(log_reg2.score(X_test, y_test))
# plot_decision_boundary(log_reg2, axis=[4, 8.5, 1.5, 4.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()

X = iris.data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_test, y_test))

log_reg2 = LogisticRegression(multi_class='multinomial', solver='newton-cg')
log_reg2.fit(X_train, y_train)
print(log_reg2.score(X_test, y_test))

ovr = OneVsRestClassifier(log_reg)
ovr.fit(X_train, y_train)
print(ovr.score(X_test, y_test))

ovo = OneVsOneClassifier(log_reg)
ovo.fit(X_train, y_train)
print(ovo.score(X_test, y_test))

0.7894736842105263
0.7894736842105263
1.0
1.0
0.9736842105263158
1.0

# 评价分类结果

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

if __name__ == "__main__":

X = digits.data
y = digits.target.copy()
# 产生偏斜
y[digits.target == 9] = 1
y[digits.target != 9] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_test, y_test))
y_log_predict = log_reg.predict(X_test)

def TN(y_true, y_predict):
# 预测值为0，真值为0
assert len(y_true) == len(y_predict)
return np.sum((y_true == 0) & (y_predict == 0))

print(TN(y_test, y_log_predict))

def FP(y_true, y_predict):
# 预测值为1，真值为0
assert len(y_true) == len(y_predict)
return np.sum((y_true == 0) & (y_predict == 1))

print(FP(y_test, y_log_predict))

def FN(y_true, y_predict):
# 预测值为0，真值为1
assert len(y_true) == len(y_predict)
return np.sum((y_true == 1) & (y_predict == 0))

print(FN(y_test, y_log_predict))

def TP(y_true, y_predict):
# 预测值为1，真值为1
assert len(y_true) == len(y_predict)
return np.sum((y_true == 1) & (y_predict == 1))

print(TP(y_test, y_log_predict))

def confusion_matrix(y_true, y_predict):
# 混淆矩阵
return np.array([
[TN(y_true, y_predict), FP(y_true, y_predict)],
[FN(y_true, y_predict), TP(y_true, y_predict)]
])

print(confusion_matrix(y_test, y_log_predict))

def precision_score(y_true, y_predict):
# 精准率
tp = TP(y_true, y_predict)
fp = FP(y_true, y_predict)
try:
return tp / (tp + fp)
except:
return 0

print(precision_score(y_test, y_log_predict))

def recall_score(y_true, y_predict):
# 召回率
tp = TP(y_true, y_predict)
fn = FN(y_true, y_predict)
try:
return tp / (tp + fn)
except:
return 0

print(recall_score(y_test, y_log_predict))


0.9755555555555555
403
2
9
36
[[403   2]
[  9  36]]
0.9473684210526315
0.8

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

if __name__ == "__main__":

X = digits.data
y = digits.target.copy()
# 产生偏斜
y[digits.target == 9] = 1
y[digits.target != 9] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_test, y_test))
y_log_predict = log_reg.predict(X_test)

# def TN(y_true, y_predict):
#     # 预测值为0，真值为0
#     assert len(y_true) == len(y_predict)
#     return np.sum((y_true == 0) & (y_predict == 0))
#
# print(TN(y_test, y_log_predict))
#
# def FP(y_true, y_predict):
#     # 预测值为1，真值为0
#     assert len(y_true) == len(y_predict)
#     return np.sum((y_true == 0) & (y_predict == 1))
#
# print(FP(y_test, y_log_predict))
#
# def FN(y_true, y_predict):
#     # 预测值为0，真值为1
#     assert len(y_true) == len(y_predict)
#     return np.sum((y_true == 1) & (y_predict == 0))
#
# print(FN(y_test, y_log_predict))
#
# def TP(y_true, y_predict):
#     # 预测值为1，真值为1
#     assert len(y_true) == len(y_predict)
#     return np.sum((y_true == 1) & (y_predict == 1))
#
# print(TP(y_test, y_log_predict))
#
# def confusion_matrix(y_true, y_predict):
#     # 混淆矩阵
#     return np.array([
#         [TN(y_true, y_predict), FP(y_true, y_predict)],
#         [FN(y_true, y_predict), TP(y_true, y_predict)]
#     ])
#
# print(confusion_matrix(y_test, y_log_predict))
#
# def precision_score(y_true, y_predict):
#     # 精准率
#     tp = TP(y_true, y_predict)
#     fp = FP(y_true, y_predict)
#     try:
#         return tp / (tp + fp)
#     except:
#         return 0
#
# print(precision_score(y_test, y_log_predict))
#
# def recall_score(y_true, y_predict):
#     # 召回率
#     tp = TP(y_true, y_predict)
#     fn = FN(y_true, y_predict)
#     try:
#         return tp / (tp + fn)
#     except:
#         return 0
#
# print(recall_score(y_test, y_log_predict))
print(confusion_matrix(y_test, y_log_predict))
print(precision_score(y_test, y_log_predict))
print(recall_score(y_test, y_log_predict))

0.9755555555555555
[[403   2]
[  9  36]]
0.9473684210526315
0.8

F1 Score

F1 Socre就是要兼顾精准率和召回率这两个指标。

import numpy as np

if __name__ == "__main__":

def f1_score(precision, recall):
try:
return 2 * precision * recall / (precision + recall)
except:
return 0

precision = 0.5
recall = 0.5
print(f1_score(precision, recall))

precision = 0.1
recall = 0.9
print(f1_score(precision, recall))

precision = 0
recall = 1
print(f1_score(precision, recall))

0.5
0.18000000000000002
0.0

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

if __name__ == "__main__":

# def f1_score(precision, recall):
#     try:
#         return 2 * precision * recall / (precision + recall)
#     except:
#         return 0

# precision = 0.5
# recall = 0.5
# print(f1_score(precision, recall))
#
# precision = 0.1
# recall = 0.9
# print(f1_score(precision, recall))
#
# precision = 0
# recall = 1
# print(f1_score(precision, recall))

X = digits.data
y = digits.target.copy()
# 产生偏斜
y[digits.target == 9] = 1
y[digits.target != 9] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_test, y_test))
y_log_predict = log_reg.predict(X_test)
print(confusion_matrix(y_test, y_log_predict))
print(precision_score(y_test, y_log_predict))
print(recall_score(y_test, y_log_predict))
print(f1_score(y_test, y_log_predict))

0.9755555555555555
[[403   2]
[  9  36]]
0.9473684210526315
0.8
0.8674698795180723

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

if __name__ == "__main__":

X = digits.data
y = digits.target.copy()
# 产生偏斜
y[digits.target == 9] = 1
y[digits.target != 9] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_predict = log_reg.predict(X_test)
print(f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(precision_score(y_test, y_predict))
print(recall_score(y_test, y_predict))
print(log_reg.decision_function(X_test))

0.8674698795180723
[[403   2]
[  9  36]]
0.9473684210526315
0.8
[-21.39111629 -32.89968323 -16.44016163 -79.83561566 -48.02243137
-24.17617018 -44.63362641 -24.22571118  -1.14596638 -19.01171386
-65.83248217 -50.98042089 -30.89891353 -45.95104769 -37.35822175
-29.53882118 -36.91822879 -82.84001664 -37.66571977  -9.86437625
-9.27829403 -85.28059442 -16.75452611 -45.33628474  -5.02567423
-48.32625942 -11.65431236 -37.33148679 -25.08336919 -13.5901564
-16.58231541 -28.78799912 -34.37955118 -28.52012209  -8.13542264
-4.63313032 -21.87928168 -21.88869519 -31.09892577 -23.3984867
-26.91006829 -62.24346371 -37.67365812 -66.36651609 -20.11619791
-16.65433589 -18.17972943 -21.54203464 -28.96458855 -19.60879607
2.43887956   7.71912841 -34.849044   -42.71549783 -25.64715678
-34.7680629   -7.59448578 -49.53016396 -51.52838815  19.67482305
-10.13108736 -32.02192086 -11.49856852  -1.4699478  -48.70848257
-43.83625893 -24.84311106 -19.61286116 -36.66586896  -3.53065043
-4.46961169 -19.22715107 -20.35477842 -40.90104406 -11.85320764
-32.75099324 -35.76726754 -28.56663926 -55.41823901 -18.8405687
4.57583101 -16.45881461 -76.79761689 -58.23275352 -30.22585503
-29.42590775 -33.40990489  -8.41240435 -47.92542597 -65.5025185
-16.93172073 -22.17307034 -11.27872628 -18.68046457 -69.24012583
-46.38513493 -39.45623248 -35.95182192 -17.7478779  -62.96997249
-16.87368524 -55.16843836 -28.78237436 -68.50484033 -68.89439645
-6.49868293 -25.51499672 -38.34875938 -27.45932835 -15.5478568
-27.48248247 -20.36181876  12.07838868 -23.10599106 -35.98724084
-29.8672301  -68.97216398 -27.27941662 -54.28218303 -24.62341497
-11.83237722 -47.37099878  -2.76558372 -59.69792222 -31.00151261
-9.01540621 -70.85805094 -56.99696801 -20.05707198 -21.5152789
-68.29244741 -18.92014998 -38.58907364 -57.36570694  -0.91281913
-22.54992466 -22.66467292 -29.0235113  -32.72820688 -20.45664617
-11.34951143   4.65601946   6.26977861   1.49141768  -7.6322439
-39.25685273  12.16623393 -74.55460816 -75.09624528 -49.97601234
-11.65031862 -47.60536987 -75.43101252 -29.91347448 -63.95185681
-7.2689773   -6.63843399 -18.20822617 -32.46673697 -17.9546501
-43.29361712 -32.68994198 -34.3180716  -72.74551409 -15.2019442
11.46964243 -56.42500557  -6.03279066 -48.37246419 -16.43765927
-2.13270161 -11.86214884 -33.25295195 -51.38609337 -10.38441353
-17.19114075  -5.24520507 -25.21953049 -15.69850493   3.54659252
-45.00033016 -12.56806987 -25.36295033 -16.5912305  -22.13313929
-82.49022097  -5.87113613 -20.29637087 -20.47640811 -26.83351282
-25.97467095 -40.49162548 -37.9859323  -26.96640644 -23.75413381
-20.13250209  -9.67658136 -19.68893877 -42.53474809 -44.17265609
-15.66069524 -64.04688059 -24.55090181 -56.30937433 -13.01786914
-29.65196663   3.88268612 -44.34077042  -7.89931516   1.1386725
-2.83023993 -11.93127768   7.50903966  -7.17392227 -46.37055337
-48.655327    -4.5917329  -19.0386492  -24.07764547 -48.76584476
-15.0310363  -24.94392526 -16.69726519 -18.67621386 -15.69759961
-16.87847809 -38.54539917 -31.09627629  -9.38955202 -71.46839701
-22.79405742 -14.42232531 -23.07557033 -34.32759766  -0.88099854
-32.76513112 -11.2433954  -18.68720525  -8.20466874 -45.46311901
-22.30356666 -62.43491449 -46.77168427 -65.15925742 -33.23520093
-23.46541576 -28.48344277 -64.79554203   1.45558119  -4.09259583
-25.67552639 -22.3174507  -54.70729372 -16.34598362 -12.09697892
-35.30133487  -5.75958739 -13.47308585 -72.33098862  -6.16740739
-1.17230738 -35.54773249 -24.18715193 -68.33044284  14.76753318
-63.08785126   9.91043525 -24.14305198 -32.46286967 -14.40715008
-85.75262185 -12.79528441   9.00228282 -16.503391   -36.69845123
-16.512096   -19.35133411 -32.60588814  -5.63915201   7.69497867
9.40915283   5.87506061 -35.64726625 -12.99147964 -54.43160989
-41.10194425   5.62194842 -79.52182102 -15.81768172 -19.23006978
-10.86823812 -42.52899396 -19.83047039 -15.69814303 -17.9881368
-18.03782243  -6.75389093 -20.79051566 -16.58471449 -70.44670178
-9.20322073 -31.68326159 -19.70349907 -21.98390417 -24.76579065
-16.374491   -13.37493066 -22.92694274  11.05274035 -15.40526762
-32.93952208 -13.75616482 -50.36690474 -20.48555219 -56.27962062
-28.68557026 -21.86287716 -30.40747782 -69.26566596 -59.34914687
14.35788248   8.58428239 -25.6877792    2.74470278   4.93805248
-19.68361559 -58.84898426 -10.01585482 -28.80478682 -27.20954018
6.30088159 -80.50052526 -34.46318844 -50.31508122 -35.96203418
-48.65003927 -17.95728207 -62.35151791  -3.08443829 -25.258016
-64.10656011  -9.63906144 -21.72007978  19.92910802 -18.75421777
-4.47831729 -13.15140197 -21.63965789 -43.10672748 -52.13053585
-28.51712477 -14.5823744   -2.46609074  -6.12681966   3.71455999
-14.9977847  -40.84905871 -26.67149074  14.11187575 -17.69770862
15.2076739  -33.08394198   5.28155289 -14.27159115 -53.5969583
-50.03282267 -30.66725832 -38.04409354 -23.29943335 -24.69438718
-13.55230615 -22.59913745 -27.21119584 -19.64039532 -28.17022763
-19.93440541 -29.78707338 -11.30148933 -17.24723071 -24.0270052
-24.36164647  10.39497979 -17.24456276 -38.0307681  -16.09768011
-37.60773045 -16.35556132 -69.1377758  -33.70524439 -43.63203695
-26.54795828 -10.30511187 -66.3722633  -31.88204434 -45.55916781
-14.58978562 -36.09888421 -14.97052471 -70.02824527 -11.3627415
-40.87243347 -32.6886302  -19.74769031 -27.56948378 -15.73011542
-31.59281444  -8.52806892 -21.36544185 -34.08460531 -11.67038657
-36.4478906  -34.76021702 -22.23118301   4.78055769 -21.33805183
-4.46928482 -20.85352719 -32.24890674 -41.15145747 -25.07561195
-19.76267733 -47.88375454 -30.95384768 -45.58272734 -71.52627785
-6.26013362 -32.5437332    2.29251349  11.95767159   7.11098246
-31.39894635 -63.96650655 -23.79370668  -5.74111066 -32.41736889
-24.74623259 -67.7157729  -32.81326364 -33.60404409 -31.55108505
-51.97532241 -22.53962738  -7.74759499 -17.29216614 -25.76348152
-32.39491728 -29.51688896 -66.44155839 -45.69256747 -16.05903712]

print(log_reg.decision_function(X_test)[: 10])
print(log_reg.predict(X_test)[: 10])

[-21.39111629 -32.89968323 -16.44016163 -79.83561566 -48.02243137
-24.17617018 -44.63362641 -24.22571118  -1.14596638 -19.01171386]
[0 0 0 0 0 0 0 0 0 0]

decision_scores = log_reg.decision_function(X_test)
print(np.min(decision_scores))
print(np.max(decision_scores))
y_predict_2 = np.array(decision_scores >= 5, dtype='int')
print(confusion_matrix(y_test, y_predict_2))
print(precision_score(y_test, y_predict_2))
print(recall_score(y_test, y_predict_2))

-85.75262185435525
19.929108017559418
[[404   1]
[ 21  24]]
0.96
0.5333333333333333

y_predict_3 = np.array(decision_scores >= -5, dtype='int')
print(confusion_matrix(y_test, y_predict_3))
print(precision_score(y_test, y_predict_3))
print(recall_score(y_test, y_predict_3))

[[390  15]
[  5  40]]
0.7272727272727273
0.8888888888888888

precisions = []
recalls = []
# 定义从最小到最大的概率值，步长为0.1的队列
thresholds = np.arange(np.min(decision_scores), np.max(decision_scores), 0.1)
for threshold in thresholds:
# 以队列中的每一个值为决策边界，获取预测的分类值
y_predict = np.array(decision_scores >= threshold, dtype='int')
# 添加该次的精准率
precisions.append(precision_score(y_test, y_predict))
# 添加该次的召回率
recalls.append(recall_score(y_test, y_predict))
# 绘制精确率曲线
plt.plot(thresholds, precisions)
# 绘制召回率曲线
plt.plot(thresholds, recalls)
plt.show()

plt.plot(precisions, recalls)
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve

if __name__ == "__main__":

X = digits.data
y = digits.target.copy()
# 产生偏斜
y[digits.target == 9] = 1
y[digits.target != 9] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_predict = log_reg.predict(X_test)

decision_scores = log_reg.decision_function(X_test)
precisions, recalls, thresholds = precision_recall_curve(y_test, decision_scores)
print(precisions.shape)
print(recalls.shape)
print(thresholds.shape)

(151,)
(151,)
(150,)

plt.plot(thresholds, precisions[: -1])
plt.plot(thresholds, recalls[: -1])
plt.show()

plt.plot(precisions, recalls)
plt.show()

ROC曲线

TPR

TPR其实就是之前说的召回率

TPR就是预测为1，并且预测对了的数量占真实为1的百分比是多少。

FPR

FPR就是用FP去除以真实值为0的所有的数字和。FPR是指预测为1，可惜我们预测错了，这个数量占真实值为0的百分比是多少。

import numpy as np
from math import sqrt

def accuracy_score(y_true, y_predict):
"""计算y_true和y_predict之间的准确率"""
assert len(y_true) == len(y_predict), \
"the size of y_true must be equal to the size of y_predict"

return np.sum(y_true == y_predict) / len(y_true)

def mean_squared_error(y_true, y_predict):
"""计算y_true和y_predict之间的MSE"""
assert len(y_true) == len(y_predict), \
"the size of y_true must be equal to the size of y_predict"

return np.sum((y_true - y_predict)**2) / len(y_true)

def root_mean_squared_error(y_true, y_predict):
"""计算y_true和y_predict之间的RMSE"""

return sqrt(mean_squared_error(y_true, y_predict))

def mean_absolute_error(y_true, y_predict):
"""计算y_true和y_predict之间的MAE"""

return np.sum(np.absolute(y_true - y_predict)) / len(y_true)

def r2_score(y_true, y_predict):
"""计算y_true和y_predict之间的R Square"""
return 1 - mean_squared_error(y_true, y_predict) / np.var(y_true)

def TN(y_true, y_predict):
# 预测值为0，真值为0
assert len(y_true) == len(y_predict)
return np.sum((y_true == 0) & (y_predict == 0))

def FP(y_true, y_predict):
# 预测值为1，真值为0
assert len(y_true) == len(y_predict)
return np.sum((y_true == 0) & (y_predict == 1))

def FN(y_true, y_predict):
# 预测值为0，真值为1
assert len(y_true) == len(y_predict)
return np.sum((y_true == 1) & (y_predict == 0))

def TP(y_true, y_predict):
# 预测值为1，真值为1
assert len(y_true) == len(y_predict)
return np.sum((y_true == 1) & (y_predict == 1))

def confusion_matrix(y_true, y_predict):
# 混淆矩阵
return np.array([
[TN(y_true, y_predict), FP(y_true, y_predict)],
[FN(y_true, y_predict), TP(y_true, y_predict)]
])

def precision_score(y_true, y_predict):
# 精准率
tp = TP(y_true, y_predict)
fp = FP(y_true, y_predict)
try:
return tp / (tp + fp)
except:
return 0

def recall_score(y_true, y_predict):
# 召回率
tp = TP(y_true, y_predict)
fn = FN(y_true, y_predict)
try:
return tp / (tp + fn)
except:
return 0

def f1_score(y_true, y_predict):
precision = precision_score(y_true, y_predict)
recall = recall_score(y_true, y_predict)
try:
return 2 * precision * recall / (precision + recall)
except:
return 0

def TPR(y_true, y_predict):
tp = TP(y_true, y_predict)
fn = FN(y_true, y_predict)
try:
return tp / (tp + fn)
except:
return 0

def FPR(y_true, y_predict):
fp = FP(y_true, y_predict)
tn = TN(y_true, y_predict)
try:
return fp / (fp + tn)
except:
return 0
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from playLA.metrics import FPR, TPR

if __name__ == "__main__":

X = digits.data
y = digits.target.copy()
# 产生偏斜
y[digits.target == 9] = 1
y[digits.target != 9] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
# 获取所有测试数据集的概率值
decision_scores = log_reg.decision_function(X_test)
fprs = []
tprs = []
# 定义从最小到最大的概率值，步长为0.1的队列
thresholds = np.arange(np.min(decision_scores), np.max(decision_scores), 0.1)
for threshold in thresholds:
# 以队列中的每一个值为决策边界，获取预测的分类值
y_predict = np.array(decision_scores >= threshold, dtype='int')
fprs.append(FPR(y_test, y_predict))
tprs.append(TPR(y_test, y_predict))
# 绘制ROC曲线
plt.plot(fprs, tprs)
plt.show()


import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from playLA.metrics import FPR, TPR
from sklearn.metrics import roc_curve

if __name__ == "__main__":

X = digits.data
y = digits.target.copy()
# 产生偏斜
y[digits.target == 9] = 1
y[digits.target != 9] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
# 获取所有测试数据集的概率值
decision_scores = log_reg.decision_function(X_test)
fprs = []
tprs = []
# 定义从最小到最大的概率值，步长为0.1的队列
thresholds = np.arange(np.min(decision_scores), np.max(decision_scores), 0.1)
for threshold in thresholds:
# 以队列中的每一个值为决策边界，获取预测的分类值
y_predict = np.array(decision_scores >= threshold, dtype='int')
fprs.append(FPR(y_test, y_predict))
tprs.append(TPR(y_test, y_predict))
# 绘制ROC曲线
# plt.plot(fprs, tprs)
# plt.show()

fprs, tprs, thresholds = roc_curve(y_test, decision_scores)
plt.plot(fprs, tprs)
plt.show()


import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from playLA.metrics import FPR, TPR
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

if __name__ == "__main__":

X = digits.data
y = digits.target.copy()
# 产生偏斜
y[digits.target == 9] = 1
y[digits.target != 9] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
# 获取所有测试数据集的概率值
decision_scores = log_reg.decision_function(X_test)
fprs = []
tprs = []
# 定义从最小到最大的概率值，步长为0.1的队列
thresholds = np.arange(np.min(decision_scores), np.max(decision_scores), 0.1)
for threshold in thresholds:
# 以队列中的每一个值为决策边界，获取预测的分类值
y_predict = np.array(decision_scores >= threshold, dtype='int')
fprs.append(FPR(y_test, y_predict))
tprs.append(TPR(y_test, y_predict))
# 绘制ROC曲线
# plt.plot(fprs, tprs)
# plt.show()

fprs, tprs, thresholds = roc_curve(y_test, decision_scores)
# plt.plot(fprs, tprs)
# plt.show()
# 获取ROC曲线下的面积
print(roc_auc_score(y_test, decision_scores))


0.9823868312757201

ROC的应用场合主要就在比较两个模型的优劣。上图中的两根曲线就代表了两个模型或者是同一个算法，它们使用超参数不同使用的模型对应的结果。在这种情况下，我们应该选择ROC曲线下面的面积更大的那个模型被认为是更好的模型。

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

if __name__ == "__main__":

X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=666)
log_reg = LogisticRegression()
# 使用OvR的方式解决多分类的问题
log_reg.fit(X_train, y_train)
# 获取多分类准确度
print(log_reg.score(X_test, y_test))
y_predict = log_reg.predict(X_test)
# 获取多分类的精准率
print(precision_score(y_test, y_predict, average='micro'))
# 获取多分类的召回率
print(recall_score(y_test, y_predict, average='micro'))


0.9408901251738526
0.9408901251738526
0.9408901251738526

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

if __name__ == "__main__":

X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=666)
log_reg = LogisticRegression()
# 使用OvR的方式解决多分类的问题
log_reg.fit(X_train, y_train)
# 获取多分类准确度
print(log_reg.score(X_test, y_test))
y_predict = log_reg.predict(X_test)
# 获取多分类的精准率
print(precision_score(y_test, y_predict, average='micro'))
# 获取多分类的召回率
print(recall_score(y_test, y_predict, average='micro'))
# 获取这个十分类的混淆矩阵
print(confusion_matrix(y_test, y_predict))

0.9408901251738526
0.9408901251738526
0.9408901251738526
[[148   0   1   0   0   0   0   0   0   0]
[  0 125   2   0   0   0   0   3   2  11]
[  0   1 134   0   0   0   0   0   1   0]
[  0   0   1 138   0   5   0   1   4   0]
[  2   4   0   0 138   0   1   3   0   2]
[  1   2   1   0   0 146   1   0   0   1]
[  0   2   0   0   0   1 132   0   1   0]
[  0   0   0   0   0   0   0 135   0   1]
[  0   8   2   1   3   3   0   1 120   2]
[  0   1   0   6   0   1   0   1   1 137]]

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

if __name__ == "__main__":

X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=666)
log_reg = LogisticRegression()
# 使用OvR的方式解决多分类的问题
log_reg.fit(X_train, y_train)
# 获取多分类准确度
print(log_reg.score(X_test, y_test))
y_predict = log_reg.predict(X_test)
# 获取多分类的精准率
print(precision_score(y_test, y_predict, average='micro'))
# 获取多分类的召回率
print(recall_score(y_test, y_predict, average='micro'))
# 获取这个十分类的混淆矩阵
print(confusion_matrix(y_test, y_predict))
cfm = confusion_matrix(y_test, y_predict)
plt.matshow(cfm, cmap=plt.cm.gray)
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

if __name__ == "__main__":

X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=666)
log_reg = LogisticRegression()
# 使用OvR的方式解决多分类的问题
log_reg.fit(X_train, y_train)
# 获取多分类准确度
print(log_reg.score(X_test, y_test))
y_predict = log_reg.predict(X_test)
# 获取多分类的精准率
print(precision_score(y_test, y_predict, average='micro'))
# 获取多分类的召回率
print(recall_score(y_test, y_predict, average='micro'))
# 获取这个十分类的混淆矩阵
print(confusion_matrix(y_test, y_predict))
cfm = confusion_matrix(y_test, y_predict)
# plt.matshow(cfm, cmap=plt.cm.gray)
# plt.show()
# 获取每行的样本
row_sums = np.sum(cfm, axis=1)
err_matrix = cfm / row_sums
np.fill_diagonal(err_matrix, 0)
print(err_matrix)
plt.matshow(err_matrix, cmap=plt.cm.gray)
plt.show()

0.9408901251738526
0.9408901251738526
0.9408901251738526
[[148   0   1   0   0   0   0   0   0   0]
[  0 125   2   0   0   0   0   3   2  11]
[  0   1 134   0   0   0   0   0   1   0]
[  0   0   1 138   0   5   0   1   4   0]
[  2   4   0   0 138   0   1   3   0   2]
[  1   2   1   0   0 146   1   0   0   1]
[  0   2   0   0   0   1 132   0   1   0]
[  0   0   0   0   0   0   0 135   0   1]
[  0   8   2   1   3   3   0   1 120   2]
[  0   1   0   6   0   1   0   1   1 137]]
[[0.         0.         0.00735294 0.         0.         0.
0.         0.         0.         0.        ]
[0.         0.         0.01470588 0.         0.         0.
0.         0.02205882 0.01428571 0.07482993]
[0.         0.00699301 0.         0.         0.         0.
0.         0.         0.00714286 0.        ]
[0.         0.         0.00735294 0.         0.         0.03289474
0.         0.00735294 0.02857143 0.        ]
[0.01342282 0.02797203 0.         0.         0.         0.
0.00735294 0.02205882 0.         0.01360544]
[0.00671141 0.01398601 0.00735294 0.         0.         0.
0.00735294 0.         0.         0.00680272]
[0.         0.01398601 0.         0.         0.         0.00657895
0.         0.         0.00714286 0.        ]
[0.         0.         0.         0.         0.         0.
0.         0.         0.         0.00680272]
[0.         0.05594406 0.01470588 0.00671141 0.02       0.01973684
0.         0.00735294 0.         0.01360544]
[0.         0.00699301 0.         0.04026846 0.         0.00657895
0.         0.00735294 0.00714286 0.        ]]


# 支撑向量机SVM

SVM是Support Vector Machine的缩写。支撑向量机既可以解决分类问题，也可以解决回归问题。

SVM背后的最优化问题

SVM要最大化margin，而margin=2d，所以只要我们找到这个d，就可以得到margin。

Soft Margin SVM

scikit-learn中的SVM

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

if __name__ == "__main__":

X = iris.data
y = iris.target
# 只进行二分类，并且为了可视化只取前两个特征
X = X[y < 2, :2]
y = y[y < 2]
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X = iris.data
y = iris.target
# 只进行二分类，并且为了可视化只取前两个特征
X = X[y < 2, :2]
y = y[y < 2]
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()
standardScaler = StandardScaler()
standardScaler.fit(X)
X_standard = standardScaler.transform(X)
# 生成一个Hard Margin SVM对象，调大超参数C
svc = LinearSVC(C=1e9)
svc.fit(X_standard, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

plot_decision_boundary(svc, axis=[-3, 3, -3, 3])
plt.scatter(X_standard[y == 0, 0], X_standard[y == 0, 1], color='red')
plt.scatter(X_standard[y == 1, 0], X_standard[y == 1, 1], color='blue')
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X = iris.data
y = iris.target
# 只进行二分类，并且为了可视化只取前两个特征
X = X[y < 2, :2]
y = y[y < 2]
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()
standardScaler = StandardScaler()
standardScaler.fit(X)
X_standard = standardScaler.transform(X)
# 生成一个Hard Margin SVM对象，调大超参数C
svc = LinearSVC(C=1e9)
svc.fit(X_standard, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(svc, axis=[-3, 3, -3, 3])
# plt.scatter(X_standard[y == 0, 0], X_standard[y == 0, 1], color='red')
# plt.scatter(X_standard[y == 1, 0], X_standard[y == 1, 1], color='blue')
# plt.show()
svc2 = LinearSVC(C=0.01)
svc2.fit(X_standard, y)
plot_decision_boundary(svc2, axis=[-3, 3, -3, 3])
plt.scatter(X_standard[y == 0, 0], X_standard[y == 0, 1], color='red')
plt.scatter(X_standard[y == 1, 0], X_standard[y == 1, 1], color='blue')
plt.show()


import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X = iris.data
y = iris.target
# 只进行二分类，并且为了可视化只取前两个特征
X = X[y < 2, :2]
y = y[y < 2]
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()
standardScaler = StandardScaler()
standardScaler.fit(X)
X_standard = standardScaler.transform(X)
# 生成一个Hard Margin SVM对象，调大超参数C
svc = LinearSVC(C=1e9)
svc.fit(X_standard, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(svc, axis=[-3, 3, -3, 3])
# plt.scatter(X_standard[y == 0, 0], X_standard[y == 0, 1], color='red')
# plt.scatter(X_standard[y == 1, 0], X_standard[y == 1, 1], color='blue')
# plt.show()
svc2 = LinearSVC(C=0.01)
svc2.fit(X_standard, y)
# plot_decision_boundary(svc2, axis=[-3, 3, -3, 3])
# plt.scatter(X_standard[y == 0, 0], X_standard[y == 0, 1], color='red')
# plt.scatter(X_standard[y == 1, 0], X_standard[y == 1, 1], color='blue')
# plt.show()

def plot_svc_decision_boundary(model, axis):
# 绘制不规则决策边界以及margin上下的两根直线
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
w = model.coef_[0]
b = model.intercept_[0]
plot_x = np.linspace(axis[0], axis[1], 200)
up_y = - w[0] / w[1] * plot_x - b / w[1] + 1 / w[1]
down_y = - w[0] / w[1] * plot_x - b / w[1] - 1 / w[1]
up_index = (up_y >= axis[2]) & (up_y <= axis[3])
down_index = (down_y >= axis[2]) & (down_y <= axis[3])
plt.plot(plot_x[up_index], up_y[up_index], color='black')
plt.plot(plot_x[down_index], down_y[down_index], color='black')

# 打印系数
print(svc.coef_)
# 打印截距
print(svc.intercept_)
plot_svc_decision_boundary(svc, axis=[-3, 3, -3, 3])
plt.scatter(X_standard[y == 0, 0], X_standard[y == 0, 1], color='red')
plt.scatter(X_standard[y == 1, 0], X_standard[y == 1, 1], color='blue')
plt.show()

[[ 4.03240941 -2.5070088 ]]
[0.92734009]

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X = iris.data
y = iris.target
# 只进行二分类，并且为了可视化只取前两个特征
X = X[y < 2, :2]
y = y[y < 2]
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()
standardScaler = StandardScaler()
standardScaler.fit(X)
X_standard = standardScaler.transform(X)
# 生成一个Hard Margin SVM对象，调大超参数C
svc = LinearSVC(C=1e9)
svc.fit(X_standard, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(svc, axis=[-3, 3, -3, 3])
# plt.scatter(X_standard[y == 0, 0], X_standard[y == 0, 1], color='red')
# plt.scatter(X_standard[y == 1, 0], X_standard[y == 1, 1], color='blue')
# plt.show()
svc2 = LinearSVC(C=0.01)
svc2.fit(X_standard, y)
# plot_decision_boundary(svc2, axis=[-3, 3, -3, 3])
# plt.scatter(X_standard[y == 0, 0], X_standard[y == 0, 1], color='red')
# plt.scatter(X_standard[y == 1, 0], X_standard[y == 1, 1], color='blue')
# plt.show()

def plot_svc_decision_boundary(model, axis):
# 绘制不规则决策边界以及margin上下的两根直线
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
w = model.coef_[0]
b = model.intercept_[0]
plot_x = np.linspace(axis[0], axis[1], 200)
up_y = - w[0] / w[1] * plot_x - b / w[1] + 1 / w[1]
down_y = - w[0] / w[1] * plot_x - b / w[1] - 1 / w[1]
up_index = (up_y >= axis[2]) & (up_y <= axis[3])
down_index = (down_y >= axis[2]) & (down_y <= axis[3])
plt.plot(plot_x[up_index], up_y[up_index], color='black')
plt.plot(plot_x[down_index], down_y[down_index], color='black')

# 打印系数
# print(svc.coef_)
# 打印截距
# print(svc.intercept_)
# plot_svc_decision_boundary(svc, axis=[-3, 3, -3, 3])
# plt.scatter(X_standard[y == 0, 0], X_standard[y == 0, 1], color='red')
# plt.scatter(X_standard[y == 1, 0], X_standard[y == 1, 1], color='blue')
# plt.show()
print(svc2.coef_)
print(svc2.intercept_)
plot_svc_decision_boundary(svc2, axis=[-3, 3, -3, 3])
plt.scatter(X_standard[y == 0, 0], X_standard[y == 0, 1], color='red')
plt.scatter(X_standard[y == 1, 0], X_standard[y == 1, 1], color='blue')
plt.show()

[[ 0.43789691 -0.41091867]]
[0.00592624]

SVM中使用多项式特征和核函数

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

if __name__ == "__main__":

# 制造一个月亮形状的数据集
X, y = datasets.make_moons()
print(X.shape)
print(y.shape)
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
plt.show()

(100, 2)
(100,)

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

if __name__ == "__main__":

# 制造一个月亮形状的数据集
# X, y = datasets.make_moons()
# print(X.shape)
# print(y.shape)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()
X, y = datasets.make_moons(noise=0.15, random_state=666)
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

# 制造一个月亮形状的数据集
# X, y = datasets.make_moons()
# print(X.shape)
# print(y.shape)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()
X, y = datasets.make_moons(noise=0.15, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()
# 使用多项式特征的SVM
def PolynomialSVC(degree, C=1):
return Pipeline([
("poly", PolynomialFeatures(degree=degree)),
("std_scaler", StandardScaler()),
("linearSVC", LinearSVC(C=C))
])

poly_svc = PolynomialSVC(degree=3)
poly_svc.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

plot_decision_boundary(poly_svc, axis=[-1.5, 2.5, -1, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from matplotlib.colors import ListedColormap
from sklearn.svm import SVC

if __name__ == "__main__":

# 制造一个月亮形状的数据集
# X, y = datasets.make_moons()
# print(X.shape)
# print(y.shape)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()
X, y = datasets.make_moons(noise=0.15, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()
# 使用多项式特征的SVM
def PolynomialSVC(degree, C=1):
return Pipeline([
("poly", PolynomialFeatures(degree=degree)),
("std_scaler", StandardScaler()),
("linearSVC", LinearSVC(C=C))
])

poly_svc = PolynomialSVC(degree=3)
poly_svc.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(poly_svc, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
# plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
# plt.show()

def PolynomialKernelSVC(degree, C=1):
return Pipeline([
("std_scaler", StandardScaler()),
("kernelSVC", SVC(kernel='poly', degree=degree, C=C))
])

poly_kernel_svc = PolynomialKernelSVC(degree=3)
poly_kernel_svc.fit(X, y)
plot_decision_boundary(poly_kernel_svc, axis=[-1.5, 2.5, -1, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
plt.show()


import numpy as np
import matplotlib.pyplot as plt

if __name__ == "__main__":

# 生成一维样本数据
x = np.arange(-4, 5, 1)
print(x)
# 生成线性不可分的分类
y = np.array((x >= -2) & (x <= 2), dtype='int')
print(y)
plt.scatter(x[y == 0], [0] * len(x[y == 0]))
plt.scatter(x[y == 1], [0] * len(x[y == 1]))
plt.show()

[-4 -3 -2 -1  0  1  2  3  4]
[0 0 1 1 1 1 1 0 0]


import numpy as np
import matplotlib.pyplot as plt

if __name__ == "__main__":

# 生成一维样本数据
x = np.arange(-4, 5, 1)
# print(x)
# 生成线性不可分的分类
y = np.array((x >= -2) & (x <= 2), dtype='int')
# print(y)
# plt.scatter(x[y == 0], [0] * len(x[y == 0]))
# plt.scatter(x[y == 1], [0] * len(x[y == 1]))
# plt.show()

def gaussian(x, l):
# 高斯核函数
gamma = 1
return np.exp(- gamma * (x - l)**2)

# 设定地标
l1, l2 = -1, 1
X_new = np.empty((len(x), 2))
for i, data in enumerate(x):
X_new[i, 0] = gaussian(data, l1)
X_new[i, 1] = gaussian(data, l2)
plt.scatter(X_new[y == 0, 0], X_new[y == 0, 1])
plt.scatter(X_new[y == 1, 0], X_new[y == 1, 1])
plt.show()

scikit-learn中的高斯RBF核

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

if __name__ == "__main__":

X, y = datasets.make_moons(noise=0.15, random_state=666)
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X, y = datasets.make_moons(noise=0.15, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

def RBFKernelSVC(gamma=1):
# 生成一个高斯核函数的SVM模型
return Pipeline([
("std_scaler", StandardScaler()),
("svc", SVC(kernel='rbf', gamma=gamma))
])

svc = RBFKernelSVC()
svc.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

plot_decision_boundary(svc, axis=[-1.5, 2.5, -1, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X, y = datasets.make_moons(noise=0.15, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

def RBFKernelSVC(gamma=1):
# 生成一个高斯核函数的SVM模型
return Pipeline([
("std_scaler", StandardScaler()),
("svc", SVC(kernel='rbf', gamma=gamma))
])

svc = RBFKernelSVC()
svc.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(svc, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
svc_gamma100 = RBFKernelSVC(gamma=100)
svc_gamma100.fit(X, y)
plot_decision_boundary(svc_gamma100, axis=[-1.5, 2.5, -1, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X, y = datasets.make_moons(noise=0.15, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

def RBFKernelSVC(gamma=1):
# 生成一个高斯核函数的SVM模型
return Pipeline([
("std_scaler", StandardScaler()),
("svc", SVC(kernel='rbf', gamma=gamma))
])

svc = RBFKernelSVC()
svc.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(svc, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
svc_gamma100 = RBFKernelSVC(gamma=100)
svc_gamma100.fit(X, y)
# plot_decision_boundary(svc_gamma100, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
svc_gamma10 = RBFKernelSVC(gamma=10)
svc_gamma10.fit(X, y)
plot_decision_boundary(svc_gamma10, axis=[-1.5, 2.5, -1, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X, y = datasets.make_moons(noise=0.15, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

def RBFKernelSVC(gamma=1):
# 生成一个高斯核函数的SVM模型
return Pipeline([
("std_scaler", StandardScaler()),
("svc", SVC(kernel='rbf', gamma=gamma))
])

svc = RBFKernelSVC()
svc.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(svc, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
svc_gamma100 = RBFKernelSVC(gamma=100)
svc_gamma100.fit(X, y)
# plot_decision_boundary(svc_gamma100, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
svc_gamma10 = RBFKernelSVC(gamma=10)
svc_gamma10.fit(X, y)
# plot_decision_boundary(svc_gamma10, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
svc_gamma05 = RBFKernelSVC(gamma=0.5)
svc_gamma05.fit(X, y)
plot_decision_boundary(svc_gamma05, axis=[-1.5, 2.5, -1, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X, y = datasets.make_moons(noise=0.15, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

def RBFKernelSVC(gamma=1):
# 生成一个高斯核函数的SVM模型
return Pipeline([
("std_scaler", StandardScaler()),
("svc", SVC(kernel='rbf', gamma=gamma))
])

svc = RBFKernelSVC()
svc.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(svc, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
svc_gamma100 = RBFKernelSVC(gamma=100)
svc_gamma100.fit(X, y)
# plot_decision_boundary(svc_gamma100, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
svc_gamma10 = RBFKernelSVC(gamma=10)
svc_gamma10.fit(X, y)
# plot_decision_boundary(svc_gamma10, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
svc_gamma05 = RBFKernelSVC(gamma=0.5)
svc_gamma05.fit(X, y)
# plot_decision_boundary(svc_gamma05, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
svc_gamma01 = RBFKernelSVC(gamma=0.1)
svc_gamma01.fit(X, y)
plot_decision_boundary(svc_gamma01, axis=[-1.5, 2.5, -1, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

SVM思想解决回归问题

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVR
# 可以传入不同的核函数
from sklearn.svm import SVR

if __name__ == "__main__":

X = boston.data
y = boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

def StandardLinearSVR(epsilon=0.1):
return Pipeline([
("std_scaler", StandardScaler()),
("linearSVR", LinearSVR(epsilon=epsilon))
])

svr = StandardLinearSVR()
svr.fit(X_train, y_train)
print(svr.score(X_test, y_test))

0.6361902835717916

0
0 收藏

0 评论
0 收藏
0