2021/09/19 00:42

决策树

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

if __name__ == "__main__":

# 保留鸢尾花数据集的后两个特征
X = iris.data[:, 2:]
y = iris.target
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.scatter(X[y == 2, 0], X[y == 2, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

# 保留鸢尾花数据集的后两个特征
X = iris.data[:, 2:]
y = iris.target
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()
dt_clf = DecisionTreeClassifier(max_depth=2, criterion='entropy')
dt_clf.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

plot_decision_boundary(dt_clf, axis=[0.5, 7.5, 0, 3])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.scatter(X[y == 2, 0], X[y == 2, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt

if __name__ == "__main__":

def entropy(p):
# 计算二分类信息熵
return - p * np.log(p) - (1 - p) * np.log(1 - p)

x = np.linspace(0.01, 0.99, 200)
plt.plot(x, entropy(x))
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap
from collections import Counter
from math import log

if __name__ == "__main__":

# 保留鸢尾花数据集的后两个特征
X = iris.data[:, 2:]
y = iris.target
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()
dt_clf = DecisionTreeClassifier(max_depth=2, criterion='entropy')
dt_clf.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(dt_clf, axis=[0.5, 7.5, 0, 3])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()

def split(X, y, d, value):
'''
划分
:param X: 特征
:param y: 输出
:param d: 维度
:param value: 阈值
:return:
'''
index_a = (X[:, d] <= value)
index_b = (X[:, d] > value)
return X[index_a], X[index_b], y[index_a], y[index_b]

def entropy(y):
'''
计算信息熵
:param y:
:return:
'''
# 建立类别和数量的字典
counter = Counter(y)
res = 0
for num in counter.values():
p = num / len(y)
res += - p * log(p)
return res

def try_split(X, y):
'''
搜索信息熵最小的维度和阈值
:param X:
:param y:
:return:
'''
# 定义一个最好的信息熵，初始值为+∞
best_entropy = float('inf')
# 定义在哪个维度，哪个阈值上进行划分，初始值都为-1
best_d, best_v = -1, -1
# 遍历所有的特征
for d in range(X.shape[1]):
# 对d维度上的数据进行排序
sorted_index = np.argsort(X[:, d])
# 遍历所有的样本数(不包含第一个样本)
for i in range(1, len(X)):
# 拿取候选阈值，为d维的数据中每两个数的中间值，且这两个数不能相等
if X[sorted_index[i - 1], d] != X[sorted_index[i], d]:
v = (X[sorted_index[i - 1], d] + X[sorted_index[i], d]) / 2
# 对候选维度和阈值尝试划分
X_l, X_r, y_l, y_r = split(X, y, d, v)
# 获取对候选维度和候选阈值进行划分后的信息熵
e = entropy(y_l) + entropy(y_r)
# 获取信息熵最小的维度和阈值
if e < best_entropy:
best_entropy, best_d, best_v = e, d, v
return best_entropy, best_d, best_v

best_entropy, best_d, best_v = try_split(X, y)
print("best_entropy =", best_entropy)
print("best_d =", best_d)
print("best_v =", best_v)

best_entropy = 0.6931471805599453
best_d = 0
best_v = 2.45

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap
from collections import Counter
from math import log

if __name__ == "__main__":

# 保留鸢尾花数据集的后两个特征
X = iris.data[:, 2:]
y = iris.target
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()
dt_clf = DecisionTreeClassifier(max_depth=2, criterion='entropy')
dt_clf.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(dt_clf, axis=[0.5, 7.5, 0, 3])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()

def split(X, y, d, value):
'''
划分
:param X: 特征
:param y: 输出
:param d: 维度
:param value: 阈值
:return:
'''
index_a = (X[:, d] <= value)
index_b = (X[:, d] > value)
return X[index_a], X[index_b], y[index_a], y[index_b]

def entropy(y):
'''
计算信息熵
:param y:
:return:
'''
# 建立类别和数量的字典
counter = Counter(y)
res = 0
for num in counter.values():
p = num / len(y)
res += - p * log(p)
return res

def try_split(X, y):
'''
搜索信息熵最小的维度和阈值
:param X:
:param y:
:return:
'''
# 定义一个最好的信息熵，初始值为+∞
best_entropy = float('inf')
# 定义在哪个维度，哪个阈值上进行划分，初始值都为-1
best_d, best_v = -1, -1
# 遍历所有的特征
for d in range(X.shape[1]):
# 对d维度上的数据进行排序
sorted_index = np.argsort(X[:, d])
# 遍历所有的样本数(不包含第一个样本)
for i in range(1, len(X)):
# 拿取候选阈值，为d维的数据中每两个数的中间值，且这两个数不能相等
if X[sorted_index[i - 1], d] != X[sorted_index[i], d]:
v = (X[sorted_index[i - 1], d] + X[sorted_index[i], d]) / 2
# 对候选维度和阈值尝试划分
X_l, X_r, y_l, y_r = split(X, y, d, v)
# 获取对候选维度和候选阈值进行划分后的信息熵
e = entropy(y_l) + entropy(y_r)
# 获取信息熵最小的维度和阈值
if e < best_entropy:
best_entropy, best_d, best_v = e, d, v
return best_entropy, best_d, best_v

best_entropy, best_d, best_v = try_split(X, y)
print("best_entropy =", best_entropy)
print("best_d =", best_d)
print("best_v =", best_v)
# 将第一次划分后的数据进行存储
X1_l, X1_r, y1_l, y1_r = split(X, y, best_d, best_v)
# 打印划分后左子树的信息熵
print(entropy(y1_l))
# 打印划分后的右子树的信息熵
print(entropy(y1_r))

best_entropy = 0.6931471805599453
best_d = 0
best_v = 2.45
0.0
0.6931471805599453

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap
from collections import Counter
from math import log

if __name__ == "__main__":

# 保留鸢尾花数据集的后两个特征
X = iris.data[:, 2:]
y = iris.target
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()
dt_clf = DecisionTreeClassifier(max_depth=2, criterion='entropy')
dt_clf.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(dt_clf, axis=[0.5, 7.5, 0, 3])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()

def split(X, y, d, value):
'''
划分
:param X: 特征
:param y: 输出
:param d: 维度
:param value: 阈值
:return:
'''
index_a = (X[:, d] <= value)
index_b = (X[:, d] > value)
return X[index_a], X[index_b], y[index_a], y[index_b]

def entropy(y):
'''
计算信息熵
:param y:
:return:
'''
# 建立类别和数量的字典
counter = Counter(y)
res = 0
for num in counter.values():
p = num / len(y)
res += - p * log(p)
return res

def try_split(X, y):
'''
搜索信息熵最小的维度和阈值
:param X:
:param y:
:return:
'''
# 定义一个最好的信息熵，初始值为+∞
best_entropy = float('inf')
# 定义在哪个维度，哪个阈值上进行划分，初始值都为-1
best_d, best_v = -1, -1
# 遍历所有的特征
for d in range(X.shape[1]):
# 对d维度上的数据进行排序
sorted_index = np.argsort(X[:, d])
# 遍历所有的样本数(不包含第一个样本)
for i in range(1, len(X)):
# 拿取候选阈值，为d维的数据中每两个数的中间值，且这两个数不能相等
if X[sorted_index[i - 1], d] != X[sorted_index[i], d]:
v = (X[sorted_index[i - 1], d] + X[sorted_index[i], d]) / 2
# 对候选维度和阈值尝试划分
X_l, X_r, y_l, y_r = split(X, y, d, v)
# 获取对候选维度和候选阈值进行划分后的信息熵
e = entropy(y_l) + entropy(y_r)
# 获取信息熵最小的维度和阈值
if e < best_entropy:
best_entropy, best_d, best_v = e, d, v
return best_entropy, best_d, best_v

best_entropy, best_d, best_v = try_split(X, y)
print("best_entropy =", best_entropy)
print("best_d =", best_d)
print("best_v =", best_v)
# 将第一次划分后的数据进行存储
X1_l, X1_r, y1_l, y1_r = split(X, y, best_d, best_v)
# 打印划分后左子树的信息熵
print(entropy(y1_l))
# 打印划分后的右子树的信息熵
print(entropy(y1_r))
# 对右子树继续划分
best_entropy2, best_d2, best_v2 = try_split(X1_r, y1_r)
print("best_entropy =", best_entropy2)
print("best_d =", best_d2)
print("best_v =", best_v2)
X2_l, X2_r, y2_l, y2_r = split(X1_r, y1_r, best_d2, best_v2)
# 打印第二次划分后左子树的信息熵
print(entropy(y2_l))
# 打印第二次划分后的右子树的信息熵
print(entropy(y2_r))

best_entropy = 0.6931471805599453
best_d = 0
best_v = 2.45
0.0
0.6931471805599453
best_entropy = 0.4132278899361904
best_d = 1
best_v = 1.75
0.30849545083110386
0.10473243910508653

import numpy as np
import matplotlib.pyplot as plt

if __name__ == "__main__":

def gini(p):
# 计算二分类基尼系数
return - 2 * p**2 + 2 * p

x = np.linspace(0, 1, 200)
plt.plot(x, gini(x))
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

# 保留鸢尾花数据集的后两个特征
X = iris.data[:, 2:]
y = iris.target
dt_clf = DecisionTreeClassifier(max_depth=2, criterion='gini', splitter='best')
dt_clf.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

plot_decision_boundary(dt_clf, axis=[0.5, 7.5, 0, 3])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.scatter(X[y == 2, 0], X[y == 2, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap
from collections import Counter

if __name__ == "__main__":

# 保留鸢尾花数据集的后两个特征
X = iris.data[:, 2:]
y = iris.target
dt_clf = DecisionTreeClassifier(max_depth=2, criterion='gini', splitter='best')
dt_clf.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(dt_clf, axis=[0.5, 7.5, 0, 3])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()

def split(X, y, d, value):
'''
划分
:param X: 特征
:param y: 输出
:param d: 维度
:param value: 阈值
:return:
'''
index_a = (X[:, d] <= value)
index_b = (X[:, d] > value)
return X[index_a], X[index_b], y[index_a], y[index_b]

def gini(y):
'''
计算基尼系数
:param y:
:return:
'''
# 建立类别和数量的字典
counter = Counter(y)
res = 1
for num in counter.values():
p = num / len(y)
res -= p**2
return res

def try_split(X, y):
'''
搜索基尼系数最小的维度和阈值
:param X:
:param y:
:return:
'''
# 定义一个最好的基尼系数，初始值为+∞
best_g = float('inf')
# 定义在哪个维度，哪个阈值上进行划分，初始值都为-1
best_d, best_v = -1, -1
# 遍历所有的特征
for d in range(X.shape[1]):
# 对d维度上的数据进行排序
sorted_index = np.argsort(X[:, d])
# 遍历所有的样本数(不包含第一个样本)
for i in range(1, len(X)):
# 拿取候选阈值，为d维的数据中每两个数的中间值，且这两个数不能相等
if X[sorted_index[i - 1], d] != X[sorted_index[i], d]:
v = (X[sorted_index[i - 1], d] + X[sorted_index[i], d]) / 2
# 对候选维度和阈值尝试划分
X_l, X_r, y_l, y_r = split(X, y, d, v)
# 获取对候选维度和候选阈值进行划分后的基尼系数
e = gini(y_l) + gini(y_r)
# 获取基尼系数最小的维度和阈值
if e < best_g:
best_g, best_d, best_v = e, d, v
return best_g, best_d, best_v

best_g, best_d, best_v = try_split(X, y)
print("best_g =", best_g)
print("best_d =", best_d)
print("best_v =", best_v)
# 将第一次划分后的数据进行存储
X1_l, X1_r, y1_l, y1_r = split(X, y, best_d, best_v)
# 打印划分后左子树的基尼系数
print(gini(y1_l))
# 打印划分后的右子树的基尼系数
print(gini(y1_r))
# 对右子树继续划分
best_g2, best_d2, best_v2 = try_split(X1_r, y1_r)
print("best_g =", best_g2)
print("best_d =", best_d2)
print("best_v =", best_v2)
X2_l, X2_r, y2_l, y2_r = split(X1_r, y1_r, best_d2, best_v2)
# 打印第二次划分后左子树的基尼系数
print(gini(y2_l))
# 打印第二次划分后的右子树的基尼系数
print(gini(y2_r))

best_g = 0.5
best_d = 0
best_v = 2.45
0.0
0.5
best_g = 0.2105714900645938
best_d = 1
best_v = 1.75
0.1680384087791495
0.04253308128544431

CART与决策树中的超参数

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

if __name__ == "__main__":

X, y = datasets.make_moons(noise=0.25, random_state=666)
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X, y = datasets.make_moons(noise=0.25, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

plot_decision_boundary(dt_clf, axis=[-1.5, 2.5, -1, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X, y = datasets.make_moons(noise=0.25, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(dt_clf, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

dt_clf2 = DecisionTreeClassifier(max_depth=2)
dt_clf2.fit(X, y)
plot_decision_boundary(dt_clf2, axis=[-1.5, 2.5, -1, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X, y = datasets.make_moons(noise=0.25, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(dt_clf, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

dt_clf2 = DecisionTreeClassifier(max_depth=2)
dt_clf2.fit(X, y)
# plot_decision_boundary(dt_clf2, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

dt_clf3 = DecisionTreeClassifier(min_samples_split=10)
dt_clf3.fit(X, y)
plot_decision_boundary(dt_clf3, axis=[-1.5, 2.5, -1, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X, y = datasets.make_moons(noise=0.25, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(dt_clf, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

dt_clf2 = DecisionTreeClassifier(max_depth=2)
dt_clf2.fit(X, y)
# plot_decision_boundary(dt_clf2, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

dt_clf3 = DecisionTreeClassifier(min_samples_split=10)
dt_clf3.fit(X, y)
# plot_decision_boundary(dt_clf3, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

dt_clf4 = DecisionTreeClassifier(min_samples_leaf=6)
dt_clf4.fit(X, y)
plot_decision_boundary(dt_clf4, axis=[-1.5, 2.5, -1, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X, y = datasets.make_moons(noise=0.25, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(dt_clf, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

dt_clf2 = DecisionTreeClassifier(max_depth=2)
dt_clf2.fit(X, y)
# plot_decision_boundary(dt_clf2, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

dt_clf3 = DecisionTreeClassifier(min_samples_split=10)
dt_clf3.fit(X, y)
# plot_decision_boundary(dt_clf3, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

dt_clf4 = DecisionTreeClassifier(min_samples_leaf=6)
dt_clf4.fit(X, y)
# plot_decision_boundary(dt_clf4, axis=[-1.5, 2.5, -1, 1.5])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

dt_clf5 = DecisionTreeClassifier(max_leaf_nodes=4)
dt_clf5.fit(X, y)
plot_decision_boundary(dt_clf5, axis=[-1.5, 2.5, -1, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

if __name__ == "__main__":

X = boston.data
y = boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)
print(dt_reg.score(X_test, y_test))
print(dt_reg.score(X_train, y_train))

0.5934966254524108
1.0

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

if __name__ == "__main__":

X = boston.data
y = boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)
# print(dt_reg.score(X_test, y_test))
# print(dt_reg.score(X_train, y_train))

maxSampleLeaf = 506
train_scores = []
test_scores = []
for i in range(1, maxSampleLeaf + 1):
dt_reg = DecisionTreeRegressor(min_samples_leaf=i)
dt_reg.fit(X_train, y_train)
y_train_predict = dt_reg.predict(X_train)
train_scores.append(r2_score(y_train, y_train_predict))
test_scores.append(dt_reg.score(X_test, y_test))
plt.plot([i for i in range(1, maxSampleLeaf + 1)], train_scores, label='train')
plt.plot([i for i in range(1, maxSampleLeaf + 1)], test_scores, label='test')
plt.xlim(maxSampleLeaf, 1)
plt.legend()
plt.show()

min_samples_leaf的意思就是对于叶子节点来说，它至少应该有几个样本。我们知道该值越小越容易过拟合，模型复杂度越高，所以我们图形中的横轴是从大到小，最小到1来排序的。意思就是说我们的模型复杂度是从低到高的一个过程，其中训练数据集的模型准确率是不断上升的，而测试数据集也是在不断上升但是到了一个峰值就开始下降。这里这个下降可能看的不是特别明显，现在我们来调低min_samples_leaf的最大值。

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

if __name__ == "__main__":

X = boston.data
y = boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)
# print(dt_reg.score(X_test, y_test))
# print(dt_reg.score(X_train, y_train))

maxSampleLeaf = 506
train_scores = []
test_scores = []
for i in range(1, maxSampleLeaf + 1):
dt_reg = DecisionTreeRegressor(min_samples_leaf=i)
dt_reg.fit(X_train, y_train)
y_train_predict = dt_reg.predict(X_train)
train_scores.append(r2_score(y_train, y_train_predict))
test_scores.append(dt_reg.score(X_test, y_test))
# plt.plot([i for i in range(1, maxSampleLeaf + 1)], train_scores, label='train')
# plt.plot([i for i in range(1, maxSampleLeaf + 1)], test_scores, label='test')
# plt.xlim(maxSampleLeaf, 1)
# plt.legend()
# plt.show()

maxSampleLeaf = 100
train_scores = []
test_scores = []
for i in range(1, maxSampleLeaf + 1):
dt_reg = DecisionTreeRegressor(min_samples_leaf=i)
dt_reg.fit(X_train, y_train)
y_train_predict = dt_reg.predict(X_train)
train_scores.append(r2_score(y_train, y_train_predict))
test_scores.append(dt_reg.score(X_test, y_test))
plt.plot([i for i in range(1, maxSampleLeaf + 1)], train_scores, label='train')
plt.plot([i for i in range(1, maxSampleLeaf + 1)], test_scores, label='test')
plt.xlim(maxSampleLeaf, 1)
plt.legend()
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

if __name__ == "__main__":

X = boston.data
y = boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)
# print(dt_reg.score(X_test, y_test))
# print(dt_reg.score(X_train, y_train))

maxSampleLeaf = 506
train_scores = []
test_scores = []
for i in range(1, maxSampleLeaf + 1):
dt_reg = DecisionTreeRegressor(min_samples_leaf=i)
dt_reg.fit(X_train, y_train)
y_train_predict = dt_reg.predict(X_train)
train_scores.append(r2_score(y_train, y_train_predict))
test_scores.append(dt_reg.score(X_test, y_test))
# plt.plot([i for i in range(1, maxSampleLeaf + 1)], train_scores, label='train')
# plt.plot([i for i in range(1, maxSampleLeaf + 1)], test_scores, label='test')
# plt.xlim(maxSampleLeaf, 1)
# plt.legend()
# plt.show()

maxSampleLeaf = 100
train_scores = []
test_scores = []
for i in range(1, maxSampleLeaf + 1):
dt_reg = DecisionTreeRegressor(min_samples_leaf=i)
dt_reg.fit(X_train, y_train)
y_train_predict = dt_reg.predict(X_train)
train_scores.append(r2_score(y_train, y_train_predict))
test_scores.append(dt_reg.score(X_test, y_test))
# plt.plot([i for i in range(1, maxSampleLeaf + 1)], train_scores, label='train')
# plt.plot([i for i in range(1, maxSampleLeaf + 1)], test_scores, label='test')
# plt.xlim(maxSampleLeaf, 1)
# plt.legend()
# plt.show()

maxSamplesSplit = 300
train_scores = []
test_scores = []
for i in range(2, maxSamplesSplit + 1):
dt_reg = DecisionTreeRegressor(min_samples_split=i)
dt_reg.fit(X_train, y_train)
y_train_predict = dt_reg.predict(X_train)
train_scores.append(r2_score(y_train, y_train_predict))
test_scores.append(dt_reg.score(X_test, y_test))
plt.plot([i for i in range(2, maxSamplesSplit + 1)], train_scores, label='train')
plt.plot([i for i in range(2, maxSamplesSplit + 1)], test_scores, label='test')
plt.xlim(maxSamplesSplit, 2)
plt.legend()
plt.show()

min_samples_split的意思就是对于一个节点来说，它至少要有多少个样本数据，我们才对这个节点继续进行拆分下去。这个值越大，模型复杂度越简单；越小，模型复杂度越复杂，所以上图的模型复杂度也是从小到大。同样对于训练数据集的模型准确率也是不断上升的，但是对于测试数据集来说，也是不断上升，然后到达峰值然后开始下降。

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X = iris.data[:, 2:]
y = iris.target
tree_clf = DecisionTreeClassifier(max_depth=2, criterion='entropy', splitter='best')
tree_clf.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

plot_decision_boundary(tree_clf, axis=[0.5, 7.5, 0, 3])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.scatter(X[y == 2, 0], X[y == 2, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap

if __name__ == "__main__":

X = iris.data[:, 2:]
y = iris.target
tree_clf = DecisionTreeClassifier(max_depth=2, criterion='entropy', splitter='best')
tree_clf.fit(X, y)

def plot_decision_boundary(model, axis):
# 绘制不规则决策边界
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

# plot_decision_boundary(tree_clf, axis=[0.5, 7.5, 0, 3])
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.scatter(X[y == 2, 0], X[y == 2, 1])
# plt.show()

X_new = np.delete(X, 138, axis=0)
y_new = np.delete(y, 138)
print(X_new.shape)
print(y_new.shape)
tree_clf2 = DecisionTreeClassifier(max_depth=2, criterion='entropy', splitter='best')
tree_clf2.fit(X_new, y_new)
plot_decision_boundary(tree_clf2, axis=[0.5, 7.5, 0, 3])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.scatter(X[y == 2, 0], X[y == 2, 1])
plt.show()

集成学习和随机森林

scikit-learn为我们提供了一个方便的接口——Voting Classifier。它就是基于不同的算法进行投票的一种分类器。我们用代码来看一下这个集成学习的使用，先画出月亮的数据集

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# 逻辑回归
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
print(log_clf.score(X_test, y_test))
# 支撑向量机SVM
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
print(svm_clf.score(X_test, y_test))
# 决策树
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
print(dt_clf.score(X_test, y_test))
# 用三种模型进行预测
y_predict1 = log_clf.predict(X_test)
y_predict2 = svm_clf.predict(X_test)
y_predict3 = dt_clf.predict(X_test)
# 综合三个算法，少数服从多数得到分类预测值
y_predict = np.array((y_predict1 + y_predict2 + y_predict3) >= 2, dtype='int')
print(accuracy_score(y_test, y_predict))


0.864
0.896
0.856
0.912

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# 逻辑回归
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
print(log_clf.score(X_test, y_test))
# 支撑向量机SVM
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
print(svm_clf.score(X_test, y_test))
# 决策树
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
print(dt_clf.score(X_test, y_test))
# 用三种模型进行预测
y_predict1 = log_clf.predict(X_test)
y_predict2 = svm_clf.predict(X_test)
y_predict3 = dt_clf.predict(X_test)
# 综合三个算法，少数服从多数得到分类预测值
y_predict = np.array((y_predict1 + y_predict2 + y_predict3) >= 2, dtype='int')
print(accuracy_score(y_test, y_predict))
# 使用scikit-learn提供的集成学习接口
voting_clf = VotingClassifier(estimators=[
('log_clf', LogisticRegression()),
('svm_clf', SVC()),
('dt_clf', DecisionTreeClassifier())
], voting='hard')
voting_clf.fit(X_train, y_train)
print(voting_clf.score(X_test, y_test))


0.864
0.896
0.872
0.904
0.904

Soft Voting Classifier

kNN算法，计算概率的方式就是看离新来的样本点最近的多数分类的样本点数量/离新来的样本最近的总样本数量，下面这个图的k=3,概率就是2/3=66.6%

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# 逻辑回归
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
print(log_clf.score(X_test, y_test))
# 支撑向量机SVM
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
print(svm_clf.score(X_test, y_test))
# 决策树
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
print(dt_clf.score(X_test, y_test))
# 用三种模型进行预测
y_predict1 = log_clf.predict(X_test)
y_predict2 = svm_clf.predict(X_test)
y_predict3 = dt_clf.predict(X_test)
# 综合三个算法，少数服从多数得到分类预测值
y_predict = np.array((y_predict1 + y_predict2 + y_predict3) >= 2, dtype='int')
print(accuracy_score(y_test, y_predict))
# 使用scikit-learn提供的集成学习接口
# 使用hard voting
voting_clf = VotingClassifier(estimators=[
('log_clf', LogisticRegression()),
('svm_clf', SVC()),
('dt_clf', DecisionTreeClassifier())
], voting='hard')
voting_clf.fit(X_train, y_train)
print(voting_clf.score(X_test, y_test))
# 使用soft voting
voting_clf2 = VotingClassifier(estimators=[
('log_clf', LogisticRegression()),
('svm_clf', SVC(probability=True)),
('dt_clf', DecisionTreeClassifier(random_state=666))
], voting='soft')
voting_clf2.fit(X_train, y_train)
print(voting_clf2.score(X_test, y_test))


0.864
0.896
0.856
0.904
0.904
0.904

Bagging和Pasting

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

if __name__ == "__main__":

X , y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
dec_tree_clf = DecisionTreeClassifier()
dec_tree_clf.fit(X_train, y_train)
print(dec_tree_clf.score(X_test, y_test))
# 使用500个决策树子模型，每个模型只学习100个样本数据，采用放回取样的方式
bagging_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True)
bagging_clf.fit(X_train, y_train)
print(bagging_clf.score(X_test, y_test))

0.872
0.912

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
dec_tree_clf = DecisionTreeClassifier()
dec_tree_clf.fit(X_train, y_train)
print(dec_tree_clf.score(X_test, y_test))
# 使用500个决策树子模型，每个模型只学习100个样本数据，采用放回取样的方式
bagging_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True)
bagging_clf.fit(X_train, y_train)
print(bagging_clf.score(X_test, y_test))
# 使用5000个决策树子模型，每个模型只学习100个样本数据，采用放回取样的方式
bagging_clf2 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=5000, max_samples=100, bootstrap=True)
bagging_clf2.fit(X_train, y_train)
print(bagging_clf2.score(X_test, y_test))

0.824
0.912
0.928

oob(Out-of-Bag)和关于Bagging的更多讨论

OOB Out-of-Bag:放回取样有一定的概率导致一部分样本很有可能没有取到。平均大约有37%的样本没有取到。这37%的样本就叫OOB。

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import timeit

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
dec_tree_clf = DecisionTreeClassifier()
dec_tree_clf.fit(X_train, y_train)
print(dec_tree_clf.score(X_test, y_test))
# 使用500个决策树子模型，每个模型只学习100个样本数据，采用放回取样的方式
bagging_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True)
bagging_clf.fit(X_train, y_train)
print(bagging_clf.score(X_test, y_test))
# 使用5000个决策树子模型，每个模型只学习100个样本数据，采用放回取样的方式
start_time = timeit.default_timer()
bagging_clf2 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=5000, max_samples=100, bootstrap=True)
bagging_clf2.fit(X_train, y_train)
print(timeit.default_timer() - start_time)
print(bagging_clf2.score(X_test, y_test))
# 使用oob进行测试
bagging_clf3 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True,
oob_score=True)
bagging_clf3.fit(X, y)
print(bagging_clf3.oob_score_)

0.888
0.904
5.212921124
0.912
0.918

Bagging的更多探讨

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import timeit

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
dec_tree_clf = DecisionTreeClassifier()
dec_tree_clf.fit(X_train, y_train)
print(dec_tree_clf.score(X_test, y_test))
# 使用500个决策树子模型，每个模型只学习100个样本数据，采用放回取样的方式
bagging_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True)
bagging_clf.fit(X_train, y_train)
print(bagging_clf.score(X_test, y_test))
# 使用5000个决策树子模型，每个模型只学习100个样本数据，采用放回取样的方式
start_time = timeit.default_timer()
bagging_clf2 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=5000, max_samples=100, bootstrap=True)
bagging_clf2.fit(X_train, y_train)
print(timeit.default_timer() - start_time)
print(bagging_clf2.score(X_test, y_test))
# 使用oob进行测试
bagging_clf3 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True,
oob_score=True)
bagging_clf3.fit(X, y)
print(bagging_clf3.oob_score_)
# Bagging使用并行处理
start_time = timeit.default_timer()
bagging_clf4 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=5000,
max_samples=100, bootstrap=True, n_jobs=-1)
bagging_clf4.fit(X_train, y_train)
print(timeit.default_timer() - start_time)
print(bagging_clf4.score(X_test, y_test))

0.848
0.912
4.504785647
0.912
0.922
1.7601140050000001
0.912

Patches是补丁的意思，这也是一种形象的说法，我们的数据本身就是一个矩阵，每一行是一个样本，每一列代表一个特征，Random Patches这种方式其实就是既在行的维度上随机，又在列的维度上随机，得到的结果就好像上图一样，看起来就好像是一块布上的补丁，这些补丁是随机的。

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import timeit

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
dec_tree_clf = DecisionTreeClassifier()
dec_tree_clf.fit(X_train, y_train)
print(dec_tree_clf.score(X_test, y_test))
# 使用500个决策树子模型，每个模型只学习100个样本数据，采用放回取样的方式
bagging_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True)
bagging_clf.fit(X_train, y_train)
print(bagging_clf.score(X_test, y_test))
# 使用5000个决策树子模型，每个模型只学习100个样本数据，采用放回取样的方式
start_time = timeit.default_timer()
bagging_clf2 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=5000, max_samples=100, bootstrap=True)
bagging_clf2.fit(X_train, y_train)
print(timeit.default_timer() - start_time)
print(bagging_clf2.score(X_test, y_test))
# 使用oob进行测试
bagging_clf3 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True,
oob_score=True)
bagging_clf3.fit(X, y)
print(bagging_clf3.oob_score_)
# Bagging使用并行处理
start_time = timeit.default_timer()
bagging_clf4 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=5000,
max_samples=100, bootstrap=True, n_jobs=-1)
bagging_clf4.fit(X_train, y_train)
print(timeit.default_timer() - start_time)
print(bagging_clf4.score(X_test, y_test))
# Bagging对特征进行随机采样
random_subspaces_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
max_samples=500, bootstrap=True,
oob_score=True, n_jobs=-1,
max_features=1, bootstrap_features=True)
random_subspaces_clf.fit(X, y)
print(random_subspaces_clf.oob_score_)

0.848
0.912
4.455713805
0.912
0.92
1.8603084690000005
0.912
0.818

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import timeit

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
dec_tree_clf = DecisionTreeClassifier()
dec_tree_clf.fit(X_train, y_train)
print(dec_tree_clf.score(X_test, y_test))
# 使用500个决策树子模型，每个模型只学习100个样本数据，采用放回取样的方式
bagging_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True)
bagging_clf.fit(X_train, y_train)
print(bagging_clf.score(X_test, y_test))
# 使用5000个决策树子模型，每个模型只学习100个样本数据，采用放回取样的方式
start_time = timeit.default_timer()
bagging_clf2 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=5000, max_samples=100, bootstrap=True)
bagging_clf2.fit(X_train, y_train)
print(timeit.default_timer() - start_time)
print(bagging_clf2.score(X_test, y_test))
# 使用oob进行测试
bagging_clf3 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True,
oob_score=True)
bagging_clf3.fit(X, y)
print(bagging_clf3.oob_score_)
# Bagging使用并行处理
start_time = timeit.default_timer()
bagging_clf4 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=5000,
max_samples=100, bootstrap=True, n_jobs=-1)
bagging_clf4.fit(X_train, y_train)
print(timeit.default_timer() - start_time)
print(bagging_clf4.score(X_test, y_test))
# Bagging对特征进行随机采样
random_subspaces_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
max_samples=500, bootstrap=True,
oob_score=True, n_jobs=-1,
max_features=1, bootstrap_features=True)
random_subspaces_clf.fit(X, y)
print(random_subspaces_clf.oob_score_)
# 既对特征进行随机采样，又对样本数量进行随机采样
random_patches_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True,
oob_score=True, n_jobs=-1,
max_features=1, bootstrap_features=True)
random_patches_clf.fit(X, y)
print(random_patches_clf.oob_score_)

0.864
0.928
4.535023132
0.912
0.92
1.7864520659999998
0.912
0.842
0.854

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=666)
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

rf_cls = RandomForestClassifier(n_estimators=500, random_state=666,
oob_score=True, n_jobs=-1)
rf_cls.fit(X, y)
print(rf_cls.oob_score_)

rf_cls2 = RandomForestClassifier(n_estimators=500, random_state=666, max_leaf_nodes=16,
oob_score=True, n_jobs=-1)
rf_cls2.fit(X, y)
print(rf_cls2.oob_score_)

0.892
0.906

Extra-Trees

Extra-Trees依然是使用的基础分类器都是决策树(Base Estimator:Decision Tree)，它跟上面的随机森林最大的不同就是决策树在节点划分上，使用随机的特征和随机的阈值，我们上面的随机森林虽然也是使用随机的特征，但是阈值是最优划分的。这种方式显然提供了额外的随机性，抑制过拟合，但缺点就是增大了偏差(bias)，遏制了方差。对于是否要使用Extra-Trees，要根据我们的数据，要解决的问题是否适合使用Extra-Trees这样的一种方式，要根据实际情况进行判断。由于使用了随机的特征，随机的阈值，无需计算，比起随机森林而言，它有更快的训练速度。

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()

rf_cls = RandomForestClassifier(n_estimators=500, random_state=666,
oob_score=True, n_jobs=-1)
rf_cls.fit(X, y)
print(rf_cls.oob_score_)

rf_cls2 = RandomForestClassifier(n_estimators=500, random_state=666, max_leaf_nodes=16,
oob_score=True, n_jobs=-1)
rf_cls2.fit(X, y)
print(rf_cls2.oob_score_)

et_cls = ExtraTreesClassifier(n_estimators=500, bootstrap=True,
oob_score=True, random_state=666)
et_cls.fit(X, y)
print(et_cls.oob_score_)

0.892
0.906
0.892

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=666)
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
print(ada_clf.score(X_test, y_test))

0.872

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

if __name__ == "__main__":

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=666)
# plt.scatter(X[y == 0, 0], X[y == 0, 1])
# plt.scatter(X[y == 1, 0], X[y == 1, 1])
# plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

gb_clf.fit(X_train, y_train)
print(gb_clf.score(X_test, y_test))

0.848
0.904

Boosting解决回归问题

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

Stacking

0
0 收藏

0 评论
0 收藏
0