# 机器学习笔记(1)决策树

2018/11/28 15:39

sklearn中使用决策树

from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(max_depth=2, criterion="entropy")
tree_clf.fit(X, y)
tree_clf = DecisionTreeClassifier(max_depth=2, criterion="gini")
tree_clf.fit(X, y)

from collections import Counter
from math import log

def entropy(y):
counter = Counter(y)
res = 0.0
for num in counter.values():
p = num / len(y)
res += -p * log(p)
return res

def try_split(X, y):

best_entropy = float('inf')
best_d, best_v = -1, -1
for d in range(X.shape[1]):
sorted_index = np.argsort(X[:,d])
for i in range(1, len(X)):
if X[sorted_index[i], d] != X[sorted_index[i-1], d]:
v = (X[sorted_index[i], d] + X[sorted_index[i-1], d])/2  #取2个样本的均值作为v
X_l, X_r, y_l, y_r = split(X, y, d, v)
e = entropy(y_l) + entropy(y_r)
if e < best_entropy:
best_entropy, best_d, best_v = e, d, v

return best_entropy, best_d, best_v

class sklearn.tree.DecisionTreeClassifier(criterion='gini'splitter='best'max_depth=Nonemin_samples_split=2min_samples_leaf=1min_weight_fraction_leaf=0.0max_features=Nonerandom_state=Nonemax_leaf_nodes=Nonemin_impurity_decrease=0.0min_impurity_split=Noneclass_weight=Nonepresort=False)

max_depth:树的深度.

min_samples_split：某个待决策的节点的最少样本数量.

min_samples_leaf ：叶子节点必须包含的最小样本数量

max_features ：寻找使系统的不确定性最低的划分时，所需要考虑的最大特征数.

min_samples_split越大,越容易欠拟合,越小越容器过拟合.比如我的min_samples_split设置为1,那肯定树要被划分的很深.

min_samples_leaf 越小越容易过拟合.

http://sklearn.apachecn.org/cn/latest/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier

0
0 收藏

0 评论
0 收藏
0