机器学习 —— DecisionTree决策树( 四 )


CART算法
# 对比 对数 和 平方 时间差%timeit np.log2(1000000)# 966 ns ± 3.57 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)%timeit 1000000**2# 8.05 ns ± 0.0485 ns per loop (mean ± std. dev. of 7 runs, 100,000,000 loops each)# D :target 表示账号是否真实# 总共十条数据# yes7个p = 7/10 = 0.7# no 3个p = 0.3gini_D = 1 - (0.7**2 + 0.3**2)gini_D# 0.42000000000000004
日志密度L
# 日志密度有几个结果: 3个# s :3个p=0.31个yes 2个no# m :4个p=0.43个yes 1个no# l :3个p=0.33个yes 0个nogini_L_D = 0.3 * (1 - ( (1/3)**2 + (2/3)**2))\+ 0.4 * (1 - ( (3/4)**2 + (1/4)**2))\+ 0.3 * (1 - ( (3/3)**2 + (0/3)**2))gini_L_D# 0.2833333333333333# 信息增益gini_D - gini_L_D# 0.13666666666666671
好友密度F
# 好友密度有几个结果: 3个# s :4个p=0.41个yes 3个no# m :4个p=0.44个yes 0个no# l :2个p=0.22个yes 0个nogini_F_D = 0.4 * (1 - ( (1/4)**2 + (3/4)**2))\+ 0.4 * (1 - ( (4/4)**2 + (0/4)**2))\+ 0.2 * (1 - ( (2/2)**2 + (0/2)**2))gini_F_D# 0.15000000000000002gini_D - gini_F_D# 0.27
是否使用真实头像H
# 是否使用真实头像有几个结果: 2个# yes :5个p=0.54个yes 1个no# no :5个p=0.53个yes 2个nogini_H_D = 0.5 * (1 - ( (4/5)**2 + (1/5)**2))\+ 0.5 * (1 - ( (3/5)**2 + (2/5)**2))gini_H_D#0.3999999999999999gini_D - gini_H_D# 0.02000000000000013# 好友密度0.27 > 日志密度0.13 > 是否使用真实头像0.02# 好友密度 优先分裂
三、实战
iris数据集
from sklearn.datasets import load_irisdata,target = load_iris(return_X_y=True)data.shape# (150, 4)
使用决策树算法
from sklearn.tree import DecisionTreeClassifier
=2, 最小分裂的样本数,数据量少的情况下不设置,默认是2
=1, 叶子节点所需要的最少样本数,
# dt = DecisionTreeClassifier(max_depth=2)# 防止过拟合将值调小# dt = DecisionTreeClassifier(min_samples_split=4)# 防止过拟合将值调大# dt = DecisionTreeClassifier(min_samples_leaf=3)# 防止过拟合将值调大dt = DecisionTreeClassifier(min_samples_split=4,min_samples_leaf=3)dt.fit(data,target).score(data,target)# 0.98# 特征重要性dt.feature_importances_# array([0., 0., 0.5736694, 0.4263306])
使用KNN算法
from sklearn.neighbors import KNeighborsClassifierknn = KNeighborsClassifier()knn.fit(data,target).score(data,target)# 0.9666666666666667
使用逻辑斯蒂回归算法
from sklearn.linear_model import LogisticRegressionlr = LogisticRegression(max_iter=1000)lr.fit(data,target).score(data,target)# 0.9733333333333334
使用决策树实现回归
from sklearn.tree import DecisionTreeRegressor
sin曲线
x_train = np.random.random(100)*10y_train = np.sin(x_train)plt.scatter(x_train,y_train)
加噪声
y_train[::5] += np.random.random(20)*0.3plt.scatter(x_train,y_train)
# 测试数据x_test = np.linspace(0,10,100).reshape(-1,1)# 决策树回归tree = DecisionTreeRegressor()tree.fit(x_train.reshape(-1,1),y_train)y_tree = tree.predict(x_test)# KNN 回归from sklearn.neighbors import KNeighborsRegressorknn = KNeighborsRegressor()knn.fit(x_train.reshape(-1,1),y_train)y_knn = knn.predict(x_test)# 线性回归from sklearn.linear_model import LinearRegressionlinear = LinearRegression()linear.fit(x_train.reshape(-1,1),y_train)y_linear = linear.predict(x_test)# 画图plt.scatter(x_train,y_train)plt.plot(x_test,y_tree,c='r',label='tree')plt.plot(x_test,y_knn,c='g',label='KNN')plt.plot(x_test,y_linear,c='y',label='Linear')plt.legend()