目录

    • 鸢尾花数据集的分类练习
      • 观察数据
      • 划分数据集及训练
      • 超参数选择
      • 使用sklearn库进行k值调优
    • 电影分类案例
    • knn 回归
        • 真实情况一定存在噪声


声明:内容非原创,是学习内容的总结,版权所属姜老师

鸢尾花数据集的分类练习

import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

观察数据

data_iris = sklearn.datasets.load_iris()
data_iris

截取了主要内容(没有完全显示)

 {'data': array([[5.1, 3.5, 1.4, 0.2],[4.9, 3. , 1.4, 0.2],[4.7, 3.2, 1.3, 0.2],[4.6, 3.1, 1.5, 0.2],[5. , 3.6, 1.4, 0.2],....     'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),'frame': None,'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'),'DESCR': '.. 'feature_names': ['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)'],
data_iris.data

KNN 算法复习总结-编程知识网

data_iris.target_names
array(['setosa', 'versicolor', 'virginica'], dtype='<U10')
data_iris.feature_names
['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']
df2 = pd.DataFrame(data = data_iris.data,columns = data_iris.feature_names)
df2
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8

150 rows × 4 columns

df2['target'] = data_iris.target
df2.head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0

划分数据集及训练

from sklearn.model_selection import train_test_split
df2.iloc[:,:4]
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8

150 rows × 4 columns

X_train,X_test,y_train,y_test = train_test_split(df2.iloc[:,:4],df2['target'],test_size=0.3,random_state=22)
X_train.shape
(105, 4)
X_test.shape
(45, 4)
y_train.shape
(105,)
y_test.shape
(45,)
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors = 1)
clf.fit(X_train,y_train)
KNeighborsClassifier(n_neighbors=1)
clf.score(X_test,y_test)
0.9777777777777777
new_data2 = np.array([[5.3,3.2,1.1,1.9],[5.0,3.5,0.9,1.3]])
clf.predict(new_data2)
array([0, 0])

超参数选择

scores = []
for i in range(1,20,2):knn = KNeighborsClassifier(i).fit(X_train,y_train)scores.append(knn.score(X_test,y_test))
scores
[0.9777777777777777,0.9555555555555556,0.9555555555555556,0.9555555555555556,0.9555555555555556,0.9777777777777777,0.9555555555555556,0.9777777777777777,0.9555555555555556,0.9333333333333333]
plt.plot(range(1,20,2),scores)

KNN 算法复习总结-编程知识网

使用sklearn库进行k值调优

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler # 标准化
from sklearn.preprocessing import MinMaxScaler # 归一化
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
iris = sklearn.datasets.load_iris()
X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,random_state=22)
# 实例化一个转换器
transfer = StandardScaler()

疑问:为什么X_train 要 fit_transform 而X_test 只需要transform

X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test)

实例化

estimator = KNeighborsClassifier()

网格搜索

param_dict = {"n_neighbors":[1,3,7,9]}
estimator = GridSearchCV(estimator,param_grid = param_dict,cv =3)
estimator.fit(X_train,y_train)
GridSearchCV(cv=3, estimator=KNeighborsClassifier(),param_grid={'n_neighbors': [1, 3, 7, 9]})
y_predict = estimator.predict(X_test)
print("对比预测结果和真实值:\n",y_predict == y_test)
对比预测结果和真实值:[ True  True  True  True  True  True  True False  True  True  True  TrueTrue  True  True  True  True  True False  True  True  True  True  TrueTrue  True  True  True  True  True  True  True  True  True  True  TrueTrue  True]
print("直接计算准确率:\n",estimator.best_score_)
print("最好的参数模型:\n",estimator.best_estimator_)
直接计算准确率:0.9732100521574205
最好的参数模型:KNeighborsClassifier(n_neighbors=7)
print("每次交叉验证后的准确率结果:\n",estimator.cv_results_)
每次交叉验证后的准确率结果:{'mean_fit_time': array([0.00167433, 0.00100048, 0.00066678, 0.00100017]), 'std_fit_time': array([9.35758153e-04, 2.97360213e-07, 4.71482906e-04, 7.78671819e-07]), 'mean_score_time': array([0.00399717, 0.00298945, 0.00266369, 0.00266417]), 'std_score_time': array([1.46109075e-06, 8.17085419e-04, 4.70752210e-04, 4.70921149e-04]), 'param_n_neighbors': masked_array(data=[1, 3, 7, 9],mask=[False, False, False, False],fill_value='?',dtype=object), 'params': [{'n_neighbors': 1}, {'n_neighbors': 3}, {'n_neighbors': 7}, {'n_neighbors': 9}], 'split0_test_score': array([0.97368421, 0.97368421, 0.97368421, 0.94736842]), 'split1_test_score': array([0.97297297, 0.97297297, 0.97297297, 0.97297297]), 'split2_test_score': array([0.94594595, 0.89189189, 0.97297297, 0.91891892]), 'mean_test_score': array([0.96420104, 0.94618303, 0.97321005, 0.9464201 ]), 'std_test_score': array([0.01291157, 0.03839073, 0.00033528, 0.02207766]), 'rank_test_score': array([2, 4, 1, 3])}

电影分类案例

影响一部电影的因素是: 动作镜头 + 爱情镜头

目标: target 分类 哪个电影(动作片和爱情片)

import numpy as np
import pandas as pd
from pandas import Series, DataFrameimport matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
data = pd.read_excel('movie01.xlsx')
data
电影名字 爱情镜头 动作镜头 动作片/爱情片
0 这个杀手不太冷 9 18 动作片
1 黑客帝国 6 19 动作片
2 沙丘 2 20 动作片
3 无敌破坏王 1 21 动作片
4 卧虎藏龙 5 15 动作片
5 金蝉脱壳 1 16 动作片
6 我的野蛮女友 19 2 爱情片
7 分手大师 31 5 爱情片
8 假如爱有天意 25 2 爱情片
9 三生三世十里桃花 17 10 爱情片
10 泰坦尼克号 21 3 爱情片
data.head()
电影名字 爱情镜头 动作镜头 动作片/爱情片
0 这个杀手不太冷 9 18 动作片
1 黑客帝国 6 19 动作片
2 沙丘 2 20 动作片
3 无敌破坏王 1 21 动作片
4 卧虎藏龙 5 15 动作片
# 获取特征向量集合 和标签集合
y = data['动作片/爱情片'].copy()X = data[['爱情镜头','动作镜头']].copy()
# 生成预测样本
X_test = np.array([[15,2],[8,14]])
X_test
array([[15,  2],[ 8, 14]])
X.values
array([[ 9, 18],[ 6, 19],[ 2, 20],[ 1, 21],[ 5, 15],[ 1, 16],[19,  2],[31,  5],[25,  2],[17, 10],[21,  3]], dtype=int64)
# seaborn 自带调色板
colors = sns.color_palette(palette='husl',n_colors=2)
sns.palplot(colors)

KNN 算法复习总结-编程知识网

# 自己设置调色板
from matplotlib.colors import ListedColormap
husl_map = ListedColormap(colors)
# 绘制样本集在特征空间的分布情况
plt.figure(figsize=(8,5))
plt.scatter(X['动作镜头'],X['爱情镜头'], s=100,c=y.map({'动作片':0,'爱情片':1
}),cmap=husl_map)
<matplotlib.collections.PathCollection at 0x18004ca5cd0>

KNN 算法复习总结-编程知识网

y == '动作片'
0      True
1      True
2      True
3      True
4      True
5      True
6     False
7     False
8     False
9     False
10    False
Name: 动作片/爱情片, dtype: bool
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 给空间样本点添加图里描述
action = X.loc[y=='动作片']
love = X.loc[y=='爱情片']s = 100
plt.scatter(action['动作镜头'],action['爱情镜头'],s=s, c='blue', label='动作电影')
plt.scatter(love['动作镜头'],love['爱情镜头'],s=s, c='red',label='爱情电影')
plt.xlabel('动作镜头')
plt.ylabel('爱情镜头')
plt.legend()plt.scatter(X_test[:,0],X_test[:,1],s=s,marker='*',c='green')
<matplotlib.collections.PathCollection at 0x18004d3fb50>

KNN 算法复习总结-编程知识网

from sklearn.neighbors import KNeighborsClassifier
# 导入knn
# 1. 构造算法对象
clf = KNeighborsClassifier(n_neighbors=3)
# 2.训练算法对象 目的是得到模型 模型是具有预测能力的 (本质是一个已知参数的函数)# 注意
# X 特征向量的集合 必须是一个二维数组 VSM向量空间模型
# y 标签集合 一般来说是一维数组 也有可能是多维数组
clf.fit(X, y)
# fit之后, 就意味着clf这个分类器可以用来预测新的数据了
KNeighborsClassifier(n_neighbors=3)
# 传统机器学习类型 
# 有监督学习 回归 分类
# 无监督学习 聚类# KNN属于有监督学习 
# 提供了两种模型    分类的模型 KNeighborsClassifier 回归模型 KNeighborsRegressor
# 3. 预测数据
clf.predict(X_test)
array(['爱情片', '动作片'], dtype=object)
# 给空间样本点添加图里描述
action = X.loc[y=='动作片']
love = X.loc[y=='爱情片']s = 100
plt.scatter(action['动作镜头'],action['爱情镜头'],s=s, c='blue', label='动作电影')
plt.scatter(love['动作镜头'],love['爱情镜头'],s=s, c='red',label='爱情电影')
plt.xlabel('动作镜头')
plt.ylabel('爱情镜头')
plt.legend()#plt.scatter(X_test[:,0],X_test[:,1],s=s,marker='*',c='green')plt.scatter(X_test[0,0],X_test[0,1],s=s,marker='*',c='green')
plt.scatter(X_test[1,0],X_test[1,1],s=s,marker='*',c='green')
<matplotlib.collections.PathCollection at 0x18005641e80>

KNN 算法复习总结-编程知识网


knn 回归

# Regression knn回归
# 在语法上 要求y值需要可运算的数值类型  (回归需要连续值)
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import pandas as pd 
from pandas import Series,DataFrameimport matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# 假设 房子平米数 X 和房价之间的关系 能够如下描述:
f = lambda x:3*x + 2
# X 代指房子的平方数
# y 房价
x = np.random.random(60) * 60+ 60
x
array([ 86.71898211,  99.32422819, 101.80354187,  85.05007293,61.23802843,  84.50494507, 118.3232965 ,  94.94073921,60.96892806,  85.09573796, 102.82814208, 105.62206953,67.79118374, 103.2745069 ,  74.25729401, 113.48932148,74.77686787, 104.74169474,  75.68874697,  63.00103219,86.50034993,  83.6088712 ,  88.03601657,  77.24198646,94.45508227,  93.20986106,  97.05789538,  82.87482885,113.16585671,  82.62155351,  96.41215711,  93.45602741,85.34640385, 103.81303707, 115.54593701, 117.92993722,116.22213662,  63.04310621,  99.56676988,  89.20742364,116.12809676, 106.95210231,  71.9042332 ,  63.47049094,106.4255902 ,  79.4315286 ,  79.10346342, 111.67768503,102.44970919,  67.91049796,  73.41975608,  94.91831987,67.21388615,  87.31979659, 115.6553011 ,  62.18940825,80.16234784,  93.16943413,  98.47791582,  98.90453755])
# 计算房价
y = f(x)
y
array([262.15694632, 299.97268456, 307.41062562, 257.1502188 ,185.71408528, 255.51483522, 356.9698895 , 286.82221762,184.90678418, 257.28721388, 310.48442625, 318.86620859,205.37355121, 311.8235207 , 224.77188203, 342.46796445,226.33060361, 316.22508421, 229.06624092, 191.00309656,261.50104978, 252.82661361, 266.1080497 , 233.72595937,285.36524682, 281.62958317, 293.17368614, 250.62448655,341.49757013, 249.86466054, 291.23647133, 282.36808222,258.03921156, 313.43911121, 348.63781104, 355.78981166,350.66640985, 191.12931862, 300.70030965, 269.62227093,350.38429028, 322.85630692, 217.7126996 , 192.41147281,321.27677061, 240.29458579, 239.31039026, 337.0330551 ,309.34912758, 205.73149389, 222.25926824, 286.75495962,203.64165845, 263.95938978, 348.9659033 , 188.56822474,242.48704353, 281.50830239, 297.43374747, 298.71361264])
# 绘图看看房价和平方的关系
sns.set()
plt.scatter(x,y)
plt.xlabel('M²')
plt.ylabel('RMB')
Text(0, 0.5, 'RMB')


KNN 算法复习总结-编程知识网

真实情况一定存在噪声

# 手动生成偏差和噪声
bias = np.random.normal(loc =10, scale =2, size = 60)
bias = bias * np.array([1,-1])[np.random.randint(0,2,size=60)]
bias
array([-13.0291064 , -10.02495945,  -9.97782765,  12.64723644,-9.98195524,  -9.02187354, -10.24830628, -11.08886485,13.51838618,  -6.61636637, -11.25623598,  -8.47511058,-8.63887281, -12.01284281,   6.52659794,  -6.66706196,9.23569534,  12.60706034, -11.17515802,  -9.65352491,10.59777178,   8.17109782,  -7.60775619,  10.7237684 ,9.16353249,   7.09541961,   6.65429976,  11.90124201,-8.70976122,   5.03214993,  -8.74934746, -13.7142512 ,-8.56152148, -12.15045728,  -6.68908803,  10.77801205,7.20647652,  -9.25268462,  -7.42099604, -10.04855526,8.71739769,  -9.78909485,   8.49915866,  11.60269079,-6.48109875, -11.67543207,   8.38788536, -10.1409664 ,9.06067727,   9.63352347,  -8.69484635,  14.14670522,7.68168557,  11.17618534,   8.21802924,  -8.74698975,-10.04267587,   7.96355161,  13.97309601,   8.27492325])
y = f(x) + bias
plt.scatter(x,y)
plt.xlabel('M²')
plt.ylabel('RMB')
Text(0, 0.5, 'RMB')


KNN 算法复习总结-编程知识网

x.shape
(60,)
# 1. 构造一个knn回归模型 实例化
knn = KNeighborsRegressor(n_neighbors=5)
# 2.构造X的走向
X = x.reshape(-1,1)
X
array([[ 86.71898211],[ 99.32422819],[101.80354187],[ 85.05007293],[ 61.23802843],[ 84.50494507],[118.3232965 ],[ 94.94073921],[ 60.96892806],[ 85.09573796],[102.82814208],[105.62206953],[ 67.79118374],[103.2745069 ],[ 74.25729401],[113.48932148],[ 74.77686787],[104.74169474],[ 75.68874697],[ 63.00103219],[ 86.50034993],[ 83.6088712 ],[ 88.03601657],[ 77.24198646],[ 94.45508227],[ 93.20986106],[ 97.05789538],[ 82.87482885],[113.16585671],[ 82.62155351],[ 96.41215711],[ 93.45602741],[ 85.34640385],[103.81303707],[115.54593701],[117.92993722],[116.22213662],[ 63.04310621],[ 99.56676988],[ 89.20742364],[116.12809676],[106.95210231],[ 71.9042332 ],[ 63.47049094],[106.4255902 ],[ 79.4315286 ],[ 79.10346342],[111.67768503],[102.44970919],[ 67.91049796],[ 73.41975608],[ 94.91831987],[ 67.21388615],[ 87.31979659],[115.6553011 ],[ 62.18940825],[ 80.16234784],[ 93.16943413],[ 98.47791582],[ 98.90453755]])
# 3.训练模型fit
knn.fit(X,y)
KNeighborsRegressor()
# 4. 制作测试数据用以查看训练的效果
X_test = np.linspace(X.min(),X.max(),100).reshape(-1,1)
X_test
array([[ 60.96892806],[ 61.54826512],[ 62.12760217],[ 62.70693923],[ 63.28627628],[ 63.86561334],[ 64.44495039],[ 65.02428745],[ 65.6036245 ],[ 66.18296156],[ 66.76229861],[ 67.34163567],[ 67.92097272],[ 68.50030977],[ 69.07964683],[ 69.65898388],[ 70.23832094],[ 70.81765799],[ 71.39699505],[ 71.9763321 ],[ 72.55566916],[ 73.13500621],[ 73.71434327],[ 74.29368032],[ 74.87301738],[ 75.45235443],[ 76.03169149],[ 76.61102854],[ 77.1903656 ],[ 77.76970265],[ 78.34903971],[ 78.92837676],[ 79.50771382],[ 80.08705087],[ 80.66638793],[ 81.24572498],[ 81.82506204],[ 82.40439909],[ 82.98373615],[ 83.5630732 ],[ 84.14241026],[ 84.72174731],[ 85.30108437],[ 85.88042142],[ 86.45975848],[ 87.03909553],[ 87.61843259],[ 88.19776964],[ 88.7771067 ],[ 89.35644375],[ 89.93578081],[ 90.51511786],[ 91.09445492],[ 91.67379197],[ 92.25312903],[ 92.83246608],[ 93.41180314],[ 93.99114019],[ 94.57047725],[ 95.1498143 ],[ 95.72915136],[ 96.30848841],[ 96.88782547],[ 97.46716252],[ 98.04649958],[ 98.62583663],[ 99.20517369],[ 99.78451074],[100.3638478 ],[100.94318485],[101.52252191],[102.10185896],[102.68119602],[103.26053307],[103.83987013],[104.41920718],[104.99854424],[105.57788129],[106.15721835],[106.7365554 ],[107.31589246],[107.89522951],[108.47456657],[109.05390362],[109.63324067],[110.21257773],[110.79191478],[111.37125184],[111.95058889],[112.52992595],[113.109263  ],[113.68860006],[114.26793711],[114.84727417],[115.42661122],[116.00594828],[116.58528533],[117.16462239],[117.74395944],[118.3232965 ]])
# 5.预测房价
y_ = knn.predict(X_test)
y_
array([183.44094821, 183.44094821, 183.44094821, 184.55874685,184.55874685, 184.55874685, 191.67698965, 195.05967833,201.86276748, 201.86276748, 201.86276748, 201.86276748,210.72981233, 212.63986399, 212.63986399, 212.63986399,212.63986399, 216.63489118, 224.40121529, 224.90642839,224.90642839, 224.90642839, 224.90642839, 224.90642839,228.5540023 , 228.5540023 , 228.5540023 , 235.38077304,234.84490779, 234.22052153, 234.22052153, 234.22052153,241.62166705, 245.2368672 , 245.2368672 , 245.2368672 ,247.89675437, 251.47151596, 258.94213348, 258.94213348,255.48733319, 255.48733319, 257.70755522, 258.23453086,259.30215484, 260.86804404, 262.88724915, 262.88724915,262.88724915, 262.88724915, 266.36185564, 272.9849394 ,272.9849394 , 280.19063656, 288.45622639, 288.45622639,288.45622639, 288.45622639, 285.70852615, 284.46095036,290.69578134, 290.69578134, 294.07139417, 298.13164285,300.2900808 , 300.2900808 , 300.2900808 , 299.81104322,299.81104322, 299.65956637, 301.63215692, 303.23402499,303.23402499, 303.23402499, 309.5138943 , 307.91015293,311.02364925, 313.67495608, 313.67495608, 313.67495608,313.67495608, 318.79564304, 318.79564304, 319.58677591,324.66873681, 324.66873681, 324.66873681, 338.92269113,338.92269113, 338.92269113, 338.92269113, 338.92269113,345.36461098, 350.38162648, 350.38162648, 356.53501072,356.53501072, 357.48958276, 357.48958276, 357.48958276])
# 6.画图展现预测结果
plt.plot(X_test,y_,label='prediction line',color = 'yellow')
plt.scatter(x,y,label='True data',color = 'pink')
plt.xlabel('M²')
plt.ylabel('RMB')
plt.legend()
<matplotlib.legend.Legend at 0x25a1dcd4df0>


KNN 算法复习总结-编程知识网