# 使用PCA可视化数据

01/07 08:01

PCA用于减少用于训练模型的特征维度数量，它通过从多个特征构造所谓的主成分（PC）来实现这一点。
PC的构造方式使得PC1方向在最大变化上尽可能地解释了你的特征，然后PC2在最大变化上尽可能地解释剩余特征，PC1和PC2通常可以解释总体特征变化中的绝大部分信息。


import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

data = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
data['y'] = cancer['target']


PCA-整个数据集


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#标准化
scaler = StandardScaler()
scaler.fit(data)
scaled = scaler.transform(data)

#PCA
pca = PCA().fit(scaled)

pc = pca.transform(scaled)
pc1 = pc[:,0]
pc2 = pc[:,1]

#画出主成分
plt.figure(figsize=(10,10))

colour = ['#ff2121' if y == 1 else '#2176ff' for y in data['y']]
plt.scatter(pc1,pc2 ,c=colour,edgecolors='#000000')
plt.ylabel("Glucose",size=20)
plt.xlabel('Age',size=20)
plt.yticks(size=12)
plt.xticks(size=12)
plt.xlabel('PC1')



var = pca.explained_variance_[0:10] #percentage of variance explained
labels = ['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10']

plt.figure(figsize=(15,7))
plt.bar(labels,var,)
plt.xlabel('Pricipal Component')
plt.ylabel('Proportion of Variance Explained')


PCA-特征组


group_1 = ['mean symmetry', 'symmetry error','worst symmetry',
'mean smoothness','smoothness error','worst smoothness']

group_2 = ['mean perimeter','perimeter error','worst perimeter',
'mean concavity','concavity error','worst concavity']



from sklearn.model_selection import train_test_split
import sklearn.metrics as metric
import statsmodels.api as sm

for i,g in enumerate(group):

x = data[g]
y = data['y']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,
random_state = 101)

model = sm.Logit(y_train,x_train).fit() #fit logistic regression model

predictions = np.around(model.predict(x_test))
accuracy = metric.accuracy_score(y_test,predictions)

print("Accuracy of Group {}: {}".format(i+1,accuracy))


0
0 收藏

0 评论
0 收藏
0