12345678910111213141516171819202122232425262728293031323334 |
- import pandas as pd
- import numpy as np
- from sklearn.linear_model import LogisticRegression
- from sklearn.model_selection import train_test_split
- # 加载数据
- url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
- names = ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
- 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean',
- 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se',
- 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se',
- 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst',
- 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst',
- 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
- data = pd.read_csv(url, names=names)
- # 前10个特征作为基因和基因变异
- X = data.iloc[:, 2:12]
- # 诊断结果作为标签
- y = data['diagnosis'].map({'M': 1, 'B': 0})
- # 将数据集分为训练集和测试集
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
- # 训练逻辑回归模型
- lr = LogisticRegression()
- lr.fit(X_train, y_train)
- # 预测测试集结果
- y_pred = lr.predict(X_test)
- # 计算准确率
- accuracy = np.mean(y_pred == y_test)
- print("Accuracy: ", accuracy)
|