123456789101112131415161718192021222324252627282930313233343536373839404142434445 |
- import pandas as pd
- from sklearn.linear_model import LogisticRegression
- from sklearn.metrics import classification_report
- from sklearn.metrics.regression import net_reclassification_index
- import matplotlib.pyplot as plt
- # 加载数据
- snp_data = pd.read_csv('snp_data.csv') # 假设SNP数据已保存在CSV文件中
- # other_data = pd.read_csv('other_data.csv') # 假设其他特征数据已保存在CSV文件中
- target = pd.read_csv('target.csv') # 假设目标数据已保存在CSV文件中
- # 合并数据
- data = pd.concat([snp_data, target], axis=1)
- # 划分训练集和测试集
- train_data = data.sample(frac=0.8, random_state=1)
- test_data = data.drop(train_data.index)
- # 训练基线模型和改进模型
- X_train = train_data.drop('target', axis=1)
- y_train = train_data['target']
- X_test = test_data.drop('target', axis=1)
- y_test = test_data['target']
- base_model = LogisticRegression(random_state=1)
- base_model.fit(X_train, y_train)
- improved_model = LogisticRegression(random_state=1, solver='liblinear', penalty='l1')
- improved_model.fit(X_train, y_train)
- # 使用模型进行分类并计算NRI指标
- base_proba = base_model.predict_proba(X_test)
- improved_proba = improved_model.predict_proba(X_test)
- nri = net_reclassification_index(y_test, base_proba[:, 1], improved_proba[:, 1])
- # 输出NRI指标
- print('NRI:', nri)
- # 可视化NRI指标
- nri_df = pd.DataFrame({'Model': ['Baseline', 'Improved'], 'NRI': [0, nri]})
- plt.bar(nri_df['Model'], nri_df['NRI'], color=['#1f77b4', '#ff7f0e'])
- plt.ylim([0, 1])
- plt.xlabel('Model')
- plt.ylabel('NRI')
- plt.title('Net Reclassification Improvement')
- plt.show()
|