import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report from sklearn.metrics.regression import net_reclassification_index import matplotlib.pyplot as plt # 加载数据 snp_data = pd.read_csv('snp_data.csv') # 假设SNP数据已保存在CSV文件中 other_data = pd.read_csv('other_data.csv') # 假设其他特征数据已保存在CSV文件中 target = pd.read_csv('target.csv') # 假设目标数据已保存在CSV文件中 # 合并数据 data = pd.concat([snp_data, other_data, target], axis=1) # 划分训练集和测试集 train_data = data.sample(frac=0.8, random_state=1) test_data = data.drop(train_data.index) # 训练基线模型和改进模型 X_train = train_data.drop('target', axis=1) y_train = train_data['target'] X_test = test_data.drop('target', axis=1) y_test = test_data['target'] base_model = LogisticRegression(random_state=1) base_model.fit(X_train, y_train) improved_model = LogisticRegression(random_state=1, solver='liblinear', penalty='l1') improved_model.fit(X_train, y_train) # 使用模型进行分类并计算NRI指标 base_proba = base_model.predict_proba(X_test) improved_proba = improved_model.predict_proba(X_test) nri = net_reclassification_index(y_test, base_proba[:, 1], improved_proba[:, 1]) # 输出NRI指标 print('NRI:', nri) # 可视化NRI指标 nri_df = pd.DataFrame({'Model': ['Baseline', 'Improved'], 'NRI': [0, nri]}) plt.bar(nri_df['Model'], nri_df['NRI'], color=['#1f77b4', '#ff7f0e']) plt.ylim([0, 1]) plt.xlabel('Model') plt.ylabel('NRI') plt.title('Net Reclassification Improvement') plt.show()