import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression # 读入MAF文件 maf_data = pd.read_csv('69dbfe7c-3efc-4126-b36d-2b59aca7a16d.wxs.aliquot_ensemble_masked.maf', sep='\t', comment='#') # 提取基因型数据 genotype_data = maf_data.iloc[:, 10:] # 将基因型转换成0,1,2的数字编码 genotype_data.replace({'AA':0, 'AT':1, 'TT':2}, inplace=True) # 读入疾病风险相关基因关联数据 risk_gene_data = pd.read_csv('risk_gene_data.csv') # 将两个数据集根据个体ID进行合并 merged_data = pd.merge(genotype_data, risk_gene_data, on='id') # 将数据集分为训练集和测试集 train_data, test_data = train_test_split(merged_data, test_size=0.3) # 构建逻辑回归模型并训练 model = LogisticRegression() model.fit(train_data.iloc[:, :-1], train_data['risk_score']) # 对测试集进行预测 test_pred = model.predict(test_data.iloc[:, :-1]) # 计算模型预测准确率 accuracy = np.mean(test_pred == test_data['risk_score']) print('模型预测准确率:', accuracy) # 绘制预测结果可视化图表 plt.scatter(test_data['id'], test_data['risk_score'], color='blue', label='True risk score') plt.scatter(test_data['id'], test_pred, color='red', label='Predicted risk score') plt.xlabel('Individual ID') plt.ylabel('Risk score') plt.title('Genetic risk score prediction') plt.legend() plt.show()