test5.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. import pandas as pd
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. from sklearn.model_selection import train_test_split
  5. from sklearn.linear_model import LogisticRegression
  6. # 读入MAF文件
  7. maf_data = pd.read_csv('69dbfe7c-3efc-4126-b36d-2b59aca7a16d.wxs.aliquot_ensemble_masked.maf', sep='\t', comment='#')
  8. # 提取基因型数据
  9. genotype_data = maf_data.iloc[:, 10:]
  10. # 将基因型转换成0,1,2的数字编码
  11. genotype_data.replace({'AA':0, 'AT':1, 'TT':2}, inplace=True)
  12. # 读入疾病风险相关基因关联数据
  13. risk_gene_data = pd.read_csv('risk_gene_data.csv')
  14. # 将两个数据集根据个体ID进行合并
  15. merged_data = pd.merge(genotype_data, risk_gene_data, on='id')
  16. # 将数据集分为训练集和测试集
  17. train_data, test_data = train_test_split(merged_data, test_size=0.3)
  18. # 构建逻辑回归模型并训练
  19. model = LogisticRegression()
  20. model.fit(train_data.iloc[:, :-1], train_data['risk_score'])
  21. # 对测试集进行预测
  22. test_pred = model.predict(test_data.iloc[:, :-1])
  23. # 计算模型预测准确率
  24. accuracy = np.mean(test_pred == test_data['risk_score'])
  25. print('模型预测准确率:', accuracy)
  26. # 绘制预测结果可视化图表
  27. plt.scatter(test_data['id'], test_data['risk_score'], color='blue', label='True risk score')
  28. plt.scatter(test_data['id'], test_pred, color='red', label='Predicted risk score')
  29. plt.xlabel('Individual ID')
  30. plt.ylabel('Risk score')
  31. plt.title('Genetic risk score prediction')
  32. plt.legend()
  33. plt.show()