12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- from sklearn.model_selection import train_test_split
- from sklearn.linear_model import LogisticRegression
- # 读入MAF文件
- maf_data = pd.read_csv('69dbfe7c-3efc-4126-b36d-2b59aca7a16d.wxs.aliquot_ensemble_masked.maf', sep='\t', comment='#')
- # 提取基因型数据
- genotype_data = maf_data.iloc[:, 10:]
- # 将基因型转换成0,1,2的数字编码
- genotype_data.replace({'AA':0, 'AT':1, 'TT':2}, inplace=True)
- # 读入疾病风险相关基因关联数据
- risk_gene_data = pd.read_csv('risk_gene_data.csv')
- # 将两个数据集根据个体ID进行合并
- merged_data = pd.merge(genotype_data, risk_gene_data, on='id')
- # 将数据集分为训练集和测试集
- train_data, test_data = train_test_split(merged_data, test_size=0.3)
- # 构建逻辑回归模型并训练
- model = LogisticRegression()
- model.fit(train_data.iloc[:, :-1], train_data['risk_score'])
- # 对测试集进行预测
- test_pred = model.predict(test_data.iloc[:, :-1])
- # 计算模型预测准确率
- accuracy = np.mean(test_pred == test_data['risk_score'])
- print('模型预测准确率:', accuracy)
- # 绘制预测结果可视化图表
- plt.scatter(test_data['id'], test_data['risk_score'], color='blue', label='True risk score')
- plt.scatter(test_data['id'], test_pred, color='red', label='Predicted risk score')
- plt.xlabel('Individual ID')
- plt.ylabel('Risk score')
- plt.title('Genetic risk score prediction')
- plt.legend()
- plt.show()
|