utils.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. import pandas as pd
  2. from pandas.api.types import is_string_dtype
  3. from pandas.api.types import is_numeric_dtype
  4. from sklearn.preprocessing import LabelEncoder
  5. import matplotlib.pyplot as plt
  6. from sklearn.preprocessing import MinMaxScaler
  7. from sklearn.model_selection import train_test_split
  8. import os
  9. import pyreadr
  10. from sklearn.linear_model import LogisticRegression
  11. import joblib
  12. from sklearn.metrics import roc_curve, auc
  13. import numpy as np
  14. def ROC_curve(clf, X_test, y_test, name):
  15. # ROC曲线绘制,AUC值计算
  16. # 计算每一类的得分
  17. yscore_rf = clf.predict_proba(X_test)
  18. fpr_rf, tpr_rf, thersholds = roc_curve(y_test, yscore_rf[:, 1])
  19. plt.figure()
  20. plt.plot(fpr_rf, tpr_rf, color='darkorange',
  21. lw=2, label='ROC curve (area = %0.2f)' % auc(fpr_rf, tpr_rf))
  22. plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
  23. plt.xlim([0.0, 1.0])
  24. plt.ylim([0.0, 1.05])
  25. plt.xlabel('False Positive Rate')
  26. plt.ylabel('True Positive Rate')
  27. # plt.title('Receiver operating characteristic example')
  28. plt.title(name)
  29. plt.legend(loc="lower right")
  30. # plt.savefig(plt.savefig('figs/savefig_example.png'))
  31. plt.savefig('../images/' + name + '.png')
  32. # plt.show()
  33. def save_gene_list(COAD_path, STAD_path):
  34. print(COAD_path, STAD_path)
  35. # 读取非胃癌数据
  36. data_health = pyreadr.read_r(COAD_path)
  37. data_health = data_health['data'] # .head(10000) # 提取前10000行,后续把.head()删掉
  38. # 读取胃癌数据
  39. data_sick = pyreadr.read_r(STAD_path)
  40. data_sick = data_sick['data'] # .head(10000) # 提取前10000行,后续把.head()删掉
  41. # 数据处理
  42. data_health['D'] = 0
  43. data_sick['D'] = 1
  44. data = pd.concat([data_health, data_sick])
  45. print(50, type(data))
  46. data.to_csv('data.csv')
  47. # data_sick[['Variant_Classification','Tumor_Sample_Barcode']]
  48. sample_list = list(data.groupby(['Tumor_Sample_Barcode']).groups.keys()) # 把获取类别转为列表
  49. gene_list = list(data.groupby(['Variant_Classification']).groups.keys()) # 把获取类别转为列表
  50. np.save('sample_list.npy', np.array(sample_list)) # 保存为.npy格式
  51. np.save('gene_list.npy', np.array(gene_list)) # 保存为.npy格式
  52. def get_gene_list():
  53. gene_list = np.load('gene_list.npy')
  54. gene_list = gene_list.tolist()
  55. return gene_list
  56. def get_sample_list():
  57. sample_list = np.load('sample_list.npy')
  58. sample_list = sample_list.tolist()
  59. return sample_list
  60. def get_data():
  61. snp_data = pd.read_csv('data.csv') # 假设SNP数据已保存在CSV文件中
  62. # save_gene_list(COAD_path='../Rdata/COAD.Rdata', STAD_path='../Rdata/STAD.Rdata')
  63. # 计算基因数目并保存
  64. def count_the_number_of_genes():
  65. data = pd.read_csv('data.csv', low_memory=False) # 假设SNP数据已保存在CSV文件中
  66. gene_list = get_gene_list()
  67. sample_list = get_sample_list()
  68. list_final = gene_list + ['Tumor_Sample_Barcode', 'D']
  69. data_final = pd.DataFrame(columns=list_final)
  70. ind = 0
  71. for sample in sample_list:
  72. tempdata = data[data['Tumor_Sample_Barcode'] == sample]
  73. message = []
  74. for gene in gene_list:
  75. num = len(tempdata[tempdata['Variant_Classification'] == gene])
  76. message.append(num)
  77. message = message + [sample, tempdata.iloc[0][-1]]
  78. data_final.loc[ind] = message
  79. ind = ind + 1
  80. data_final.to_csv('data_final.csv')
  81. # count_the_number_of_genes()
  82. # 数据处理
  83. def load_data(COAD_path, STAD_path):
  84. # 读取非胃癌数据
  85. data_health = pyreadr.read_r('COAD.Rdata')
  86. data_health = data_health['data'] # .head(10000) # 提取前10000行,后续把.head()删掉
  87. # 读取胃癌数据
  88. data_sick = pyreadr.read_r('STAD.Rdata')
  89. data_sick = data_sick['data'] # .head(10000) # 提取前10000行,后续把.head()删掉
  90. def training_SC_GRS_model():
  91. data_final = pd.read_csv('data_final.csv', low_memory=False) # 假设SNP数据已保存在CSV文件中
  92. gene_list = get_gene_list()
  93. data_final['Gi_sum'] = data_final[gene_list].sum(axis=1)
  94. # 模型训练
  95. X_train, X_test, y_train, y_test = train_test_split(data_final['Gi_sum'].values.reshape(-1, 1),
  96. data_final['D'].values.astype('int'), test_size=0.2)
  97. clf = LogisticRegression(penalty='l2')
  98. clf.fit(X_train, y_train)
  99. # 模型系数
  100. coef = clf.coef_
  101. intercept = clf.intercept_
  102. # 储存模型
  103. joblib.dump(clf, 'clf_scgrs.model')
  104. # ROC绘图 SC_GRS方法
  105. ROC_curve(clf, X_test, y_test, name='SC_GRS')
  106. print('training_SC_GRS_model')
  107. # 模型预测,假设data_final是输入数据,注意,data_final已经是处理好对应的基因数目的数据
  108. # 模型加载
  109. gene_list = np.load('gene_list.npy')
  110. gene_list = gene_list.tolist()
  111. clf_scgrs = joblib.load('clf_scgrs.model')
  112. # 数据处理
  113. X_test = data_final[gene_list].sum(axis=1).values.reshape(-1, 1)
  114. y_pre = clf.predict(X_test) # 分类结果预测
  115. prob_pre = clf.predict_proba(X_test) # 结果概率预测
  116. # print(95, prob_pre)
  117. # print(95, np.amin(prob_pre))
  118. # print(95, np.amax(prob_pre))
  119. print(95, np.mean(prob_pre))
  120. print(95, np.mean(y_pre))
  121. training_SC_GRS_model()
  122. def training_DL_GRS_model():
  123. print('training_DL_GRS_model')
  124. def training_OR_GRS_model():
  125. print('training_OR_GRS_model')