tasks.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. import json
  2. import logging
  3. import time
  4. import matplotlib
  5. matplotlib.use('Agg')
  6. import pandas as pd
  7. from pandas.api.types import is_string_dtype
  8. from pandas.api.types import is_numeric_dtype
  9. from sklearn.preprocessing import LabelEncoder
  10. import matplotlib.pyplot as plt
  11. from sklearn.preprocessing import MinMaxScaler
  12. from sklearn.model_selection import train_test_split
  13. import os
  14. import pyreadr
  15. from sklearn.linear_model import LogisticRegression
  16. import joblib
  17. from sklearn.metrics import roc_curve, auc
  18. import numpy as np
  19. from celery import shared_task
  20. from files.models import Files as Files_models
  21. from datetime import datetime, timedelta
  22. from django.utils import timezone
  23. import os
  24. from order.models import Order as Order_models
  25. from files.models import Files as Files_models
  26. from celery.utils.log import get_task_logger
  27. logger = get_task_logger(__name__)
  28. @shared_task
  29. def get_AUC(order_id):
  30. matplotlib.use('Agg')
  31. print(31, order_id)
  32. # logger.info(28282828, order_id)
  33. # 更新order状态
  34. try:
  35. order = Order_models.objects.get(id=order_id)
  36. order.status = 2
  37. order.save()
  38. # return JsonResponse(setSuccess(msg='更新成功'))
  39. except Order_models.DoesNotExist:
  40. logging.warning(json.dumps({
  41. 'msg': '订单不存在',
  42. 'order_id': order_id
  43. }))
  44. # return JsonResponse(setFailure(msg='订单不存在'))
  45. files = Files_models.objects.filter(order_id=order_id)
  46. logger.info(files[0].file_path)
  47. logger.info('-----')
  48. logger.info(files[1].file_path)
  49. logger.info('-----')
  50. logger.info(str(order_id))
  51. # body = json.loads(request.body)
  52. # logger.info(26, body)
  53. # 用户订单数据
  54. base_path = 'Rdata/' + str(order_id) + '/'
  55. logger.info(base_path)
  56. # return
  57. if not os.path.exists(base_path):
  58. os.makedirs(base_path)
  59. def ROC_curve(clf, X_test, y_test, name):
  60. logger.info('130130130130 ' + name + ' gene_list.npy 2')
  61. # ROC曲线绘制,AUC值计算
  62. # 计算每一类的得分
  63. yscore_rf = clf.predict_proba(X_test)
  64. logger.info('6666666666 ')
  65. fpr_rf, tpr_rf, thersholds = roc_curve(y_test, yscore_rf[:, 1])
  66. logger.info('797979797979 ')
  67. plt.figure()
  68. logger.info('8080808080 ')
  69. plt.plot(fpr_rf, tpr_rf, color='darkorange',
  70. lw=2, label='ROC curve (arebody = json.loads(request.body)a = %0.2f)' % auc(fpr_rf, tpr_rf))
  71. plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
  72. plt.xlim([0.0, 1.0])
  73. plt.ylim([0.0, 1.05])
  74. plt.xlabel('False Positive Rate')
  75. plt.ylabel('True Positive Rate')
  76. # plt.title('Receiver operating characteristic example')
  77. plt.title(name)
  78. plt.legend(loc="lower right")
  79. # plt.savefig(plt.savefig('figs/savefig_example.png'))
  80. plt.savefig(base_path + name + '.png')
  81. # plt.show()
  82. logger.info('797979')
  83. # 读取非胃癌数据
  84. data_health = pyreadr.read_r(str(files[0].file_path)) # 'COAD.Rdata'
  85. data_health = data_health['data'] # .head(10000) # 提取前10000行,后续把.head()删掉
  86. logger.info('8383838383')
  87. # 读取胃癌数据
  88. data_sick = pyreadr.read_r(str(files[1].file_path)) # 'STAD.Rdata'
  89. data_sick = data_sick['data'] # .head(10000) # 提取前10000行,后续把.head()删掉
  90. logger.info('1111')
  91. logger.info('8888888')
  92. # 数据处理
  93. data_health['D'] = 0
  94. data_sick['D'] = 1
  95. data = pd.concat([data_health, data_sick])
  96. # data_sick[['Variant_Classification','Tumor_Sample_Barcode']]
  97. sample_list = list(data.groupby(['Tumor_Sample_Barcode']).groups.keys()) # 把获取类别转为列表
  98. gene_list = list(data.groupby(['Variant_Classification']).groups.keys()) # 把获取类别转为列表
  99. np.save(base_path + 'gene_list.npy', np.array(gene_list)) # 保存为.npy格式
  100. np.save(base_path + 'sample_list.npy', np.array(sample_list)) # 保存为.npy格式
  101. logger.info('2222')
  102. # 计算基因数目
  103. list_final = gene_list + ['Tumor_Sample_Barcode', 'D']
  104. data_final = pd.DataFrame(columns=list_final)
  105. def save_Variant_Type():
  106. Variant_Type = data['Variant_Type'].value_counts()
  107. # 创建一个条形图
  108. plt.bar(Variant_Type.index, Variant_Type.values)
  109. # 添加标题和轴标签
  110. plt.title('Variant Type Counts')
  111. plt.xlabel('Variant Type')
  112. plt.ylabel('Count')
  113. # 保存图形为 PNG 文件
  114. plt.savefig(base_path + 'Variant_Type' + '.png')
  115. def save_Variant_Type2(name, max_value=100):
  116. Variant_Type2 = data[name].value_counts()
  117. # 创建一个条形图
  118. plt.plot(Variant_Type2.values, Variant_Type2.index)
  119. plt.ylim([0.0, max_value])
  120. plt.xlim([0.0, len(Variant_Type2.index)])
  121. # 添加标题和轴标签
  122. plt.title(name)
  123. plt.xlabel(name)
  124. plt.ylabel('Count')
  125. # 保存图形为 PNG 文件
  126. plt.savefig(base_path + name + '.png')
  127. time.sleep(1)
  128. save_Variant_Type()
  129. time.sleep(1)
  130. save_Variant_Type2('t_ref_count', max_value=1000)
  131. time.sleep(1)
  132. save_Variant_Type2('t_alt_count', max_value=1000)
  133. time.sleep(1)
  134. save_Variant_Type2('t_depth', max_value=1000)
  135. time.sleep(1)
  136. # 绘制图像
  137. # plt.imshow(data['t_depth'])
  138. # 保存为图片文件
  139. # plt.savefig(base_path + "t_depth.png", format='png')
  140. # t_depth
  141. #
  142. # t_ref_count
  143. #
  144. # t_alt_count
  145. #
  146. # n_depth
  147. ind = 0
  148. for sample in sample_list:
  149. tempdata = data[data['Tumor_Sample_Barcode'] == sample]
  150. message = []
  151. for gene in gene_list:
  152. num = len(tempdata[tempdata['Variant_Classification'] == gene])
  153. message.append(num)
  154. message = message + [sample, tempdata.iloc[0][-1]]
  155. data_final.loc[ind] = message
  156. ind = ind + 1
  157. # data_final
  158. logger.info('113113113113113')
  159. # -------------------------------------SC_GRS方法-----------------------------------------
  160. data_final['Gi_sum'] = data_final[gene_list].sum(axis=1)
  161. # 模型训练
  162. X_train, X_test, y_train, y_test = train_test_split(data_final['Gi_sum'].values.reshape(-1, 1),
  163. data_final['D'].values.astype('int'), test_size=0.2)
  164. clf = LogisticRegression(penalty='l2')
  165. clf.fit(X_train, y_train)
  166. # 模型系数
  167. coef = clf.coef_
  168. intercept = clf.intercept_
  169. # 储存模型
  170. joblib.dump(clf, base_path + 'clf_scgrs.model')
  171. logger.info('3333')
  172. # ROC绘图 SC_GRS方法
  173. ROC_curve(clf, X_test, y_test, name='SC_GRS')
  174. logger.info('130130130130 ' + base_path + 'gene_list.npy')
  175. # 模型预测,假设data_final是输入数据,注意,data_final已经是处理好对应的基因数目的数据
  176. # 模型加载
  177. gene_list = np.load(base_path + 'gene_list.npy')
  178. logger.info('134134134134134134134')
  179. gene_list = gene_list.tolist()
  180. clf_scgrs = joblib.load(base_path + 'clf_scgrs.model')
  181. logger.info('137137137137137')
  182. # 数据处理
  183. X_test = data_final[gene_list].sum(axis=1).values.reshape(-1, 1)
  184. y_pre = clf.predict(X_test) # 分类结果预测
  185. prob_pre = clf.predict_proba(X_test) # 结果概率预测
  186. logger.info('4444')
  187. # 绘制图像
  188. plt.imshow(prob_pre)
  189. # 保存为图片文件
  190. plt.savefig(base_path + "SC_GRS_prob_pre.png", format='png')
  191. # -------------------------------------DL_GRS方法-----------------------------------------
  192. # 模型训练
  193. X_train, X_test, y_train, y_test = train_test_split(data_final[gene_list], data_final['D'].values.astype('int'),
  194. test_size=0.2)
  195. clf = LogisticRegression(penalty='l2', max_iter=1000)
  196. clf.fit(X_train, y_train)
  197. # 模型系数
  198. coef = clf.coef_
  199. intercept = clf.intercept_
  200. # 储存模型
  201. joblib.dump(clf, base_path + 'clf_dlgrs.model')
  202. # ROC曲线绘制,AUC值计算
  203. ROC_curve(clf, X_test, y_test, name='DL_GRS')
  204. # 模型预测,假设data_final是输入数据,注意,data_final已经是处理好对应的基因数目的数据
  205. # 模型加载
  206. gene_list = np.load(base_path + 'gene_list.npy')
  207. gene_list = gene_list.tolist()
  208. clf_scgrs = joblib.load(base_path + 'clf_dlgrs.model')
  209. # 数据处理
  210. X_test = data_final[gene_list]
  211. y_pre = clf.predict(X_test) # 分类结果预测
  212. prob_pre = clf.predict_proba(X_test) # 结果概率预测
  213. # 绘制图像
  214. plt.imshow(prob_pre)
  215. # 保存为图片文件
  216. plt.savefig(base_path + "DL_GRS_prob_pre.png", format='png')
  217. # -------------------------------------OR_GRS方法-----------------------------------------
  218. # or值计算
  219. tempdata = data_final[data_final['D'] == 0]
  220. gene_mean0 = tempdata[gene_list].mean()
  221. tempdata = data_final[data_final['D'] == 1]
  222. gene_mean1 = tempdata[gene_list].mean()
  223. omega_or = np.log(gene_mean1 / gene_mean0)
  224. # 模型训练
  225. data_final['GRS'] = (data_final[gene_list] * omega_or).sum(axis=1)
  226. X_train, X_test, y_train, y_test = train_test_split(data_final['GRS'].values.reshape(-1, 1),
  227. data_final['D'].values.astype('int'), test_size=0.2)
  228. clf = LogisticRegression(penalty='l2')
  229. clf.fit(X_train, y_train)
  230. # 模型系数
  231. coef = clf.coef_
  232. intercept = clf.intercept_
  233. # 储存模型
  234. joblib.dump(clf, base_path + 'clf_orgrs.model')
  235. np.save(base_path + 'omega_or.npy', np.array(omega_or)) # 保存为.npy格式
  236. # ROC曲线绘制,AUC值计算 OR_GRS方法
  237. ROC_curve(clf, X_test, y_test, name='OR_GRS')
  238. # 模型预测,假设data_final是输入数据,注意,data_final已经是处理好对应的基因数目的数据
  239. # 模型加载
  240. gene_list = np.load(base_path + 'gene_list.npy')
  241. gene_list = gene_list.tolist()
  242. clf_scgrs = joblib.load(base_path + 'clf_dlgrs.model')
  243. omega_or = np.load(base_path + 'omega_or.npy')
  244. # 数据处理
  245. X_test = (data_final[gene_list] * omega_or).sum(axis=1).values.reshape(-1, 1)
  246. y_pre = clf.predict(X_test) # 分类结果预测
  247. prob_pre = clf.predict_proba(X_test) # 结果概率预测
  248. # 绘制图像
  249. plt.imshow(prob_pre)
  250. # 保存为图片文件
  251. plt.savefig(base_path + "OR_GRS_prob_pre.png", format='png')
  252. # logger.info(154, prob_pre)
  253. # 更新order状态
  254. try:
  255. order = Order_models.objects.get(id=order_id)
  256. order.status = 3
  257. order.save()
  258. # return JsonResponse(setSuccess(msg='更新成功'))
  259. except Order_models.DoesNotExist:
  260. logging.warning(json.dumps({
  261. 'msg': '订单不存在',
  262. 'order_id': order_id
  263. }))