import json import logging import time import matplotlib matplotlib.use('Agg') import pandas as pd from pandas.api.types import is_string_dtype from pandas.api.types import is_numeric_dtype from sklearn.preprocessing import LabelEncoder import matplotlib.pyplot as plt from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split import os import pyreadr from sklearn.linear_model import LogisticRegression import joblib from sklearn.metrics import roc_curve, auc import numpy as np from celery import shared_task from files.models import Files as Files_models from datetime import datetime, timedelta from django.utils import timezone import os from order.models import Order as Order_models from files.models import Files as Files_models from celery.utils.log import get_task_logger logger = get_task_logger(__name__) @shared_task def get_AUC(order_id): matplotlib.use('Agg') print(31, order_id) # logger.info(28282828, order_id) # 更新order状态 try: order = Order_models.objects.get(id=order_id) order.status = 2 order.save() # return JsonResponse(setSuccess(msg='更新成功')) except Order_models.DoesNotExist: logging.warning(json.dumps({ 'msg': '订单不存在', 'order_id': order_id })) # return JsonResponse(setFailure(msg='订单不存在')) files = Files_models.objects.filter(order_id=order_id) logger.info(files[0].file_path) logger.info('-----') logger.info(files[1].file_path) logger.info('-----') logger.info(str(order_id)) # body = json.loads(request.body) # logger.info(26, body) # 用户订单数据 base_path = 'Rdata/' + str(order_id) + '/' logger.info(base_path) # return if not os.path.exists(base_path): os.makedirs(base_path) def ROC_curve(clf, X_test, y_test, name): logger.info('130130130130 ' + name + ' gene_list.npy 2') # ROC曲线绘制,AUC值计算 # 计算每一类的得分 yscore_rf = clf.predict_proba(X_test) logger.info('6666666666 ') fpr_rf, tpr_rf, thersholds = roc_curve(y_test, yscore_rf[:, 1]) logger.info('797979797979 ') plt.figure() logger.info('8080808080 ') plt.plot(fpr_rf, tpr_rf, color='darkorange', lw=2, label='ROC curve (arebody = json.loads(request.body)a = %0.2f)' % auc(fpr_rf, tpr_rf)) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') # plt.title('Receiver operating characteristic example') plt.title(name) plt.legend(loc="lower right") # plt.savefig(plt.savefig('figs/savefig_example.png')) plt.savefig(base_path + name + '.png') # plt.show() logger.info('797979') # 读取非胃癌数据 data_health = pyreadr.read_r(str(files[0].file_path)) # 'COAD.Rdata' data_health = data_health['data'] # .head(10000) # 提取前10000行,后续把.head()删掉 logger.info('8383838383') # 读取胃癌数据 data_sick = pyreadr.read_r(str(files[1].file_path)) # 'STAD.Rdata' data_sick = data_sick['data'] # .head(10000) # 提取前10000行,后续把.head()删掉 logger.info('1111') logger.info('8888888') # 数据处理 data_health['D'] = 0 data_sick['D'] = 1 data = pd.concat([data_health, data_sick]) # data_sick[['Variant_Classification','Tumor_Sample_Barcode']] sample_list = list(data.groupby(['Tumor_Sample_Barcode']).groups.keys()) # 把获取类别转为列表 gene_list = list(data.groupby(['Variant_Classification']).groups.keys()) # 把获取类别转为列表 np.save(base_path + 'gene_list.npy', np.array(gene_list)) # 保存为.npy格式 np.save(base_path + 'sample_list.npy', np.array(sample_list)) # 保存为.npy格式 logger.info('2222') # 计算基因数目 list_final = gene_list + ['Tumor_Sample_Barcode', 'D'] data_final = pd.DataFrame(columns=list_final) def save_Variant_Type(): Variant_Type = data['Variant_Type'].value_counts() # 创建一个条形图 plt.bar(Variant_Type.index, Variant_Type.values) # 添加标题和轴标签 plt.title('Variant Type Counts') plt.xlabel('Variant Type') plt.ylabel('Count') # 保存图形为 PNG 文件 plt.savefig(base_path + 'Variant_Type' + '.png') def save_Variant_Type2(name, max_value=100): Variant_Type2 = data[name].value_counts() # 创建一个条形图 plt.plot(Variant_Type2.values, Variant_Type2.index) plt.ylim([0.0, max_value]) plt.xlim([0.0, len(Variant_Type2.index)]) # 添加标题和轴标签 plt.title(name) plt.xlabel(name) plt.ylabel('Count') # 保存图形为 PNG 文件 plt.savefig(base_path + name + '.png') def save_end_data(self_prob_pre, name, pro_name): # 输出大于0.5的数据 second_column = self_prob_pre[:, 1] # 找到大于0.5的数据的下标 indices = np.where(second_column >= 0.5)[0] # print(144, pro_name, indices) with open(f'{base_path}{name}.txt', 'w') as f: for item in indices: f.write("%s\n" % item) # # # 绘制折线图 # # plt.plot(second_column) # plt.plot(indices, second_column[indices], 'ro') # # 设置 x 轴和 y 轴标签 # plt.title(f"{pro_name} Probability prediction greater than 0.5") # plt.xlabel("Index") # plt.ylabel("Value") # # 保存图像 # plt.savefig(f"{base_path}{name}概率预测.png", format="png") # # 清除图像 # plt.clf() # time.sleep(1) # save_Variant_Type() # time.sleep(1) # save_Variant_Type2('t_ref_count', max_value=1000) # time.sleep(1) # save_Variant_Type2('t_alt_count', max_value=1000) # time.sleep(1) # save_Variant_Type2('t_depth', max_value=1000) # time.sleep(1) # 绘制图像 # plt.imshow(data['t_depth']) # 保存为图片文件 # plt.savefig(base_path + "t_depth.png", format='png') # t_depth # # t_ref_count # # t_alt_count # # n_depth ind = 0 for sample in sample_list: tempdata = data[data['Tumor_Sample_Barcode'] == sample] message = [] for gene in gene_list: num = len(tempdata[tempdata['Variant_Classification'] == gene]) message.append(num) message = message + [sample, tempdata.iloc[0][-1]] data_final.loc[ind] = message ind = ind + 1 # data_final logger.info('113113113113113') # -------------------------------------SC_GRS方法----------------------------------------- data_final['Gi_sum'] = data_final[gene_list].sum(axis=1) # 模型训练 X_train, X_test, y_train, y_test = train_test_split(data_final['Gi_sum'].values.reshape(-1, 1), data_final['D'].values.astype('int'), test_size=0.2) clf = LogisticRegression(penalty='l2') clf.fit(X_train, y_train) # 模型系数 coef = clf.coef_ intercept = clf.intercept_ # 储存模型 joblib.dump(clf, base_path + 'clf_scgrs.model') logger.info('3333') # ROC绘图 SC_GRS方法 ROC_curve(clf, X_test, y_test, name='SC_GRS') logger.info('130130130130 ' + base_path + 'gene_list.npy') # 模型预测,假设data_final是输入数据,注意,data_final已经是处理好对应的基因数目的数据 # 模型加载 gene_list = np.load(base_path + 'gene_list.npy') logger.info('134134134134134134134') gene_list = gene_list.tolist() clf_scgrs = joblib.load(base_path + 'clf_scgrs.model') logger.info('137137137137137') # 数据处理 X_test = data_final[gene_list].sum(axis=1).values.reshape(-1, 1) y_pre = clf.predict(X_test) # 分类结果预测 prob_pre = clf.predict_proba(X_test) # 结果概率预测 logger.info('4444') # 绘制图像 plt.imshow(prob_pre) # 保存为图片文件 plt.savefig(base_path + "SC_GRS_prob_pre.png", format='png') # second_column = prob_pre[1, :] # # 找到大于0.5的数据的下标 # indices = np.where(second_column > 0.5) # # 绘制折线图 # plt.plot(second_column) # plt.plot(indices, second_column[indices], 'ro') # # 设置 x 轴和 y 轴标签 # plt.title('SC GRS Probability prediction greater than 0.5') # plt.xlabel('Index') # plt.ylabel('Value') # # 显示图形 # plt.savefig(base_path + "GRS概率预测.png", format='png') save_end_data(prob_pre, 'SC_GRS', 'SC GRS') # -------------------------------------DL_GRS方法----------------------------------------- # 模型训练 X_train, X_test, y_train, y_test = train_test_split(data_final[gene_list], data_final['D'].values.astype('int'), test_size=0.2) clf = LogisticRegression(penalty='l2', max_iter=1000) clf.fit(X_train, y_train) # 模型系数 coef = clf.coef_ intercept = clf.intercept_ # 储存模型 joblib.dump(clf, base_path + 'clf_dlgrs.model') # ROC曲线绘制,AUC值计算 ROC_curve(clf, X_test, y_test, name='DL_GRS') # 模型预测,假设data_final是输入数据,注意,data_final已经是处理好对应的基因数目的数据 # 模型加载 gene_list = np.load(base_path + 'gene_list.npy') gene_list = gene_list.tolist() clf_scgrs = joblib.load(base_path + 'clf_dlgrs.model') # 数据处理 X_test = data_final[gene_list] y_pre = clf.predict(X_test) # 分类结果预测 prob_pre = clf.predict_proba(X_test) # 结果概率预测 # 绘制图像 plt.imshow(prob_pre) # 保存为图片文件 plt.savefig(base_path + "DL_GRS_prob_pre.png", format='png') save_end_data(prob_pre, 'DL_GRS', 'DL GRS') # -------------------------------------OR_GRS方法----------------------------------------- # or值计算 tempdata = data_final[data_final['D'] == 0] gene_mean0 = tempdata[gene_list].mean() tempdata = data_final[data_final['D'] == 1] gene_mean1 = tempdata[gene_list].mean() omega_or = np.log(gene_mean1 / gene_mean0) # 模型训练 data_final['GRS'] = (data_final[gene_list] * omega_or).sum(axis=1) X_train, X_test, y_train, y_test = train_test_split(data_final['GRS'].values.reshape(-1, 1), data_final['D'].values.astype('int'), test_size=0.2) clf = LogisticRegression(penalty='l2') clf.fit(X_train, y_train) # 模型系数 coef = clf.coef_ intercept = clf.intercept_ # 储存模型 joblib.dump(clf, base_path + 'clf_orgrs.model') np.save(base_path + 'omega_or.npy', np.array(omega_or)) # 保存为.npy格式 # ROC曲线绘制,AUC值计算 OR_GRS方法 ROC_curve(clf, X_test, y_test, name='OR_GRS') # 模型预测,假设data_final是输入数据,注意,data_final已经是处理好对应的基因数目的数据 # 模型加载 gene_list = np.load(base_path + 'gene_list.npy') gene_list = gene_list.tolist() clf_scgrs = joblib.load(base_path + 'clf_dlgrs.model') omega_or = np.load(base_path + 'omega_or.npy') # 数据处理 X_test = (data_final[gene_list] * omega_or).sum(axis=1).values.reshape(-1, 1) y_pre = clf.predict(X_test) # 分类结果预测 prob_pre = clf.predict_proba(X_test) # 结果概率预测 # 绘制图像 plt.imshow(prob_pre) # 保存为图片文件 plt.savefig(base_path + "OR_GRS_prob_pre.png", format='png') save_end_data(prob_pre, 'OR_GRS', 'OR GRS') # logger.info(154, prob_pre) # 更新order状态 try: order = Order_models.objects.get(id=order_id) order.status = 3 order.save() # return JsonResponse(setSuccess(msg='更新成功')) except Order_models.DoesNotExist: logging.warning(json.dumps({ 'msg': '订单不存在', 'order_id': order_id }))