get_pkl.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. import pandas as pd
  2. import numpy as np
  3. import target as target
  4. from sklearn.impute import SimpleImputer
  5. from sklearn.metrics import accuracy_score
  6. import joblib
  7. from sklearn.ensemble import RandomForestClassifier
  8. from sklearn.model_selection import train_test_split
  9. from sklearn.preprocessing import StandardScaler
  10. from sklearn.feature_selection import VarianceThreshold
  11. from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
  12. from bioconductor import maftools
  13. # 1. 准备数据
  14. # 读取 maf 文件
  15. maf_df = pd.read_csv('example.maf', sep='\t', comment='#')
  16. # 转为 csv 文件
  17. maf_df.to_csv('example.csv', index=False)
  18. # 1. 准备数据
  19. data = pd.read_csv('example.csv')
  20. data=data.drop(data[data['all_effects'] == 'MAPKBP1'].index)
  21. # 删除无关特征
  22. # data.drop(['id', 'class'], axis=1, inplace=True)
  23. # 填充缺失值
  24. # imputer = SimpleImputer(strategy='mean', fill_value=0)
  25. # data = imputer.fit_transform(data)
  26. # 标准化
  27. scaler = StandardScaler()
  28. data = scaler.fit_transform(data)
  29. # 方差选择
  30. selector = VarianceThreshold(threshold=0.01)
  31. data = selector.fit_transform(data)
  32. print('data.shape', data.shape)
  33. # 假设您的 target 数组原来的形状为 (57, 2)
  34. # target = np.argmax(data.shape, axis=1) # 将二元分类标签转换为单个类标签
  35. # target = target.reshape((-1,)) # 将形状修改为 (57,)
  36. # 划分训练集和测试集
  37. X_train, X_test, y_train, y_test = train_test_split(data, data.shape, test_size=0.2, random_state=42)
  38. # 建立随机森林模型
  39. rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
  40. rfc.fit(X_train, y_train)
  41. # 在测试集上进行预测
  42. y_pred = rfc.predict(X_test)
  43. # 计算模型评估指标
  44. accuracy = accuracy_score(y_test, y_pred)
  45. precision = precision_score(y_test, y_pred)
  46. recall = recall_score(y_test, y_pred)
  47. f1 = f1_score(y_test, y_pred)
  48. print("Accuracy:", accuracy)
  49. print("Precision:", precision)
  50. print("Recall:", recall)
  51. print("F1 score:", f1)
  52. # data=data.drop(data[data['all_effects'] == 'MAPKBP1'].index)
  53. # data=data.drop(data[data['Hugo_Symbol'] == 'BPNT1'].index)
  54. # data=data.drop(data[data['Hugo_Symbol'] == 'MAPKBP1'].index)
  55. # data=data.drop(data[data['Hugo_Symbol'] == 'TNS1'].index)
  56. # data=data.drop(data[data['Hugo_Symbol'] == 'KCNH8'].index)
  57. # data=data.drop(data[data['Hugo_Symbol'] == 'FOXL2NB'].index)
  58. # data=data.drop(data[data['Hugo_Symbol'] == 'GHSR'].index)
  59. # data=data.drop(data[data['Hugo_Symbol'] == 'WNK1'].index)
  60. # data=data.drop(data[data['Hugo_Symbol'] == 'F12'].index)
  61. # data=data.drop(data[data['Hugo_Symbol'] == 'ASB5'].index)
  62. # data=data.drop(data[data['Hugo_Symbol'] == 'TBCCD1'].index)
  63. # data=data.drop(data[data['Hugo_Symbol'] == 'GPR39'].index)
  64. # data=data.drop(data[data['Hugo_Symbol'] == 'KCNF1'].index)
  65. # data=data.drop(data[data['Hugo_Symbol'] == 'GDF7'].index)
  66. # data=data.drop(data[data['Hugo_Symbol'] == 'TCF23'].index)
  67. # data=data.drop(data[data['SYMBOL'] == 'MAPKBP1'].index)
  68. #
  69. #
  70. # data.drop('all_effects', axis=1, inplace=True)
  71. #
  72. # for index, row in data.iterrows():
  73. # if index == 42:
  74. # # print(row)
  75. # for elm_index in data.columns:
  76. # print(row[elm_index])
  77. #
  78. # # print(303030, data[data['Hugo_Symbol'] == 'MAPKBP1'].index)
  79. # # print(303030, data[data['Hugo_Symbol'] == 'MAPKBP1'])
  80. #
  81. # # df = data.drop(data.select_dtypes(include=['object']), axis=1)
  82. # X = data.iloc[:, :-1]
  83. # y = data.iloc[:, -1]
  84. #
  85. # # 2. 划分训练集和测试集
  86. # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  87. #
  88. #
  89. # # 删除不需要的列
  90. # # data.drop( index=45, axis=1, inplace=True)
  91. # # data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
  92. #
  93. #
  94. #
  95. #
  96. #
  97. #
  98. #
  99. #
  100. # # 3. 选择算法
  101. # rfc = RandomForestClassifier(n_estimators=100, random_state=42)
  102. #
  103. # # 4. 训练模型
  104. # rfc.fit(X_train, y_train)
  105. #
  106. #
  107. #
  108. # # 5. 评估模型
  109. # y_pred = rfc.predict(X_test)
  110. # accuracy = accuracy_score(y_test, y_pred)
  111. # print(f"Accuracy: {accuracy:.2f}")
  112. #
  113. # # 6. 模型保存
  114. # joblib.dump(rfc, 'snp_model.pkl')