import pandas as pd import numpy as np import target as target from sklearn.impute import SimpleImputer from sklearn.metrics import accuracy_score import joblib from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.feature_selection import VarianceThreshold from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from bioconductor import maftools # 1. 准备数据 # 读取 maf 文件 maf_df = pd.read_csv('example.maf', sep='\t', comment='#') # 转为 csv 文件 maf_df.to_csv('example.csv', index=False) # 1. 准备数据 data = pd.read_csv('example.csv') data=data.drop(data[data['all_effects'] == 'MAPKBP1'].index) # 删除无关特征 # data.drop(['id', 'class'], axis=1, inplace=True) # 填充缺失值 # imputer = SimpleImputer(strategy='mean', fill_value=0) # data = imputer.fit_transform(data) # 标准化 scaler = StandardScaler() data = scaler.fit_transform(data) # 方差选择 selector = VarianceThreshold(threshold=0.01) data = selector.fit_transform(data) print('data.shape', data.shape) # 假设您的 target 数组原来的形状为 (57, 2) # target = np.argmax(data.shape, axis=1) # 将二元分类标签转换为单个类标签 # target = target.reshape((-1,)) # 将形状修改为 (57,) # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(data, data.shape, test_size=0.2, random_state=42) # 建立随机森林模型 rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42) rfc.fit(X_train, y_train) # 在测试集上进行预测 y_pred = rfc.predict(X_test) # 计算模型评估指标 accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) print("Accuracy:", accuracy) print("Precision:", precision) print("Recall:", recall) print("F1 score:", f1) # data=data.drop(data[data['all_effects'] == 'MAPKBP1'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'BPNT1'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'MAPKBP1'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'TNS1'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'KCNH8'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'FOXL2NB'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'GHSR'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'WNK1'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'F12'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'ASB5'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'TBCCD1'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'GPR39'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'KCNF1'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'GDF7'].index) # data=data.drop(data[data['Hugo_Symbol'] == 'TCF23'].index) # data=data.drop(data[data['SYMBOL'] == 'MAPKBP1'].index) # # # data.drop('all_effects', axis=1, inplace=True) # # for index, row in data.iterrows(): # if index == 42: # # print(row) # for elm_index in data.columns: # print(row[elm_index]) # # # print(303030, data[data['Hugo_Symbol'] == 'MAPKBP1'].index) # # print(303030, data[data['Hugo_Symbol'] == 'MAPKBP1']) # # # df = data.drop(data.select_dtypes(include=['object']), axis=1) # X = data.iloc[:, :-1] # y = data.iloc[:, -1] # # # 2. 划分训练集和测试集 # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # # # # 删除不需要的列 # # data.drop( index=45, axis=1, inplace=True) # # data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) # # # # # # # # # # 3. 选择算法 # rfc = RandomForestClassifier(n_estimators=100, random_state=42) # # # 4. 训练模型 # rfc.fit(X_train, y_train) # # # # # 5. 评估模型 # y_pred = rfc.predict(X_test) # accuracy = accuracy_score(y_test, y_pred) # print(f"Accuracy: {accuracy:.2f}") # # # 6. 模型保存 # joblib.dump(rfc, 'snp_model.pkl')