123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132 |
- import pandas as pd
- import numpy as np
- import target as target
- from sklearn.impute import SimpleImputer
- from sklearn.metrics import accuracy_score
- import joblib
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.model_selection import train_test_split
- from sklearn.preprocessing import StandardScaler
- from sklearn.feature_selection import VarianceThreshold
- from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
- from bioconductor import maftools
- # 1. 准备数据
- # 读取 maf 文件
- maf_df = pd.read_csv('example.maf', sep='\t', comment='#')
- # 转为 csv 文件
- maf_df.to_csv('example.csv', index=False)
- # 1. 准备数据
- data = pd.read_csv('example.csv')
- data=data.drop(data[data['all_effects'] == 'MAPKBP1'].index)
- # 删除无关特征
- # data.drop(['id', 'class'], axis=1, inplace=True)
- # 填充缺失值
- # imputer = SimpleImputer(strategy='mean', fill_value=0)
- # data = imputer.fit_transform(data)
- # 标准化
- scaler = StandardScaler()
- data = scaler.fit_transform(data)
- # 方差选择
- selector = VarianceThreshold(threshold=0.01)
- data = selector.fit_transform(data)
- print('data.shape', data.shape)
- # 假设您的 target 数组原来的形状为 (57, 2)
- # target = np.argmax(data.shape, axis=1) # 将二元分类标签转换为单个类标签
- # target = target.reshape((-1,)) # 将形状修改为 (57,)
- # 划分训练集和测试集
- X_train, X_test, y_train, y_test = train_test_split(data, data.shape, test_size=0.2, random_state=42)
- # 建立随机森林模型
- rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
- rfc.fit(X_train, y_train)
- # 在测试集上进行预测
- y_pred = rfc.predict(X_test)
- # 计算模型评估指标
- accuracy = accuracy_score(y_test, y_pred)
- precision = precision_score(y_test, y_pred)
- recall = recall_score(y_test, y_pred)
- f1 = f1_score(y_test, y_pred)
- print("Accuracy:", accuracy)
- print("Precision:", precision)
- print("Recall:", recall)
- print("F1 score:", f1)
- # data=data.drop(data[data['all_effects'] == 'MAPKBP1'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'BPNT1'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'MAPKBP1'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'TNS1'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'KCNH8'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'FOXL2NB'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'GHSR'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'WNK1'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'F12'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'ASB5'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'TBCCD1'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'GPR39'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'KCNF1'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'GDF7'].index)
- # data=data.drop(data[data['Hugo_Symbol'] == 'TCF23'].index)
- # data=data.drop(data[data['SYMBOL'] == 'MAPKBP1'].index)
- #
- #
- # data.drop('all_effects', axis=1, inplace=True)
- #
- # for index, row in data.iterrows():
- # if index == 42:
- # # print(row)
- # for elm_index in data.columns:
- # print(row[elm_index])
- #
- # # print(303030, data[data['Hugo_Symbol'] == 'MAPKBP1'].index)
- # # print(303030, data[data['Hugo_Symbol'] == 'MAPKBP1'])
- #
- # # df = data.drop(data.select_dtypes(include=['object']), axis=1)
- # X = data.iloc[:, :-1]
- # y = data.iloc[:, -1]
- #
- # # 2. 划分训练集和测试集
- # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
- #
- #
- # # 删除不需要的列
- # # data.drop( index=45, axis=1, inplace=True)
- # # data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
- #
- #
- #
- #
- #
- #
- #
- #
- # # 3. 选择算法
- # rfc = RandomForestClassifier(n_estimators=100, random_state=42)
- #
- # # 4. 训练模型
- # rfc.fit(X_train, y_train)
- #
- #
- #
- # # 5. 评估模型
- # y_pred = rfc.predict(X_test)
- # accuracy = accuracy_score(y_test, y_pred)
- # print(f"Accuracy: {accuracy:.2f}")
- #
- # # 6. 模型保存
- # joblib.dump(rfc, 'snp_model.pkl')
|