import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score import joblib # 1. 准备数据 data = pd.read_csv('snp_data.csv') X = data.iloc[:, :-1] y = data.iloc[:, -1] # 2. 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 3. 选择算法 rfc = RandomForestClassifier(n_estimators=100, random_state=42) # 4. 训练模型 rfc.fit(X_train, y_train) # 5. 评估模型 y_pred = rfc.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"Accuracy: {accuracy:.2f}") # 6. 模型保存 joblib.dump(rfc, 'snp_model.pkl')