main.py 709 B

123456789101112131415161718192021222324252627
  1. import pandas as pd
  2. from sklearn.model_selection import train_test_split
  3. from sklearn.ensemble import RandomForestClassifier
  4. from sklearn.metrics import accuracy_score
  5. import joblib
  6. # 1. 准备数据
  7. data = pd.read_csv('snp_data.csv')
  8. X = data.iloc[:, :-1]
  9. y = data.iloc[:, -1]
  10. # 2. 划分训练集和测试集
  11. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  12. # 3. 选择算法
  13. rfc = RandomForestClassifier(n_estimators=100, random_state=42)
  14. # 4. 训练模型
  15. rfc.fit(X_train, y_train)
  16. # 5. 评估模型
  17. y_pred = rfc.predict(X_test)
  18. accuracy = accuracy_score(y_test, y_pred)
  19. print(f"Accuracy: {accuracy:.2f}")
  20. # 6. 模型保存
  21. joblib.dump(rfc, 'snp_model.pkl')