목차
⚙️ 실습
📝 Random Forest를 통한 검증
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs = -1, random_state = 1234)
scores = cross_val_score(rf,
X_train,y_train,
cv = kfold,
scoring = "accuracy",
n_jobs = -1)
print(scores)
for iter_count, acc in enumerate(scores):
print("RF의 {0}번째 검증 ACC: {1:4f}".format(iter_count, acc))
print("RF 모델의 Accuracy Mean: ", scores.mean())
print("RF 모델의 Accuracy std: ", scores.std())
- Hyper Parameter Tunning (RGS)
# 1. 사용할 모델 지정
rf = RandomForestClassifier(n_jobs = -1, random_state = 1234)
# 2. 변경하고 싶은 파라미터 조합
parameters = {
"n_estimators" : [10, 30, 50, 70, 100, 200, 300, 500, 1000, 2000],
"max_features" : [3, 4, 5, 6, 7],
"max_depth" : [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
"min_samples_leaf" : [1, 3, 5, 7, 9]
}
# 3. 시도 횟수
n_iter= 20
# 4. RGS에 모델 생성
rf_kfold_rgs = RandomizedSearchCV(rf,
param_distributions = parameters,
cv = kfold,
scoring = "accuracy",
n_jobs = -1,
random_state = 1234,
n_iter = n_iter)
# 5. 실제 데이터로 학습
rf_kfold_rgs.fit(X_train, y_train)
# 6. 예측하기
np.random.randint(low = 10, high = 2000, size = 10)
rf_kfold_rgs_best = rf_kfold_rgs.best_estimator_
rf_kfold_rgs_ypred = rf_kfold_rgs_best.predict(X_val)
rf_kfold_rgs_acc = accuracy_score(y_val, rf_kfold_rgs_ypred)
- Hyper Parameter Tunning (GS)
# 1. 사용할 모델 지정
rf = RandomForestClassifier(n_jobs = -1, random_state = 1234)
# 2. 변경해보고 싶은 파라미터들의 조합
parameters = {
"n_estimators" : [1000, 1100, 900, 500, 1500],
"max_features" : [5, 6, 7],
"max_depth" : [9, 10, 11, 15],
"min_samples_leaf" : [1, 2, 3, 4]
}
# 3. GS에 모델 생성
rf_kfold_gs = GridSearchCV(rf,
param_grid = parameters,
cv = kfold,
scoring = "accuracy",
n_jobs = -1)
# 4. 실제 데이터로 학습
rf_kfold_gs.fit(X_train, y_train)
# 5. 예측하기
rf_kfold_gs_best = rf_kfold_gs.best_estimator_
rf_kfold_gs_ypred = rf_kfold_gs_best.predict(X_val)
rf_kfold_gs_acc = accuracy_score(y_val, rf_kfold_gs_ypred)
rf_kfold_gs_best.predict(X_val)
rf_kfold_gs_best.predict_proba(X_val)
📝 XGBoost를 통한 검증
from xgboost import XGBClassifier
xgbc = XGBClassifier(n_jobs = -1, random_state = 1234)
scores = cross_val_score(xgbc,
X_train,
y_train,
cv = kfold,
scoring = "accuracy",
n_jobs = -1)
print(scores)
for iter_count, acc in enumerate(scores):
print("XGboost의 {0}번째 검증 ACC: {1:4f}".format(iter_count, acc))
print("XGboost 모델의 Accuracy Mean: ", scores.mean())
print("XGboost 모델의 Accuracy std: ", scores.std())
- Hyper Parameter Tunning (RGS)
# 1. 사용할 모델 지정
xgbc = XGBClassifier(n_jobs = -1, random_state = 1234)
# 2. 변경해보고 싶은 파라미터들의 조합
parameters = {
"n_estimators" : [10, 30, 50, 100, 300, 500],
"learning_rate" : [0.01, 0.1, 0.2, 0.3, 0.5],
"max_depth" : [2, 3, 4, 5, 6, 10],
"gamma" : [0, 0.1, 0.2, 0.3],
"subsample" : [0.3, 0.4, 0.5, 0.6, 0.9],
"colsample_bytree" : [0.3, 0.4, 0.5, 0.6, 0.9],
"reg_alpha" : [0, 0.01, 0.1, 0.3],
"reg_lambda" : [0, 0.01, 0.1, 0.3]
}
# 3. 시도 횟수
n_iter = 80
# 4. RGS에 모델 생성
xgb_kfold_rgs = RandomizedSearchCV(xgbc,
param_distributions = parameters,
cv = kfold,
scoring = "accuracy",
n_jobs = -1,
random_state = 1234,
n_iter = n_iter)
# 5. 실제 데이터로 학습
xgb_kfold_rgs.fit(X_train, y_train)
# 6. 예측하기
xgb_kfold_rgs_best = xgb_kfold_rgs.best_estimator_
xgb_kfold_rgs_ypred = xgb_kfold_rgs_best.predict(X_val)
xgb_kfold_rgs_acc = accuracy_score(y_val, rf_kfold_gs_ypred)
- Hyper Parameter Tunning (GS)
# 1. 사용할 모델 지정
xgbc = XGBClassifier(n_jobs = -1, random_state = 1234)
# 2. 변경해보고 싶은 파라미터들의 조합
parameters = {
"n_estimators" : [40, 45, 50, 55, 100],
"learning_rage" : [0.01, 0.05, 0.1, 0.15],
"max_depth" : [4, 5, 6],
"gamma" : [0, 0.1, 0.05, 0.15],
"subsample" : [0.4, 0.3, 0.5],
"colsample_bytree" : [0.4, 0.5, 0.6],
"reg_alpha" : [0.01, 0, 0.02],
"reg_lambda" : [0.1, 0, 0.05, 0.15]
}
# 3. GS에 모델 생성
xgb_kfold_gs = GridSearchCV(xgbc,
param_grid = parameters,
cv = kfold,
scoring = "accuracy",
n_jobs = -1)
# 4. 실제 데이터로 학습
xgb_kfold_gs.fit(X_train, y_train)
📝 LightGBM을 통한 검증
from lightgbm import LGBMClassifier
lgbc = LGBMClassifier(n_jobs = -1, random_state = 1234)
scores = cross_val_score(lgbc,
X_train,
y_train,
cv = kfold,
scoring = "accuracy",
n_jobs = -1)
print(scores)
for iter_count, acc in enumerate(scores):
print("LGBM의 {0}번째 검증 ACC: {1:4f}".format(iter_count, acc))
print("LGBM 모델의 Accuracy Mean: ", scores.mean())
print("LGBM 모델의 Accuracy std: ", scores.std())
- Hyper Parameter Tunning (RGS)
# 1. 사용할 모델 지정
lgbc = LGBMClassifier(n_jobs=-1, random_state=1234)
# 2. 변경해보고 싶은 파라미터들의 조합
parameters = {
"n_estimators" : [10, 30, 50, 100, 300, 500],
"learning_rate" : [0.01, 0.03, 0.05, 0.07, 0.1, 0.3],
"max_depth" : [-1, 2, 3, 5, 7, 10],
"min_split_gain" : [0, 0.1, 0.2],
"subsample" : [0.3, 0.4, 0.5, 0.6],
"colsample_bytree" : [0.3, 0.4, 0.5, 0.6],
"reg_alpha" : [0, 0.01, 0.1, 0.3],
"reg_lambda" : [0, 0.01, 0.1, 0.3]
}
# 3. 시도 횟수
n_iter = 60
# 4. RGS에 모델 생성
lgbm_kfold_rgs = RandomizedSearchCV(lgbc,
param_distributions = parameters,
cv = kfold,
n_jobs = -1,
scoring = "accuracy",
random_state = 1234,
n_iter = n_iter)
# 5. 실제 데이터로 학습
lgbm_kfold_rgs.fit(X_train, y_train)
# 6. 예측하기
lgbm_kfold_gs_best = lgbm_kfold_rgs.best_estimator_
lgbm_kfold_gs_ypred = lgbm_kfold_gs_best.predict(X_val)
lgbm_kfold_gs_acc = accuracy_score(y_val, lgbm_kfold_gs_ypred)
📝 Hard Voting
from sklearn.ensemble import VotingClassifier
hard_clf = VotingClassifier(
estimators= [
("RF", rf_kfold_gs_best),
("XGB", xgb_kfold_gs_best),
("LGBM", lgbm_kfold_rgs_best)
],
voting="hard"
)
hard_clf.fit(X_train, y_train)
hard_clf_ypred = hard_clf.predict(X_val)
hard_clf_acc = accuracy_score(y_val, hard_clf_ypred)
📝 Soft Voting
# 지분율 : lightGBm 2, xg : 1
soft_clf = VotingClassifier(
estimators= [
("XGB", xgb_kfold_gs_best),
("LGBM", lgbm_kfold_rgs_best)
],
voting="soft",
weights=[1, 2]
)
soft_clf.fit(X_train, y_train)
soft_clf_ypred = soft_clf.predict(X_val)
soft_clf_acc = accuracy_score(y_val, soft_clf_ypred)
# 지분율 : lightGBm 2, rf:1
soft_clf = VotingClassifier(
estimators= [
("RF", rf_kfold_gs_best),
("LGBM", lgbm_kfold_rgs_best)
],
voting="soft",
weights=[1, 2]
)
soft_clf.fit(X_train, y_train)
soft_clf_ypred = soft_clf.predict(X_val)
soft_clf_acc = accuracy_score(y_val, soft_clf_ypred)
📝 모델 저장하기 (joblib)
import joblib
# 모델 저장하기
joblib.dump(rf_kfold_gs_best, "rf_kfold_gs_best.pkl")
# 저장된 모델 불러오기
model_path = "rf_kfold_gs_best.pkl"
rf_kfold_backup_model = joblib.load(model_path)
📝 모델 저장하기 (pickle)
# pickle을 사용하면 협업 과정에서 발생할 수 있는 충돌 예방이 가능
import pickle
# 모델 저장하기
filename = "my_model.pkl"
pickle.dump(rf_kfold_gs_best, open(filename, "wb"))
# 저장된 모델 불러오기
new_model = pickle.load(open(filename, "rb"))
⚙️ 실습
📝 Random Forest를 통한 검증
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs = -1, random_state = 1234)
scores = cross_val_score(rf,
X_train,y_train,
cv = kfold,
scoring = "accuracy",
n_jobs = -1)
print(scores)
for iter_count, acc in enumerate(scores):
print("RF의 {0}번째 검증 ACC: {1:4f}".format(iter_count, acc))
print("RF 모델의 Accuracy Mean: ", scores.mean())
print("RF 모델의 Accuracy std: ", scores.std())
- Hyper Parameter Tunning (RGS)
# 1. 사용할 모델 지정
rf = RandomForestClassifier(n_jobs = -1, random_state = 1234)
# 2. 변경하고 싶은 파라미터 조합
parameters = {
"n_estimators" : [10, 30, 50, 70, 100, 200, 300, 500, 1000, 2000],
"max_features" : [3, 4, 5, 6, 7],
"max_depth" : [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
"min_samples_leaf" : [1, 3, 5, 7, 9]
}
# 3. 시도 횟수
n_iter= 20
# 4. RGS에 모델 생성
rf_kfold_rgs = RandomizedSearchCV(rf,
param_distributions = parameters,
cv = kfold,
scoring = "accuracy",
n_jobs = -1,
random_state = 1234,
n_iter = n_iter)
# 5. 실제 데이터로 학습
rf_kfold_rgs.fit(X_train, y_train)
# 6. 예측하기
np.random.randint(low = 10, high = 2000, size = 10)
rf_kfold_rgs_best = rf_kfold_rgs.best_estimator_
rf_kfold_rgs_ypred = rf_kfold_rgs_best.predict(X_val)
rf_kfold_rgs_acc = accuracy_score(y_val, rf_kfold_rgs_ypred)
- Hyper Parameter Tunning (GS)
# 1. 사용할 모델 지정
rf = RandomForestClassifier(n_jobs = -1, random_state = 1234)
# 2. 변경해보고 싶은 파라미터들의 조합
parameters = {
"n_estimators" : [1000, 1100, 900, 500, 1500],
"max_features" : [5, 6, 7],
"max_depth" : [9, 10, 11, 15],
"min_samples_leaf" : [1, 2, 3, 4]
}
# 3. GS에 모델 생성
rf_kfold_gs = GridSearchCV(rf,
param_grid = parameters,
cv = kfold,
scoring = "accuracy",
n_jobs = -1)
# 4. 실제 데이터로 학습
rf_kfold_gs.fit(X_train, y_train)
# 5. 예측하기
rf_kfold_gs_best = rf_kfold_gs.best_estimator_
rf_kfold_gs_ypred = rf_kfold_gs_best.predict(X_val)
rf_kfold_gs_acc = accuracy_score(y_val, rf_kfold_gs_ypred)
rf_kfold_gs_best.predict(X_val)
rf_kfold_gs_best.predict_proba(X_val)
📝 XGBoost를 통한 검증
from xgboost import XGBClassifier
xgbc = XGBClassifier(n_jobs = -1, random_state = 1234)
scores = cross_val_score(xgbc,
X_train,
y_train,
cv = kfold,
scoring = "accuracy",
n_jobs = -1)
print(scores)
for iter_count, acc in enumerate(scores):
print("XGboost의 {0}번째 검증 ACC: {1:4f}".format(iter_count, acc))
print("XGboost 모델의 Accuracy Mean: ", scores.mean())
print("XGboost 모델의 Accuracy std: ", scores.std())
- Hyper Parameter Tunning (RGS)
# 1. 사용할 모델 지정
xgbc = XGBClassifier(n_jobs = -1, random_state = 1234)
# 2. 변경해보고 싶은 파라미터들의 조합
parameters = {
"n_estimators" : [10, 30, 50, 100, 300, 500],
"learning_rate" : [0.01, 0.1, 0.2, 0.3, 0.5],
"max_depth" : [2, 3, 4, 5, 6, 10],
"gamma" : [0, 0.1, 0.2, 0.3],
"subsample" : [0.3, 0.4, 0.5, 0.6, 0.9],
"colsample_bytree" : [0.3, 0.4, 0.5, 0.6, 0.9],
"reg_alpha" : [0, 0.01, 0.1, 0.3],
"reg_lambda" : [0, 0.01, 0.1, 0.3]
}
# 3. 시도 횟수
n_iter = 80
# 4. RGS에 모델 생성
xgb_kfold_rgs = RandomizedSearchCV(xgbc,
param_distributions = parameters,
cv = kfold,
scoring = "accuracy",
n_jobs = -1,
random_state = 1234,
n_iter = n_iter)
# 5. 실제 데이터로 학습
xgb_kfold_rgs.fit(X_train, y_train)
# 6. 예측하기
xgb_kfold_rgs_best = xgb_kfold_rgs.best_estimator_
xgb_kfold_rgs_ypred = xgb_kfold_rgs_best.predict(X_val)
xgb_kfold_rgs_acc = accuracy_score(y_val, rf_kfold_gs_ypred)
- Hyper Parameter Tunning (GS)
# 1. 사용할 모델 지정
xgbc = XGBClassifier(n_jobs = -1, random_state = 1234)
# 2. 변경해보고 싶은 파라미터들의 조합
parameters = {
"n_estimators" : [40, 45, 50, 55, 100],
"learning_rage" : [0.01, 0.05, 0.1, 0.15],
"max_depth" : [4, 5, 6],
"gamma" : [0, 0.1, 0.05, 0.15],
"subsample" : [0.4, 0.3, 0.5],
"colsample_bytree" : [0.4, 0.5, 0.6],
"reg_alpha" : [0.01, 0, 0.02],
"reg_lambda" : [0.1, 0, 0.05, 0.15]
}
# 3. GS에 모델 생성
xgb_kfold_gs = GridSearchCV(xgbc,
param_grid = parameters,
cv = kfold,
scoring = "accuracy",
n_jobs = -1)
# 4. 실제 데이터로 학습
xgb_kfold_gs.fit(X_train, y_train)
📝 LightGBM을 통한 검증
from lightgbm import LGBMClassifier
lgbc = LGBMClassifier(n_jobs = -1, random_state = 1234)
scores = cross_val_score(lgbc,
X_train,
y_train,
cv = kfold,
scoring = "accuracy",
n_jobs = -1)
print(scores)
for iter_count, acc in enumerate(scores):
print("LGBM의 {0}번째 검증 ACC: {1:4f}".format(iter_count, acc))
print("LGBM 모델의 Accuracy Mean: ", scores.mean())
print("LGBM 모델의 Accuracy std: ", scores.std())
- Hyper Parameter Tunning (RGS)
# 1. 사용할 모델 지정
lgbc = LGBMClassifier(n_jobs=-1, random_state=1234)
# 2. 변경해보고 싶은 파라미터들의 조합
parameters = {
"n_estimators" : [10, 30, 50, 100, 300, 500],
"learning_rate" : [0.01, 0.03, 0.05, 0.07, 0.1, 0.3],
"max_depth" : [-1, 2, 3, 5, 7, 10],
"min_split_gain" : [0, 0.1, 0.2],
"subsample" : [0.3, 0.4, 0.5, 0.6],
"colsample_bytree" : [0.3, 0.4, 0.5, 0.6],
"reg_alpha" : [0, 0.01, 0.1, 0.3],
"reg_lambda" : [0, 0.01, 0.1, 0.3]
}
# 3. 시도 횟수
n_iter = 60
# 4. RGS에 모델 생성
lgbm_kfold_rgs = RandomizedSearchCV(lgbc,
param_distributions = parameters,
cv = kfold,
n_jobs = -1,
scoring = "accuracy",
random_state = 1234,
n_iter = n_iter)
# 5. 실제 데이터로 학습
lgbm_kfold_rgs.fit(X_train, y_train)
# 6. 예측하기
lgbm_kfold_gs_best = lgbm_kfold_rgs.best_estimator_
lgbm_kfold_gs_ypred = lgbm_kfold_gs_best.predict(X_val)
lgbm_kfold_gs_acc = accuracy_score(y_val, lgbm_kfold_gs_ypred)
📝 Hard Voting
from sklearn.ensemble import VotingClassifier
hard_clf = VotingClassifier(
estimators= [
("RF", rf_kfold_gs_best),
("XGB", xgb_kfold_gs_best),
("LGBM", lgbm_kfold_rgs_best)
],
voting="hard"
)
hard_clf.fit(X_train, y_train)
hard_clf_ypred = hard_clf.predict(X_val)
hard_clf_acc = accuracy_score(y_val, hard_clf_ypred)
📝 Soft Voting
# 지분율 : lightGBm 2, xg : 1
soft_clf = VotingClassifier(
estimators= [
("XGB", xgb_kfold_gs_best),
("LGBM", lgbm_kfold_rgs_best)
],
voting="soft",
weights=[1, 2]
)
soft_clf.fit(X_train, y_train)
soft_clf_ypred = soft_clf.predict(X_val)
soft_clf_acc = accuracy_score(y_val, soft_clf_ypred)
# 지분율 : lightGBm 2, rf:1
soft_clf = VotingClassifier(
estimators= [
("RF", rf_kfold_gs_best),
("LGBM", lgbm_kfold_rgs_best)
],
voting="soft",
weights=[1, 2]
)
soft_clf.fit(X_train, y_train)
soft_clf_ypred = soft_clf.predict(X_val)
soft_clf_acc = accuracy_score(y_val, soft_clf_ypred)
📝 모델 저장하기 (joblib)
import joblib
# 모델 저장하기
joblib.dump(rf_kfold_gs_best, "rf_kfold_gs_best.pkl")
# 저장된 모델 불러오기
model_path = "rf_kfold_gs_best.pkl"
rf_kfold_backup_model = joblib.load(model_path)
📝 모델 저장하기 (pickle)
# pickle을 사용하면 협업 과정에서 발생할 수 있는 충돌 예방이 가능
import pickle
# 모델 저장하기
filename = "my_model.pkl"
pickle.dump(rf_kfold_gs_best, open(filename, "wb"))
# 저장된 모델 불러오기
new_model = pickle.load(open(filename, "rb"))