ag update

This commit is contained in:
2024-02-27 14:40:27 +01:00
parent 1e882df758
commit 759f4c50d0
5 changed files with 5723 additions and 75 deletions

View File

@@ -7,32 +7,47 @@ import pickle
import argparse
def main(args):
#you can put these parameters in the args but here I keep it simpler
num_boost_round = 600
SKI_AREA_TEST= 'Klausberg' ##you can put it to None
SEASON_TEST_SKIAREA = 'Kronplatz'##you can put it to None
SEASON_TEST_YEAR= 2023 ##you can put it to None
weight_type = 'sqrt'
##these are passed
reload_data = args.reload_data
use_smote = args.use_smote ##I don't like to use it, leave to False
undersampling = args.undersampling ##I don't like to use it, leave to False
retrain = args.retrain
retrain_last_model = args.retrain_last_model
test_size = args.test_size
labeled,labeled_small,to_remove = retrive_data(reload_data=args.reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
## get the data
labeled,labeled_small,to_remove = retrive_data(reload_data=reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
with open('to_remove.pkl','wb') as f:
pickle.dump(to_remove,f)
#split the data
dataset,dataset_test = split(labeled_small if args.use_small else labeled ,
SKI_AREA_TEST= 'Klausberg',
SEASON_TEST_SKIAREA = 'Kronplatz',
SEASON_TEST_YEAR= 2023,
use_smote = args.use_smote,
weight_type = 'sqrt' )
if args.retrain:
SKI_AREA_TEST= SKI_AREA_TEST,
SEASON_TEST_SKIAREA = SEASON_TEST_SKIAREA,
SEASON_TEST_YEAR= SEASON_TEST_YEAR,
use_smote = use_smote,
undersampling = undersampling,
test_size = test_size,
weight_type = weight_type )
#if you changed something you may want to retrain the model and save the best model
if retrain:
print('OPTUNA hyperparameter tuning, please wait!')
best_model,params_final = train(dataset,n_trials=args.n_trials,timeout=600,num_boost_round=600)
best_model,params_final,study = train(dataset,n_trials=args.n_trials,timeout=600,num_boost_round=num_boost_round)
feat_imp = pd.Series(best_model.get_fscore()).sort_values(ascending=False)
with open('best_params.pkl','wb') as f:
pickle.dump([params_final,feat_imp,best_model],f)
pickle.dump([params_final,feat_imp,best_model,study],f)
else:
with open('best_params.pkl','rb') as f:
params_final,feat_imp,best_model = pickle.load(f)
params_final,feat_imp,best_model,study = pickle.load(f)
@@ -41,7 +56,7 @@ def main(args):
tmp_train = xgb.DMatrix(dataset.X_train[best_model.feature_names],dataset.y_train,enable_categorical=True)
tmp_valid = xgb.DMatrix(dataset.X_valid[best_model.feature_names],dataset.y_valid,enable_categorical=True)
##get the scores
preds_class_valid = best_model.predict(tmp_valid)
preds_class_train= best_model.predict(tmp_train)
print('##################RESULT ON THE TRAIN SET#####################')
@@ -55,30 +70,33 @@ def main(args):
if args.retrain_last_model:
tot,bst_FS,FS = gain_accuracy_train(dataset,feat_imp,num_boost_round=600,params=params_final)
#now you can train the final model, for example using gain_accuracy_train for reducing the number of features used
if retrain_last_model:
tot,bst_FS,FS = gain_accuracy_train(dataset,feat_imp,num_boost_round=num_boost_round,params=params_final)
with open('best_params_and_final_model.pkl','wb') as f:
pickle.dump([tot,bst_FS,FS],f)
else:
with open('best_params_and_final_model.pkl','rb') as f:
tot,bst_FS,FS = pickle.load(f)
dtest_FS = xgb.DMatrix(dataset_test.X_test_area[bst_FS.feature_names],dataset_test.y_test_area,enable_categorical=True,)
dtest_season_FS = xgb.DMatrix(dataset_test.X_test_season[bst_FS.feature_names],dataset_test.y_test_season,enable_categorical=True,)
preds_class_test = bst_FS.predict(dtest_FS)
preds_class_test_season = bst_FS.predict(dtest_season_FS)
if dataset_test.X_test_area is not None:
dtest_FS = xgb.DMatrix(dataset_test.X_test_area[bst_FS.feature_names],dataset_test.y_test_area,enable_categorical=True,)
preds_class_test = bst_FS.predict(dtest_FS)
mcc = matthews_corrcoef(dataset_test.y_test_area,preds_class_test.argmax(1))
acc = accuracy_score(dataset_test.y_test_area,preds_class_test.argmax(1))
cm = confusion_matrix(dataset_test.y_test_area,preds_class_test.argmax(1))
print(f'RESULT ON THE TEST SKI AREA {mcc=}, {acc=}, \n{cm=}')
if dataset_test.X_test_season is not None:
dtest_season_FS = xgb.DMatrix(dataset_test.X_test_season[bst_FS.feature_names],dataset_test.y_test_season,enable_categorical=True,)
preds_class_test_season = bst_FS.predict(dtest_season_FS)
mcc = matthews_corrcoef(dataset_test.y_test_season,preds_class_test_season.argmax(1))
acc = accuracy_score(dataset_test.y_test_season,preds_class_test_season.argmax(1))
cm = confusion_matrix(dataset_test.y_test_season,preds_class_test_season.argmax(1))
mcc = matthews_corrcoef(dataset_test.y_test_area,preds_class_test.argmax(1))
acc = accuracy_score(dataset_test.y_test_area,preds_class_test.argmax(1))
cm = confusion_matrix(dataset_test.y_test_area,preds_class_test.argmax(1))
print(f'RESULT ON THE TEST SKI SEASON {mcc=}, {acc=}, {cm=}')
print(f'RESULT ON THE TEST SKI AREA {mcc=}, {acc=}, \n{cm=}')
mcc = matthews_corrcoef(dataset_test.y_test_season,preds_class_test_season.argmax(1))
acc = accuracy_score(dataset_test.y_test_season,preds_class_test_season.argmax(1))
cm = confusion_matrix(dataset_test.y_test_season,preds_class_test_season.argmax(1))
print(f'RESULT ON THE TEST SKI SEASON {mcc=}, {acc=}, {cm=}')
if __name__ == "__main__":
@@ -90,6 +108,8 @@ if __name__ == "__main__":
parser.add_argument('--reload_data', action='store_true', help='Dowload data from db')
parser.add_argument('--retrain_last_model', action='store_true', help='retrain the last model')
parser.add_argument('--n_trials', type=int,default=1000, help='number of trials per optuna')
parser.add_argument('--undersampling', action='store_true', help='Undersample the training dataset')
parser.add_argument('--test_size', type=float,default=0.33, help='Percentage of dataset to use as validation')
args = parser.parse_args()

View File

@@ -53,7 +53,7 @@ def objective(trial,dataset:Dataset,num_boost_round:int)->float:
return mcc
def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=600)->(xgb.Boost, dict):
def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=600)->(xgb.Booster, dict):
"""optuna search procedure
Args:
@@ -85,7 +85,7 @@ def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=
bst = xgb.train(params_final, dtrain,verbose_eval=False, num_boost_round=num_boost_round,
evals = [(dtrain, "train"), (dvalid, "valid")],
early_stopping_rounds=100,)
return bst,params_final
return bst,params_final, study
def gain_accuracy_train(dataset:Dataset,feat_imp:pd.DataFrame,num_boost_round:int=600,params:dict={})->(pd.DataFrame,xgb.Booster,int):

View File

@@ -7,6 +7,8 @@ import pickle
from dataclasses import dataclass
from typing import Union
import os
from imblearn.under_sampling import RandomUnderSampler,RandomOverSampler
##AUXILIARY CLASSES
@dataclass
class Dataset:
@@ -146,6 +148,8 @@ def split(labeled:pd.DataFrame,
SEASON_TEST_SKIAREA:str = 'Kronplatz',
SEASON_TEST_YEAR:int = 2023,
use_smote:bool = False,
undersampling:bool=False,
test_size:float=0.33,
weight_type:str = 'sqrt' )->(Dataset, Dataset_test):
"""Split the dataset into train,validation test. From the initial dataset we remove a single skiarea (SKI_AREA_TEST)
generating the first test set. Then we select a skieare and a starting season (SEASON_TEST_SKIAREA,SEASON_TEST_YEAR)
@@ -159,32 +163,44 @@ def split(labeled:pd.DataFrame,
SEASON_TEST_SKIAREA (str, optional): skiarea to remove from the dataset if the season is greater than SEASON_TEST_YEAR. Defaults to 'Kronplatz'.
SEASON_TEST_YEAR (int, optional): see SEASON_TEST_SKIAREA . Defaults to 2023.
use_smote (bool, optional): use oversampling for class umbalance. Defaults to False.
undersampling (bool, optional): use undersampling for class umbalance. Defaults to False.
test_size (float, optional): percentage of dataset to use as validation. Defaults to 0.33.
weight_type (str, optional): routine for weighting the error on the samples. Defaults to 'sqrt'.
Returns:
trainin-validation dataset and test dataset
"""
test_area = labeled[labeled.skiarea_name==SKI_AREA_TEST]
test_area_season = labeled[(labeled.skiarea_name==SEASON_TEST_SKIAREA)&(labeled.season>=SEASON_TEST_YEAR)]
labeled_tmp = labeled.copy()
##remove from dataset the corresponding test rows
labeled_tmp = labeled[labeled.skiarea_name!=SKI_AREA_TEST]
labeled_tmp = labeled_tmp[(labeled_tmp.skiarea_name!=SEASON_TEST_SKIAREA)|(labeled_tmp.season<SEASON_TEST_YEAR) ]
if SKI_AREA_TEST is not None:
test_area = labeled[labeled.skiarea_name==SKI_AREA_TEST]
labeled_tmp = labeled_tmp[labeled_tmp.skiarea_name!=SKI_AREA_TEST]
else:
test_area = None
if SEASON_TEST_SKIAREA is not None and SEASON_TEST_YEAR is not None:
test_area_season = labeled[(labeled.skiarea_name==SEASON_TEST_SKIAREA)&(labeled.season>=SEASON_TEST_YEAR)]
labeled_tmp = labeled_tmp[(labeled_tmp.skiarea_name!=SEASON_TEST_SKIAREA)|(labeled_tmp.season<SEASON_TEST_YEAR) ]
else:
test_area_season = None
X_train, X_valid, y_train, y_valid = train_test_split( labeled_tmp.drop(columns=['india','season','skiarea_name']),
labeled_tmp.india, test_size=0.33, random_state=0,stratify=labeled_tmp.india)
if use_smote:
from imblearn.over_sampling import RandomOverSampler
labeled_tmp.india, test_size=test_size, random_state=0,stratify=labeled_tmp.india)
if use_smote:
sm = RandomOverSampler()
X_train,y_train = sm.fit_resample(X_train,y_train)
if undersampling:
sm = RandomUnderSampler(sampling_strategy='majority')
X_train,y_train = sm.fit_resample(X_train,y_train)
##computed the weights for unbalanced dataset
@@ -202,7 +218,7 @@ def split(labeled:pd.DataFrame,
print(f'{weight_type=} not implemented please use a valid one: sqrt or sum, I will set all the weights to 0')
w.p = 1
if use_smote is False:
if use_smote is False and undersampling is False:
weight_train = pd.merge(pd.DataFrame({'class':y_train}),w).p.values
else:
w.p = 1
@@ -210,7 +226,7 @@ def split(labeled:pd.DataFrame,
dataset = Dataset(X_train, y_train, X_valid, y_valid,weight_train)
dataset_test = Dataset_test(test_area,test_area.india,test_area_season,test_area_season.india)
dataset_test = Dataset_test(test_area,test_area.india if test_area is not None else None,test_area_season,test_area_season.india if test_area_season is not None else None)
return dataset,dataset_test