ag update
This commit is contained in:
80
src/main.py
80
src/main.py
@@ -7,32 +7,47 @@ import pickle
|
||||
import argparse
|
||||
|
||||
def main(args):
|
||||
#you can put these parameters in the args but here I keep it simpler
|
||||
num_boost_round = 600
|
||||
SKI_AREA_TEST= 'Klausberg' ##you can put it to None
|
||||
SEASON_TEST_SKIAREA = 'Kronplatz'##you can put it to None
|
||||
SEASON_TEST_YEAR= 2023 ##you can put it to None
|
||||
weight_type = 'sqrt'
|
||||
|
||||
##these are passed
|
||||
reload_data = args.reload_data
|
||||
use_smote = args.use_smote ##I don't like to use it, leave to False
|
||||
undersampling = args.undersampling ##I don't like to use it, leave to False
|
||||
retrain = args.retrain
|
||||
retrain_last_model = args.retrain_last_model
|
||||
test_size = args.test_size
|
||||
|
||||
|
||||
labeled,labeled_small,to_remove = retrive_data(reload_data=args.reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
|
||||
## get the data
|
||||
labeled,labeled_small,to_remove = retrive_data(reload_data=reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
|
||||
with open('to_remove.pkl','wb') as f:
|
||||
pickle.dump(to_remove,f)
|
||||
|
||||
#split the data
|
||||
dataset,dataset_test = split(labeled_small if args.use_small else labeled ,
|
||||
SKI_AREA_TEST= 'Klausberg',
|
||||
SEASON_TEST_SKIAREA = 'Kronplatz',
|
||||
SEASON_TEST_YEAR= 2023,
|
||||
use_smote = args.use_smote,
|
||||
weight_type = 'sqrt' )
|
||||
if args.retrain:
|
||||
|
||||
|
||||
SKI_AREA_TEST= SKI_AREA_TEST,
|
||||
SEASON_TEST_SKIAREA = SEASON_TEST_SKIAREA,
|
||||
SEASON_TEST_YEAR= SEASON_TEST_YEAR,
|
||||
use_smote = use_smote,
|
||||
undersampling = undersampling,
|
||||
test_size = test_size,
|
||||
weight_type = weight_type )
|
||||
#if you changed something you may want to retrain the model and save the best model
|
||||
if retrain:
|
||||
print('OPTUNA hyperparameter tuning, please wait!')
|
||||
best_model,params_final = train(dataset,n_trials=args.n_trials,timeout=600,num_boost_round=600)
|
||||
best_model,params_final,study = train(dataset,n_trials=args.n_trials,timeout=600,num_boost_round=num_boost_round)
|
||||
feat_imp = pd.Series(best_model.get_fscore()).sort_values(ascending=False)
|
||||
|
||||
with open('best_params.pkl','wb') as f:
|
||||
pickle.dump([params_final,feat_imp,best_model],f)
|
||||
pickle.dump([params_final,feat_imp,best_model,study],f)
|
||||
|
||||
else:
|
||||
with open('best_params.pkl','rb') as f:
|
||||
params_final,feat_imp,best_model = pickle.load(f)
|
||||
params_final,feat_imp,best_model,study = pickle.load(f)
|
||||
|
||||
|
||||
|
||||
@@ -41,7 +56,7 @@ def main(args):
|
||||
tmp_train = xgb.DMatrix(dataset.X_train[best_model.feature_names],dataset.y_train,enable_categorical=True)
|
||||
tmp_valid = xgb.DMatrix(dataset.X_valid[best_model.feature_names],dataset.y_valid,enable_categorical=True)
|
||||
|
||||
|
||||
##get the scores
|
||||
preds_class_valid = best_model.predict(tmp_valid)
|
||||
preds_class_train= best_model.predict(tmp_train)
|
||||
print('##################RESULT ON THE TRAIN SET#####################')
|
||||
@@ -55,30 +70,33 @@ def main(args):
|
||||
|
||||
|
||||
|
||||
|
||||
if args.retrain_last_model:
|
||||
tot,bst_FS,FS = gain_accuracy_train(dataset,feat_imp,num_boost_round=600,params=params_final)
|
||||
#now you can train the final model, for example using gain_accuracy_train for reducing the number of features used
|
||||
if retrain_last_model:
|
||||
tot,bst_FS,FS = gain_accuracy_train(dataset,feat_imp,num_boost_round=num_boost_round,params=params_final)
|
||||
with open('best_params_and_final_model.pkl','wb') as f:
|
||||
pickle.dump([tot,bst_FS,FS],f)
|
||||
else:
|
||||
with open('best_params_and_final_model.pkl','rb') as f:
|
||||
tot,bst_FS,FS = pickle.load(f)
|
||||
|
||||
dtest_FS = xgb.DMatrix(dataset_test.X_test_area[bst_FS.feature_names],dataset_test.y_test_area,enable_categorical=True,)
|
||||
dtest_season_FS = xgb.DMatrix(dataset_test.X_test_season[bst_FS.feature_names],dataset_test.y_test_season,enable_categorical=True,)
|
||||
preds_class_test = bst_FS.predict(dtest_FS)
|
||||
preds_class_test_season = bst_FS.predict(dtest_season_FS)
|
||||
if dataset_test.X_test_area is not None:
|
||||
dtest_FS = xgb.DMatrix(dataset_test.X_test_area[bst_FS.feature_names],dataset_test.y_test_area,enable_categorical=True,)
|
||||
preds_class_test = bst_FS.predict(dtest_FS)
|
||||
mcc = matthews_corrcoef(dataset_test.y_test_area,preds_class_test.argmax(1))
|
||||
acc = accuracy_score(dataset_test.y_test_area,preds_class_test.argmax(1))
|
||||
cm = confusion_matrix(dataset_test.y_test_area,preds_class_test.argmax(1))
|
||||
print(f'RESULT ON THE TEST SKI AREA {mcc=}, {acc=}, \n{cm=}')
|
||||
|
||||
if dataset_test.X_test_season is not None:
|
||||
dtest_season_FS = xgb.DMatrix(dataset_test.X_test_season[bst_FS.feature_names],dataset_test.y_test_season,enable_categorical=True,)
|
||||
preds_class_test_season = bst_FS.predict(dtest_season_FS)
|
||||
mcc = matthews_corrcoef(dataset_test.y_test_season,preds_class_test_season.argmax(1))
|
||||
acc = accuracy_score(dataset_test.y_test_season,preds_class_test_season.argmax(1))
|
||||
cm = confusion_matrix(dataset_test.y_test_season,preds_class_test_season.argmax(1))
|
||||
|
||||
mcc = matthews_corrcoef(dataset_test.y_test_area,preds_class_test.argmax(1))
|
||||
acc = accuracy_score(dataset_test.y_test_area,preds_class_test.argmax(1))
|
||||
cm = confusion_matrix(dataset_test.y_test_area,preds_class_test.argmax(1))
|
||||
print(f'RESULT ON THE TEST SKI SEASON {mcc=}, {acc=}, {cm=}')
|
||||
|
||||
|
||||
print(f'RESULT ON THE TEST SKI AREA {mcc=}, {acc=}, \n{cm=}')
|
||||
mcc = matthews_corrcoef(dataset_test.y_test_season,preds_class_test_season.argmax(1))
|
||||
acc = accuracy_score(dataset_test.y_test_season,preds_class_test_season.argmax(1))
|
||||
cm = confusion_matrix(dataset_test.y_test_season,preds_class_test_season.argmax(1))
|
||||
|
||||
print(f'RESULT ON THE TEST SKI SEASON {mcc=}, {acc=}, {cm=}')
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -90,6 +108,8 @@ if __name__ == "__main__":
|
||||
parser.add_argument('--reload_data', action='store_true', help='Dowload data from db')
|
||||
parser.add_argument('--retrain_last_model', action='store_true', help='retrain the last model')
|
||||
parser.add_argument('--n_trials', type=int,default=1000, help='number of trials per optuna')
|
||||
parser.add_argument('--undersampling', action='store_true', help='Undersample the training dataset')
|
||||
parser.add_argument('--test_size', type=float,default=0.33, help='Percentage of dataset to use as validation')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ def objective(trial,dataset:Dataset,num_boost_round:int)->float:
|
||||
return mcc
|
||||
|
||||
|
||||
def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=600)->(xgb.Boost, dict):
|
||||
def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=600)->(xgb.Booster, dict):
|
||||
"""optuna search procedure
|
||||
|
||||
Args:
|
||||
@@ -85,7 +85,7 @@ def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=
|
||||
bst = xgb.train(params_final, dtrain,verbose_eval=False, num_boost_round=num_boost_round,
|
||||
evals = [(dtrain, "train"), (dvalid, "valid")],
|
||||
early_stopping_rounds=100,)
|
||||
return bst,params_final
|
||||
return bst,params_final, study
|
||||
|
||||
|
||||
def gain_accuracy_train(dataset:Dataset,feat_imp:pd.DataFrame,num_boost_round:int=600,params:dict={})->(pd.DataFrame,xgb.Booster,int):
|
||||
|
||||
42
src/utils.py
42
src/utils.py
@@ -7,6 +7,8 @@ import pickle
|
||||
from dataclasses import dataclass
|
||||
from typing import Union
|
||||
import os
|
||||
from imblearn.under_sampling import RandomUnderSampler,RandomOverSampler
|
||||
|
||||
##AUXILIARY CLASSES
|
||||
@dataclass
|
||||
class Dataset:
|
||||
@@ -146,6 +148,8 @@ def split(labeled:pd.DataFrame,
|
||||
SEASON_TEST_SKIAREA:str = 'Kronplatz',
|
||||
SEASON_TEST_YEAR:int = 2023,
|
||||
use_smote:bool = False,
|
||||
undersampling:bool=False,
|
||||
test_size:float=0.33,
|
||||
weight_type:str = 'sqrt' )->(Dataset, Dataset_test):
|
||||
"""Split the dataset into train,validation test. From the initial dataset we remove a single skiarea (SKI_AREA_TEST)
|
||||
generating the first test set. Then we select a skieare and a starting season (SEASON_TEST_SKIAREA,SEASON_TEST_YEAR)
|
||||
@@ -159,32 +163,44 @@ def split(labeled:pd.DataFrame,
|
||||
SEASON_TEST_SKIAREA (str, optional): skiarea to remove from the dataset if the season is greater than SEASON_TEST_YEAR. Defaults to 'Kronplatz'.
|
||||
SEASON_TEST_YEAR (int, optional): see SEASON_TEST_SKIAREA . Defaults to 2023.
|
||||
use_smote (bool, optional): use oversampling for class umbalance. Defaults to False.
|
||||
undersampling (bool, optional): use undersampling for class umbalance. Defaults to False.
|
||||
test_size (float, optional): percentage of dataset to use as validation. Defaults to 0.33.
|
||||
weight_type (str, optional): routine for weighting the error on the samples. Defaults to 'sqrt'.
|
||||
|
||||
|
||||
Returns:
|
||||
trainin-validation dataset and test dataset
|
||||
"""
|
||||
|
||||
|
||||
test_area = labeled[labeled.skiarea_name==SKI_AREA_TEST]
|
||||
test_area_season = labeled[(labeled.skiarea_name==SEASON_TEST_SKIAREA)&(labeled.season>=SEASON_TEST_YEAR)]
|
||||
|
||||
labeled_tmp = labeled.copy()
|
||||
##remove from dataset the corresponding test rows
|
||||
labeled_tmp = labeled[labeled.skiarea_name!=SKI_AREA_TEST]
|
||||
labeled_tmp = labeled_tmp[(labeled_tmp.skiarea_name!=SEASON_TEST_SKIAREA)|(labeled_tmp.season<SEASON_TEST_YEAR) ]
|
||||
|
||||
if SKI_AREA_TEST is not None:
|
||||
test_area = labeled[labeled.skiarea_name==SKI_AREA_TEST]
|
||||
labeled_tmp = labeled_tmp[labeled_tmp.skiarea_name!=SKI_AREA_TEST]
|
||||
else:
|
||||
test_area = None
|
||||
|
||||
if SEASON_TEST_SKIAREA is not None and SEASON_TEST_YEAR is not None:
|
||||
test_area_season = labeled[(labeled.skiarea_name==SEASON_TEST_SKIAREA)&(labeled.season>=SEASON_TEST_YEAR)]
|
||||
labeled_tmp = labeled_tmp[(labeled_tmp.skiarea_name!=SEASON_TEST_SKIAREA)|(labeled_tmp.season<SEASON_TEST_YEAR) ]
|
||||
else:
|
||||
test_area_season = None
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
X_train, X_valid, y_train, y_valid = train_test_split( labeled_tmp.drop(columns=['india','season','skiarea_name']),
|
||||
labeled_tmp.india, test_size=0.33, random_state=0,stratify=labeled_tmp.india)
|
||||
|
||||
if use_smote:
|
||||
from imblearn.over_sampling import RandomOverSampler
|
||||
labeled_tmp.india, test_size=test_size, random_state=0,stratify=labeled_tmp.india)
|
||||
|
||||
if use_smote:
|
||||
sm = RandomOverSampler()
|
||||
X_train,y_train = sm.fit_resample(X_train,y_train)
|
||||
if undersampling:
|
||||
sm = RandomUnderSampler(sampling_strategy='majority')
|
||||
X_train,y_train = sm.fit_resample(X_train,y_train)
|
||||
|
||||
##computed the weights for unbalanced dataset
|
||||
|
||||
@@ -202,7 +218,7 @@ def split(labeled:pd.DataFrame,
|
||||
print(f'{weight_type=} not implemented please use a valid one: sqrt or sum, I will set all the weights to 0')
|
||||
w.p = 1
|
||||
|
||||
if use_smote is False:
|
||||
if use_smote is False and undersampling is False:
|
||||
weight_train = pd.merge(pd.DataFrame({'class':y_train}),w).p.values
|
||||
else:
|
||||
w.p = 1
|
||||
@@ -210,7 +226,7 @@ def split(labeled:pd.DataFrame,
|
||||
|
||||
|
||||
dataset = Dataset(X_train, y_train, X_valid, y_valid,weight_train)
|
||||
dataset_test = Dataset_test(test_area,test_area.india,test_area_season,test_area_season.india)
|
||||
dataset_test = Dataset_test(test_area,test_area.india if test_area is not None else None,test_area_season,test_area_season.india if test_area_season is not None else None)
|
||||
|
||||
return dataset,dataset_test
|
||||
|
||||
|
||||
Reference in New Issue
Block a user