ag update

This commit is contained in:
2024-02-27 14:40:27 +01:00
parent 1e882df758
commit 759f4c50d0
5 changed files with 5723 additions and 75 deletions

View File

@@ -22,6 +22,18 @@ Le features vengono ordinate per importanza utilizzando lo score (`best_model.ge
Questo puo' essere utilizzato anche per dire: per avere il x% di accuracy devo utilizzare almeno queste variabili. Mi viene in mente ad esempio 'questi sono i campi obbligatori del form da compilare'.
## Modello finale
Una volta fatti tutti i test possiamo anche reintrgrare il test set nel train:
```
SKI_AREA_TEST= None
SEASON_TEST_SKIAREA = None
SEASON_TEST_YEAR= None
```
e anche aumentare la quantita' di punti nel trainin set:
```
test_size = 0.2 #(80% train 20% validation)
```
## Notebooks
Ci sono alcuni notebook, TRAIN contiene piu' o meno quello che fa `main.py`, o meglio una sua versione precedente e non pulita con alcuni check etc, l'ho lasciata per sicurezza. `Variable_exploration` contiene la parte di inference su un nuovo dataset utilizzando `prepare_new_data` (c'e' anche un confronto tra le distribuzioni, ma non avendo i labels non saprei che altro mettere). C'e' anche una parte di explainability. Molto difficile da interpretare con le variabili categoriche, ma in qualche modo ti dice perche' un certo sample e' stato classificato in questo modo. Nelle immagini qui sotto vedi il sample considerato, che ha classe 2 in origine, che viene correttamente classificato (guarda i valori degli shap values oppure le predizioni che lo mettono in classe 2 con 86% di probabilita.). Nei due grafici sotto si vede quali feature fanno aumentare o diminuire il valore di probabilita' (non proprio probabilita' ma se vuoi lo possiamo chiamare affidabilita). Tutte le frecce rosse che spingono verso destra si leggono cosi' (guardiamo la seconda riga): la diagnosi, la location e la destinazione sono quelle che maggiormente gli fanno pensare che sia della seconda classe. In effetti elicottero, hospital_emergency_room e dislocation possono fare pensare che non sia una cosa da poco. Non va sempre cosi' bene, ti ho trovato un esempio chiaro per spiegartelo, poi vedete voi se e come usarlo.

File diff suppressed because one or more lines are too long

View File

@@ -7,32 +7,47 @@ import pickle
import argparse
def main(args):
#you can put these parameters in the args but here I keep it simpler
num_boost_round = 600
SKI_AREA_TEST= 'Klausberg' ##you can put it to None
SEASON_TEST_SKIAREA = 'Kronplatz'##you can put it to None
SEASON_TEST_YEAR= 2023 ##you can put it to None
weight_type = 'sqrt'
##these are passed
reload_data = args.reload_data
use_smote = args.use_smote ##I don't like to use it, leave to False
undersampling = args.undersampling ##I don't like to use it, leave to False
retrain = args.retrain
retrain_last_model = args.retrain_last_model
test_size = args.test_size
labeled,labeled_small,to_remove = retrive_data(reload_data=args.reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
## get the data
labeled,labeled_small,to_remove = retrive_data(reload_data=reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
with open('to_remove.pkl','wb') as f:
pickle.dump(to_remove,f)
#split the data
dataset,dataset_test = split(labeled_small if args.use_small else labeled ,
SKI_AREA_TEST= 'Klausberg',
SEASON_TEST_SKIAREA = 'Kronplatz',
SEASON_TEST_YEAR= 2023,
use_smote = args.use_smote,
weight_type = 'sqrt' )
if args.retrain:
SKI_AREA_TEST= SKI_AREA_TEST,
SEASON_TEST_SKIAREA = SEASON_TEST_SKIAREA,
SEASON_TEST_YEAR= SEASON_TEST_YEAR,
use_smote = use_smote,
undersampling = undersampling,
test_size = test_size,
weight_type = weight_type )
#if you changed something you may want to retrain the model and save the best model
if retrain:
print('OPTUNA hyperparameter tuning, please wait!')
best_model,params_final = train(dataset,n_trials=args.n_trials,timeout=600,num_boost_round=600)
best_model,params_final,study = train(dataset,n_trials=args.n_trials,timeout=600,num_boost_round=num_boost_round)
feat_imp = pd.Series(best_model.get_fscore()).sort_values(ascending=False)
with open('best_params.pkl','wb') as f:
pickle.dump([params_final,feat_imp,best_model],f)
pickle.dump([params_final,feat_imp,best_model,study],f)
else:
with open('best_params.pkl','rb') as f:
params_final,feat_imp,best_model = pickle.load(f)
params_final,feat_imp,best_model,study = pickle.load(f)
@@ -41,7 +56,7 @@ def main(args):
tmp_train = xgb.DMatrix(dataset.X_train[best_model.feature_names],dataset.y_train,enable_categorical=True)
tmp_valid = xgb.DMatrix(dataset.X_valid[best_model.feature_names],dataset.y_valid,enable_categorical=True)
##get the scores
preds_class_valid = best_model.predict(tmp_valid)
preds_class_train= best_model.predict(tmp_train)
print('##################RESULT ON THE TRAIN SET#####################')
@@ -55,30 +70,33 @@ def main(args):
if args.retrain_last_model:
tot,bst_FS,FS = gain_accuracy_train(dataset,feat_imp,num_boost_round=600,params=params_final)
#now you can train the final model, for example using gain_accuracy_train for reducing the number of features used
if retrain_last_model:
tot,bst_FS,FS = gain_accuracy_train(dataset,feat_imp,num_boost_round=num_boost_round,params=params_final)
with open('best_params_and_final_model.pkl','wb') as f:
pickle.dump([tot,bst_FS,FS],f)
else:
with open('best_params_and_final_model.pkl','rb') as f:
tot,bst_FS,FS = pickle.load(f)
dtest_FS = xgb.DMatrix(dataset_test.X_test_area[bst_FS.feature_names],dataset_test.y_test_area,enable_categorical=True,)
dtest_season_FS = xgb.DMatrix(dataset_test.X_test_season[bst_FS.feature_names],dataset_test.y_test_season,enable_categorical=True,)
preds_class_test = bst_FS.predict(dtest_FS)
preds_class_test_season = bst_FS.predict(dtest_season_FS)
if dataset_test.X_test_area is not None:
dtest_FS = xgb.DMatrix(dataset_test.X_test_area[bst_FS.feature_names],dataset_test.y_test_area,enable_categorical=True,)
preds_class_test = bst_FS.predict(dtest_FS)
mcc = matthews_corrcoef(dataset_test.y_test_area,preds_class_test.argmax(1))
acc = accuracy_score(dataset_test.y_test_area,preds_class_test.argmax(1))
cm = confusion_matrix(dataset_test.y_test_area,preds_class_test.argmax(1))
print(f'RESULT ON THE TEST SKI AREA {mcc=}, {acc=}, \n{cm=}')
mcc = matthews_corrcoef(dataset_test.y_test_area,preds_class_test.argmax(1))
acc = accuracy_score(dataset_test.y_test_area,preds_class_test.argmax(1))
cm = confusion_matrix(dataset_test.y_test_area,preds_class_test.argmax(1))
if dataset_test.X_test_season is not None:
dtest_season_FS = xgb.DMatrix(dataset_test.X_test_season[bst_FS.feature_names],dataset_test.y_test_season,enable_categorical=True,)
preds_class_test_season = bst_FS.predict(dtest_season_FS)
mcc = matthews_corrcoef(dataset_test.y_test_season,preds_class_test_season.argmax(1))
acc = accuracy_score(dataset_test.y_test_season,preds_class_test_season.argmax(1))
cm = confusion_matrix(dataset_test.y_test_season,preds_class_test_season.argmax(1))
print(f'RESULT ON THE TEST SKI SEASON {mcc=}, {acc=}, {cm=}')
print(f'RESULT ON THE TEST SKI AREA {mcc=}, {acc=}, \n{cm=}')
mcc = matthews_corrcoef(dataset_test.y_test_season,preds_class_test_season.argmax(1))
acc = accuracy_score(dataset_test.y_test_season,preds_class_test_season.argmax(1))
cm = confusion_matrix(dataset_test.y_test_season,preds_class_test_season.argmax(1))
print(f'RESULT ON THE TEST SKI SEASON {mcc=}, {acc=}, {cm=}')
if __name__ == "__main__":
@@ -90,6 +108,8 @@ if __name__ == "__main__":
parser.add_argument('--reload_data', action='store_true', help='Dowload data from db')
parser.add_argument('--retrain_last_model', action='store_true', help='retrain the last model')
parser.add_argument('--n_trials', type=int,default=1000, help='number of trials per optuna')
parser.add_argument('--undersampling', action='store_true', help='Undersample the training dataset')
parser.add_argument('--test_size', type=float,default=0.33, help='Percentage of dataset to use as validation')
args = parser.parse_args()

View File

@@ -53,7 +53,7 @@ def objective(trial,dataset:Dataset,num_boost_round:int)->float:
return mcc
def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=600)->(xgb.Boost, dict):
def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=600)->(xgb.Booster, dict):
"""optuna search procedure
Args:
@@ -85,7 +85,7 @@ def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=
bst = xgb.train(params_final, dtrain,verbose_eval=False, num_boost_round=num_boost_round,
evals = [(dtrain, "train"), (dvalid, "valid")],
early_stopping_rounds=100,)
return bst,params_final
return bst,params_final, study
def gain_accuracy_train(dataset:Dataset,feat_imp:pd.DataFrame,num_boost_round:int=600,params:dict={})->(pd.DataFrame,xgb.Booster,int):

View File

@@ -7,6 +7,8 @@ import pickle
from dataclasses import dataclass
from typing import Union
import os
from imblearn.under_sampling import RandomUnderSampler,RandomOverSampler
##AUXILIARY CLASSES
@dataclass
class Dataset:
@@ -146,6 +148,8 @@ def split(labeled:pd.DataFrame,
SEASON_TEST_SKIAREA:str = 'Kronplatz',
SEASON_TEST_YEAR:int = 2023,
use_smote:bool = False,
undersampling:bool=False,
test_size:float=0.33,
weight_type:str = 'sqrt' )->(Dataset, Dataset_test):
"""Split the dataset into train,validation test. From the initial dataset we remove a single skiarea (SKI_AREA_TEST)
generating the first test set. Then we select a skieare and a starting season (SEASON_TEST_SKIAREA,SEASON_TEST_YEAR)
@@ -159,32 +163,44 @@ def split(labeled:pd.DataFrame,
SEASON_TEST_SKIAREA (str, optional): skiarea to remove from the dataset if the season is greater than SEASON_TEST_YEAR. Defaults to 'Kronplatz'.
SEASON_TEST_YEAR (int, optional): see SEASON_TEST_SKIAREA . Defaults to 2023.
use_smote (bool, optional): use oversampling for class umbalance. Defaults to False.
undersampling (bool, optional): use undersampling for class umbalance. Defaults to False.
test_size (float, optional): percentage of dataset to use as validation. Defaults to 0.33.
weight_type (str, optional): routine for weighting the error on the samples. Defaults to 'sqrt'.
Returns:
trainin-validation dataset and test dataset
"""
test_area = labeled[labeled.skiarea_name==SKI_AREA_TEST]
test_area_season = labeled[(labeled.skiarea_name==SEASON_TEST_SKIAREA)&(labeled.season>=SEASON_TEST_YEAR)]
labeled_tmp = labeled.copy()
##remove from dataset the corresponding test rows
labeled_tmp = labeled[labeled.skiarea_name!=SKI_AREA_TEST]
labeled_tmp = labeled_tmp[(labeled_tmp.skiarea_name!=SEASON_TEST_SKIAREA)|(labeled_tmp.season<SEASON_TEST_YEAR) ]
if SKI_AREA_TEST is not None:
test_area = labeled[labeled.skiarea_name==SKI_AREA_TEST]
labeled_tmp = labeled_tmp[labeled_tmp.skiarea_name!=SKI_AREA_TEST]
else:
test_area = None
if SEASON_TEST_SKIAREA is not None and SEASON_TEST_YEAR is not None:
test_area_season = labeled[(labeled.skiarea_name==SEASON_TEST_SKIAREA)&(labeled.season>=SEASON_TEST_YEAR)]
labeled_tmp = labeled_tmp[(labeled_tmp.skiarea_name!=SEASON_TEST_SKIAREA)|(labeled_tmp.season<SEASON_TEST_YEAR) ]
else:
test_area_season = None
X_train, X_valid, y_train, y_valid = train_test_split( labeled_tmp.drop(columns=['india','season','skiarea_name']),
labeled_tmp.india, test_size=0.33, random_state=0,stratify=labeled_tmp.india)
labeled_tmp.india, test_size=test_size, random_state=0,stratify=labeled_tmp.india)
if use_smote:
from imblearn.over_sampling import RandomOverSampler
sm = RandomOverSampler()
X_train,y_train = sm.fit_resample(X_train,y_train)
if undersampling:
sm = RandomUnderSampler(sampling_strategy='majority')
X_train,y_train = sm.fit_resample(X_train,y_train)
##computed the weights for unbalanced dataset
@@ -202,7 +218,7 @@ def split(labeled:pd.DataFrame,
print(f'{weight_type=} not implemented please use a valid one: sqrt or sum, I will set all the weights to 0')
w.p = 1
if use_smote is False:
if use_smote is False and undersampling is False:
weight_train = pd.merge(pd.DataFrame({'class':y_train}),w).p.values
else:
w.p = 1
@@ -210,7 +226,7 @@ def split(labeled:pd.DataFrame,
dataset = Dataset(X_train, y_train, X_valid, y_valid,weight_train)
dataset_test = Dataset_test(test_area,test_area.india,test_area_season,test_area_season.india)
dataset_test = Dataset_test(test_area,test_area.india if test_area is not None else None,test_area_season,test_area_season.india if test_area_season is not None else None)
return dataset,dataset_test