ag update

2024-02-27 14:40:27 +01:00
parent 1e882df758
commit 759f4c50d0
5 changed files with 5723 additions and 75 deletions
--- a/README.md
+++ b/README.md
@@ -22,6 +22,18 @@ Le features vengono ordinate per importanza utilizzando lo score (`best_model.ge

  Questo puo' essere utilizzato anche per dire: per avere il x% di accuracy devo utilizzare almeno queste variabili. Mi viene in mente ad esempio 'questi sono i campi obbligatori del form da compilare'. 

+
+## Modello finale
+Una volta fatti tutti i test possiamo anche reintrgrare il test set nel train:
+```
+SKI_AREA_TEST= None
+SEASON_TEST_SKIAREA = None
+SEASON_TEST_YEAR= None
+```
+e anche aumentare la quantita' di punti nel trainin set:
+```
+test_size = 0.2 #(80% train 20% validation)
+```
 ## Notebooks

 Ci sono alcuni notebook, TRAIN contiene piu' o meno quello che fa `main.py`, o meglio una sua versione precedente e non pulita con alcuni check etc, l'ho lasciata per sicurezza. `Variable_exploration` contiene la parte di inference su un nuovo dataset utilizzando `prepare_new_data` (c'e' anche un confronto tra le distribuzioni, ma non avendo i labels non saprei che altro mettere). C'e' anche una parte di explainability. Molto difficile da interpretare con le variabili categoriche, ma in qualche modo ti dice perche' un certo sample e' stato classificato in questo modo. Nelle immagini qui sotto vedi il sample considerato, che ha classe 2 in origine, che viene correttamente classificato (guarda i valori degli shap values oppure le predizioni che lo mettono in classe 2 con 86% di probabilita.). Nei due grafici sotto si vede quali feature fanno aumentare o diminuire il valore di probabilita' (non proprio probabilita' ma se vuoi lo possiamo chiamare affidabilita). Tutte le frecce rosse che spingono verso destra si leggono cosi' (guardiamo la seconda riga): la diagnosi, la location e la destinazione sono quelle che maggiormente gli fanno pensare che sia della seconda classe. In effetti elicottero, hospital_emergency_room e dislocation possono fare pensare che non sia una cosa da poco. Non va sempre cosi' bene, ti ho trovato un esempio chiaro per spiegartelo, poi vedete voi se e come usarlo.
--- a/notebooks/TRAIN.ipynb
+++ b/notebooks/TRAIN.ipynb
--- a/src/main.py
+++ b/src/main.py
@@ -7,32 +7,47 @@ import pickle
 import argparse

 def main(args):
+    #you can put these parameters in the args but here I keep it simpler
+    num_boost_round = 600
+    SKI_AREA_TEST= 'Klausberg'       ##you can put it to None
+    SEASON_TEST_SKIAREA = 'Kronplatz'##you can put it to None
+    SEASON_TEST_YEAR= 2023           ##you can put it to None
+    weight_type = 'sqrt' 
    
+    ##these are passed
+    reload_data = args.reload_data
+    use_smote = args.use_smote         ##I don't like to use it, leave to False
+    undersampling = args.undersampling ##I don't like to use it, leave to False
+    retrain = args.retrain
+    retrain_last_model = args.retrain_last_model
+    test_size = args.test_size

-    
-    labeled,labeled_small,to_remove = retrive_data(reload_data=args.reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
+    ## get the data
+    labeled,labeled_small,to_remove = retrive_data(reload_data=reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
    with open('to_remove.pkl','wb') as f:
        pickle.dump(to_remove,f)
    
+    #split the data
    dataset,dataset_test = split(labeled_small if args.use_small  else labeled  ,
-                                SKI_AREA_TEST= 'Klausberg',
-                                SEASON_TEST_SKIAREA = 'Kronplatz',
-                                SEASON_TEST_YEAR= 2023,
-                                use_smote = args.use_smote,
-                                weight_type = 'sqrt' )
-    if args.retrain:
-
-
+                                SKI_AREA_TEST= SKI_AREA_TEST,
+                                SEASON_TEST_SKIAREA = SEASON_TEST_SKIAREA,
+                                SEASON_TEST_YEAR= SEASON_TEST_YEAR,
+                                use_smote = use_smote,
+                                undersampling = undersampling,
+                                test_size = test_size,
+                                weight_type = weight_type )
+    #if you changed something you may want to retrain the model and save the best model
+    if retrain:
        print('OPTUNA hyperparameter tuning, please wait!')
-        best_model,params_final = train(dataset,n_trials=args.n_trials,timeout=600,num_boost_round=600)
+        best_model,params_final,study = train(dataset,n_trials=args.n_trials,timeout=600,num_boost_round=num_boost_round)
        feat_imp = pd.Series(best_model.get_fscore()).sort_values(ascending=False)

        with open('best_params.pkl','wb') as f:
-            pickle.dump([params_final,feat_imp,best_model],f)
+            pickle.dump([params_final,feat_imp,best_model,study],f)
    
    else:
        with open('best_params.pkl','rb') as f:
-            params_final,feat_imp,best_model = pickle.load(f)
+            params_final,feat_imp,best_model,study = pickle.load(f)
            
    

@@ -41,7 +56,7 @@ def main(args):
    tmp_train = xgb.DMatrix(dataset.X_train[best_model.feature_names],dataset.y_train,enable_categorical=True)
    tmp_valid = xgb.DMatrix(dataset.X_valid[best_model.feature_names],dataset.y_valid,enable_categorical=True)

-    
+    ##get the scores
    preds_class_valid = best_model.predict(tmp_valid)
    preds_class_train= best_model.predict(tmp_train)
    print('##################RESULT ON THE TRAIN SET#####################')
@@ -55,30 +70,33 @@ def main(args):
    

        
-        
-    if args.retrain_last_model:
-        tot,bst_FS,FS = gain_accuracy_train(dataset,feat_imp,num_boost_round=600,params=params_final)
+    #now you can train the final model, for example using gain_accuracy_train for reducing the number of features used
+    if retrain_last_model:
+        tot,bst_FS,FS = gain_accuracy_train(dataset,feat_imp,num_boost_round=num_boost_round,params=params_final)
        with open('best_params_and_final_model.pkl','wb') as f:
            pickle.dump([tot,bst_FS,FS],f)
    else:
        with open('best_params_and_final_model.pkl','rb') as f:
            tot,bst_FS,FS = pickle.load(f)  

-    dtest_FS = xgb.DMatrix(dataset_test.X_test_area[bst_FS.feature_names],dataset_test.y_test_area,enable_categorical=True,)
-    dtest_season_FS = xgb.DMatrix(dataset_test.X_test_season[bst_FS.feature_names],dataset_test.y_test_season,enable_categorical=True,)
-    preds_class_test = bst_FS.predict(dtest_FS)
-    preds_class_test_season = bst_FS.predict(dtest_season_FS)
+    if dataset_test.X_test_area is not None:
+        dtest_FS = xgb.DMatrix(dataset_test.X_test_area[bst_FS.feature_names],dataset_test.y_test_area,enable_categorical=True,)
+        preds_class_test = bst_FS.predict(dtest_FS)
+        mcc = matthews_corrcoef(dataset_test.y_test_area,preds_class_test.argmax(1))
+        acc = accuracy_score(dataset_test.y_test_area,preds_class_test.argmax(1))
+        cm = confusion_matrix(dataset_test.y_test_area,preds_class_test.argmax(1))
+        print(f'RESULT ON THE TEST SKI AREA {mcc=}, {acc=}, \n{cm=}')
        
-    mcc = matthews_corrcoef(dataset_test.y_test_area,preds_class_test.argmax(1))
-    acc = accuracy_score(dataset_test.y_test_area,preds_class_test.argmax(1))
-    cm = confusion_matrix(dataset_test.y_test_area,preds_class_test.argmax(1))
+    if dataset_test.X_test_season is not None:
+        dtest_season_FS = xgb.DMatrix(dataset_test.X_test_season[bst_FS.feature_names],dataset_test.y_test_season,enable_categorical=True,)
+        preds_class_test_season = bst_FS.predict(dtest_season_FS)
+        mcc = matthews_corrcoef(dataset_test.y_test_season,preds_class_test_season.argmax(1))
+        acc = accuracy_score(dataset_test.y_test_season,preds_class_test_season.argmax(1))
+        cm = confusion_matrix(dataset_test.y_test_season,preds_class_test_season.argmax(1))
+
+        print(f'RESULT ON THE TEST SKI SEASON {mcc=}, {acc=}, {cm=}')
    
-    print(f'RESULT ON THE TEST SKI AREA {mcc=}, {acc=}, \n{cm=}')
-    mcc = matthews_corrcoef(dataset_test.y_test_season,preds_class_test_season.argmax(1))
-    acc = accuracy_score(dataset_test.y_test_season,preds_class_test_season.argmax(1))
-    cm = confusion_matrix(dataset_test.y_test_season,preds_class_test_season.argmax(1))

-    print(f'RESULT ON THE TEST SKI SEASON {mcc=}, {acc=}, {cm=}')
    
 if __name__ == "__main__":

@@ -90,6 +108,8 @@ if __name__ == "__main__":
    parser.add_argument('--reload_data', action='store_true', help='Dowload data from db')
    parser.add_argument('--retrain_last_model', action='store_true', help='retrain the last model')
    parser.add_argument('--n_trials', type=int,default=1000, help='number of trials per optuna')
+    parser.add_argument('--undersampling', action='store_true', help='Undersample the training dataset')
+    parser.add_argument('--test_size', type=float,default=0.33, help='Percentage of dataset to use as validation')

    args = parser.parse_args()
    
--- a/src/model.py
+++ b/src/model.py
@@ -53,7 +53,7 @@ def objective(trial,dataset:Dataset,num_boost_round:int)->float:
    return mcc


-def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=600)->(xgb.Boost, dict):
+def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=600)->(xgb.Booster, dict):
    """optuna search procedure

    Args:
@@ -85,7 +85,7 @@ def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=
    bst = xgb.train(params_final, dtrain,verbose_eval=False, num_boost_round=num_boost_round,
                    evals = [(dtrain, "train"), (dvalid, "valid")],
                    early_stopping_rounds=100,)
-    return bst,params_final
+    return bst,params_final, study


 def gain_accuracy_train(dataset:Dataset,feat_imp:pd.DataFrame,num_boost_round:int=600,params:dict={})->(pd.DataFrame,xgb.Booster,int):
--- a/src/utils.py
+++ b/src/utils.py
@@ -7,6 +7,8 @@ import pickle
 from dataclasses import dataclass
 from typing import Union
 import os
+from imblearn.under_sampling import RandomUnderSampler,RandomOverSampler
+
 ##AUXILIARY CLASSES
@dataclass  
 class Dataset:
@@ -146,6 +148,8 @@ def split(labeled:pd.DataFrame,
          SEASON_TEST_SKIAREA:str = 'Kronplatz',
          SEASON_TEST_YEAR:int = 2023,
          use_smote:bool = False,
+          undersampling:bool=False,
+          test_size:float=0.33,
          weight_type:str = 'sqrt' )->(Dataset, Dataset_test):
    """Split  the dataset into train,validation test. From the initial dataset we remove a single skiarea (SKI_AREA_TEST)
    generating the first test set. Then we select a skieare and a starting season (SEASON_TEST_SKIAREA,SEASON_TEST_YEAR) 
@@ -159,32 +163,44 @@ def split(labeled:pd.DataFrame,
        SEASON_TEST_SKIAREA (str, optional): skiarea to remove from the dataset if the season is greater than SEASON_TEST_YEAR. Defaults to 'Kronplatz'.
        SEASON_TEST_YEAR (int, optional): see SEASON_TEST_SKIAREA . Defaults to 2023.
        use_smote (bool, optional): use oversampling for class umbalance. Defaults to False.
+        undersampling (bool, optional): use undersampling for class umbalance. Defaults to False.
+        test_size (float, optional): percentage of dataset to use as validation. Defaults to 0.33.
        weight_type (str, optional): routine for weighting the error on the samples. Defaults to 'sqrt'.

+
    Returns:
        trainin-validation dataset and test dataset
    """
-
-        
-    test_area = labeled[labeled.skiarea_name==SKI_AREA_TEST]
-    test_area_season = labeled[(labeled.skiarea_name==SEASON_TEST_SKIAREA)&(labeled.season>=SEASON_TEST_YEAR)]
-    
+    labeled_tmp = labeled.copy()
    ##remove from dataset the corresponding test rows
-    labeled_tmp = labeled[labeled.skiarea_name!=SKI_AREA_TEST]
-    labeled_tmp = labeled_tmp[(labeled_tmp.skiarea_name!=SEASON_TEST_SKIAREA)|(labeled_tmp.season<SEASON_TEST_YEAR) ]
+    
+    if SKI_AREA_TEST is not None:
+        test_area = labeled[labeled.skiarea_name==SKI_AREA_TEST]
+        labeled_tmp = labeled_tmp[labeled_tmp.skiarea_name!=SKI_AREA_TEST]
+    else:
+        test_area = None
+        
+    if SEASON_TEST_SKIAREA is not None and SEASON_TEST_YEAR is not None:
+        test_area_season = labeled[(labeled.skiarea_name==SEASON_TEST_SKIAREA)&(labeled.season>=SEASON_TEST_YEAR)]
+        labeled_tmp = labeled_tmp[(labeled_tmp.skiarea_name!=SEASON_TEST_SKIAREA)|(labeled_tmp.season<SEASON_TEST_YEAR) ]
+    else:
+        test_area_season = None
+
+    
    
    

        
    
    X_train, X_valid, y_train, y_valid = train_test_split( labeled_tmp.drop(columns=['india','season','skiarea_name']),
-                                                        labeled_tmp.india, test_size=0.33, random_state=0,stratify=labeled_tmp.india)
+                                                        labeled_tmp.india, test_size=test_size, random_state=0,stratify=labeled_tmp.india)
    
    if use_smote:   
-        from imblearn.over_sampling import RandomOverSampler
-    
        sm = RandomOverSampler()
        X_train,y_train = sm.fit_resample(X_train,y_train)
+    if undersampling:
+        sm = RandomUnderSampler(sampling_strategy='majority')
+        X_train,y_train = sm.fit_resample(X_train,y_train)

    ##computed the weights for unbalanced dataset

@@ -202,7 +218,7 @@ def split(labeled:pd.DataFrame,
        print(f'{weight_type=} not implemented please use a valid one: sqrt or sum, I will set all the weights to 0')
        w.p = 1
        
-    if use_smote is False:
+    if use_smote is False and undersampling is False:
        weight_train = pd.merge(pd.DataFrame({'class':y_train}),w).p.values
    else:
        w.p = 1
@@ -210,7 +226,7 @@ def split(labeled:pd.DataFrame,
    
        
    dataset = Dataset(X_train, y_train, X_valid, y_valid,weight_train)
-    dataset_test = Dataset_test(test_area,test_area.india,test_area_season,test_area_season.india)
+    dataset_test = Dataset_test(test_area,test_area.india if test_area is not None else None,test_area_season,test_area_season.india if test_area_season is not None else None)

    return dataset,dataset_test