ag added service API using docker and fastAPI

2024-03-20 16:30:30 +01:00
parent 759f4c50d0
commit 08c5f672be
16 changed files with 3450 additions and 264 deletions
--- a/src/config.yaml
+++ b/src/config.yaml
@@ -0,0 +1,23 @@
+processing:
+  skiarea_test: 'Klausberg'        ##you can put it to None
+  season_test_skiarea : 'Kronplatz' ##you can put it to None
+  season_test_year: 2023            ##you can put it to None
+  weight_type: 'sqrt' 
+  use_small: True                   ## condensate underrepresented classes (no destination!)
+  reload_data: False
+  use_smote: False                  ##I don't like to use it, leave to False
+  undersampling: False              ##I don't like to use it, leave to False
+  test_size: 0.33
+
+model:
+  name: test
+  num_boost_round : 2500
+  retrain: True
+  retrain_last_model: True
+  n_trials: 2000
+
+
+hydra:  
+  output_subdir: null  
+  run:  
+    dir: .
--- a/src/main.py
+++ b/src/main.py
@@ -4,49 +4,66 @@ from sklearn.metrics import confusion_matrix,matthews_corrcoef,accuracy_score
 import xgboost as xgb
 import pandas as pd
 import pickle
-import argparse
+from omegaconf import DictConfig,OmegaConf
+import hydra
+import logging

-def main(args):
-    #you can put these parameters in the args but here I keep it simpler
-    num_boost_round = 600
-    SKI_AREA_TEST= 'Klausberg'       ##you can put it to None
-    SEASON_TEST_SKIAREA = 'Kronplatz'##you can put it to None
-    SEASON_TEST_YEAR= 2023           ##you can put it to None
-    weight_type = 'sqrt' 
+import os
+
+
+
+
+@hydra.main(config_name='config.yaml')
+def main(conf: DictConfig) -> None:
+
+    skiarea_test= conf.processing.skiarea_test 
+    season_test_skiarea = conf.processing.season_test_skiarea
+    season_test_year= conf.processing.season_test_year         
+    weight_type = conf.processing.weight_type    
+    reload_data = conf.processing.reload_data
+    use_smote = conf.processing.use_smote         
+    undersampling = conf.processing.undersampling 
+    test_size = conf.processing.test_size
+    use_small= conf.processing.use_small
+    num_boost_round = conf.model.num_boost_round
+    retrain_last_model = conf.model.retrain_last_model
+    retrain = conf.model.retrain
+    n_trials = conf.model.n_trials
+    name = conf.model.name
+    os.makedirs(name,exist_ok=True) 
+    with open(os.path.join(name,"conf.yaml"),'w') as f:
+        OmegaConf.save(conf, f)
+    logging.basicConfig(level=logging.INFO, handlers=[logging.FileHandler(os.path.join(name,"debug.log")),logging.StreamHandler() ])
+    
    
-    ##these are passed
-    reload_data = args.reload_data
-    use_smote = args.use_smote         ##I don't like to use it, leave to False
-    undersampling = args.undersampling ##I don't like to use it, leave to False
-    retrain = args.retrain
-    retrain_last_model = args.retrain_last_model
-    test_size = args.test_size
-
    ## get the data
-    labeled,labeled_small,to_remove = retrive_data(reload_data=reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
-    with open('to_remove.pkl','wb') as f:
-        pickle.dump(to_remove,f)
+    labeled,labeled_small,to_remove,evacuations,encoders = retrive_data(reload_data=reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
+
    
+    with open(os.path.join(name,'metadata.pkl'),'wb') as f:
+        pickle.dump([to_remove,use_small,evacuations,encoders],f)
+
    #split the data
-    dataset,dataset_test = split(labeled_small if args.use_small  else labeled  ,
-                                SKI_AREA_TEST= SKI_AREA_TEST,
-                                SEASON_TEST_SKIAREA = SEASON_TEST_SKIAREA,
-                                SEASON_TEST_YEAR= SEASON_TEST_YEAR,
+
+    dataset,dataset_test = split(labeled_small if use_small  else labeled  ,
+                                skiarea_test= skiarea_test,
+                                season_test_skiarea = season_test_skiarea,
+                                season_test_year= season_test_year,
                                use_smote = use_smote,
                                undersampling = undersampling,
                                test_size = test_size,
                                weight_type = weight_type )
    #if you changed something you may want to retrain the model and save the best model
    if retrain:
-        print('OPTUNA hyperparameter tuning, please wait!')
-        best_model,params_final,study = train(dataset,n_trials=args.n_trials,timeout=600,num_boost_round=num_boost_round)
+        logging.info('OPTUNA hyperparameter tuning, please wait!')
+        best_model,params_final,study = train(dataset,n_trials=n_trials,timeout=6000,num_boost_round=num_boost_round)
        feat_imp = pd.Series(best_model.get_fscore()).sort_values(ascending=False)

-        with open('best_params.pkl','wb') as f:
+        with open(os.path.join(name,'best_params.pkl'),'wb') as f:
            pickle.dump([params_final,feat_imp,best_model,study],f)
    
    else:
-        with open('best_params.pkl','rb') as f:
+        with open(os.path.join(name,'best_params.pkl'),'rb') as f:
            params_final,feat_imp,best_model,study = pickle.load(f)
            
    
@@ -59,25 +76,33 @@ def main(args):
    ##get the scores
    preds_class_valid = best_model.predict(tmp_valid)
    preds_class_train= best_model.predict(tmp_train)
-    print('##################RESULT ON THE TRAIN SET#####################')
-    print(confusion_matrix(dataset.y_train,preds_class_train.argmax(1)))
-    print(f'MCC:{matthews_corrcoef(dataset.y_train,preds_class_train.argmax(1))}')
-    print(f'ACC:{accuracy_score(dataset.y_train,preds_class_train.argmax(1))}')
-    print('##################RESULT ON THE VALIDATION SET#####################')
-    print(confusion_matrix(dataset.y_valid,preds_class_valid.argmax(1)))
-    print(f'MCC:{matthews_corrcoef(dataset.y_valid,preds_class_valid.argmax(1))}')
-    print(f'ACC:{accuracy_score(dataset.y_valid,preds_class_valid.argmax(1))}')
+    logging.info('##################RESULT ON THE TRAIN SET#####################')
+    logging.info(confusion_matrix(dataset.y_train,preds_class_train.argmax(1)))
+    logging.info(f'MCC:{matthews_corrcoef(dataset.y_train,preds_class_train.argmax(1))}')
+    logging.info(f'ACC:{accuracy_score(dataset.y_train,preds_class_train.argmax(1))}')
+    logging.info('##################RESULT ON THE VALIDATION SET#####################')
+    logging.info(confusion_matrix(dataset.y_valid,preds_class_valid.argmax(1)))
+    logging.info(f'MCC:{matthews_corrcoef(dataset.y_valid,preds_class_valid.argmax(1))}')
+    logging.info(f'ACC:{accuracy_score(dataset.y_valid,preds_class_valid.argmax(1))}')
    

        
    #now you can train the final model, for example using gain_accuracy_train for reducing the number of features used
    if retrain_last_model:
        tot,bst_FS,FS = gain_accuracy_train(dataset,feat_imp,num_boost_round=num_boost_round,params=params_final)
-        with open('best_params_and_final_model.pkl','wb') as f:
+        with open(os.path.join(name,'best_params_and_final_model.pkl'),'wb') as f:
            pickle.dump([tot,bst_FS,FS],f)
+            bst_FS.save_model(os.path.join(name,"model.json"))
    else:
-        with open('best_params_and_final_model.pkl','rb') as f:
+        with open(os.path.join(name,'best_params_and_final_model.pkl'),'rb') as f:
            tot,bst_FS,FS = pickle.load(f)  
+            bst_FS = xgb.Booster()
+            bst_FS.load_model(os.path.join(name,"model.json"))
+
+
+    ## save the model in json format, maybe it is better
+    
+

    if dataset_test.X_test_area is not None:
        dtest_FS = xgb.DMatrix(dataset_test.X_test_area[bst_FS.feature_names],dataset_test.y_test_area,enable_categorical=True,)
@@ -85,7 +110,7 @@ def main(args):
        mcc = matthews_corrcoef(dataset_test.y_test_area,preds_class_test.argmax(1))
        acc = accuracy_score(dataset_test.y_test_area,preds_class_test.argmax(1))
        cm = confusion_matrix(dataset_test.y_test_area,preds_class_test.argmax(1))
-        print(f'RESULT ON THE TEST SKI AREA {mcc=}, {acc=}, \n{cm=}')
+        logging.info(f'RESULT ON THE TEST SKI AREA {mcc=}, {acc=}, \n{cm=}')
        
    if dataset_test.X_test_season is not None:
        dtest_season_FS = xgb.DMatrix(dataset_test.X_test_season[bst_FS.feature_names],dataset_test.y_test_season,enable_categorical=True,)
@@ -94,26 +119,10 @@ def main(args):
        acc = accuracy_score(dataset_test.y_test_season,preds_class_test_season.argmax(1))
        cm = confusion_matrix(dataset_test.y_test_season,preds_class_test_season.argmax(1))

-        print(f'RESULT ON THE TEST SKI SEASON {mcc=}, {acc=}, {cm=}')
+        logging.info(f'RESULT ON THE TEST SKI SEASON {mcc=}, {acc=}, {cm=}')
    

    
 if __name__ == "__main__":
-
-    
-    parser = argparse.ArgumentParser(description='Train Optuna XGBOOST model')
-    parser.add_argument('--use_small', action='store_true', help="Aggregate under represented input classes (es: rare country)")
-    parser.add_argument('--use_smote', action='store_true', help='oversampling underrperesented target labels')
-    parser.add_argument('--retrain', action='store_true', help='Retrain the optuna searcher')
-    parser.add_argument('--reload_data', action='store_true', help='Dowload data from db')
-    parser.add_argument('--retrain_last_model', action='store_true', help='retrain the last model')
-    parser.add_argument('--n_trials', type=int,default=1000, help='number of trials per optuna')
-    parser.add_argument('--undersampling', action='store_true', help='Undersample the training dataset')
-    parser.add_argument('--test_size', type=float,default=0.33, help='Percentage of dataset to use as validation')
-
-    args = parser.parse_args()
-    
-    
-    main(args)
-    
-    #python main.py --use_small --retrain --retrain_last_model --n_trials=10 --reload_data
+    main()
+    
--- a/src/model.py
+++ b/src/model.py
@@ -1,11 +1,11 @@

 import xgboost as xgb
+import optuna
 from sklearn.metrics import matthews_corrcoef, accuracy_score
 import optuna
 from utils import Dataset
 import pandas  as pd
-
-
+import logging

 def objective(trial,dataset:Dataset,num_boost_round:int)->float:
    """function to maximize during the tuning phase
@@ -22,21 +22,21 @@ def objective(trial,dataset:Dataset,num_boost_round:int)->float:
    params = dict(
                learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2),
                max_depth= trial.suggest_int("max_depth",5, 15),
-                min_child_weight = trial.suggest_int("min_child_weight", 1, 8),
-                gamma = trial.suggest_float("gamma", 0, 10),
+                min_child_weight = trial.suggest_int("min_child_weight", 2, 8),
+                gamma = trial.suggest_float("gamma",0, 10), 
                subsample = trial.suggest_float("subsample", 0.01,1),
                colsample_bytree = trial.suggest_float("colsample_bytree", 0.01,1),
-                alpha = trial.suggest_float("alpha", 0, 10),
+                alpha = trial.suggest_float("alpha", 1, 10),
                objective= 'multi:softprob', 
                nthread=4, 
                 num_class= 5,
                seed=27)
-    params['lambda'] = trial.suggest_float("lambda", 0, 10)
+    params['lambda'] = trial.suggest_float("lambda", 1, 10)

  
    dtrain = xgb.DMatrix(dataset.X_train,dataset.y_train, 
                         enable_categorical=True,
-                         weight=dataset.weight_train)
+                         weight=dataset.weight_train)#np.power(dataset.weight_train,trial.suggest_float("power", 0.1, 2)))
    dvalid = xgb.DMatrix(dataset.X_valid,dataset.y_valid,
                         enable_categorical=True,
                         )
@@ -45,7 +45,7 @@ def objective(trial,dataset:Dataset,num_boost_round:int)->float:
    bst = xgb.train(params, dtrain,verbose_eval=False, num_boost_round=num_boost_round,
                    evals = [(dtrain, "train"), (dvalid, "valid")],
                    early_stopping_rounds=100)
-
+    logging.info(bst.best_iteration)
    preds = bst.predict(dvalid)
    ##MCC is more solid
    mcc = matthews_corrcoef(dataset.y_valid,preds.argmax(1))    
@@ -119,7 +119,7 @@ def gain_accuracy_train(dataset:Dataset,feat_imp:pd.DataFrame,num_boost_round:in
        
    tot = pd.DataFrame(tot)  
    FS = int(tot.loc[tot.acc.argmax()].FS) ## get best
-    print(f'Best model with {FS} features, retraining....')
+    logging.info(f'Best model with {FS} features, retraining....')
    
    dtrain_FS = xgb.DMatrix(dataset.X_train[list(feat_imp.head(FS).index)],dataset.y_train, enable_categorical=True, weight=dataset.weight_train)
    dvalid_FS = xgb.DMatrix(dataset.X_valid[list(feat_imp.head(FS).index)],dataset.y_valid,enable_categorical=True,  )
--- a/src/utils.py
+++ b/src/utils.py
@@ -7,7 +7,10 @@ import pickle
 from dataclasses import dataclass
 from typing import Union
 import os
-from imblearn.under_sampling import RandomUnderSampler,RandomOverSampler
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import RandomOverSampler
+from sklearn.preprocessing import LabelEncoder
+import logging

 ##AUXILIARY CLASSES
@dataclass  
@@ -69,7 +72,8 @@ def prepare_new_data(dataset:pd.DataFrame,to_remove:dict)->(pd.DataFrame,pd.Data



-def retrive_data(reload_data:bool,threshold_under_represented:float,path:str)->(pd.DataFrame,pd.DataFrame):
+
+def retrive_data(reload_data:bool,threshold_under_represented:float,path:str)->(pd.DataFrame,pd.DataFrame,list):
    """Get data

    Args:
@@ -79,6 +83,7 @@ def retrive_data(reload_data:bool,threshold_under_represented:float,path:str)->(

    Returns:
        two pandas dataframe, one the original the second with condensed classes and a dictionarly of condesed classes
+        and a list of all evacuations and the encoders for the categorical features
    """
    if reload_data:
        engine = pg.connect("dbname='safeidx' user='fbk_mpba' host='172.104.247.67' port='5432' password='fbk2024$'")
@@ -88,8 +93,11 @@ def retrive_data(reload_data:bool,threshold_under_represented:float,path:str)->(
    else:
        with open(os.path.join(path,'data.pkl'),'rb') as f:
            df = pickle.load(f)
-
+    #import pdb
+    #pdb.set_trace()
+    df = df[df.year>2011]
    ## these columns can lead to overfit!
+
    df.drop(columns=['dateandtime','skiarea_id','day_of_year','minute_of_day','year'], inplace=True)
    
    ##evacuation_vehicles must be explicitated
@@ -112,7 +120,7 @@ def retrive_data(reload_data:bool,threshold_under_represented:float,path:str)->(
    ## maybe it is possible to obtain a more stable model removing such classes
    to_remove = {}
    for c in labeled.columns:
-        if c not in ['india','age','season','skiarea_name']:
+        if c not in ['india','age','season','skiarea_name','destination']:
            labeled[c] = labeled[c].astype('str')  
            tmp = labeled.groupby(c)[c].count()
            tmp = 100*tmp/tmp.max()
@@ -125,13 +133,20 @@ def retrive_data(reload_data:bool,threshold_under_represented:float,path:str)->(
                    
    ## keep the datasets
    labeled_small = labeled.copy()
+    encoders = {'small':{},'normal':{}}
    for c in to_remove.keys():
        for k in to_remove[c]:
            labeled_small.loc[labeled_small[c]==k,c] = 'other'
    for c in labeled_small.columns:
-        if c not in ['age','season','skiarea_name']:
-            labeled_small[c] =  labeled_small[c].fillna('None').astype('category')  
-            labeled[c] =  labeled[c].fillna('None').astype('category')
+        if c not in ['age','season','skiarea_name','india']:
+            le = LabelEncoder()
+            labeled_small[c] = le.fit_transform(labeled_small[c].fillna('None'))
+            labeled_small[c] = labeled_small[c].astype('category')
+            encoders['small'][c] = le
+            le = LabelEncoder()
+            labeled[c] =  le.fit_transform(labeled[c].fillna('None'))
+            labeled[c] = labeled[c].astype('category')
+            encoders['normal'][c] = le
    labeled.dropna(inplace=True)
    labeled_small.dropna(inplace=True)
    
@@ -139,29 +154,29 @@ def retrive_data(reload_data:bool,threshold_under_represented:float,path:str)->(
    labeled.india = labeled.india.apply(lambda x: x.replace('i','')).astype(int)
    labeled_small.india = labeled_small.india.apply(lambda x: x.replace('i','')).astype(int)
    
-    return labeled,labeled_small,to_remove
+    return labeled,labeled_small,to_remove,list(ev),encoders
    


 def split(labeled:pd.DataFrame,
-          SKI_AREA_TEST: str = 'Klausberg',
-          SEASON_TEST_SKIAREA:str = 'Kronplatz',
-          SEASON_TEST_YEAR:int = 2023,
+          skiarea_test: str = 'Klausberg',
+          season_test_skiarea:str = 'Kronplatz',
+          season_test_year:int = 2023,
          use_smote:bool = False,
          undersampling:bool=False,
          test_size:float=0.33,
          weight_type:str = 'sqrt' )->(Dataset, Dataset_test):
-    """Split  the dataset into train,validation test. From the initial dataset we remove a single skiarea (SKI_AREA_TEST)
-    generating the first test set. Then we select a skieare and a starting season (SEASON_TEST_SKIAREA,SEASON_TEST_YEAR) 
+    """Split  the dataset into train,validation test. From the initial dataset we remove a single skiarea (skiarea_test)
+    generating the first test set. Then we select a skieare and a starting season (season_test_skiarea,season_test_year) 
    and generate the seconda test set. The rest of the data are splitted 66-33 stratified on the target column (india). 
    It is possible to specify the weight of eact sample. There are two strategies implemented: using the sum or the square root 
    of the sum. This is used for mitigating the class umbalance. Another alternative is to use an oversampling procedure (use_smote)

    Args:
        labeled (pd.DataFrame): dataset
-        SKI_AREA_TEST (str, optional): skiarea to remove from the train and use in test. Defaults to 'Klausberg'.
-        SEASON_TEST_SKIAREA (str, optional): skiarea to remove from the dataset if the season is greater than SEASON_TEST_YEAR. Defaults to 'Kronplatz'.
-        SEASON_TEST_YEAR (int, optional): see SEASON_TEST_SKIAREA . Defaults to 2023.
+        skiarea_test (str, optional): skiarea to remove from the train and use in test. Defaults to 'Klausberg'.
+        season_test_skiarea (str, optional): skiarea to remove from the dataset if the season is greater than season_test_year. Defaults to 'Kronplatz'.
+        season_test_year (int, optional): see season_test_skiarea . Defaults to 2023.
        use_smote (bool, optional): use oversampling for class umbalance. Defaults to False.
        undersampling (bool, optional): use undersampling for class umbalance. Defaults to False.
        test_size (float, optional): percentage of dataset to use as validation. Defaults to 0.33.
@@ -174,15 +189,15 @@ def split(labeled:pd.DataFrame,
    labeled_tmp = labeled.copy()
    ##remove from dataset the corresponding test rows
    
-    if SKI_AREA_TEST is not None:
-        test_area = labeled[labeled.skiarea_name==SKI_AREA_TEST]
-        labeled_tmp = labeled_tmp[labeled_tmp.skiarea_name!=SKI_AREA_TEST]
+    if skiarea_test is not None:
+        test_area = labeled[labeled.skiarea_name==skiarea_test]
+        labeled_tmp = labeled_tmp[labeled_tmp.skiarea_name!=skiarea_test]
    else:
        test_area = None
        
-    if SEASON_TEST_SKIAREA is not None and SEASON_TEST_YEAR is not None:
-        test_area_season = labeled[(labeled.skiarea_name==SEASON_TEST_SKIAREA)&(labeled.season>=SEASON_TEST_YEAR)]
-        labeled_tmp = labeled_tmp[(labeled_tmp.skiarea_name!=SEASON_TEST_SKIAREA)|(labeled_tmp.season<SEASON_TEST_YEAR) ]
+    if season_test_skiarea is not None and season_test_year is not None:
+        test_area_season = labeled[(labeled.skiarea_name==season_test_skiarea)&(labeled.season>=season_test_year)]
+        labeled_tmp = labeled_tmp[(labeled_tmp.skiarea_name!=season_test_skiarea)|(labeled_tmp.season<season_test_year) ]
    else:
        test_area_season = None

@@ -210,12 +225,12 @@ def split(labeled:pd.DataFrame,
    ##when computing the error, these are the weights used for each class: you can punish more errpr on most severe clases
    if weight_type == 'sqrt':
        w.p = np.sqrt(w.p.sum())/w.p
-        print(w)
+        logging.info(w)
    elif weight_type == 'sum':
        w.p = w.p.sum()/w.p/w.shape[0]
-        print(w)
+        logging.info(w)
    else:
-        print(f'{weight_type=} not implemented please use a valid one: sqrt or sum, I will set all the weights to 0')
+        logging.info(f'{weight_type=} not implemented please use a valid one: sqrt or sum, I will set all the weights to 0')
        w.p = 1
        
    if use_smote is False and undersampling is False: