ag update readme and added documentation

2024-02-22 14:39:44 +01:00
parent ea097a7b71
commit 7eb456384e
18 changed files with 14754 additions and 28 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -10,8 +10,10 @@ def main(args):
    

    
-    labeled,labeled_small = retrive_data(reload_data=args.reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
-
+    labeled,labeled_small,to_remove = retrive_data(reload_data=args.reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
+    with open('to_remove.pkl','wb') as f:
+        pickle.dump(to_remove,f)
+    
    dataset,dataset_test = split(labeled_small if args.use_small  else labeled  ,
                                SKI_AREA_TEST= 'Klausberg',
                                SEASON_TEST_SKIAREA = 'Kronplatz',
--- a/src/model.py
+++ b/src/model.py
@@ -7,8 +7,17 @@ import pandas  as pd



-def objective(trial,dataset:Dataset,num_boost_round:int):
-    
+def objective(trial,dataset:Dataset,num_boost_round:int)->float:
+    """function to maximize during the tuning phase
+
+    Args:
+        trial (??): optuna stuff
+        dataset (Dataset): dataset to use (containing train  and validation)
+        num_boost_round (int): number of iteration of xgboost
+
+    Returns:
+        float: validation MCC
+    """
    #These are the parameters usually used
    params = dict(
                learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2),
@@ -44,8 +53,20 @@ def objective(trial,dataset:Dataset,num_boost_round:int):
    return mcc


-def train(dataset,n_trials=1000,timeout=600,num_boost_round=600):
-    
+def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=600)->(xgb.Boost, dict):
+    """optuna search procedure
+
+    Args:
+        dataset (Dataset): dataset to use (containing train  and validation)
+        n_trials (int, optional): number of combination to try. Defaults to 1000.
+        timeout (int, optional): maximum time before stopping. Defaults to 600.
+        num_boost_round (int, optional): number of iteration of a single boost model. Defaults to 600.
+
+    Returns:
+        trained xgboost and a dictionary containing the best parameters
+    """
+
+
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial,dataset,num_boost_round), n_trials=n_trials, timeout=timeout)

@@ -67,7 +88,18 @@ def train(dataset,n_trials=1000,timeout=600,num_boost_round=600):
    return bst,params_final


-def gain_accuracy_train(dataset:Dataset,feat_imp:pd.DataFrame,num_boost_round:int,params:dict):
+def gain_accuracy_train(dataset:Dataset,feat_imp:pd.DataFrame,num_boost_round:int=600,params:dict={})->(pd.DataFrame,xgb.Booster,int):
+    """Starting from the most important feature, add one feature, train the model and get mcc and acc on the validation
+
+    Args:
+        dataset (Dataset):  dataset to use (containing train  and validation)
+        feat_imp (pd.DataFrame): feature importance dataset computed using feat_imp = pd.Series(best_model.get_fscore()).sort_values(ascending=False)
+        num_boost_round (int): number of iteration of a single boost model. Defaults to 600.
+        params (dict): dictionary of best parameters returned from the function `train`
+
+    Returns:
+       dataframe with N-variables, ACC, MCC for each N 
+    """

    tot = []
    for i in range(1,dataset.X_train.shape[1]):
--- a/src/utils.py
+++ b/src/utils.py
@@ -22,10 +22,62 @@ class Dataset_test:
    y_test_area:Union[pd.Series,None]
    X_test_season:Union[pd.DataFrame,None]
    y_test_season:Union[pd.Series,None]
-   
+
+def prepare_new_data(dataset:pd.DataFrame,to_remove:dict)->(pd.DataFrame,pd.DataFrame):
+    """prepare new data for prediction. MUST BE SIMILAR TO retrive_data. Maybe it can use directly inside it...
+
+    Args:
+        dataset (pd.DataFrame): dataset to use as inference
+        to_remove (dict): columns to aggregate
+
+    Returns:
+        two pandas dataframe, one the original the second with condensed classes. 
+
+    """
+    dataset_p = dataset.copy()
+    dataset_p.drop(columns=['dateandtime','skiarea_id','day_of_year','minute_of_day','year'], inplace=True)
+    
+    ##evacuation_vehicles must be explicitated
+    ev = set({})
+    for i,row in dataset_p.iterrows():
+        ev = ev.union(set(row.evacuation_vehicles))
+    for c in ev:
+        dataset_p[c] = False
+    for i,row in dataset_p.iterrows():
+        for c in row.evacuation_vehicles:
+            dataset_p.loc[i,c] = True
+    dataset_p.drop(columns=['town','province','evacuation_vehicles'],inplace=True)
+    
+    
+    dataset_p['age'] =  dataset_p['age'].astype(np.float32).fillna(np.nan)
+    
+    dataset_p_small = dataset_p.copy()
+
+    for c in to_remove.keys():
+        for k in to_remove[c]:
+            dataset_p_small.loc[dataset_p[c]==k,c] = 'other'
+    for c in dataset_p.columns:
+        if c not in ['age','season','skiarea_name']:
+            dataset_p_small[c] =  dataset_p_small[c].fillna('None').astype('category')  
+            dataset_p[c] =  dataset_p[c].fillna('None').astype('category')  
+    dataset_p.dropna(inplace=True)
+    dataset_p_small.dropna(inplace=True)
+
+    return dataset_p,dataset_p_small


-def retrive_data(reload_data:bool,threshold_under_represented:float,path:str):
+
+def retrive_data(reload_data:bool,threshold_under_represented:float,path:str)->(pd.DataFrame,pd.DataFrame):
+    """Get data
+
+    Args:
+        reload_data (bool): if true, the procedure will downolad the data from the db
+        threshold_under_represented (float): classes with few representants are condensed in the class `other`
+        path (str): path in which saving the data
+
+    Returns:
+        two pandas dataframe, one the original the second with condensed classes and a dictionarly of condesed classes
+    """
    if reload_data:
        engine = pg.connect("dbname='safeidx' user='fbk_mpba' host='172.104.247.67' port='5432' password='fbk2024$'")
        df = pd.read_sql('select * from fbk_export_20240212', con=engine) 
@@ -85,7 +137,7 @@ def retrive_data(reload_data:bool,threshold_under_represented:float,path:str):
    labeled.india = labeled.india.apply(lambda x: x.replace('i','')).astype(int)
    labeled_small.india = labeled_small.india.apply(lambda x: x.replace('i','')).astype(int)
    
-    return labeled,labeled_small
+    return labeled,labeled_small,to_remove
    


@@ -94,8 +146,24 @@ def split(labeled:pd.DataFrame,
          SEASON_TEST_SKIAREA:str = 'Kronplatz',
          SEASON_TEST_YEAR:int = 2023,
          use_smote:bool = False,
-          weight_type:str = 'sqrt' ):
-    
+          weight_type:str = 'sqrt' )->(Dataset, Dataset_test):
+    """Split  the dataset into train,validation test. From the initial dataset we remove a single skiarea (SKI_AREA_TEST)
+    generating the first test set. Then we select a skieare and a starting season (SEASON_TEST_SKIAREA,SEASON_TEST_YEAR) 
+    and generate the seconda test set. The rest of the data are splitted 66-33 stratified on the target column (india). 
+    It is possible to specify the weight of eact sample. There are two strategies implemented: using the sum or the square root 
+    of the sum. This is used for mitigating the class umbalance. Another alternative is to use an oversampling procedure (use_smote)
+
+    Args:
+        labeled (pd.DataFrame): dataset
+        SKI_AREA_TEST (str, optional): skiarea to remove from the train and use in test. Defaults to 'Klausberg'.
+        SEASON_TEST_SKIAREA (str, optional): skiarea to remove from the dataset if the season is greater than SEASON_TEST_YEAR. Defaults to 'Kronplatz'.
+        SEASON_TEST_YEAR (int, optional): see SEASON_TEST_SKIAREA . Defaults to 2023.
+        use_smote (bool, optional): use oversampling for class umbalance. Defaults to False.
+        weight_type (str, optional): routine for weighting the error on the samples. Defaults to 'sqrt'.
+
+    Returns:
+        trainin-validation dataset and test dataset
+    """

        
    test_area = labeled[labeled.skiarea_name==SKI_AREA_TEST]
@@ -116,7 +184,6 @@ def split(labeled:pd.DataFrame,
        from imblearn.over_sampling import RandomOverSampler
    
        sm = RandomOverSampler()
-        X_train_smote,y_train_smote = sm.fit_resample(X_train,y_train)
        X_train,y_train = sm.fit_resample(X_train,y_train)

    ##computed the weights for unbalanced dataset