ag update readme and added documentation

This commit is contained in:
2024-02-22 14:39:44 +01:00
parent ea097a7b71
commit 7eb456384e
18 changed files with 14754 additions and 28 deletions

View File

@@ -10,8 +10,10 @@ def main(args):
labeled,labeled_small = retrive_data(reload_data=args.reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
labeled,labeled_small,to_remove = retrive_data(reload_data=args.reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
with open('to_remove.pkl','wb') as f:
pickle.dump(to_remove,f)
dataset,dataset_test = split(labeled_small if args.use_small else labeled ,
SKI_AREA_TEST= 'Klausberg',
SEASON_TEST_SKIAREA = 'Kronplatz',

View File

@@ -7,8 +7,17 @@ import pandas as pd
def objective(trial,dataset:Dataset,num_boost_round:int):
def objective(trial,dataset:Dataset,num_boost_round:int)->float:
"""function to maximize during the tuning phase
Args:
trial (??): optuna stuff
dataset (Dataset): dataset to use (containing train and validation)
num_boost_round (int): number of iteration of xgboost
Returns:
float: validation MCC
"""
#These are the parameters usually used
params = dict(
learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2),
@@ -44,8 +53,20 @@ def objective(trial,dataset:Dataset,num_boost_round:int):
return mcc
def train(dataset,n_trials=1000,timeout=600,num_boost_round=600):
def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=600)->(xgb.Boost, dict):
"""optuna search procedure
Args:
dataset (Dataset): dataset to use (containing train and validation)
n_trials (int, optional): number of combination to try. Defaults to 1000.
timeout (int, optional): maximum time before stopping. Defaults to 600.
num_boost_round (int, optional): number of iteration of a single boost model. Defaults to 600.
Returns:
trained xgboost and a dictionary containing the best parameters
"""
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial,dataset,num_boost_round), n_trials=n_trials, timeout=timeout)
@@ -67,7 +88,18 @@ def train(dataset,n_trials=1000,timeout=600,num_boost_round=600):
return bst,params_final
def gain_accuracy_train(dataset:Dataset,feat_imp:pd.DataFrame,num_boost_round:int,params:dict):
def gain_accuracy_train(dataset:Dataset,feat_imp:pd.DataFrame,num_boost_round:int=600,params:dict={})->(pd.DataFrame,xgb.Booster,int):
"""Starting from the most important feature, add one feature, train the model and get mcc and acc on the validation
Args:
dataset (Dataset): dataset to use (containing train and validation)
feat_imp (pd.DataFrame): feature importance dataset computed using feat_imp = pd.Series(best_model.get_fscore()).sort_values(ascending=False)
num_boost_round (int): number of iteration of a single boost model. Defaults to 600.
params (dict): dictionary of best parameters returned from the function `train`
Returns:
dataframe with N-variables, ACC, MCC for each N
"""
tot = []
for i in range(1,dataset.X_train.shape[1]):

View File

@@ -22,10 +22,62 @@ class Dataset_test:
y_test_area:Union[pd.Series,None]
X_test_season:Union[pd.DataFrame,None]
y_test_season:Union[pd.Series,None]
def prepare_new_data(dataset:pd.DataFrame,to_remove:dict)->(pd.DataFrame,pd.DataFrame):
"""prepare new data for prediction. MUST BE SIMILAR TO retrive_data. Maybe it can use directly inside it...
Args:
dataset (pd.DataFrame): dataset to use as inference
to_remove (dict): columns to aggregate
Returns:
two pandas dataframe, one the original the second with condensed classes.
"""
dataset_p = dataset.copy()
dataset_p.drop(columns=['dateandtime','skiarea_id','day_of_year','minute_of_day','year'], inplace=True)
##evacuation_vehicles must be explicitated
ev = set({})
for i,row in dataset_p.iterrows():
ev = ev.union(set(row.evacuation_vehicles))
for c in ev:
dataset_p[c] = False
for i,row in dataset_p.iterrows():
for c in row.evacuation_vehicles:
dataset_p.loc[i,c] = True
dataset_p.drop(columns=['town','province','evacuation_vehicles'],inplace=True)
dataset_p['age'] = dataset_p['age'].astype(np.float32).fillna(np.nan)
dataset_p_small = dataset_p.copy()
for c in to_remove.keys():
for k in to_remove[c]:
dataset_p_small.loc[dataset_p[c]==k,c] = 'other'
for c in dataset_p.columns:
if c not in ['age','season','skiarea_name']:
dataset_p_small[c] = dataset_p_small[c].fillna('None').astype('category')
dataset_p[c] = dataset_p[c].fillna('None').astype('category')
dataset_p.dropna(inplace=True)
dataset_p_small.dropna(inplace=True)
return dataset_p,dataset_p_small
def retrive_data(reload_data:bool,threshold_under_represented:float,path:str):
def retrive_data(reload_data:bool,threshold_under_represented:float,path:str)->(pd.DataFrame,pd.DataFrame):
"""Get data
Args:
reload_data (bool): if true, the procedure will downolad the data from the db
threshold_under_represented (float): classes with few representants are condensed in the class `other`
path (str): path in which saving the data
Returns:
two pandas dataframe, one the original the second with condensed classes and a dictionarly of condesed classes
"""
if reload_data:
engine = pg.connect("dbname='safeidx' user='fbk_mpba' host='172.104.247.67' port='5432' password='fbk2024$'")
df = pd.read_sql('select * from fbk_export_20240212', con=engine)
@@ -85,7 +137,7 @@ def retrive_data(reload_data:bool,threshold_under_represented:float,path:str):
labeled.india = labeled.india.apply(lambda x: x.replace('i','')).astype(int)
labeled_small.india = labeled_small.india.apply(lambda x: x.replace('i','')).astype(int)
return labeled,labeled_small
return labeled,labeled_small,to_remove
@@ -94,8 +146,24 @@ def split(labeled:pd.DataFrame,
SEASON_TEST_SKIAREA:str = 'Kronplatz',
SEASON_TEST_YEAR:int = 2023,
use_smote:bool = False,
weight_type:str = 'sqrt' ):
weight_type:str = 'sqrt' )->(Dataset, Dataset_test):
"""Split the dataset into train,validation test. From the initial dataset we remove a single skiarea (SKI_AREA_TEST)
generating the first test set. Then we select a skieare and a starting season (SEASON_TEST_SKIAREA,SEASON_TEST_YEAR)
and generate the seconda test set. The rest of the data are splitted 66-33 stratified on the target column (india).
It is possible to specify the weight of eact sample. There are two strategies implemented: using the sum or the square root
of the sum. This is used for mitigating the class umbalance. Another alternative is to use an oversampling procedure (use_smote)
Args:
labeled (pd.DataFrame): dataset
SKI_AREA_TEST (str, optional): skiarea to remove from the train and use in test. Defaults to 'Klausberg'.
SEASON_TEST_SKIAREA (str, optional): skiarea to remove from the dataset if the season is greater than SEASON_TEST_YEAR. Defaults to 'Kronplatz'.
SEASON_TEST_YEAR (int, optional): see SEASON_TEST_SKIAREA . Defaults to 2023.
use_smote (bool, optional): use oversampling for class umbalance. Defaults to False.
weight_type (str, optional): routine for weighting the error on the samples. Defaults to 'sqrt'.
Returns:
trainin-validation dataset and test dataset
"""
test_area = labeled[labeled.skiarea_name==SKI_AREA_TEST]
@@ -116,7 +184,6 @@ def split(labeled:pd.DataFrame,
from imblearn.over_sampling import RandomOverSampler
sm = RandomOverSampler()
X_train_smote,y_train_smote = sm.fit_resample(X_train,y_train)
X_train,y_train = sm.fit_resample(X_train,y_train)
##computed the weights for unbalanced dataset