ag update readme and added documentation
This commit is contained in:
@@ -10,8 +10,10 @@ def main(args):
|
||||
|
||||
|
||||
|
||||
labeled,labeled_small = retrive_data(reload_data=args.reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
|
||||
|
||||
labeled,labeled_small,to_remove = retrive_data(reload_data=args.reload_data,threshold_under_represented=0.5,path='/home/agobbi/Projects/PID/datanalytics/PID/src')
|
||||
with open('to_remove.pkl','wb') as f:
|
||||
pickle.dump(to_remove,f)
|
||||
|
||||
dataset,dataset_test = split(labeled_small if args.use_small else labeled ,
|
||||
SKI_AREA_TEST= 'Klausberg',
|
||||
SEASON_TEST_SKIAREA = 'Kronplatz',
|
||||
|
||||
42
src/model.py
42
src/model.py
@@ -7,8 +7,17 @@ import pandas as pd
|
||||
|
||||
|
||||
|
||||
def objective(trial,dataset:Dataset,num_boost_round:int):
|
||||
|
||||
def objective(trial,dataset:Dataset,num_boost_round:int)->float:
|
||||
"""function to maximize during the tuning phase
|
||||
|
||||
Args:
|
||||
trial (??): optuna stuff
|
||||
dataset (Dataset): dataset to use (containing train and validation)
|
||||
num_boost_round (int): number of iteration of xgboost
|
||||
|
||||
Returns:
|
||||
float: validation MCC
|
||||
"""
|
||||
#These are the parameters usually used
|
||||
params = dict(
|
||||
learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2),
|
||||
@@ -44,8 +53,20 @@ def objective(trial,dataset:Dataset,num_boost_round:int):
|
||||
return mcc
|
||||
|
||||
|
||||
def train(dataset,n_trials=1000,timeout=600,num_boost_round=600):
|
||||
|
||||
def train(dataset:Dataset,n_trials:int=1000,timeout:int=600,num_boost_round:int=600)->(xgb.Boost, dict):
|
||||
"""optuna search procedure
|
||||
|
||||
Args:
|
||||
dataset (Dataset): dataset to use (containing train and validation)
|
||||
n_trials (int, optional): number of combination to try. Defaults to 1000.
|
||||
timeout (int, optional): maximum time before stopping. Defaults to 600.
|
||||
num_boost_round (int, optional): number of iteration of a single boost model. Defaults to 600.
|
||||
|
||||
Returns:
|
||||
trained xgboost and a dictionary containing the best parameters
|
||||
"""
|
||||
|
||||
|
||||
study = optuna.create_study(direction="maximize")
|
||||
study.optimize(lambda trial: objective(trial,dataset,num_boost_round), n_trials=n_trials, timeout=timeout)
|
||||
|
||||
@@ -67,7 +88,18 @@ def train(dataset,n_trials=1000,timeout=600,num_boost_round=600):
|
||||
return bst,params_final
|
||||
|
||||
|
||||
def gain_accuracy_train(dataset:Dataset,feat_imp:pd.DataFrame,num_boost_round:int,params:dict):
|
||||
def gain_accuracy_train(dataset:Dataset,feat_imp:pd.DataFrame,num_boost_round:int=600,params:dict={})->(pd.DataFrame,xgb.Booster,int):
|
||||
"""Starting from the most important feature, add one feature, train the model and get mcc and acc on the validation
|
||||
|
||||
Args:
|
||||
dataset (Dataset): dataset to use (containing train and validation)
|
||||
feat_imp (pd.DataFrame): feature importance dataset computed using feat_imp = pd.Series(best_model.get_fscore()).sort_values(ascending=False)
|
||||
num_boost_round (int): number of iteration of a single boost model. Defaults to 600.
|
||||
params (dict): dictionary of best parameters returned from the function `train`
|
||||
|
||||
Returns:
|
||||
dataframe with N-variables, ACC, MCC for each N
|
||||
"""
|
||||
|
||||
tot = []
|
||||
for i in range(1,dataset.X_train.shape[1]):
|
||||
|
||||
79
src/utils.py
79
src/utils.py
@@ -22,10 +22,62 @@ class Dataset_test:
|
||||
y_test_area:Union[pd.Series,None]
|
||||
X_test_season:Union[pd.DataFrame,None]
|
||||
y_test_season:Union[pd.Series,None]
|
||||
|
||||
|
||||
def prepare_new_data(dataset:pd.DataFrame,to_remove:dict)->(pd.DataFrame,pd.DataFrame):
|
||||
"""prepare new data for prediction. MUST BE SIMILAR TO retrive_data. Maybe it can use directly inside it...
|
||||
|
||||
Args:
|
||||
dataset (pd.DataFrame): dataset to use as inference
|
||||
to_remove (dict): columns to aggregate
|
||||
|
||||
Returns:
|
||||
two pandas dataframe, one the original the second with condensed classes.
|
||||
|
||||
"""
|
||||
dataset_p = dataset.copy()
|
||||
dataset_p.drop(columns=['dateandtime','skiarea_id','day_of_year','minute_of_day','year'], inplace=True)
|
||||
|
||||
##evacuation_vehicles must be explicitated
|
||||
ev = set({})
|
||||
for i,row in dataset_p.iterrows():
|
||||
ev = ev.union(set(row.evacuation_vehicles))
|
||||
for c in ev:
|
||||
dataset_p[c] = False
|
||||
for i,row in dataset_p.iterrows():
|
||||
for c in row.evacuation_vehicles:
|
||||
dataset_p.loc[i,c] = True
|
||||
dataset_p.drop(columns=['town','province','evacuation_vehicles'],inplace=True)
|
||||
|
||||
|
||||
dataset_p['age'] = dataset_p['age'].astype(np.float32).fillna(np.nan)
|
||||
|
||||
dataset_p_small = dataset_p.copy()
|
||||
|
||||
for c in to_remove.keys():
|
||||
for k in to_remove[c]:
|
||||
dataset_p_small.loc[dataset_p[c]==k,c] = 'other'
|
||||
for c in dataset_p.columns:
|
||||
if c not in ['age','season','skiarea_name']:
|
||||
dataset_p_small[c] = dataset_p_small[c].fillna('None').astype('category')
|
||||
dataset_p[c] = dataset_p[c].fillna('None').astype('category')
|
||||
dataset_p.dropna(inplace=True)
|
||||
dataset_p_small.dropna(inplace=True)
|
||||
|
||||
return dataset_p,dataset_p_small
|
||||
|
||||
|
||||
def retrive_data(reload_data:bool,threshold_under_represented:float,path:str):
|
||||
|
||||
def retrive_data(reload_data:bool,threshold_under_represented:float,path:str)->(pd.DataFrame,pd.DataFrame):
|
||||
"""Get data
|
||||
|
||||
Args:
|
||||
reload_data (bool): if true, the procedure will downolad the data from the db
|
||||
threshold_under_represented (float): classes with few representants are condensed in the class `other`
|
||||
path (str): path in which saving the data
|
||||
|
||||
Returns:
|
||||
two pandas dataframe, one the original the second with condensed classes and a dictionarly of condesed classes
|
||||
"""
|
||||
if reload_data:
|
||||
engine = pg.connect("dbname='safeidx' user='fbk_mpba' host='172.104.247.67' port='5432' password='fbk2024$'")
|
||||
df = pd.read_sql('select * from fbk_export_20240212', con=engine)
|
||||
@@ -85,7 +137,7 @@ def retrive_data(reload_data:bool,threshold_under_represented:float,path:str):
|
||||
labeled.india = labeled.india.apply(lambda x: x.replace('i','')).astype(int)
|
||||
labeled_small.india = labeled_small.india.apply(lambda x: x.replace('i','')).astype(int)
|
||||
|
||||
return labeled,labeled_small
|
||||
return labeled,labeled_small,to_remove
|
||||
|
||||
|
||||
|
||||
@@ -94,8 +146,24 @@ def split(labeled:pd.DataFrame,
|
||||
SEASON_TEST_SKIAREA:str = 'Kronplatz',
|
||||
SEASON_TEST_YEAR:int = 2023,
|
||||
use_smote:bool = False,
|
||||
weight_type:str = 'sqrt' ):
|
||||
|
||||
weight_type:str = 'sqrt' )->(Dataset, Dataset_test):
|
||||
"""Split the dataset into train,validation test. From the initial dataset we remove a single skiarea (SKI_AREA_TEST)
|
||||
generating the first test set. Then we select a skieare and a starting season (SEASON_TEST_SKIAREA,SEASON_TEST_YEAR)
|
||||
and generate the seconda test set. The rest of the data are splitted 66-33 stratified on the target column (india).
|
||||
It is possible to specify the weight of eact sample. There are two strategies implemented: using the sum or the square root
|
||||
of the sum. This is used for mitigating the class umbalance. Another alternative is to use an oversampling procedure (use_smote)
|
||||
|
||||
Args:
|
||||
labeled (pd.DataFrame): dataset
|
||||
SKI_AREA_TEST (str, optional): skiarea to remove from the train and use in test. Defaults to 'Klausberg'.
|
||||
SEASON_TEST_SKIAREA (str, optional): skiarea to remove from the dataset if the season is greater than SEASON_TEST_YEAR. Defaults to 'Kronplatz'.
|
||||
SEASON_TEST_YEAR (int, optional): see SEASON_TEST_SKIAREA . Defaults to 2023.
|
||||
use_smote (bool, optional): use oversampling for class umbalance. Defaults to False.
|
||||
weight_type (str, optional): routine for weighting the error on the samples. Defaults to 'sqrt'.
|
||||
|
||||
Returns:
|
||||
trainin-validation dataset and test dataset
|
||||
"""
|
||||
|
||||
|
||||
test_area = labeled[labeled.skiarea_name==SKI_AREA_TEST]
|
||||
@@ -116,7 +184,6 @@ def split(labeled:pd.DataFrame,
|
||||
from imblearn.over_sampling import RandomOverSampler
|
||||
|
||||
sm = RandomOverSampler()
|
||||
X_train_smote,y_train_smote = sm.fit_resample(X_train,y_train)
|
||||
X_train,y_train = sm.fit_resample(X_train,y_train)
|
||||
|
||||
##computed the weights for unbalanced dataset
|
||||
|
||||
Reference in New Issue
Block a user