Module ml_utils.feature_selection_utils
Expand source code Browse git
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import os
from ml_utils.plot_utils import plot_scatter, get_subplot_rows_cols
def covariate_shift(train, test, categorical_columns, n_samples, iterations = 200, weights_coef = 1, AUC_threshold = 0.8, importance_threshold = 0.9, max_loops = 20, test_size = 0.1, trys_all_influencer=5, calc_sample_weights=True, task_type="CPU", data_dir='', load_cov=False, save_cov=False, plot=True):
""" Select features without Covariate Shift between training and test set using iteratively CatBoostClassifier to identify relation between train and test """
import seaborn as sns
import catboost as cb
from sklearn.model_selection import train_test_split
if not os.path.exists(data_dir + 'cov_shift_features.pkl') or not load_cov:
train_sample = train.sample(n_samples)
train_sample.loc[:,'origin'] = 0
test_sample = test.sample(n_samples)
test_sample.loc[:,'origin'] = 1
combined_train, combined_test = train_test_split(
pd.concat([train_sample.reset_index(drop=True), test_sample.reset_index(drop=True)]),
test_size = test_size,
shuffle = True)
try:
influence_columns = []
count_all_influencer = 0
i = 0
AUC_score = 1
while i < max_loops and AUC_score > AUC_threshold:
x_columns = combined_train.columns.drop(['origin',] + influence_columns)
# Get the indexes for the categorical columns which CatBoost requires to out-perform other algorithms
cat_features_index = [list(x_columns).index(col) for col in categorical_columns if col in list(x_columns)]
# Do the feature selection once and only try again if no feature is selected
cov_shift_feature_selection = []
while len(cov_shift_feature_selection) == 0 and count_all_influencer < trys_all_influencer:
if count_all_influencer > 0:
print("Try again because model has set any feature as influencer")
cov_shift_model = cb.CatBoostClassifier(iterations = iterations,
eval_metric = "AUC",
cat_features = cat_features_index,
task_type = task_type,
verbose = False
)
cov_shift_feature_selection, df_cov_shift_feature_selection = shadow_feature_selection(
cov_shift_model,
combined_train['origin'], combined_train[x_columns],
need_cat_features_index=True, categorical_columns=categorical_columns,
collinear_threshold = 1,
n_iterations_mean = 1, times_no_change_features = 1
)
count_all_influencer += 1
if count_all_influencer == trys_all_influencer:
cov_shift_feature_selection = list(x_columns)
# Get the indexes for the categorical columns which CatBoost requires to out-perform other algorithms
cat_features_index = [cov_shift_feature_selection.index(col) for col in categorical_columns if col in cov_shift_feature_selection]
params = {'iterations' : 2*iterations, 'learning_rate' : 0.05, 'depth' : 6}
cov_shift_model = cb.CatBoostClassifier(iterations = iterations,
eval_metric = "AUC",
cat_features = cat_features_index,
scale_pos_weight = combined_train['origin'].value_counts()[0] / combined_train['origin'].value_counts()[1],
task_type = task_type,
verbose = False
)
cov_shift_model.set_params(**params)
cov_shift_model.fit(combined_train.drop('origin', axis = 1)[cov_shift_feature_selection],
combined_train['origin'],
eval_set = (combined_test.drop('origin', axis = 1)[cov_shift_feature_selection], combined_test['origin']),
use_best_model = True,
#sample_weight = sample_weight,
#early_stopping_rounds = True,
plot = False,
verbose = False)
AUC_score = cov_shift_model.get_best_score()['validation']['AUC']
print(f"Model score AUC of {AUC_score} on test")
# Remove the features which cumulative importance is relevant to predict origin of data (train or test)
if count_all_influencer != trys_all_influencer:
df_cov_shift_importance = pd.DataFrame(cov_shift_model.feature_importances_, columns = ['importance'], index = cov_shift_feature_selection)
df_cov_shift_importance['cumulative_importance'] = df_cov_shift_importance['importance'].cumsum() / df_cov_shift_importance['importance'].sum()
new_influence_columns = list(df_cov_shift_importance[df_cov_shift_importance['cumulative_importance'] < importance_threshold].index)
influence_columns = influence_columns + new_influence_columns
print(f"New {len(new_influence_columns)} columns will be removed from model: ", new_influence_columns)
print()
count_all_influencer = 0
i = i + 1
finally:
print()
print(f"Due to difference of influence of features to distinguish between data and submission, {len(influence_columns)} columns are removed:")
print(influence_columns)
if calc_sample_weights:
print("Calculating weights for each training sample")
probs = cov_shift_model.predict_proba(train[cov_shift_model.feature_names_])[:, 1] #calculating the probability
#print("Plot Train AUC")
#plot_roc_auc(pd.Serie(1,index = train.index), probs)
sample_weight = -np.log(probs)
sample_weight /= max(sample_weight) # Normalizing the weights
sample_weight = 1 + weights_coef * sample_weight
if plot:
plt.xlabel('Computed sample weight')
plt.ylabel('# Samples')
sns.distplot(sample_weight, kde=False)
if save_cov:
with open(data_dir + 'cov_shift_features.pkl', 'wb') as file:
print("Saving data in ", data_dir + 'cov_shift_features.pkl')
pickle.dump(influence_columns, file)
else:
print("Loading influence columns from ",data_dir)
with open(data_dir + 'cov_shift_features.pkl', 'rb') as file:
influence_columns = pickle.load(file)
cov_shift_model = None
sample_weight = [1,] * len(train)
return influence_columns, cov_shift_model, sample_weight
def stadistic_difference_distributions(data, submission, time_column, test_percentage=0.2, p_value_threshold=None,
verbose=False):
""" Calculate relation between initial and end part of the dataset for each column using Kolmogorov-Smirnov statistic on 2 samples """
from scipy import stats
from sklearn.model_selection import train_test_split
train, test = train_test_split(data.sort_values(time_column), test_size=test_percentage, shuffle=False)
time_analysis_df = pd.DataFrame(False, columns=['train_test', 'train_submission', 'test_submission'],
index=submission.columns.values)
for col in tqdm_notebook(submission.columns.values):
try:
KS_stat_test, p_value_test = stats.ks_2samp(train[col], test[col])
KS_stat_submission, p_value_submission = stats.ks_2samp(train[col], submission[col])
KS_stat_test_submission, p_value_test_submission = stats.ks_2samp(test[col], submission[col])
time_analysis_df.loc[col] = [p_value_test, p_value_submission, p_value_test_submission]
if verbose:
if p_value_test <= p_value_threshold or p_value_submission <= p_value_threshold or p_value_test_submission <= p_value_threshold:
print_s = f'Column {col} has different distribution'
if p_value_test <= p_value_threshold:
print_s = print_s + ' // train <--> test'
if p_value_submission <= p_value_threshold:
print_s = print_s + ' // train <--> submission'
if p_value_test_submission <= p_value_threshold:
print_s = print_s + ' // test <--> submission'
print(print_s)
except TypeError:
time_analysis_df.loc[col] = [np.nan, np.nan, np.nan]
if p_value_threshold == None:
cond1 = time_analysis_df['train_test'] == 0
cond2 = time_analysis_df['train_submission'] == 0
cond3 = time_analysis_df['test_submission'] == 0
else:
cond1 = time_analysis_df['train_test'] <= p_value_threshold
cond2 = time_analysis_df['train_submission'] <= p_value_threshold
cond3 = time_analysis_df['test_submission'] <= p_value_threshold
cols_to_remove = list(time_analysis_df[cond1 | cond2 | cond3].index)
return time_analysis_df, cols_to_remove
def outliers_analysis(full_data, features_names=None, x_column=None, subplot_rows=None, subplot_cols=None, starting_index=0,
index_offset=0, z_score_threshold=3.5, use_mean=False, plot=True, num_bins=50):
""" Calculate and visualize outliers analysis from Modified Z-score with MAD """
# Compatibility with numpy arrays
if type(full_data) == np.ndarray:
assert len(full_data.shape) <= 2
if len(full_data.shape) == 1:
columns = ['feature']
else:
columns = ['feature_'+str(i) for i in range(full_data.shape[-1])]
full_data = pd.DataFrame(full_data, columns=columns)
# Features not provided, use all the columns
if features_names is None:
features_names = list(full_data.columns)
if plot:
# Set a good relation rows/cols for the plot if not specified
if subplot_rows is None or subplot_cols is None:
subplot_rows, subplot_cols = get_subplot_rows_cols(len(features_names), [3,4,5])
# Resize for better visualization of subplots
plt.rcParams['figure.figsize'] = [subplot_cols * 5, subplot_rows * 4]
fig, axes = plt.subplots(subplot_rows, subplot_cols, sharex=False, sharey=False)
outliers_pd = full_data.copy()
outliers_summary = {}
i = starting_index
while i < len(features_names):
feature_name = features_names[i]
data = outliers_pd.loc[outliers_pd[feature_name].notnull(), feature_name]
# Modified Z-score with MAD (Median Absolute Deviation)
if use_mean:
outliers_pd.loc[outliers_pd[feature_name].notnull(), feature_name + '_zscore'] = 0.6745 * (data - data.mean()).abs() / (
data - data.mean()).abs().mean()
else:
outliers_pd.loc[outliers_pd[feature_name].notnull(), feature_name + '_zscore'] = 0.6745 * (data - data.median()).abs() / (
data - data.median()).abs().median()
outliers_pd[feature_name + '_zscore_outliers'] = outliers_pd[feature_name + '_zscore'] > z_score_threshold
if plot:
# Take into account the case of only one plot
if subplot_rows * subplot_cols == 1:
ax = axes
elif subplot_rows == 1:
ax = axes[(i + index_offset) % subplot_cols]
else:
ax = axes[(i + index_offset) // subplot_cols, (i + index_offset) % subplot_cols]
# If X_column provided plot scatter, otherwise histogram
if x_column is None:
bins = np.linspace(data.min(), data.max(), num_bins)
ax.hist(data[~outliers_pd[feature_name + '_zscore_outliers']], bins=bins, density=False)
ax.hist(data[outliers_pd[feature_name + '_zscore_outliers']], bins=bins, density=False)
ax.set_title(feature_name)
else:
plot_scatter(outliers_pd[outliers_pd[feature_name].notnull()], x_column=x_column, y_column=feature_name,
axes=ax, highlight_column=feature_name + '_zscore_outliers')
outliers_percentage = 100 * outliers_pd[feature_name + '_zscore_outliers'].sum() / outliers_pd[
feature_name + '_zscore_outliers'].count()
outliers_summary[feature_name] = outliers_percentage
print("Feature: ", feature_name, " - Percentage of outliers using modified Z-score approach is: ",
np.round(outliers_percentage, 2), "%")
i = i + 1
if plot:
fig.tight_layout()
# Resize to original settings
plt.rcParams['figure.figsize'] = [10, 6]
outliers_summary = pd.DataFrame.from_dict(outliers_summary, orient='index', columns=['Percentage'])
return outliers_summary, outliers_pd
def feature_selection(classifier_initial, y_train, x_train, n_top_features=50, baseline_features=[],
min_importance=None):
""" Select features which have the top N feature importance and/or above baseline """
classifier_model = classifier_initial.fit(x_train, y_train)
feature_importance = sorted(zip(map(lambda x: round(x, 4), classifier_model.feature_importances_), x_train),
reverse=True)
dict_feature_importance = dict(zip(x_train, map(lambda x: round(x, 4), estimator.feature_importances_)))
if baseline_features:
min_importance = max([importance for importance, feature in feature_importance if feature in baseline_features])
model_columns = []
i = 0
while i < n_top_features and i < len(feature_importance):
if feature_importance[i][0] > min_importance:
model_columns.append(feature_importance[i][1])
else:
break
i = i + 1
return model_columns
def cumulative_feature_selection(df_feature_importance, cum_importance_threshold):
""" Select features which are below of the cumulative feature importance threshold """
df_feature_importance = pd.DataFrame(df_feature_importance, columns=['importance'])
df_feature_importance['cumulative_importance'] = df_feature_importance['importance'].cumsum() / \
df_feature_importance['importance'].sum()
print("Removed ", sum(df_feature_importance['cumulative_importance'] >= cum_importance_threshold),
" features due to low importance:")
print(
df_feature_importance[df_feature_importance['cumulative_importance'] >= cum_importance_threshold].index.values)
df_feature_importance = df_feature_importance[
df_feature_importance['cumulative_importance'] < cum_importance_threshold]
return df_feature_importance
def collinear_feature_selection(x_train, df_feature_importance, collinear_threshold=0.98, plot=True):
""" Select features which have less collinearity below the threshold """
correlation = x_train[df_feature_importance.index].corr()
if plot:
correlation.round(3).style.background_gradient(cmap='coolwarm')
cond1 = pd.DataFrame(np.triu(np.ones(correlation.shape[0]) - np.eye(correlation.shape[0])),
columns=correlation.columns, index=correlation.index) == 1
corr_final = (correlation > collinear_threshold) & cond1
corr_final = corr_final.loc[:, corr_final.any()]
features_remove = []
columns = corr_final.columns.values
rows = corr_final.index.values
for i in tqdm_notebook(range(corr_final.shape[1]), desc='1st Loop'):
# If a feature is already on the remove list, then it is not needed to check
if columns[i] in features_remove:
continue
j_max = np.where(rows == columns[i])
for j in tqdm_notebook(range(corr_final.shape[0]), desc='2nd Loop', leave=False):
if j == j_max:
break
# Feature columns[i] and feature rows[j] are collinear
if corr_final.iloc[j, i]:
# If a feature is already on the remove list, then it is not needed to check
if rows[j] in features_remove:
continue
# Remove the one which has less importance
importance_i = df_feature_importance.loc[columns[i], 'importance']
importance_j = df_feature_importance.loc[rows[j], 'importance']
if importance_i < importance_j:
features_remove.append(columns[i])
else:
features_remove.append(columns[j])
print("Removed ", len(features_remove), " features due to collinearity: ")
print(features_remove)
df_feature_importance = df_feature_importance.drop(features_remove)
return df_feature_importance
def shadow_feature_selection(classifier_initial, y_train, x_train, eval_set=None, n_top_features=None, collinear_threshold=0.98, cum_importance_threshold=0.99, times_no_change_features=2, max_loops=50, n_iterations_mean=3, need_cat_features_index=False, categorical_columns=[], verbose=True, debug=False, plot_correlation=False):
""" Select features which have greater feature importance than random and than shuffle shadowed feature of itself """
# Create 3 random features which will serve as baseline to reject features
baseline_features = ['random_binary', 'random_uniform', 'random_integers']
x_train = x_train.drop(baseline_features, axis=1, errors='ignore')
x_train['random_binary'] = np.random.choice([0, 1], x_train.shape[0])
x_train['random_uniform'] = np.random.uniform(0, 1, x_train.shape[0])
x_train['random_integers'] = np.random.randint(0, x_train.shape[0] / 2, x_train.shape[0])
# For each feature it creates a shadow_feature which will have same values but shuffled
x_train_shadow, dict_shadow_names = _create_shadow(x_train, baseline_features)
count_no_changes = 0
x_all_columns = baseline_features + list(dict_shadow_names.keys()) + list(dict_shadow_names.values())
# "Infinite" loop till one of the stopping criterias stop removing features
for i in tqdm_notebook(range(max_loops), desc='Main Loop'):
if dict_shadow_names == {}:
print("Stopping because no feature found to be relevant")
return [], pd.DataFrame([])
print("Loop number: ", i, " with still ", len(dict_shadow_names.keys()), " features")
# Take a copy of current columns to check stopping criteria of changing columns
x_all_columns_prev = x_all_columns.copy()
x_all_columns_prev.sort()
if debug:
print("x_all_columns_prev: ", x_all_columns_prev)
# Get the feature importance for each column (real, shadow and baseline)
df_feature_importance = get_feature_importance_mean(classifier_initial, y_train, x_train_shadow[x_all_columns],
n_iterations_mean=n_iterations_mean,
need_cat_features_index=need_cat_features_index,
categorical_columns=categorical_columns,
dict_shadow_names=dict_shadow_names,
eval_set=eval_set)
# Take as minimum value of feature importance as the greatest value of the baselines features
if baseline_features != []:
min_importance = df_feature_importance[baseline_features].max()
else:
min_importance = 0
# Drop all features that have lower importance than their shadow or lower than the baseline features
dict_shadow_names_copy = dict_shadow_names.copy()
for real_col, shadow_col in dict_shadow_names.items():
if df_feature_importance[shadow_col] >= df_feature_importance[real_col] or df_feature_importance[
real_col] < min_importance:
del dict_shadow_names_copy[real_col]
if debug:
print("Removing feature: ", real_col)
if debug:
print("dict_shadow_names: ", dict_shadow_names)
print("--------------------------------------------------------")
print("dict_shadow_names_copy: ", dict_shadow_names_copy)
print()
print("--------------------------------------------------------")
print()
dict_shadow_names = dict_shadow_names_copy.copy()
if debug:
print("dict_shadow_names: ", dict_shadow_names)
print("--------------------------------------------------------")
print("dict_shadow_names_copy: ", dict_shadow_names_copy)
x_all_columns = baseline_features + list(dict_shadow_names.keys()) + list(dict_shadow_names.values())
x_all_columns.sort()
if debug:
print()
print("--------------------------------------------------------")
print()
print("x_all_columns_prev: ", x_all_columns_prev)
print("--------------------------------------------------------")
print("x_all_columns: ", x_all_columns)
# Check if has been any change, if not then break the loop
if x_all_columns == x_all_columns_prev:
count_no_changes += 1
if times_no_change_features == count_no_changes:
print("Stopping feature selection due to no change")
break
else:
count_no_changes = 0
print("This loop has removed ", int((len(x_all_columns_prev) - len(x_all_columns)) / 2), " features")
# If we have reduced to the maximum number of features, then break the loop
if n_top_features != None and n_top_features >= len(dict_shadow_names.keys()):
print("Stopping feature selection due to reached maximum of features")
break
df_feature_importance = df_feature_importance[list(dict_shadow_names.keys())].sort_values(ascending=False)
df_feature_importance = cumulative_feature_selection(df_feature_importance, cum_importance_threshold)
df_feature_importance = collinear_feature_selection(x_train[df_feature_importance.index.values],
df_feature_importance, collinear_threshold=collinear_threshold,
plot=plot_correlation)
if n_top_features != None:
if n_top_features >= len(dict_shadow_names.keys()):
df_feature_importance = df_feature_importance.iloc[0: len(dict_shadow_names.keys())]
else:
df_feature_importance = df_feature_importance.iloc[0: n_top_features]
return list(df_feature_importance.index), df_feature_importance
def _create_shadow(x, baseline_features):
"""
Take all X variables, creating copies and randomly shuffling them
:param x: the dataframe to create shadow features on
:return: dataframe 2x width and the names of the shadows for removing later
"""
print("Creating all shadow features")
x_shadow = x.copy()
x_shadow = x_shadow.drop(baseline_features, axis=1)
for c in tqdm_notebook(x_shadow.columns, desc='Shadow Cols'):
np.random.shuffle(x_shadow[c].values) # shuffle the values of each feature to all the features
# rename the shadow
shadow_names = ["shadow_feature_" + str(i + 1) for i in range(x_shadow.shape[1])]
dict_shadow_names = dict(zip(x_shadow.columns.copy(), shadow_names))
x_shadow.columns = shadow_names
# Combine to make one new dataframe
x_new = pd.concat([x, x_shadow], axis=1)
return x_new, dict_shadow_names
def get_feature_importance_mean(classifier_initial, y_train, x_train, n_iterations_mean=3,
need_cat_features_index=False, categorical_columns=[], dict_shadow_names={}, eval_set=None):
""" Calculate feature importance mean between different training of the classifier """
cat_features_index = None
if need_cat_features_index:
if dict_shadow_names=={}:
raise ValueError("dict_shadow_names cannot be empty if categorical features index are needed")
x_columns = list(x_train.columns)
cat_features_index = []
for col in categorical_columns:
if col in x_columns:
cat_features_index.extend((x_columns.index(col), x_columns.index(dict_shadow_names[col])))
n_targets = y_train.shape[-1] if len(y_train.shape) > 1 else 1
for t in tqdm_notebook(range(n_targets), desc='Target Loop', leave=False):
for i in tqdm_notebook(range(n_iterations_mean), desc='Mean Loop', leave=False):
classifier = classifier_initial.copy()
classifier.set_params(cat_features=cat_features_index, random_state=np.random.randint(100))
metric_period = int(classifier.get_param('iterations')) // 10
if n_targets > 1:
classifier = classifier.fit(x_train, y_train.iloc[:,t], eval_set=eval_set, metric_period=metric_period)
else:
classifier = classifier.fit(x_train, y_train, eval_set=eval_set, metric_period=metric_period)
feature_importance = sorted(zip(map(lambda x: round(x, 4), classifier.feature_importances_), x_train),
reverse=True)
if t==0 and i == 0:
df_feature_importance = pd.DataFrame(dict(zip(x_train, classifier.feature_importances_)), index=[i])
else:
df_feature_importance_iter = pd.DataFrame(dict(zip(x_train, classifier.feature_importances_)),
index=[i])
df_feature_importance = pd.concat([df_feature_importance, df_feature_importance_iter])
return df_feature_importance.mean()
Functions
def collinear_feature_selection(x_train, df_feature_importance, collinear_threshold=0.98, plot=True)
-
Select features which have less collinearity below the threshold
Expand source code Browse git
def collinear_feature_selection(x_train, df_feature_importance, collinear_threshold=0.98, plot=True): """ Select features which have less collinearity below the threshold """ correlation = x_train[df_feature_importance.index].corr() if plot: correlation.round(3).style.background_gradient(cmap='coolwarm') cond1 = pd.DataFrame(np.triu(np.ones(correlation.shape[0]) - np.eye(correlation.shape[0])), columns=correlation.columns, index=correlation.index) == 1 corr_final = (correlation > collinear_threshold) & cond1 corr_final = corr_final.loc[:, corr_final.any()] features_remove = [] columns = corr_final.columns.values rows = corr_final.index.values for i in tqdm_notebook(range(corr_final.shape[1]), desc='1st Loop'): # If a feature is already on the remove list, then it is not needed to check if columns[i] in features_remove: continue j_max = np.where(rows == columns[i]) for j in tqdm_notebook(range(corr_final.shape[0]), desc='2nd Loop', leave=False): if j == j_max: break # Feature columns[i] and feature rows[j] are collinear if corr_final.iloc[j, i]: # If a feature is already on the remove list, then it is not needed to check if rows[j] in features_remove: continue # Remove the one which has less importance importance_i = df_feature_importance.loc[columns[i], 'importance'] importance_j = df_feature_importance.loc[rows[j], 'importance'] if importance_i < importance_j: features_remove.append(columns[i]) else: features_remove.append(columns[j]) print("Removed ", len(features_remove), " features due to collinearity: ") print(features_remove) df_feature_importance = df_feature_importance.drop(features_remove) return df_feature_importance
def covariate_shift(train, test, categorical_columns, n_samples, iterations=200, weights_coef=1, AUC_threshold=0.8, importance_threshold=0.9, max_loops=20, test_size=0.1, trys_all_influencer=5, calc_sample_weights=True, task_type='CPU', data_dir='', load_cov=False, save_cov=False, plot=True)
-
Select features without Covariate Shift between training and test set using iteratively CatBoostClassifier to identify relation between train and test
Expand source code Browse git
def covariate_shift(train, test, categorical_columns, n_samples, iterations = 200, weights_coef = 1, AUC_threshold = 0.8, importance_threshold = 0.9, max_loops = 20, test_size = 0.1, trys_all_influencer=5, calc_sample_weights=True, task_type="CPU", data_dir='', load_cov=False, save_cov=False, plot=True): """ Select features without Covariate Shift between training and test set using iteratively CatBoostClassifier to identify relation between train and test """ import seaborn as sns import catboost as cb from sklearn.model_selection import train_test_split if not os.path.exists(data_dir + 'cov_shift_features.pkl') or not load_cov: train_sample = train.sample(n_samples) train_sample.loc[:,'origin'] = 0 test_sample = test.sample(n_samples) test_sample.loc[:,'origin'] = 1 combined_train, combined_test = train_test_split( pd.concat([train_sample.reset_index(drop=True), test_sample.reset_index(drop=True)]), test_size = test_size, shuffle = True) try: influence_columns = [] count_all_influencer = 0 i = 0 AUC_score = 1 while i < max_loops and AUC_score > AUC_threshold: x_columns = combined_train.columns.drop(['origin',] + influence_columns) # Get the indexes for the categorical columns which CatBoost requires to out-perform other algorithms cat_features_index = [list(x_columns).index(col) for col in categorical_columns if col in list(x_columns)] # Do the feature selection once and only try again if no feature is selected cov_shift_feature_selection = [] while len(cov_shift_feature_selection) == 0 and count_all_influencer < trys_all_influencer: if count_all_influencer > 0: print("Try again because model has set any feature as influencer") cov_shift_model = cb.CatBoostClassifier(iterations = iterations, eval_metric = "AUC", cat_features = cat_features_index, task_type = task_type, verbose = False ) cov_shift_feature_selection, df_cov_shift_feature_selection = shadow_feature_selection( cov_shift_model, combined_train['origin'], combined_train[x_columns], need_cat_features_index=True, categorical_columns=categorical_columns, collinear_threshold = 1, n_iterations_mean = 1, times_no_change_features = 1 ) count_all_influencer += 1 if count_all_influencer == trys_all_influencer: cov_shift_feature_selection = list(x_columns) # Get the indexes for the categorical columns which CatBoost requires to out-perform other algorithms cat_features_index = [cov_shift_feature_selection.index(col) for col in categorical_columns if col in cov_shift_feature_selection] params = {'iterations' : 2*iterations, 'learning_rate' : 0.05, 'depth' : 6} cov_shift_model = cb.CatBoostClassifier(iterations = iterations, eval_metric = "AUC", cat_features = cat_features_index, scale_pos_weight = combined_train['origin'].value_counts()[0] / combined_train['origin'].value_counts()[1], task_type = task_type, verbose = False ) cov_shift_model.set_params(**params) cov_shift_model.fit(combined_train.drop('origin', axis = 1)[cov_shift_feature_selection], combined_train['origin'], eval_set = (combined_test.drop('origin', axis = 1)[cov_shift_feature_selection], combined_test['origin']), use_best_model = True, #sample_weight = sample_weight, #early_stopping_rounds = True, plot = False, verbose = False) AUC_score = cov_shift_model.get_best_score()['validation']['AUC'] print(f"Model score AUC of {AUC_score} on test") # Remove the features which cumulative importance is relevant to predict origin of data (train or test) if count_all_influencer != trys_all_influencer: df_cov_shift_importance = pd.DataFrame(cov_shift_model.feature_importances_, columns = ['importance'], index = cov_shift_feature_selection) df_cov_shift_importance['cumulative_importance'] = df_cov_shift_importance['importance'].cumsum() / df_cov_shift_importance['importance'].sum() new_influence_columns = list(df_cov_shift_importance[df_cov_shift_importance['cumulative_importance'] < importance_threshold].index) influence_columns = influence_columns + new_influence_columns print(f"New {len(new_influence_columns)} columns will be removed from model: ", new_influence_columns) print() count_all_influencer = 0 i = i + 1 finally: print() print(f"Due to difference of influence of features to distinguish between data and submission, {len(influence_columns)} columns are removed:") print(influence_columns) if calc_sample_weights: print("Calculating weights for each training sample") probs = cov_shift_model.predict_proba(train[cov_shift_model.feature_names_])[:, 1] #calculating the probability #print("Plot Train AUC") #plot_roc_auc(pd.Serie(1,index = train.index), probs) sample_weight = -np.log(probs) sample_weight /= max(sample_weight) # Normalizing the weights sample_weight = 1 + weights_coef * sample_weight if plot: plt.xlabel('Computed sample weight') plt.ylabel('# Samples') sns.distplot(sample_weight, kde=False) if save_cov: with open(data_dir + 'cov_shift_features.pkl', 'wb') as file: print("Saving data in ", data_dir + 'cov_shift_features.pkl') pickle.dump(influence_columns, file) else: print("Loading influence columns from ",data_dir) with open(data_dir + 'cov_shift_features.pkl', 'rb') as file: influence_columns = pickle.load(file) cov_shift_model = None sample_weight = [1,] * len(train) return influence_columns, cov_shift_model, sample_weight
def cumulative_feature_selection(df_feature_importance, cum_importance_threshold)
-
Select features which are below of the cumulative feature importance threshold
Expand source code Browse git
def cumulative_feature_selection(df_feature_importance, cum_importance_threshold): """ Select features which are below of the cumulative feature importance threshold """ df_feature_importance = pd.DataFrame(df_feature_importance, columns=['importance']) df_feature_importance['cumulative_importance'] = df_feature_importance['importance'].cumsum() / \ df_feature_importance['importance'].sum() print("Removed ", sum(df_feature_importance['cumulative_importance'] >= cum_importance_threshold), " features due to low importance:") print( df_feature_importance[df_feature_importance['cumulative_importance'] >= cum_importance_threshold].index.values) df_feature_importance = df_feature_importance[ df_feature_importance['cumulative_importance'] < cum_importance_threshold] return df_feature_importance
def feature_selection(classifier_initial, y_train, x_train, n_top_features=50, baseline_features=[], min_importance=None)
-
Select features which have the top N feature importance and/or above baseline
Expand source code Browse git
def feature_selection(classifier_initial, y_train, x_train, n_top_features=50, baseline_features=[], min_importance=None): """ Select features which have the top N feature importance and/or above baseline """ classifier_model = classifier_initial.fit(x_train, y_train) feature_importance = sorted(zip(map(lambda x: round(x, 4), classifier_model.feature_importances_), x_train), reverse=True) dict_feature_importance = dict(zip(x_train, map(lambda x: round(x, 4), estimator.feature_importances_))) if baseline_features: min_importance = max([importance for importance, feature in feature_importance if feature in baseline_features]) model_columns = [] i = 0 while i < n_top_features and i < len(feature_importance): if feature_importance[i][0] > min_importance: model_columns.append(feature_importance[i][1]) else: break i = i + 1 return model_columns
def get_feature_importance_mean(classifier_initial, y_train, x_train, n_iterations_mean=3, need_cat_features_index=False, categorical_columns=[], dict_shadow_names={}, eval_set=None)
-
Calculate feature importance mean between different training of the classifier
Expand source code Browse git
def get_feature_importance_mean(classifier_initial, y_train, x_train, n_iterations_mean=3, need_cat_features_index=False, categorical_columns=[], dict_shadow_names={}, eval_set=None): """ Calculate feature importance mean between different training of the classifier """ cat_features_index = None if need_cat_features_index: if dict_shadow_names=={}: raise ValueError("dict_shadow_names cannot be empty if categorical features index are needed") x_columns = list(x_train.columns) cat_features_index = [] for col in categorical_columns: if col in x_columns: cat_features_index.extend((x_columns.index(col), x_columns.index(dict_shadow_names[col]))) n_targets = y_train.shape[-1] if len(y_train.shape) > 1 else 1 for t in tqdm_notebook(range(n_targets), desc='Target Loop', leave=False): for i in tqdm_notebook(range(n_iterations_mean), desc='Mean Loop', leave=False): classifier = classifier_initial.copy() classifier.set_params(cat_features=cat_features_index, random_state=np.random.randint(100)) metric_period = int(classifier.get_param('iterations')) // 10 if n_targets > 1: classifier = classifier.fit(x_train, y_train.iloc[:,t], eval_set=eval_set, metric_period=metric_period) else: classifier = classifier.fit(x_train, y_train, eval_set=eval_set, metric_period=metric_period) feature_importance = sorted(zip(map(lambda x: round(x, 4), classifier.feature_importances_), x_train), reverse=True) if t==0 and i == 0: df_feature_importance = pd.DataFrame(dict(zip(x_train, classifier.feature_importances_)), index=[i]) else: df_feature_importance_iter = pd.DataFrame(dict(zip(x_train, classifier.feature_importances_)), index=[i]) df_feature_importance = pd.concat([df_feature_importance, df_feature_importance_iter]) return df_feature_importance.mean()
def outliers_analysis(full_data, features_names=None, x_column=None, subplot_rows=None, subplot_cols=None, starting_index=0, index_offset=0, z_score_threshold=3.5, use_mean=False, plot=True, num_bins=50)
-
Calculate and visualize outliers analysis from Modified Z-score with MAD
Expand source code Browse git
def outliers_analysis(full_data, features_names=None, x_column=None, subplot_rows=None, subplot_cols=None, starting_index=0, index_offset=0, z_score_threshold=3.5, use_mean=False, plot=True, num_bins=50): """ Calculate and visualize outliers analysis from Modified Z-score with MAD """ # Compatibility with numpy arrays if type(full_data) == np.ndarray: assert len(full_data.shape) <= 2 if len(full_data.shape) == 1: columns = ['feature'] else: columns = ['feature_'+str(i) for i in range(full_data.shape[-1])] full_data = pd.DataFrame(full_data, columns=columns) # Features not provided, use all the columns if features_names is None: features_names = list(full_data.columns) if plot: # Set a good relation rows/cols for the plot if not specified if subplot_rows is None or subplot_cols is None: subplot_rows, subplot_cols = get_subplot_rows_cols(len(features_names), [3,4,5]) # Resize for better visualization of subplots plt.rcParams['figure.figsize'] = [subplot_cols * 5, subplot_rows * 4] fig, axes = plt.subplots(subplot_rows, subplot_cols, sharex=False, sharey=False) outliers_pd = full_data.copy() outliers_summary = {} i = starting_index while i < len(features_names): feature_name = features_names[i] data = outliers_pd.loc[outliers_pd[feature_name].notnull(), feature_name] # Modified Z-score with MAD (Median Absolute Deviation) if use_mean: outliers_pd.loc[outliers_pd[feature_name].notnull(), feature_name + '_zscore'] = 0.6745 * (data - data.mean()).abs() / ( data - data.mean()).abs().mean() else: outliers_pd.loc[outliers_pd[feature_name].notnull(), feature_name + '_zscore'] = 0.6745 * (data - data.median()).abs() / ( data - data.median()).abs().median() outliers_pd[feature_name + '_zscore_outliers'] = outliers_pd[feature_name + '_zscore'] > z_score_threshold if plot: # Take into account the case of only one plot if subplot_rows * subplot_cols == 1: ax = axes elif subplot_rows == 1: ax = axes[(i + index_offset) % subplot_cols] else: ax = axes[(i + index_offset) // subplot_cols, (i + index_offset) % subplot_cols] # If X_column provided plot scatter, otherwise histogram if x_column is None: bins = np.linspace(data.min(), data.max(), num_bins) ax.hist(data[~outliers_pd[feature_name + '_zscore_outliers']], bins=bins, density=False) ax.hist(data[outliers_pd[feature_name + '_zscore_outliers']], bins=bins, density=False) ax.set_title(feature_name) else: plot_scatter(outliers_pd[outliers_pd[feature_name].notnull()], x_column=x_column, y_column=feature_name, axes=ax, highlight_column=feature_name + '_zscore_outliers') outliers_percentage = 100 * outliers_pd[feature_name + '_zscore_outliers'].sum() / outliers_pd[ feature_name + '_zscore_outliers'].count() outliers_summary[feature_name] = outliers_percentage print("Feature: ", feature_name, " - Percentage of outliers using modified Z-score approach is: ", np.round(outliers_percentage, 2), "%") i = i + 1 if plot: fig.tight_layout() # Resize to original settings plt.rcParams['figure.figsize'] = [10, 6] outliers_summary = pd.DataFrame.from_dict(outliers_summary, orient='index', columns=['Percentage']) return outliers_summary, outliers_pd
def shadow_feature_selection(classifier_initial, y_train, x_train, eval_set=None, n_top_features=None, collinear_threshold=0.98, cum_importance_threshold=0.99, times_no_change_features=2, max_loops=50, n_iterations_mean=3, need_cat_features_index=False, categorical_columns=[], verbose=True, debug=False, plot_correlation=False)
-
Select features which have greater feature importance than random and than shuffle shadowed feature of itself
Expand source code Browse git
def shadow_feature_selection(classifier_initial, y_train, x_train, eval_set=None, n_top_features=None, collinear_threshold=0.98, cum_importance_threshold=0.99, times_no_change_features=2, max_loops=50, n_iterations_mean=3, need_cat_features_index=False, categorical_columns=[], verbose=True, debug=False, plot_correlation=False): """ Select features which have greater feature importance than random and than shuffle shadowed feature of itself """ # Create 3 random features which will serve as baseline to reject features baseline_features = ['random_binary', 'random_uniform', 'random_integers'] x_train = x_train.drop(baseline_features, axis=1, errors='ignore') x_train['random_binary'] = np.random.choice([0, 1], x_train.shape[0]) x_train['random_uniform'] = np.random.uniform(0, 1, x_train.shape[0]) x_train['random_integers'] = np.random.randint(0, x_train.shape[0] / 2, x_train.shape[0]) # For each feature it creates a shadow_feature which will have same values but shuffled x_train_shadow, dict_shadow_names = _create_shadow(x_train, baseline_features) count_no_changes = 0 x_all_columns = baseline_features + list(dict_shadow_names.keys()) + list(dict_shadow_names.values()) # "Infinite" loop till one of the stopping criterias stop removing features for i in tqdm_notebook(range(max_loops), desc='Main Loop'): if dict_shadow_names == {}: print("Stopping because no feature found to be relevant") return [], pd.DataFrame([]) print("Loop number: ", i, " with still ", len(dict_shadow_names.keys()), " features") # Take a copy of current columns to check stopping criteria of changing columns x_all_columns_prev = x_all_columns.copy() x_all_columns_prev.sort() if debug: print("x_all_columns_prev: ", x_all_columns_prev) # Get the feature importance for each column (real, shadow and baseline) df_feature_importance = get_feature_importance_mean(classifier_initial, y_train, x_train_shadow[x_all_columns], n_iterations_mean=n_iterations_mean, need_cat_features_index=need_cat_features_index, categorical_columns=categorical_columns, dict_shadow_names=dict_shadow_names, eval_set=eval_set) # Take as minimum value of feature importance as the greatest value of the baselines features if baseline_features != []: min_importance = df_feature_importance[baseline_features].max() else: min_importance = 0 # Drop all features that have lower importance than their shadow or lower than the baseline features dict_shadow_names_copy = dict_shadow_names.copy() for real_col, shadow_col in dict_shadow_names.items(): if df_feature_importance[shadow_col] >= df_feature_importance[real_col] or df_feature_importance[ real_col] < min_importance: del dict_shadow_names_copy[real_col] if debug: print("Removing feature: ", real_col) if debug: print("dict_shadow_names: ", dict_shadow_names) print("--------------------------------------------------------") print("dict_shadow_names_copy: ", dict_shadow_names_copy) print() print("--------------------------------------------------------") print() dict_shadow_names = dict_shadow_names_copy.copy() if debug: print("dict_shadow_names: ", dict_shadow_names) print("--------------------------------------------------------") print("dict_shadow_names_copy: ", dict_shadow_names_copy) x_all_columns = baseline_features + list(dict_shadow_names.keys()) + list(dict_shadow_names.values()) x_all_columns.sort() if debug: print() print("--------------------------------------------------------") print() print("x_all_columns_prev: ", x_all_columns_prev) print("--------------------------------------------------------") print("x_all_columns: ", x_all_columns) # Check if has been any change, if not then break the loop if x_all_columns == x_all_columns_prev: count_no_changes += 1 if times_no_change_features == count_no_changes: print("Stopping feature selection due to no change") break else: count_no_changes = 0 print("This loop has removed ", int((len(x_all_columns_prev) - len(x_all_columns)) / 2), " features") # If we have reduced to the maximum number of features, then break the loop if n_top_features != None and n_top_features >= len(dict_shadow_names.keys()): print("Stopping feature selection due to reached maximum of features") break df_feature_importance = df_feature_importance[list(dict_shadow_names.keys())].sort_values(ascending=False) df_feature_importance = cumulative_feature_selection(df_feature_importance, cum_importance_threshold) df_feature_importance = collinear_feature_selection(x_train[df_feature_importance.index.values], df_feature_importance, collinear_threshold=collinear_threshold, plot=plot_correlation) if n_top_features != None: if n_top_features >= len(dict_shadow_names.keys()): df_feature_importance = df_feature_importance.iloc[0: len(dict_shadow_names.keys())] else: df_feature_importance = df_feature_importance.iloc[0: n_top_features] return list(df_feature_importance.index), df_feature_importance
def stadistic_difference_distributions(data, submission, time_column, test_percentage=0.2, p_value_threshold=None, verbose=False)
-
Calculate relation between initial and end part of the dataset for each column using Kolmogorov-Smirnov statistic on 2 samples
Expand source code Browse git
def stadistic_difference_distributions(data, submission, time_column, test_percentage=0.2, p_value_threshold=None, verbose=False): """ Calculate relation between initial and end part of the dataset for each column using Kolmogorov-Smirnov statistic on 2 samples """ from scipy import stats from sklearn.model_selection import train_test_split train, test = train_test_split(data.sort_values(time_column), test_size=test_percentage, shuffle=False) time_analysis_df = pd.DataFrame(False, columns=['train_test', 'train_submission', 'test_submission'], index=submission.columns.values) for col in tqdm_notebook(submission.columns.values): try: KS_stat_test, p_value_test = stats.ks_2samp(train[col], test[col]) KS_stat_submission, p_value_submission = stats.ks_2samp(train[col], submission[col]) KS_stat_test_submission, p_value_test_submission = stats.ks_2samp(test[col], submission[col]) time_analysis_df.loc[col] = [p_value_test, p_value_submission, p_value_test_submission] if verbose: if p_value_test <= p_value_threshold or p_value_submission <= p_value_threshold or p_value_test_submission <= p_value_threshold: print_s = f'Column {col} has different distribution' if p_value_test <= p_value_threshold: print_s = print_s + ' // train <--> test' if p_value_submission <= p_value_threshold: print_s = print_s + ' // train <--> submission' if p_value_test_submission <= p_value_threshold: print_s = print_s + ' // test <--> submission' print(print_s) except TypeError: time_analysis_df.loc[col] = [np.nan, np.nan, np.nan] if p_value_threshold == None: cond1 = time_analysis_df['train_test'] == 0 cond2 = time_analysis_df['train_submission'] == 0 cond3 = time_analysis_df['test_submission'] == 0 else: cond1 = time_analysis_df['train_test'] <= p_value_threshold cond2 = time_analysis_df['train_submission'] <= p_value_threshold cond3 = time_analysis_df['test_submission'] <= p_value_threshold cols_to_remove = list(time_analysis_df[cond1 | cond2 | cond3].index) return time_analysis_df, cols_to_remove