In this blog I'd like to show the difference deep tabular augmentation can have when training a Random Forest on a highly biased data base. In this case, we have a look at credit card fraud, where fraud itself is is way less represented than non-fraud.

import pandas as pd
import numpy as np
import torch
from torch import nn
from torch import optim
from sklearn.preprocessing import StandardScaler
from functools import partial
import mlprepare as mlp
import deep_tabular_augmentation as dta
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DATA_PATH = 'data/creditcard.csv'

df = pd.read_csv(DATA_PATH)

Let's have a short look at the data:

df.head()

Also, let's have a look of how many more non-frauf cases we have compared to fraud cases:

difference_in_class_occurences = df['Class'].value_counts()[0]-df['Class'].value_counts()[1]
difference_in_class_occurences

283823

In order to make use of the deep tabular augmentation we need to scale the data and then use only those cases, in which class we are interested in, in this case "Class" is equal to 1.

X_train, X_test, y_train, y_test = mlp.split_df(df, dep_var='Class', test_size=0.3, split_mode='random')

x_scaler = StandardScaler()

X_train_scaled = x_scaler.fit_transform(X_train)

X_test_scaled = x_scaler.transform(X_test)

X_train_fraud = X_train_scaled[np.where(y_train==1)[0]]
X_test_fraud = X_test_scaled[np.where(y_test==1)[0]]

For our model to work we need to put our data in a DataLoader (here I use the DataBunch Class from deep data augmentation).

datasets = dta.create_datasets(X_train_fraud, y_train.values[np.where(y_train==1)], X_test_fraud, y_test.values[np.where(y_test==1)])
data = dta.DataBunch(*dta.create_loaders(datasets, bs=1024))

Now we're already good to go. We can define our Variational Encoder Architecture (here: 50 - 12 - 12 - 5 - 12 - 12 - 50) and then use the LearningRate Finder to tell us the best Learning rate:

D_in = X_train_fraud.shape[1]
target_name = 'Class'
target_class = 1
df_cols = list(df.columns)

model = dta.Autoencoder(nn.Sequential(*dta.get_lin_layers(D_in, [50, 12, 12])),
                     nn.Sequential(*dta.get_lin_layers_rev(D_in, [50, 12, 12])),
                     latent_dim=5).to(device)
opt = optim.Adam(model.parameters(), lr=0.01)
loss_func = dta.customLoss()

learn = dta.Learner(model, opt, loss_func, data, target_name, target_class, df_cols)

run = dta.Runner(cb_funcs=[dta.LR_Find, dta.Recorder])

run.fit(100, learn)

run.recorder.plot(skip_last=5)

We set up a desirable learning rate and scheduler for our learning rate:

sched = dta.combine_scheds([0.3, 0.7], [dta.sched_cos(0.01, 0.1), dta.sched_cos(0.1, 0.01)])

Now, let's train the model:

cbfs = [partial(dta.LossTracker, show_every=50), dta.Recorder, partial(dta.ParamScheduler, 'lr', sched)]
model = dta.Autoencoder(nn.Sequential(*dta.get_lin_layers(D_in, [50, 12, 12])),
                     nn.Sequential(*dta.get_lin_layers_rev(D_in, [50, 12, 12])),
                     latent_dim=5).to(device)
opt = optim.Adam(model.parameters(), lr=0.01)
learn = dta.Learner(model, opt, loss_func, data, target_name, target_class, df_cols)
run = dta.Runner(cb_funcs=cbfs)
run.fit(400, learn)

epoch: 50
train loss is: 250777.15625
validation loss is: 89807.21875
epoch: 100
train loss is: 184203.078125
validation loss is: 71237.8828125
epoch: 150
train loss is: 129945.9765625
validation loss is: 81472.5078125
epoch: 200
train loss is: 95360.5859375
validation loss is: 260343.4375
epoch: 250
train loss is: 75559.625
validation loss is: 208241.1875
epoch: 300
train loss is: 63336.73828125
validation loss is: 168084.046875
epoch: 350
train loss is: 55019.421875
validation loss is: 141231.15625
epoch: 400
train loss is: 48957.48046875
validation loss is: 121973.09375

Let's see how the created data looks like:

df_fake = run.predict_df(learn, no_samples=difference_in_class_occurences, scaler=x_scaler)
df_fake_with_noise = run.predict_with_noise_df(learn, no_samples=difference_in_class_occurences, mu=0, sigma=0.1, scaler=x_scaler)
df_fake_with_noise.head()

df_fake_with_noise.describe().loc[['mean']]

Train Random Forest

We want to compare how the built-in class_weight functionality performs vs the new approach (spoiler: if you do not use any weights the RandomForest will always predict 0). Hence, we create three dataframes: the original, the original appended with fake_data, the original appended with fake data with noise.

train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
train_df_fake = pd.concat([train_df, df_fake])
train_df_fake_with_noise = pd.concat([train_df, df_fake_with_noise])

To make things easier to understand, let's define the datasets on which to train and on which to assess the results:

X_train, X_test, X_train_aug = train_df.iloc[:,:30].values, test_df.iloc[:,:30].values, train_df_fake_with_noise.iloc[:,:30].values
y_train, y_test, y_train_aug = train_df.iloc[:,30].values, test_df.iloc[:,30].values, train_df_fake_with_noise.iloc[:,30].values

First, let's train model on the original data while using the differences in class occurences as weights.

def rf(xs, y, n_estimators=40, max_samples=500,
       max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True, class_weight={0:1,1:difference_in_class_occurences}).fit(xs, y)

m = rf(X_train, y_train)
confusion_matrix(y_test, np.round(m.predict(X_test)))

array([[85300,     7],
       [   99,    37]])

Then, we use the augmented dataframe:

def rf_aug(xs, y, n_estimators=40, max_samples=500,
       max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)

m_aug = rf_aug(X_train_aug, y_train_aug)
confusion_matrix(test_df.iloc[:,30].values, np.round(m_aug.predict(test_df.iloc[:,:30].values)))

array([[85288,    19],
       [   46,    90]])

Wow, I think that is quite astonishing. We managed to highly increase the number of fraud cases we are able to detect. Moreover, we achieved these results without any finetuning of the model architecture and simply using the default structure of the VAE.

I hope this blog shed some light on why using this approach on highly biased data is worth a shot trying.

Lasse

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class
0	107547.656250	-1.175546	3.190463	-5.880084	5.172310	-0.201045	-2.110816	-2.554305	0.789000	-2.931453	...	0.680436	-0.063858	0.295163	-0.333722	0.318315	0.305315	0.570651	0.418477	59.745529	1
1	105570.804688	-1.411785	2.465464	-4.638255	3.691925	-1.264926	-1.476137	-2.528746	0.799087	-2.038979	...	0.874525	0.115001	0.530135	-0.186685	0.258670	0.213698	0.545221	0.412415	103.826714	1
2	104763.757812	-1.315135	2.079836	-3.970082	2.974885	-1.478414	-1.190014	-2.352298	0.718098	-1.687526	...	0.794227	0.149139	0.566353	-0.152397	0.244498	0.140893	0.520099	0.307183	114.956940	1
3	106964.625000	-1.133216	2.987270	-5.564784	4.863543	-0.300885	-1.990756	-2.479523	0.771097	-2.773068	...	0.650077	-0.045114	0.288558	-0.310352	0.305301	0.276039	0.575313	0.383283	70.325348	1
4	105466.343750	-1.274740	2.423632	-4.584245	3.720120	-1.073872	-1.501573	-2.430743	0.760877	-2.088018	...	0.771964	0.084755	0.460291	-0.201218	0.260977	0.201086	0.546640	0.366659	101.931236	1