How to augment highly inbalanced data with fake data

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import mlprepare as mlp 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

DATA_PATH = 'data/creditcard.csv'

df = pd.read_csv(DATA_PATH, sep=',')
df.head()

df_base = df.copy()

cols = df_base.columns

We need to normalize Time and Amount

mean_time=df_base['Time'].mean()
mean_amount=df_base['Amount'].mean()
std_time=df_base['Time'].std()
std_amount=df_base['Amount'].std()

df_base['Time']=(df_base['Time']-mean_time)/std_time
df_base['Amount']=(df_base['Amount']-mean_amount)/std_amount

Class=1 means that this was indeed a fraud case, class=0 means no fraud. This dataset is highly imbalanced:

df_base['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

I want to create fake data based on the 492 cases, which I will then use to improve the model. Let's first train a simple RandomForest.

X_train, X_test, y_train, y_test = mlp.split_df(df_base, dep_var='Class', test_size=0.3, split_mode='random')

y_test.value_counts()

0    85286
1      157
Name: Class, dtype: int64

#Ratio of the two classes:
y_test.value_counts()[0]/y_test.value_counts()[1]

543.2229299363057

RandomForest with Oversampling

Let's first use the class_weight provided by sklearn to deal with this highly inbalanced data.

def rf(xs, y, n_estimators=40, max_samples=500,
       max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True, class_weight={0:1,1:543}).fit(xs, y)

m = rf(X_train, y_train)

confusion_matrix(y_test, np.round(m.predict(X_test)))

array([[85278,     8],
       [  118,    39]], dtype=int64)

With this technique we get about 39 out of 157 Fraud cases, although the results vary quite a lot!

Fake Data with VAE

We want only the data points where y_train/test_train =1

X_train_fraud = X_train.iloc[np.where(y_train==1)[0]]
X_test_fraud = X_test.iloc[np.where(y_test==1)[0]]

Let's build a dataloader for our data, still keeping the pre-defined training/test datasets the way they were.

from torch.utils.data import Dataset, DataLoader
class DataBuilder(Dataset):
    def __init__(self, dataset):
        self.x = dataset.values
        self.x = torch.from_numpy(self.x).to(torch.float)
        self.len=self.x.shape[0]
    def __getitem__(self,index):      
        return self.x[index]
    def __len__(self):
        return self.len

traindata_set=DataBuilder(X_train_fraud)
testdata_set=DataBuilder(X_test_fraud)

trainloader=DataLoader(dataset=traindata_set,batch_size=1024)
testloader=DataLoader(dataset=testdata_set,batch_size=1024)

Define the Variational Autoencoder (for more information check out my earlier blogpost).

class Autoencoder(nn.Module):
    def __init__(self,D_in,H=50,H2=12,latent_dim=3):
        
        #Encoder
        super(Autoencoder,self).__init__()
        self.linear1=nn.Linear(D_in,H)
        self.lin_bn1 = nn.BatchNorm1d(num_features=H)
        self.linear2=nn.Linear(H,H2)
        self.lin_bn2 = nn.BatchNorm1d(num_features=H2)
        self.linear3=nn.Linear(H2,H2)
        self.lin_bn3 = nn.BatchNorm1d(num_features=H2)
        
        # Latent vectors mu and sigma
        self.fc1 = nn.Linear(H2, latent_dim)
        self.bn1 = nn.BatchNorm1d(num_features=latent_dim)
        self.fc21 = nn.Linear(latent_dim, latent_dim)
        self.fc22 = nn.Linear(latent_dim, latent_dim)

        # Sampling vector
        self.fc3 = nn.Linear(latent_dim, latent_dim)
        self.fc_bn3 = nn.BatchNorm1d(latent_dim)
        self.fc4 = nn.Linear(latent_dim, H2)
        self.fc_bn4 = nn.BatchNorm1d(H2)
        
        # Decoder
        self.linear4=nn.Linear(H2,H2)
        self.lin_bn4 = nn.BatchNorm1d(num_features=H2)
        self.linear5=nn.Linear(H2,H)
        self.lin_bn5 = nn.BatchNorm1d(num_features=H)
        self.linear6=nn.Linear(H,D_in)
        self.lin_bn6 = nn.BatchNorm1d(num_features=D_in)
        
        self.relu = nn.ReLU()
        
    def encode(self, x):
        lin1 = self.relu(self.lin_bn1(self.linear1(x)))
        lin2 = self.relu(self.lin_bn2(self.linear2(lin1)))
        lin3 = self.relu(self.lin_bn3(self.linear3(lin2)))

        fc1 = F.relu(self.bn1(self.fc1(lin3)))

        r1 = self.fc21(fc1)
        r2 = self.fc22(fc1)
        
        return r1, r2
    
    def reparameterize(self, mu, logvar):
        if self.training:
            std = logvar.mul(0.5).exp_()
            eps = Variable(std.data.new(std.size()).normal_())
            return eps.mul(std).add_(mu)
        else:
            return mu
        
    def decode(self, z):
        fc3 = self.relu(self.fc_bn3(self.fc3(z)))
        fc4 = self.relu(self.fc_bn4(self.fc4(fc3)))

        lin4 = self.relu(self.lin_bn4(self.linear4(fc4)))
        lin5 = self.relu(self.lin_bn5(self.linear5(lin4)))
        return self.lin_bn6(self.linear6(lin5))


        
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

class customLoss(nn.Module):
    def __init__(self):
        super(customLoss, self).__init__()
        self.mse_loss = nn.MSELoss(reduction="sum")
    
    # x_recon ist der im forward im Model erstellte recon_batch, x ist der originale x Batch, mu ist mu und logvar ist logvar 
    def forward(self, x_recon, x, mu, logvar):
        loss_MSE = self.mse_loss(x_recon, x)
        loss_KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

        return loss_MSE + loss_KLD

D_in = traindata_set.x.shape[1]
H = 50
H2 = 12
model = Autoencoder(D_in, H, H2).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

loss_mse = customLoss()

Train Model

log_interval = 50
val_losses = []
train_losses = []
test_losses = []

def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(trainloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_mse(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    if epoch % 200 == 0:        
        print('====> Epoch: {} Average training loss: {:.4f}'.format(
            epoch, train_loss / len(trainloader.dataset)))
        train_losses.append(train_loss / len(trainloader.dataset))

def test(epoch):
    with torch.no_grad():
        test_loss = 0
        for batch_idx, data in enumerate(testloader):
            data = data.to(device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(data)
            loss = loss_mse(recon_batch, data, mu, logvar)
            test_loss += loss.item()
            if epoch % 200 == 0:        
                print('====> Epoch: {} Average test loss: {:.4f}'.format(
                    epoch, test_loss / len(testloader.dataset)))
            test_losses.append(test_loss / len(testloader.dataset))

epochs = 1500
for epoch in range(1, epochs + 1):
    train(epoch)
    test(epoch)

====> Epoch: 200 Average training loss: 706.2121
====> Epoch: 200 Average test loss: 590.0016
====> Epoch: 400 Average training loss: 620.5279
====> Epoch: 400 Average test loss: 521.3142
====> Epoch: 600 Average training loss: 566.4392
====> Epoch: 600 Average test loss: 477.5008
====> Epoch: 800 Average training loss: 521.7474
====> Epoch: 800 Average test loss: 440.3243
====> Epoch: 1000 Average training loss: 481.2092
====> Epoch: 1000 Average test loss: 407.7625
====> Epoch: 1200 Average training loss: 434.3898
====> Epoch: 1200 Average test loss: 362.2760
====> Epoch: 1400 Average training loss: 396.9551
====> Epoch: 1400 Average test loss: 343.7408

We're still improving so keep going

epochs = 2500
optimizer = optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(1, epochs + 1):
    train(epoch)
    test(epoch)

====> Epoch: 200 Average training loss: 343.3472
====> Epoch: 200 Average test loss: 300.3575
====> Epoch: 400 Average training loss: 310.5800
====> Epoch: 400 Average test loss: 285.6697
====> Epoch: 600 Average training loss: 281.8408
====> Epoch: 600 Average test loss: 263.7150
====> Epoch: 800 Average training loss: 256.1950
====> Epoch: 800 Average test loss: 244.9427
====> Epoch: 1000 Average training loss: 232.6077
====> Epoch: 1000 Average test loss: 236.3014
====> Epoch: 1200 Average training loss: 211.2899
====> Epoch: 1200 Average test loss: 217.6404
====> Epoch: 1400 Average training loss: 191.3525
====> Epoch: 1400 Average test loss: 205.8287
====> Epoch: 1600 Average training loss: 174.0826
====> Epoch: 1600 Average test loss: 189.0589
====> Epoch: 1800 Average training loss: 157.4292
====> Epoch: 1800 Average test loss: 175.6006
====> Epoch: 2000 Average training loss: 143.2475
====> Epoch: 2000 Average test loss: 177.1668
====> Epoch: 2200 Average training loss: 129.9684
====> Epoch: 2200 Average test loss: 160.4641
====> Epoch: 2400 Average training loss: 117.6745
====> Epoch: 2400 Average test loss: 150.9483

epochs = 500
optimizer = optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(1, epochs + 1):
    train(epoch)
    test(epoch)

====> Epoch: 200 Average training loss: 54.6816
====> Epoch: 200 Average test loss: 129.6853
====> Epoch: 400 Average training loss: 48.5159
====> Epoch: 400 Average test loss: 134.4429

Let's look at the results:

with torch.no_grad():
    for batch_idx, data in enumerate(testloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)

recon_row = recon_batch[0].cpu().numpy()
recon_row = np.append(recon_row, [1])
real_row = testloader.dataset.x[0].cpu().numpy()
real_row = np.append(real_row, [1])

df = pd.DataFrame(np.stack((recon_row, real_row)), columns = cols)
df

sigma = torch.exp(logvar/2)

mu.mean(axis=0), sigma.mean(axis=0)

(tensor([0.0001, 0.0163, 0.0400]), tensor([0.9976, 0.0370, 0.0381]))

# sample z from q
no_samples = 20
q = torch.distributions.Normal(mu.mean(axis=0), sigma.mean(axis=0))
z = q.rsample(sample_shape=torch.Size([no_samples]))

with torch.no_grad():
    pred = model.decode(z).cpu().numpy()

df_fake = pd.DataFrame(pred)
df_fake['Class']=1
df_fake.columns = cols
df_fake['Class'] = np.round(df_fake['Class']).astype(int)
df_fake['Time'] = (df_fake['Time']*std_time)+mean_time
df_fake['Amount'] = (df_fake['Amount']*std_amount)+mean_amount
df_fake.head()

df_fake['Amount'].mean()

121.77293

df.groupby('Class').mean()['Amount']

Class
0     88.291022
1    122.211321
Name: Amount, dtype: float64

Use fake data for oversampling in RandomForest

y_train.value_counts()

0    199029
1       335
Name: Class, dtype: int64

So let's build about 190.000 fake fraud detection cases:

no_samples = 190_000
q = torch.distributions.Normal(mu.mean(axis=0), sigma.mean(axis=0))
z = q.rsample(sample_shape=torch.Size([no_samples]))

with torch.no_grad():
    pred = model.decode(z).cpu().numpy()

Concat to our X_train:

X_train_augmented = np.vstack((X_train.values, pred))
y_train_augmented = np.append(y_train.values, np.repeat(1,no_samples))
X_train_augmented.shape

(389364, 30)

We now have roughly as many fraud cases as we have non-fraud cases.

Train RandomForest

def rf_aug(xs, y, n_estimators=40, max_samples=500,
       max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)

m_aug = rf_aug(X_train_augmented, y_train_augmented)
confusion_matrix(y_test, np.round(m_aug.predict(X_test)))

array([[84963,   323],
       [   30,   127]], dtype=int64)

confusion_matrix(y_test, np.round(m.predict(X_test)))

array([[85278,     8],
       [  118,    39]], dtype=int64)

Look at that! We managed to find 127 out of 157! If our goal was to detect as many of the fraud cases, then we highly succeeded. Maybe you should think of this technique when you're dealing with highly inbalanced datasets in the future.

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class
0	-0.196971	-7.667089	5.699276	-10.15090	10.077229	-7.307253	-2.589641	-9.824335	3.019747	-7.658296	...	1.073921	0.034662	0.247951	0.00464	-0.037674	0.597619	0.763070	-0.609457	-0.377716	1.0
1	0.910404	-5.839191	7.151532	-12.81676	7.031115	-9.651272	-2.938427	-11.543207	4.843626	-3.494276	...	2.462056	1.054865	0.530481	0.47267	-0.275998	0.282435	0.104886	0.254417	0.910404	1.0

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class
0	-1.014143	1.505616	-4.616234	7.718655	-0.977422	8.594662	-3.198405	-6.944025	-5.043085	2.561653	...	1.094700	0.510489	-1.254657	-0.085117	0.283567	-0.268765	3.025049	0.929408	-79.125496	1
1	-1.810440	-13.005595	1.212420	5.370727	2.069537	-1.141557	-3.816671	-6.958980	4.140651	-1.208175	...	0.902933	-0.573067	1.209823	0.543091	0.666637	-0.524895	0.204588	-0.074243	-380.632935	1
2	-1.152523	12.006341	-3.014931	4.485871	-1.155190	10.059814	-3.355832	-8.342437	-8.336978	2.741910	...	-0.101801	1.417866	-2.335097	0.034988	-0.466923	-0.012957	2.653872	1.081970	-163.960175	1
3	0.228914	-5.935965	-1.644437	-6.354884	7.788726	-0.055751	-1.726003	0.577209	1.638260	-5.880371	...	-5.350942	2.994604	-0.079382	-1.020990	-0.090167	0.395981	-1.590370	-1.090804	9.417862	1
4	0.180823	-3.444491	4.722339	-4.571048	4.998073	-4.543203	-0.816252	-5.482205	3.643872	-4.685173	...	-1.748235	1.525022	0.258438	-0.465014	0.064509	0.277528	1.127516	0.161839	171.483337	1