How to augment highly inbalanced data with fake data

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import mlprepare as mlp 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
device(type='cpu')
DATA_PATH = 'data/creditcard.csv'
df = pd.read_csv(DATA_PATH, sep=',')
df.head()
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0

5 rows × 31 columns

df_base = df.copy()
cols = df_base.columns

We need to normalize Time and Amount

mean_time=df_base['Time'].mean()
mean_amount=df_base['Amount'].mean()
std_time=df_base['Time'].std()
std_amount=df_base['Amount'].std()

df_base['Time']=(df_base['Time']-mean_time)/std_time
df_base['Amount']=(df_base['Amount']-mean_amount)/std_amount

Class=1 means that this was indeed a fraud case, class=0 means no fraud. This dataset is highly imbalanced:

df_base['Class'].value_counts()
0    284315
1       492
Name: Class, dtype: int64

I want to create fake data based on the 492 cases, which I will then use to improve the model. Let's first train a simple RandomForest.

X_train, X_test, y_train, y_test = mlp.split_df(df_base, dep_var='Class', test_size=0.3, split_mode='random')
y_test.value_counts()
0    85286
1      157
Name: Class, dtype: int64
#Ratio of the two classes:
y_test.value_counts()[0]/y_test.value_counts()[1]
543.2229299363057

RandomForest with Oversampling

Let's first use the class_weight provided by sklearn to deal with this highly inbalanced data.

def rf(xs, y, n_estimators=40, max_samples=500,
       max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True, class_weight={0:1,1:543}).fit(xs, y)
m = rf(X_train, y_train)
confusion_matrix(y_test, np.round(m.predict(X_test)))
array([[85278,     8],
       [  118,    39]], dtype=int64)

With this technique we get about 39 out of 157 Fraud cases, although the results vary quite a lot!

Fake Data with VAE

We want only the data points where y_train/test_train =1

X_train_fraud = X_train.iloc[np.where(y_train==1)[0]]
X_test_fraud = X_test.iloc[np.where(y_test==1)[0]]

Let's build a dataloader for our data, still keeping the pre-defined training/test datasets the way they were.

from torch.utils.data import Dataset, DataLoader
class DataBuilder(Dataset):
    def __init__(self, dataset):
        self.x = dataset.values
        self.x = torch.from_numpy(self.x).to(torch.float)
        self.len=self.x.shape[0]
    def __getitem__(self,index):      
        return self.x[index]
    def __len__(self):
        return self.len
traindata_set=DataBuilder(X_train_fraud)
testdata_set=DataBuilder(X_test_fraud)

trainloader=DataLoader(dataset=traindata_set,batch_size=1024)
testloader=DataLoader(dataset=testdata_set,batch_size=1024)

Define the Variational Autoencoder (for more information check out my earlier blogpost).

class Autoencoder(nn.Module):
    def __init__(self,D_in,H=50,H2=12,latent_dim=3):
        
        #Encoder
        super(Autoencoder,self).__init__()
        self.linear1=nn.Linear(D_in,H)
        self.lin_bn1 = nn.BatchNorm1d(num_features=H)
        self.linear2=nn.Linear(H,H2)
        self.lin_bn2 = nn.BatchNorm1d(num_features=H2)
        self.linear3=nn.Linear(H2,H2)
        self.lin_bn3 = nn.BatchNorm1d(num_features=H2)
        
        # Latent vectors mu and sigma
        self.fc1 = nn.Linear(H2, latent_dim)
        self.bn1 = nn.BatchNorm1d(num_features=latent_dim)
        self.fc21 = nn.Linear(latent_dim, latent_dim)
        self.fc22 = nn.Linear(latent_dim, latent_dim)

        # Sampling vector
        self.fc3 = nn.Linear(latent_dim, latent_dim)
        self.fc_bn3 = nn.BatchNorm1d(latent_dim)
        self.fc4 = nn.Linear(latent_dim, H2)
        self.fc_bn4 = nn.BatchNorm1d(H2)
        
        # Decoder
        self.linear4=nn.Linear(H2,H2)
        self.lin_bn4 = nn.BatchNorm1d(num_features=H2)
        self.linear5=nn.Linear(H2,H)
        self.lin_bn5 = nn.BatchNorm1d(num_features=H)
        self.linear6=nn.Linear(H,D_in)
        self.lin_bn6 = nn.BatchNorm1d(num_features=D_in)
        
        self.relu = nn.ReLU()
        
    def encode(self, x):
        lin1 = self.relu(self.lin_bn1(self.linear1(x)))
        lin2 = self.relu(self.lin_bn2(self.linear2(lin1)))
        lin3 = self.relu(self.lin_bn3(self.linear3(lin2)))

        fc1 = F.relu(self.bn1(self.fc1(lin3)))

        r1 = self.fc21(fc1)
        r2 = self.fc22(fc1)
        
        return r1, r2
    
    def reparameterize(self, mu, logvar):
        if self.training:
            std = logvar.mul(0.5).exp_()
            eps = Variable(std.data.new(std.size()).normal_())
            return eps.mul(std).add_(mu)
        else:
            return mu
        
    def decode(self, z):
        fc3 = self.relu(self.fc_bn3(self.fc3(z)))
        fc4 = self.relu(self.fc_bn4(self.fc4(fc3)))

        lin4 = self.relu(self.lin_bn4(self.linear4(fc4)))
        lin5 = self.relu(self.lin_bn5(self.linear5(lin4)))
        return self.lin_bn6(self.linear6(lin5))


        
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar
class customLoss(nn.Module):
    def __init__(self):
        super(customLoss, self).__init__()
        self.mse_loss = nn.MSELoss(reduction="sum")
    
    # x_recon ist der im forward im Model erstellte recon_batch, x ist der originale x Batch, mu ist mu und logvar ist logvar 
    def forward(self, x_recon, x, mu, logvar):
        loss_MSE = self.mse_loss(x_recon, x)
        loss_KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

        return loss_MSE + loss_KLD
D_in = traindata_set.x.shape[1]
H = 50
H2 = 12
model = Autoencoder(D_in, H, H2).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_mse = customLoss()

Train Model

log_interval = 50
val_losses = []
train_losses = []
test_losses = []
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(trainloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_mse(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    if epoch % 200 == 0:        
        print('====> Epoch: {} Average training loss: {:.4f}'.format(
            epoch, train_loss / len(trainloader.dataset)))
        train_losses.append(train_loss / len(trainloader.dataset))
def test(epoch):
    with torch.no_grad():
        test_loss = 0
        for batch_idx, data in enumerate(testloader):
            data = data.to(device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(data)
            loss = loss_mse(recon_batch, data, mu, logvar)
            test_loss += loss.item()
            if epoch % 200 == 0:        
                print('====> Epoch: {} Average test loss: {:.4f}'.format(
                    epoch, test_loss / len(testloader.dataset)))
            test_losses.append(test_loss / len(testloader.dataset))
epochs = 1500
for epoch in range(1, epochs + 1):
    train(epoch)
    test(epoch)
====> Epoch: 200 Average training loss: 706.2121
====> Epoch: 200 Average test loss: 590.0016
====> Epoch: 400 Average training loss: 620.5279
====> Epoch: 400 Average test loss: 521.3142
====> Epoch: 600 Average training loss: 566.4392
====> Epoch: 600 Average test loss: 477.5008
====> Epoch: 800 Average training loss: 521.7474
====> Epoch: 800 Average test loss: 440.3243
====> Epoch: 1000 Average training loss: 481.2092
====> Epoch: 1000 Average test loss: 407.7625
====> Epoch: 1200 Average training loss: 434.3898
====> Epoch: 1200 Average test loss: 362.2760
====> Epoch: 1400 Average training loss: 396.9551
====> Epoch: 1400 Average test loss: 343.7408

We're still improving so keep going

epochs = 2500
optimizer = optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(1, epochs + 1):
    train(epoch)
    test(epoch)
====> Epoch: 200 Average training loss: 343.3472
====> Epoch: 200 Average test loss: 300.3575
====> Epoch: 400 Average training loss: 310.5800
====> Epoch: 400 Average test loss: 285.6697
====> Epoch: 600 Average training loss: 281.8408
====> Epoch: 600 Average test loss: 263.7150
====> Epoch: 800 Average training loss: 256.1950
====> Epoch: 800 Average test loss: 244.9427
====> Epoch: 1000 Average training loss: 232.6077
====> Epoch: 1000 Average test loss: 236.3014
====> Epoch: 1200 Average training loss: 211.2899
====> Epoch: 1200 Average test loss: 217.6404
====> Epoch: 1400 Average training loss: 191.3525
====> Epoch: 1400 Average test loss: 205.8287
====> Epoch: 1600 Average training loss: 174.0826
====> Epoch: 1600 Average test loss: 189.0589
====> Epoch: 1800 Average training loss: 157.4292
====> Epoch: 1800 Average test loss: 175.6006
====> Epoch: 2000 Average training loss: 143.2475
====> Epoch: 2000 Average test loss: 177.1668
====> Epoch: 2200 Average training loss: 129.9684
====> Epoch: 2200 Average test loss: 160.4641
====> Epoch: 2400 Average training loss: 117.6745
====> Epoch: 2400 Average test loss: 150.9483
epochs = 500
optimizer = optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(1, epochs + 1):
    train(epoch)
    test(epoch)
====> Epoch: 200 Average training loss: 54.6816
====> Epoch: 200 Average test loss: 129.6853
====> Epoch: 400 Average training loss: 48.5159
====> Epoch: 400 Average test loss: 134.4429

Let's look at the results:

with torch.no_grad():
    for batch_idx, data in enumerate(testloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
recon_row = recon_batch[0].cpu().numpy()
recon_row = np.append(recon_row, [1])
real_row = testloader.dataset.x[0].cpu().numpy()
real_row = np.append(real_row, [1])
df = pd.DataFrame(np.stack((recon_row, real_row)), columns = cols)
df
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 -0.196971 -7.667089 5.699276 -10.15090 10.077229 -7.307253 -2.589641 -9.824335 3.019747 -7.658296 ... 1.073921 0.034662 0.247951 0.00464 -0.037674 0.597619 0.763070 -0.609457 -0.377716 1.0
1 0.910404 -5.839191 7.151532 -12.81676 7.031115 -9.651272 -2.938427 -11.543207 4.843626 -3.494276 ... 2.462056 1.054865 0.530481 0.47267 -0.275998 0.282435 0.104886 0.254417 0.910404 1.0

2 rows × 31 columns

sigma = torch.exp(logvar/2)
mu.mean(axis=0), sigma.mean(axis=0)
(tensor([0.0001, 0.0163, 0.0400]), tensor([0.9976, 0.0370, 0.0381]))
# sample z from q
no_samples = 20
q = torch.distributions.Normal(mu.mean(axis=0), sigma.mean(axis=0))
z = q.rsample(sample_shape=torch.Size([no_samples]))
with torch.no_grad():
    pred = model.decode(z).cpu().numpy()
df_fake = pd.DataFrame(pred)
df_fake['Class']=1
df_fake.columns = cols
df_fake['Class'] = np.round(df_fake['Class']).astype(int)
df_fake['Time'] = (df_fake['Time']*std_time)+mean_time
df_fake['Amount'] = (df_fake['Amount']*std_amount)+mean_amount
df_fake.head()
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 -1.014143 1.505616 -4.616234 7.718655 -0.977422 8.594662 -3.198405 -6.944025 -5.043085 2.561653 ... 1.094700 0.510489 -1.254657 -0.085117 0.283567 -0.268765 3.025049 0.929408 -79.125496 1
1 -1.810440 -13.005595 1.212420 5.370727 2.069537 -1.141557 -3.816671 -6.958980 4.140651 -1.208175 ... 0.902933 -0.573067 1.209823 0.543091 0.666637 -0.524895 0.204588 -0.074243 -380.632935 1
2 -1.152523 12.006341 -3.014931 4.485871 -1.155190 10.059814 -3.355832 -8.342437 -8.336978 2.741910 ... -0.101801 1.417866 -2.335097 0.034988 -0.466923 -0.012957 2.653872 1.081970 -163.960175 1
3 0.228914 -5.935965 -1.644437 -6.354884 7.788726 -0.055751 -1.726003 0.577209 1.638260 -5.880371 ... -5.350942 2.994604 -0.079382 -1.020990 -0.090167 0.395981 -1.590370 -1.090804 9.417862 1
4 0.180823 -3.444491 4.722339 -4.571048 4.998073 -4.543203 -0.816252 -5.482205 3.643872 -4.685173 ... -1.748235 1.525022 0.258438 -0.465014 0.064509 0.277528 1.127516 0.161839 171.483337 1

5 rows × 31 columns

df_fake['Amount'].mean()
121.77293
df.groupby('Class').mean()['Amount']
Class
0     88.291022
1    122.211321
Name: Amount, dtype: float64

Use fake data for oversampling in RandomForest

y_train.value_counts()
0    199029
1       335
Name: Class, dtype: int64

So let's build about 190.000 fake fraud detection cases:

no_samples = 190_000
q = torch.distributions.Normal(mu.mean(axis=0), sigma.mean(axis=0))
z = q.rsample(sample_shape=torch.Size([no_samples]))
with torch.no_grad():
    pred = model.decode(z).cpu().numpy()

Concat to our X_train:

X_train_augmented = np.vstack((X_train.values, pred))
y_train_augmented = np.append(y_train.values, np.repeat(1,no_samples))
X_train_augmented.shape
(389364, 30)

We now have roughly as many fraud cases as we have non-fraud cases.

Train RandomForest

def rf_aug(xs, y, n_estimators=40, max_samples=500,
       max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)
m_aug = rf_aug(X_train_augmented, y_train_augmented)
confusion_matrix(y_test, np.round(m_aug.predict(X_test)))
array([[84963,   323],
       [   30,   127]], dtype=int64)
confusion_matrix(y_test, np.round(m.predict(X_test)))
array([[85278,     8],
       [  118,    39]], dtype=int64)

Look at that! We managed to find 127 out of 157! If our goal was to detect as many of the fraud cases, then we highly succeeded. Maybe you should think of this technique when you're dealing with highly inbalanced datasets in the future.