Data Augmentation for tabular data on inbalanced dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import mlprepare as mlp
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
DATA_PATH = 'data/creditcard.csv'
df = pd.read_csv(DATA_PATH, sep=',')
df.head()
df_base = df.copy()
cols = df_base.columns
We need to normalize Time and Amount
mean_time=df_base['Time'].mean()
mean_amount=df_base['Amount'].mean()
std_time=df_base['Time'].std()
std_amount=df_base['Amount'].std()
df_base['Time']=(df_base['Time']-mean_time)/std_time
df_base['Amount']=(df_base['Amount']-mean_amount)/std_amount
Class=1 means that this was indeed a fraud case, class=0 means no fraud. This dataset is highly imbalanced:
df_base['Class'].value_counts()
I want to create fake data based on the 492 cases, which I will then use to improve the model. Let's first train a simple RandomForest.
X_train, X_test, y_train, y_test = mlp.split_df(df_base, dep_var='Class', test_size=0.3, split_mode='random')
y_test.value_counts()
#Ratio of the two classes:
y_test.value_counts()[0]/y_test.value_counts()[1]
def rf(xs, y, n_estimators=40, max_samples=500,
max_features=0.5, min_samples_leaf=5, **kwargs):
return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
max_samples=max_samples, max_features=max_features,
min_samples_leaf=min_samples_leaf, oob_score=True, class_weight={0:1,1:543}).fit(xs, y)
m = rf(X_train, y_train)
confusion_matrix(y_test, np.round(m.predict(X_test)))
With this technique we get about 39 out of 157 Fraud cases, although the results vary quite a lot!
We want only the data points where y_train/test_train =1
X_train_fraud = X_train.iloc[np.where(y_train==1)[0]]
X_test_fraud = X_test.iloc[np.where(y_test==1)[0]]
Let's build a dataloader for our data, still keeping the pre-defined training/test datasets the way they were.
from torch.utils.data import Dataset, DataLoader
class DataBuilder(Dataset):
def __init__(self, dataset):
self.x = dataset.values
self.x = torch.from_numpy(self.x).to(torch.float)
self.len=self.x.shape[0]
def __getitem__(self,index):
return self.x[index]
def __len__(self):
return self.len
traindata_set=DataBuilder(X_train_fraud)
testdata_set=DataBuilder(X_test_fraud)
trainloader=DataLoader(dataset=traindata_set,batch_size=1024)
testloader=DataLoader(dataset=testdata_set,batch_size=1024)
Define the Variational Autoencoder (for more information check out my earlier blogpost).
class Autoencoder(nn.Module):
def __init__(self,D_in,H=50,H2=12,latent_dim=3):
#Encoder
super(Autoencoder,self).__init__()
self.linear1=nn.Linear(D_in,H)
self.lin_bn1 = nn.BatchNorm1d(num_features=H)
self.linear2=nn.Linear(H,H2)
self.lin_bn2 = nn.BatchNorm1d(num_features=H2)
self.linear3=nn.Linear(H2,H2)
self.lin_bn3 = nn.BatchNorm1d(num_features=H2)
# Latent vectors mu and sigma
self.fc1 = nn.Linear(H2, latent_dim)
self.bn1 = nn.BatchNorm1d(num_features=latent_dim)
self.fc21 = nn.Linear(latent_dim, latent_dim)
self.fc22 = nn.Linear(latent_dim, latent_dim)
# Sampling vector
self.fc3 = nn.Linear(latent_dim, latent_dim)
self.fc_bn3 = nn.BatchNorm1d(latent_dim)
self.fc4 = nn.Linear(latent_dim, H2)
self.fc_bn4 = nn.BatchNorm1d(H2)
# Decoder
self.linear4=nn.Linear(H2,H2)
self.lin_bn4 = nn.BatchNorm1d(num_features=H2)
self.linear5=nn.Linear(H2,H)
self.lin_bn5 = nn.BatchNorm1d(num_features=H)
self.linear6=nn.Linear(H,D_in)
self.lin_bn6 = nn.BatchNorm1d(num_features=D_in)
self.relu = nn.ReLU()
def encode(self, x):
lin1 = self.relu(self.lin_bn1(self.linear1(x)))
lin2 = self.relu(self.lin_bn2(self.linear2(lin1)))
lin3 = self.relu(self.lin_bn3(self.linear3(lin2)))
fc1 = F.relu(self.bn1(self.fc1(lin3)))
r1 = self.fc21(fc1)
r2 = self.fc22(fc1)
return r1, r2
def reparameterize(self, mu, logvar):
if self.training:
std = logvar.mul(0.5).exp_()
eps = Variable(std.data.new(std.size()).normal_())
return eps.mul(std).add_(mu)
else:
return mu
def decode(self, z):
fc3 = self.relu(self.fc_bn3(self.fc3(z)))
fc4 = self.relu(self.fc_bn4(self.fc4(fc3)))
lin4 = self.relu(self.lin_bn4(self.linear4(fc4)))
lin5 = self.relu(self.lin_bn5(self.linear5(lin4)))
return self.lin_bn6(self.linear6(lin5))
def forward(self, x):
mu, logvar = self.encode(x)
z = self.reparameterize(mu, logvar)
return self.decode(z), mu, logvar
class customLoss(nn.Module):
def __init__(self):
super(customLoss, self).__init__()
self.mse_loss = nn.MSELoss(reduction="sum")
# x_recon ist der im forward im Model erstellte recon_batch, x ist der originale x Batch, mu ist mu und logvar ist logvar
def forward(self, x_recon, x, mu, logvar):
loss_MSE = self.mse_loss(x_recon, x)
loss_KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
return loss_MSE + loss_KLD
D_in = traindata_set.x.shape[1]
H = 50
H2 = 12
model = Autoencoder(D_in, H, H2).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_mse = customLoss()
log_interval = 50
val_losses = []
train_losses = []
test_losses = []
def train(epoch):
model.train()
train_loss = 0
for batch_idx, data in enumerate(trainloader):
data = data.to(device)
optimizer.zero_grad()
recon_batch, mu, logvar = model(data)
loss = loss_mse(recon_batch, data, mu, logvar)
loss.backward()
train_loss += loss.item()
optimizer.step()
if epoch % 200 == 0:
print('====> Epoch: {} Average training loss: {:.4f}'.format(
epoch, train_loss / len(trainloader.dataset)))
train_losses.append(train_loss / len(trainloader.dataset))
def test(epoch):
with torch.no_grad():
test_loss = 0
for batch_idx, data in enumerate(testloader):
data = data.to(device)
optimizer.zero_grad()
recon_batch, mu, logvar = model(data)
loss = loss_mse(recon_batch, data, mu, logvar)
test_loss += loss.item()
if epoch % 200 == 0:
print('====> Epoch: {} Average test loss: {:.4f}'.format(
epoch, test_loss / len(testloader.dataset)))
test_losses.append(test_loss / len(testloader.dataset))
epochs = 1500
for epoch in range(1, epochs + 1):
train(epoch)
test(epoch)
We're still improving so keep going
epochs = 2500
optimizer = optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(1, epochs + 1):
train(epoch)
test(epoch)
epochs = 500
optimizer = optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(1, epochs + 1):
train(epoch)
test(epoch)
Let's look at the results:
with torch.no_grad():
for batch_idx, data in enumerate(testloader):
data = data.to(device)
optimizer.zero_grad()
recon_batch, mu, logvar = model(data)
recon_row = recon_batch[0].cpu().numpy()
recon_row = np.append(recon_row, [1])
real_row = testloader.dataset.x[0].cpu().numpy()
real_row = np.append(real_row, [1])
df = pd.DataFrame(np.stack((recon_row, real_row)), columns = cols)
df
sigma = torch.exp(logvar/2)
mu.mean(axis=0), sigma.mean(axis=0)
# sample z from q
no_samples = 20
q = torch.distributions.Normal(mu.mean(axis=0), sigma.mean(axis=0))
z = q.rsample(sample_shape=torch.Size([no_samples]))
with torch.no_grad():
pred = model.decode(z).cpu().numpy()
df_fake = pd.DataFrame(pred)
df_fake['Class']=1
df_fake.columns = cols
df_fake['Class'] = np.round(df_fake['Class']).astype(int)
df_fake['Time'] = (df_fake['Time']*std_time)+mean_time
df_fake['Amount'] = (df_fake['Amount']*std_amount)+mean_amount
df_fake.head()
df_fake['Amount'].mean()
df.groupby('Class').mean()['Amount']
y_train.value_counts()
So let's build about 190.000 fake fraud detection cases:
no_samples = 190_000
q = torch.distributions.Normal(mu.mean(axis=0), sigma.mean(axis=0))
z = q.rsample(sample_shape=torch.Size([no_samples]))
with torch.no_grad():
pred = model.decode(z).cpu().numpy()
Concat to our X_train:
X_train_augmented = np.vstack((X_train.values, pred))
y_train_augmented = np.append(y_train.values, np.repeat(1,no_samples))
X_train_augmented.shape
We now have roughly as many fraud cases as we have non-fraud cases.
def rf_aug(xs, y, n_estimators=40, max_samples=500,
max_features=0.5, min_samples_leaf=5, **kwargs):
return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
max_samples=max_samples, max_features=max_features,
min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)
m_aug = rf_aug(X_train_augmented, y_train_augmented)
confusion_matrix(y_test, np.round(m_aug.predict(X_test)))
confusion_matrix(y_test, np.round(m.predict(X_test)))
Look at that! We managed to find 127 out of 157! If our goal was to detect as many of the fraud cases, then we highly succeeded. Maybe you should think of this technique when you're dealing with highly inbalanced datasets in the future.