How to create fake tabular data with a variational autoencoder to improve deep learning algorithms

To train deeplearning models the more data the better. When we're thinking of image data, the deeplearnig community thought about a lot of tricks how to enhance the model given a dataset of images. Meaning that by rotating, flipping, blurring etc. the image we can create more input data and also improve our model.

However, when thinking about tabular data, only few of these techniques exist. In this blogpost I want to show you how to create a variational autoencoder and make use of data augmentation. I will create fake data, which is sampled from the learned distribution of the underlying data.

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
device(type='cpu')

Define path to dataset

DATA_PATH = 'data/wine.csv'

Dataset Overview

df_base = pd.read_csv(DATA_PATH, sep=',')
df_base.head()
Wine Alcohol Malic.acid Ash Acl Mg Phenols Flavanoids Nonflavanoid.phenols Proanth Color.int Hue OD Proline
0 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
2 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
3 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
4 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735
cols = df_base.columns

Build Data Loader

def load_and_standardize_data(path):
    # read in from csv
    df = pd.read_csv(path, sep=',')
    # replace nan with -99
    df = df.fillna(-99)
    df = df.values.reshape(-1, df.shape[1]).astype('float32')
    # randomly split
    X_train, X_test = train_test_split(df, test_size=0.3, random_state=42)
    # standardize values
    scaler = preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)   
    return X_train, X_test, scaler
from torch.utils.data import Dataset, DataLoader
class DataBuilder(Dataset):
    def __init__(self, path, train=True):
        self.X_train, self.X_test, self.standardizer = load_and_standardize_data(DATA_PATH)
        if train:
            self.x = torch.from_numpy(self.X_train)
            self.len=self.x.shape[0]
        else:
            self.x = torch.from_numpy(self.X_test)
            self.len=self.x.shape[0]
        del self.X_train
        del self.X_test 
    def __getitem__(self,index):      
        return self.x[index]
    def __len__(self):
        return self.len
traindata_set=DataBuilder(DATA_PATH, train=True)
testdata_set=DataBuilder(DATA_PATH, train=False)

trainloader=DataLoader(dataset=traindata_set,batch_size=1024)
testloader=DataLoader(dataset=testdata_set,batch_size=1024)
type(trainloader.dataset.x), type(testloader.dataset.x)
(torch.Tensor, torch.Tensor)
trainloader.dataset.x.shape, testloader.dataset.x.shape
(torch.Size([124, 14]), torch.Size([54, 14]))
trainloader.dataset.x
tensor([[ 1.3598,  0.6284,  1.0812,  ..., -0.6414, -1.0709, -0.5182],
        [ 0.0628, -0.5409, -0.6130,  ...,  0.3465,  1.3308, -0.2151],
        [ 0.0628, -0.7557, -1.2870,  ...,  0.4324, -0.3984,  0.0420],
        ...,
        [-1.2343,  1.6904, -0.4855,  ...,  1.0338,  0.5485,  2.6682],
        [ 0.0628, -0.3261, -0.7952,  ...,  0.0029, -0.7415, -0.7983],
        [ 0.0628, -0.7437,  0.0428,  ..., -0.6843,  1.0700, -0.9861]])

Build model

class Autoencoder(nn.Module):
    def __init__(self,D_in,H=50,H2=12,latent_dim=3):
        
        #Encoder
        super(Autoencoder,self).__init__()
        self.linear1=nn.Linear(D_in,H)
        self.lin_bn1 = nn.BatchNorm1d(num_features=H)
        self.linear2=nn.Linear(H,H2)
        self.lin_bn2 = nn.BatchNorm1d(num_features=H2)
        self.linear3=nn.Linear(H2,H2)
        self.lin_bn3 = nn.BatchNorm1d(num_features=H2)
        
        # Latent vectors mu and sigma
        self.fc1 = nn.Linear(H2, latent_dim)
        self.bn1 = nn.BatchNorm1d(num_features=latent_dim)
        self.fc21 = nn.Linear(latent_dim, latent_dim)
        self.fc22 = nn.Linear(latent_dim, latent_dim)

        # Sampling vector
        self.fc3 = nn.Linear(latent_dim, latent_dim)
        self.fc_bn3 = nn.BatchNorm1d(latent_dim)
        self.fc4 = nn.Linear(latent_dim, H2)
        self.fc_bn4 = nn.BatchNorm1d(H2)
        
        # Decoder
        self.linear4=nn.Linear(H2,H2)
        self.lin_bn4 = nn.BatchNorm1d(num_features=H2)
        self.linear5=nn.Linear(H2,H)
        self.lin_bn5 = nn.BatchNorm1d(num_features=H)
        self.linear6=nn.Linear(H,D_in)
        self.lin_bn6 = nn.BatchNorm1d(num_features=D_in)
        
        self.relu = nn.ReLU()
        
    def encode(self, x):
        lin1 = self.relu(self.lin_bn1(self.linear1(x)))
        lin2 = self.relu(self.lin_bn2(self.linear2(lin1)))
        lin3 = self.relu(self.lin_bn3(self.linear3(lin2)))

        fc1 = F.relu(self.bn1(self.fc1(lin3)))

        r1 = self.fc21(fc1)
        r2 = self.fc22(fc1)
        
        return r1, r2
    
    def reparameterize(self, mu, logvar):
        if self.training:
            std = logvar.mul(0.5).exp_()
            eps = Variable(std.data.new(std.size()).normal_())
            return eps.mul(std).add_(mu)
        else:
            return mu
        
    def decode(self, z):
        fc3 = self.relu(self.fc_bn3(self.fc3(z)))
        fc4 = self.relu(self.fc_bn4(self.fc4(fc3)))

        lin4 = self.relu(self.lin_bn4(self.linear4(fc4)))
        lin5 = self.relu(self.lin_bn5(self.linear5(lin4)))
        return self.lin_bn6(self.linear6(lin5))


        
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar
class customLoss(nn.Module):
    def __init__(self):
        super(customLoss, self).__init__()
        self.mse_loss = nn.MSELoss(reduction="sum")
    
    def forward(self, x_recon, x, mu, logvar):
        loss_MSE = self.mse_loss(x_recon, x)
        loss_KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

        return loss_MSE + loss_KLD

If you want to better understand the variational autoencoder technique, look here.

For better understanding this AutoencoderClass, let me go briefly through it. This is a variational autoencoder (VAE) with two hidden layers, which (by default, but you can change this) 50 and then 12 activations. The latent factors are set to 3 (you can change that, too). So we're first exploding our initially 14 variables to 50 activations, then condensing it to 12, then to 3. From these 3 latent factors we then sample to recreate the original 14 values. We do that by inflating the 3 latent factors back to 12, then 50 and finally 14 activations (we decode the latent factors so to speak). With this reconstructed batch (recon_batch) we compare it with the original batch, computate our loss and adjust the weights and biases via our gradient (our optimizer here will be Adam).

D_in = data_set.x.shape[1]
H = 50
H2 = 12
model = Autoencoder(D_in, H, H2).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_mse = customLoss()

Train Model

epochs = 1500
log_interval = 50
val_losses = []
train_losses = []
test_losses = []
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(trainloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_mse(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    if epoch % 200 == 0:        
        print('====> Epoch: {} Average training loss: {:.4f}'.format(
            epoch, train_loss / len(trainloader.dataset)))
        train_losses.append(train_loss / len(trainloader.dataset))
def test(epoch):
    with torch.no_grad():
        test_loss = 0
        for batch_idx, data in enumerate(testloader):
            data = data.to(device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(data)
            loss = loss_mse(recon_batch, data, mu, logvar)
            test_loss += loss.item()
            if epoch % 200 == 0:        
                print('====> Epoch: {} Average test loss: {:.4f}'.format(
                    epoch, test_loss / len(testloader.dataset)))
            test_losses.append(test_loss / len(testloader.dataset))
for epoch in range(1, epochs + 1):
    train(epoch)
    test(epoch)
====> Epoch: 200 Average training loss: 12.3501
====> Epoch: 200 Average test loss: 11.7777
====> Epoch: 400 Average training loss: 10.1168
====> Epoch: 400 Average test loss: 8.9987
====> Epoch: 600 Average training loss: 9.2956
====> Epoch: 600 Average test loss: 9.3548
====> Epoch: 800 Average training loss: 8.9570
====> Epoch: 800 Average test loss: 8.9647
====> Epoch: 1000 Average training loss: 8.6688
====> Epoch: 1000 Average test loss: 8.5866
====> Epoch: 1200 Average training loss: 8.3341
====> Epoch: 1200 Average test loss: 8.8371
====> Epoch: 1400 Average training loss: 8.4063
====> Epoch: 1400 Average test loss: 8.7891

We we're able to reduce the training and test loss but quite a bit, let's have a look at how the fake results actually look like vs the real results:

with torch.no_grad():
    for batch_idx, data in enumerate(testloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
scaler = trainloader.dataset.standardizer
recon_row = scaler.inverse_transform(recon_batch[0].cpu().numpy())
real_row = scaler.inverse_transform(testloader.dataset.x[0].cpu().numpy())
df = pd.DataFrame(np.stack((recon_row, real_row)), columns = cols)
df
Wine Alcohol Malic.acid Ash Acl Mg Phenols Flavanoids Nonflavanoid.phenols Proanth Color.int Hue OD Proline
0 1.002792 13.535107 2.010303 2.557292 18.198132 112.606842 2.737524 2.807587 0.320866 1.738254 4.899318 1.078039 3.187276 1013.391479
1 1.000000 13.640000 3.100000 2.560000 15.200000 116.000000 2.700000 3.030000 0.170000 1.660000 5.100000 0.960000 3.360000 845.000000

Not to bad right (the first row is the reconstructed row, the second one the real row from the data)? However, what we want is to built this row not with the real input so to speak, since right now we were giving the model the complete rows with their 14 columns, condensed it to 3 input parameters, just to blow it up again to the corresponding 14 columns. What I want to do is to create these 14 rows by giving the model 3 latent factors as input. Let's have a look at these latent variables.

sigma = torch.exp(logvar/2)
mu[1], sigma[1]
(tensor([-0.9960, -0.8502, -0.0043]), tensor([0.2555, 0.4801, 0.9888]))

Mu represents the mean for each of our latent factor values, logvar the log of the standard deviation. Each of these have a distribution by itself. We have 54 cases in our test data, so we have 3x54 different mu and logvar. We can have a look at the distribution of each of the 3 latent variables:

mu.mean(axis=0), sigma.mean(axis=0)
(tensor([-0.0088,  0.0051,  0.0044]), tensor([0.4514, 0.3897, 0.9986]))

All of the latent variables have a mean around zero, but the last latent factor has a wider standard deviation. So when we sample values from each of these latent variables, the last value will vary much more then the other two. I assume a normal distribution for all the latent factors.

# sample z from q
no_samples = 20
q = torch.distributions.Normal(mu.mean(axis=0), sigma.mean(axis=0))
z = q.rsample(sample_shape=torch.Size([no_samples]))
z.shape
torch.Size([20, 3])
z[:5]
tensor([[ 0.5283,  0.4519,  0.6792],
        [ 0.3664, -0.5569, -0.1531],
        [-0.5802,  0.4394,  1.8406],
        [-1.0136, -0.4239,  0.4524],
        [-0.0605,  0.3913,  0.8030]])

With these three latent factors we can now start and create fake data for our dataset and see how it looks like:

with torch.no_grad():
    pred = model.decode(z).cpu().numpy()
pred[1]
array([-0.24290268, -0.6087041 , -0.44325534, -0.7158908 , -0.15065292,
       -0.47845733,  0.26319185,  0.23732403, -0.22809544,  0.12187037,
       -0.8295655 ,  0.44908378,  0.6173717 , -0.55648965], dtype=float32)

Create fake data from Autoencoder

fake_data = scaler.inverse_transform(pred)
fake_data.shape
(20, 14)
df_fake = pd.DataFrame(fake_data, columns = cols)
df_fake['Wine'] = np.round(df_fake['Wine']).astype(int)
df_fake['Wine'] = np.where(df_fake['Wine']<1, 1, df_fake['Wine'])
df_fake.head(10)
Wine Alcohol Malic.acid Ash Acl Mg Phenols Flavanoids Nonflavanoid.phenols Proanth Color.int Hue OD Proline
0 3 13.350755 3.817283 2.425754 21.229387 98.816788 1.682916 0.910786 0.450081 1.245882 8.242197 0.667928 1.705379 636.650818
1 2 12.453159 1.916350 2.172731 18.977226 93.556114 2.444676 2.246270 0.335432 1.663583 3.166457 1.063876 3.050176 568.385925
2 2 12.735057 2.404566 2.447556 20.400013 105.475235 1.937112 1.657119 0.385740 1.452577 4.242754 0.928397 2.467263 680.271545
3 1 14.664644 1.517465 2.269279 12.428186 88.851791 3.354010 3.997237 0.265253 2.586414 7.366968 1.275564 3.170231 1516.662720
4 3 13.160161 3.359397 2.415784 21.050211 99.859154 1.662516 0.929189 0.427978 1.135361 7.101127 0.708510 1.732820 640.412231
5 2 12.453159 1.916350 2.172731 18.977226 93.556114 2.444676 2.246270 0.335432 1.663583 3.166457 1.063876 3.050176 568.385925
6 2 12.520310 2.522696 2.375254 20.435560 92.619812 1.838333 1.361269 0.470815 1.221076 4.518130 0.906680 2.146883 583.079102
7 3 12.877177 2.746192 2.395865 20.154610 97.263092 1.744550 1.187050 0.464942 1.160733 5.619783 0.836708 1.871472 665.485718
8 2 12.679532 2.344776 2.331834 19.901327 97.031586 1.857117 1.495742 0.461352 1.239715 4.668478 0.934352 2.094139 680.778809
9 2 13.062141 2.719065 2.461590 19.947014 103.352890 2.070540 1.566055 0.380154 1.293219 5.675068 0.852832 2.128047 778.582825

For comparison the real data:

df_base.sample(10)
Wine Alcohol Malic.acid Ash Acl Mg Phenols Flavanoids Nonflavanoid.phenols Proanth Color.int Hue OD Proline
1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
35 1 13.48 1.81 2.41 20.5 100 2.70 2.98 0.26 1.86 5.10 1.04 3.47 920
114 2 12.08 1.39 2.50 22.5 84 2.56 2.29 0.43 1.04 2.90 0.93 3.19 385
149 3 13.08 3.90 2.36 21.5 113 1.41 1.39 0.34 1.14 9.40 0.57 1.33 550
158 3 14.34 1.68 2.70 25.0 98 2.80 1.31 0.53 2.70 13.00 0.57 1.96 660
9 1 13.86 1.35 2.27 16.0 98 2.98 3.15 0.22 1.85 7.22 1.01 3.55 1045
90 2 12.08 1.83 2.32 18.5 81 1.60 1.50 0.52 1.64 2.40 1.08 2.27 480
47 1 13.90 1.68 2.12 16.0 101 3.10 3.39 0.21 2.14 6.10 0.91 3.33 985
10 1 14.10 2.16 2.30 18.0 105 2.95 3.32 0.22 2.38 5.75 1.25 3.17 1510
31 1 13.58 1.66 2.36 19.1 106 2.86 3.19 0.22 1.95 6.90 1.09 2.88 1515

Compare variables grouped by Wine

df_base.groupby('Wine').mean()
Alcohol Malic.acid Ash Acl Mg Phenols Flavanoids Nonflavanoid.phenols Proanth Color.int Hue OD Proline
Wine
1 13.744746 2.010678 2.455593 17.037288 106.338983 2.840169 2.982373 0.290000 1.899322 5.528305 1.062034 3.157797 1115.711864
2 12.278732 1.932676 2.244789 20.238028 94.549296 2.258873 2.080845 0.363662 1.630282 3.086620 1.056282 2.785352 519.507042
3 13.153750 3.333750 2.437083 21.416667 99.312500 1.678750 0.781458 0.447500 1.153542 7.396250 0.682708 1.683542 629.895833
df_fake.groupby('Wine').mean()
Alcohol Malic.acid Ash Acl Mg Phenols Flavanoids Nonflavanoid.phenols Proanth Color.int Hue OD Proline
Wine
1 13.812141 1.814212 2.482638 17.172688 107.468864 3.062387 3.344664 0.259955 2.162966 5.331643 1.147217 3.280716 1148.031372
2 12.560544 2.157595 2.301805 19.696327 99.324005 2.254415 1.995140 0.366076 1.575015 3.791955 1.000527 2.741598 629.895203
3 13.170316 3.413856 2.416369 20.929930 99.028229 1.683604 0.964315 0.443444 1.176529 7.288512 0.718357 1.745200 644.870056

That looks pretty convincing if you ask me.

To sum up, we've built a variational autoencoder, which we trained on our trainingset. We checked whether our loss kept on improving based on the testset, which the autoencoder never saw for generating fake data. We then calculated the mean and standard deviation from our latent factors given the test data. We've then sampled from this distribution to feed it back into our decoder to create some fake data. With this approach I am now able to create as much fake data derived from the underlying distribution as a want. And I think the results look promising.

You can take this approach to for example create data from under-represented in highly skewed datasets instead of just weighting them higher. The re-weighting approach might cause the algorithm to find relations where there are none, only because a few then overrepresented data points share this relation by random. With the shown approach, the learned distribution would take into account the high variance these features have and therefore will hopefully help the algorithm to not draw these false conclusions.

Stay tuned for the next blogpost, where I will show the shown approach in exactly this use case.