How to create fake tabular data with a variational autoencoder to improve deep learning algorithms

To train deeplearning models the more data the better. When we're thinking of image data, the deeplearnig community thought about a lot of tricks how to enhance the model given a dataset of images. Meaning that by rotating, flipping, blurring etc. the image we can create more input data and also improve our model.

However, when thinking about tabular data, only few of these techniques exist. In this blogpost I want to show you how to create a variational autoencoder and make use of data augmentation. I will create fake data, which is sampled from the learned distribution of the underlying data.

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

Define path to dataset

DATA_PATH = 'data/wine.csv'

Dataset Overview

df_base = pd.read_csv(DATA_PATH, sep=',')
df_base.head()

cols = df_base.columns

Build Data Loader

def load_and_standardize_data(path):
    # read in from csv
    df = pd.read_csv(path, sep=',')
    # replace nan with -99
    df = df.fillna(-99)
    df = df.values.reshape(-1, df.shape[1]).astype('float32')
    # randomly split
    X_train, X_test = train_test_split(df, test_size=0.3, random_state=42)
    # standardize values
    scaler = preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)   
    return X_train, X_test, scaler

from torch.utils.data import Dataset, DataLoader
class DataBuilder(Dataset):
    def __init__(self, path, train=True):
        self.X_train, self.X_test, self.standardizer = load_and_standardize_data(DATA_PATH)
        if train:
            self.x = torch.from_numpy(self.X_train)
            self.len=self.x.shape[0]
        else:
            self.x = torch.from_numpy(self.X_test)
            self.len=self.x.shape[0]
        del self.X_train
        del self.X_test 
    def __getitem__(self,index):      
        return self.x[index]
    def __len__(self):
        return self.len

traindata_set=DataBuilder(DATA_PATH, train=True)
testdata_set=DataBuilder(DATA_PATH, train=False)

trainloader=DataLoader(dataset=traindata_set,batch_size=1024)
testloader=DataLoader(dataset=testdata_set,batch_size=1024)

type(trainloader.dataset.x), type(testloader.dataset.x)

(torch.Tensor, torch.Tensor)

trainloader.dataset.x.shape, testloader.dataset.x.shape

(torch.Size([124, 14]), torch.Size([54, 14]))

trainloader.dataset.x

tensor([[ 1.3598,  0.6284,  1.0812,  ..., -0.6414, -1.0709, -0.5182],
        [ 0.0628, -0.5409, -0.6130,  ...,  0.3465,  1.3308, -0.2151],
        [ 0.0628, -0.7557, -1.2870,  ...,  0.4324, -0.3984,  0.0420],
        ...,
        [-1.2343,  1.6904, -0.4855,  ...,  1.0338,  0.5485,  2.6682],
        [ 0.0628, -0.3261, -0.7952,  ...,  0.0029, -0.7415, -0.7983],
        [ 0.0628, -0.7437,  0.0428,  ..., -0.6843,  1.0700, -0.9861]])

Build model

class Autoencoder(nn.Module):
    def __init__(self,D_in,H=50,H2=12,latent_dim=3):
        
        #Encoder
        super(Autoencoder,self).__init__()
        self.linear1=nn.Linear(D_in,H)
        self.lin_bn1 = nn.BatchNorm1d(num_features=H)
        self.linear2=nn.Linear(H,H2)
        self.lin_bn2 = nn.BatchNorm1d(num_features=H2)
        self.linear3=nn.Linear(H2,H2)
        self.lin_bn3 = nn.BatchNorm1d(num_features=H2)
        
        # Latent vectors mu and sigma
        self.fc1 = nn.Linear(H2, latent_dim)
        self.bn1 = nn.BatchNorm1d(num_features=latent_dim)
        self.fc21 = nn.Linear(latent_dim, latent_dim)
        self.fc22 = nn.Linear(latent_dim, latent_dim)

        # Sampling vector
        self.fc3 = nn.Linear(latent_dim, latent_dim)
        self.fc_bn3 = nn.BatchNorm1d(latent_dim)
        self.fc4 = nn.Linear(latent_dim, H2)
        self.fc_bn4 = nn.BatchNorm1d(H2)
        
        # Decoder
        self.linear4=nn.Linear(H2,H2)
        self.lin_bn4 = nn.BatchNorm1d(num_features=H2)
        self.linear5=nn.Linear(H2,H)
        self.lin_bn5 = nn.BatchNorm1d(num_features=H)
        self.linear6=nn.Linear(H,D_in)
        self.lin_bn6 = nn.BatchNorm1d(num_features=D_in)
        
        self.relu = nn.ReLU()
        
    def encode(self, x):
        lin1 = self.relu(self.lin_bn1(self.linear1(x)))
        lin2 = self.relu(self.lin_bn2(self.linear2(lin1)))
        lin3 = self.relu(self.lin_bn3(self.linear3(lin2)))

        fc1 = F.relu(self.bn1(self.fc1(lin3)))

        r1 = self.fc21(fc1)
        r2 = self.fc22(fc1)
        
        return r1, r2
    
    def reparameterize(self, mu, logvar):
        if self.training:
            std = logvar.mul(0.5).exp_()
            eps = Variable(std.data.new(std.size()).normal_())
            return eps.mul(std).add_(mu)
        else:
            return mu
        
    def decode(self, z):
        fc3 = self.relu(self.fc_bn3(self.fc3(z)))
        fc4 = self.relu(self.fc_bn4(self.fc4(fc3)))

        lin4 = self.relu(self.lin_bn4(self.linear4(fc4)))
        lin5 = self.relu(self.lin_bn5(self.linear5(lin4)))
        return self.lin_bn6(self.linear6(lin5))


        
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

class customLoss(nn.Module):
    def __init__(self):
        super(customLoss, self).__init__()
        self.mse_loss = nn.MSELoss(reduction="sum")
    
    def forward(self, x_recon, x, mu, logvar):
        loss_MSE = self.mse_loss(x_recon, x)
        loss_KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

        return loss_MSE + loss_KLD

If you want to better understand the variational autoencoder technique, look here.

For better understanding this AutoencoderClass, let me go briefly through it. This is a variational autoencoder (VAE) with two hidden layers, which (by default, but you can change this) 50 and then 12 activations. The latent factors are set to 3 (you can change that, too). So we're first exploding our initially 14 variables to 50 activations, then condensing it to 12, then to 3. From these 3 latent factors we then sample to recreate the original 14 values. We do that by inflating the 3 latent factors back to 12, then 50 and finally 14 activations (we decode the latent factors so to speak). With this reconstructed batch (recon_batch) we compare it with the original batch, computate our loss and adjust the weights and biases via our gradient (our optimizer here will be Adam).

D_in = data_set.x.shape[1]
H = 50
H2 = 12
model = Autoencoder(D_in, H, H2).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

loss_mse = customLoss()

Train Model

epochs = 1500
log_interval = 50
val_losses = []
train_losses = []
test_losses = []

def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(trainloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_mse(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    if epoch % 200 == 0:        
        print('====> Epoch: {} Average training loss: {:.4f}'.format(
            epoch, train_loss / len(trainloader.dataset)))
        train_losses.append(train_loss / len(trainloader.dataset))

def test(epoch):
    with torch.no_grad():
        test_loss = 0
        for batch_idx, data in enumerate(testloader):
            data = data.to(device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(data)
            loss = loss_mse(recon_batch, data, mu, logvar)
            test_loss += loss.item()
            if epoch % 200 == 0:        
                print('====> Epoch: {} Average test loss: {:.4f}'.format(
                    epoch, test_loss / len(testloader.dataset)))
            test_losses.append(test_loss / len(testloader.dataset))

for epoch in range(1, epochs + 1):
    train(epoch)
    test(epoch)

====> Epoch: 200 Average training loss: 12.3501
====> Epoch: 200 Average test loss: 11.7777
====> Epoch: 400 Average training loss: 10.1168
====> Epoch: 400 Average test loss: 8.9987
====> Epoch: 600 Average training loss: 9.2956
====> Epoch: 600 Average test loss: 9.3548
====> Epoch: 800 Average training loss: 8.9570
====> Epoch: 800 Average test loss: 8.9647
====> Epoch: 1000 Average training loss: 8.6688
====> Epoch: 1000 Average test loss: 8.5866
====> Epoch: 1200 Average training loss: 8.3341
====> Epoch: 1200 Average test loss: 8.8371
====> Epoch: 1400 Average training loss: 8.4063
====> Epoch: 1400 Average test loss: 8.7891

We we're able to reduce the training and test loss but quite a bit, let's have a look at how the fake results actually look like vs the real results:

with torch.no_grad():
    for batch_idx, data in enumerate(testloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)

scaler = trainloader.dataset.standardizer
recon_row = scaler.inverse_transform(recon_batch[0].cpu().numpy())
real_row = scaler.inverse_transform(testloader.dataset.x[0].cpu().numpy())

df = pd.DataFrame(np.stack((recon_row, real_row)), columns = cols)
df

Not to bad right (the first row is the reconstructed row, the second one the real row from the data)? However, what we want is to built this row not with the real input so to speak, since right now we were giving the model the complete rows with their 14 columns, condensed it to 3 input parameters, just to blow it up again to the corresponding 14 columns. What I want to do is to create these 14 rows by giving the model 3 latent factors as input. Let's have a look at these latent variables.

sigma = torch.exp(logvar/2)

mu[1], sigma[1]

(tensor([-0.9960, -0.8502, -0.0043]), tensor([0.2555, 0.4801, 0.9888]))

Mu represents the mean for each of our latent factor values, logvar the log of the standard deviation. Each of these have a distribution by itself. We have 54 cases in our test data, so we have 3x54 different mu and logvar. We can have a look at the distribution of each of the 3 latent variables:

mu.mean(axis=0), sigma.mean(axis=0)

(tensor([-0.0088,  0.0051,  0.0044]), tensor([0.4514, 0.3897, 0.9986]))

All of the latent variables have a mean around zero, but the last latent factor has a wider standard deviation. So when we sample values from each of these latent variables, the last value will vary much more then the other two. I assume a normal distribution for all the latent factors.

# sample z from q
no_samples = 20
q = torch.distributions.Normal(mu.mean(axis=0), sigma.mean(axis=0))
z = q.rsample(sample_shape=torch.Size([no_samples]))

z.shape

torch.Size([20, 3])

z[:5]

tensor([[ 0.5283,  0.4519,  0.6792],
        [ 0.3664, -0.5569, -0.1531],
        [-0.5802,  0.4394,  1.8406],
        [-1.0136, -0.4239,  0.4524],
        [-0.0605,  0.3913,  0.8030]])

With these three latent factors we can now start and create fake data for our dataset and see how it looks like:

with torch.no_grad():
    pred = model.decode(z).cpu().numpy()

pred[1]

array([-0.24290268, -0.6087041 , -0.44325534, -0.7158908 , -0.15065292,
       -0.47845733,  0.26319185,  0.23732403, -0.22809544,  0.12187037,
       -0.8295655 ,  0.44908378,  0.6173717 , -0.55648965], dtype=float32)

Create fake data from Autoencoder

fake_data = scaler.inverse_transform(pred)
fake_data.shape

(20, 14)

df_fake = pd.DataFrame(fake_data, columns = cols)
df_fake['Wine'] = np.round(df_fake['Wine']).astype(int)
df_fake['Wine'] = np.where(df_fake['Wine']<1, 1, df_fake['Wine'])
df_fake.head(10)

For comparison the real data:

df_base.sample(10)

Compare variables grouped by Wine

df_base.groupby('Wine').mean()

df_fake.groupby('Wine').mean()

That looks pretty convincing if you ask me.

To sum up, we've built a variational autoencoder, which we trained on our trainingset. We checked whether our loss kept on improving based on the testset, which the autoencoder never saw for generating fake data. We then calculated the mean and standard deviation from our latent factors given the test data. We've then sampled from this distribution to feed it back into our decoder to create some fake data. With this approach I am now able to create as much fake data derived from the underlying distribution as a want. And I think the results look promising.

You can take this approach to for example create data from under-represented in highly skewed datasets instead of just weighting them higher. The re-weighting approach might cause the algorithm to find relations where there are none, only because a few then overrepresented data points share this relation by random. With the shown approach, the learned distribution would take into account the high variance these features have and therefore will hopefully help the algorithm to not draw these false conclusions.

Stay tuned for the next blogpost, where I will show the shown approach in exactly this use case.

	Wine	Alcohol	Malic.acid	Ash	Acl	Mg	Phenols	Flavanoids	Nonflavanoid.phenols	Proanth	Color.int	Hue	OD	Proline
0	1	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
2	1	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
3	1	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
4	1	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735

	Wine	Alcohol	Malic.acid	Ash	Acl	Mg	Phenols	Flavanoids	Nonflavanoid.phenols	Proanth	Color.int	Hue	OD	Proline
0	1.002792	13.535107	2.010303	2.557292	18.198132	112.606842	2.737524	2.807587	0.320866	1.738254	4.899318	1.078039	3.187276	1013.391479
1	1.000000	13.640000	3.100000	2.560000	15.200000	116.000000	2.700000	3.030000	0.170000	1.660000	5.100000	0.960000	3.360000	845.000000

	Wine	Alcohol	Malic.acid	Ash	Acl	Mg	Phenols	Flavanoids	Nonflavanoid.phenols	Proanth	Color.int	Hue	OD	Proline
0	3	13.350755	3.817283	2.425754	21.229387	98.816788	1.682916	0.910786	0.450081	1.245882	8.242197	0.667928	1.705379	636.650818
1	2	12.453159	1.916350	2.172731	18.977226	93.556114	2.444676	2.246270	0.335432	1.663583	3.166457	1.063876	3.050176	568.385925
2	2	12.735057	2.404566	2.447556	20.400013	105.475235	1.937112	1.657119	0.385740	1.452577	4.242754	0.928397	2.467263	680.271545
3	1	14.664644	1.517465	2.269279	12.428186	88.851791	3.354010	3.997237	0.265253	2.586414	7.366968	1.275564	3.170231	1516.662720
4	3	13.160161	3.359397	2.415784	21.050211	99.859154	1.662516	0.929189	0.427978	1.135361	7.101127	0.708510	1.732820	640.412231
5	2	12.453159	1.916350	2.172731	18.977226	93.556114	2.444676	2.246270	0.335432	1.663583	3.166457	1.063876	3.050176	568.385925
6	2	12.520310	2.522696	2.375254	20.435560	92.619812	1.838333	1.361269	0.470815	1.221076	4.518130	0.906680	2.146883	583.079102
7	3	12.877177	2.746192	2.395865	20.154610	97.263092	1.744550	1.187050	0.464942	1.160733	5.619783	0.836708	1.871472	665.485718
8	2	12.679532	2.344776	2.331834	19.901327	97.031586	1.857117	1.495742	0.461352	1.239715	4.668478	0.934352	2.094139	680.778809
9	2	13.062141	2.719065	2.461590	19.947014	103.352890	2.070540	1.566055	0.380154	1.293219	5.675068	0.852832	2.128047	778.582825

	Wine	Alcohol	Malic.acid	Ash	Acl	Mg	Phenols	Flavanoids	Nonflavanoid.phenols	Proanth	Color.int	Hue	OD	Proline
1	1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
35	1	13.48	1.81	2.41	20.5	100	2.70	2.98	0.26	1.86	5.10	1.04	3.47	920
114	2	12.08	1.39	2.50	22.5	84	2.56	2.29	0.43	1.04	2.90	0.93	3.19	385
149	3	13.08	3.90	2.36	21.5	113	1.41	1.39	0.34	1.14	9.40	0.57	1.33	550
158	3	14.34	1.68	2.70	25.0	98	2.80	1.31	0.53	2.70	13.00	0.57	1.96	660
9	1	13.86	1.35	2.27	16.0	98	2.98	3.15	0.22	1.85	7.22	1.01	3.55	1045
90	2	12.08	1.83	2.32	18.5	81	1.60	1.50	0.52	1.64	2.40	1.08	2.27	480
47	1	13.90	1.68	2.12	16.0	101	3.10	3.39	0.21	2.14	6.10	0.91	3.33	985
10	1	14.10	2.16	2.30	18.0	105	2.95	3.32	0.22	2.38	5.75	1.25	3.17	1510
31	1	13.58	1.66	2.36	19.1	106	2.86	3.19	0.22	1.95	6.90	1.09	2.88	1515

	Alcohol	Malic.acid	Ash	Acl	Mg	Phenols	Flavanoids	Nonflavanoid.phenols	Proanth	Color.int	Hue	OD	Proline
Wine
1	13.744746	2.010678	2.455593	17.037288	106.338983	2.840169	2.982373	0.290000	1.899322	5.528305	1.062034	3.157797	1115.711864
2	12.278732	1.932676	2.244789	20.238028	94.549296	2.258873	2.080845	0.363662	1.630282	3.086620	1.056282	2.785352	519.507042
3	13.153750	3.333750	2.437083	21.416667	99.312500	1.678750	0.781458	0.447500	1.153542	7.396250	0.682708	1.683542	629.895833

	Alcohol	Malic.acid	Ash	Acl	Mg	Phenols	Flavanoids	Nonflavanoid.phenols	Proanth	Color.int	Hue	OD	Proline
Wine
1	13.812141	1.814212	2.482638	17.172688	107.468864	3.062387	3.344664	0.259955	2.162966	5.331643	1.147217	3.280716	1148.031372
2	12.560544	2.157595	2.301805	19.696327	99.324005	2.254415	1.995140	0.366076	1.575015	3.791955	1.000527	2.741598	629.895203
3	13.170316	3.413856	2.416369	20.929930	99.028229	1.683604	0.964315	0.443444	1.176529	7.288512	0.718357	1.745200	644.870056