from common import *
from config import *
from dataloaders import *
from transform_data import *
from model_part import *
from callbacks import *

from functools import partial
from scipy.io import arff
import matplotlib.pyplot as plt
# Read data
data_train = arff.loadarff('data/LargeKitchenAppliances_TRAIN.arff')
data_test = arff.loadarff('data/LargeKitchenAppliances_TEST.arff')
df_test = pd.DataFrame(data_test[0])
df_train = pd.DataFrame(data_train[0])

# let's add a categorical variable
countries = ['Germany', 'US']
household_income = ['low', 'high']
df_train["country"] = np.random.choice(countries, len(df_train))
df_test["country"] = np.random.choice(countries, len(df_test))
df_train["household_income"] = np.random.choice(household_income, len(df_train))
df_test["household_income"] = np.random.choice(household_income, len(df_test))

df_train.head()
att1 att2 att3 att4 att5 att6 att7 att8 att9 att10 ... att714 att715 att716 att717 att718 att719 att720 target country household_income
0 -0.099108 -0.099108 -0.099108 -0.099108 -0.099108 -0.099108 -0.099108 -0.099108 -0.099108 -0.099108 ... -0.099108 -0.099108 -0.099108 -0.099108 -0.099108 -0.099108 -0.099108 b'1' US low
1 -0.155256 -0.155256 -0.155256 -0.155256 -0.155256 -0.155256 -0.155256 -0.155256 -0.155256 -0.155256 ... -0.155256 -0.155256 -0.155256 -0.155256 -0.155256 -0.155256 -0.155256 b'1' Germany high
2 -0.100082 -0.100082 -0.100082 -0.100082 -0.100082 -0.100082 -0.100082 -0.100082 -0.100082 -0.100082 ... -0.100082 -0.100082 -0.100082 -0.100082 -0.100082 -0.100082 -0.100082 b'1' Germany high
3 -0.140671 -0.140671 -0.140671 -0.140671 -0.140671 -0.140671 -0.140671 -0.140671 -0.140671 -0.140671 ... -0.140671 -0.140671 -0.140671 -0.140671 -0.140671 -0.140671 -0.140671 b'1' US high
4 -0.140576 -0.140576 -0.140576 -0.140576 -0.140576 -0.140576 -0.140576 -0.140576 -0.140576 -0.140576 ... -0.140576 -0.140576 -0.140576 -0.140576 -0.140576 -0.140576 -0.140576 b'1' Germany high

5 rows × 723 columns

df_train.shape, df_test.shape
((375, 723), (375, 723))

The first 720 columns represent the time-dependent variables, the second last is our target_variable and the last is our (fake) categorical variable.

x_train = df_train.iloc[:, :-3].values.reshape(-1, 1, 720)
x_test = df_test.iloc[:, :-3].values.reshape(-1, 1, 720)

y_train = df_train.iloc[:, -3].values
y_test = df_test.iloc[:, -3].values

emb_vars_train = df_train.iloc[:, -2:].values
emb_vars_test = df_test.iloc[:, -2:].values

Let's plot some of these:

df_train.iloc[0, :-3].plot.line(title=f'time series with class = {df_train.iloc[0, -3]} in {df_train.iloc[0, -2]}');
df_train.iloc[10, :-3].plot.line(title=f'time series with class = {df_train.iloc[10, -3]} in {df_train.iloc[10, -2]}');

Let's check the means and variance of our variables:

plt.plot(x_train.mean(axis=2).reshape(-1))
[<matplotlib.lines.Line2D at 0x7f2e8d70e3d0>]
plt.plot(x_train.var(axis=2).reshape(-1))
[<matplotlib.lines.Line2D at 0x7f2e8d684460>]

Note the 1e-8 on top, meaning we're dealing here with numbers almost being 0 for the mean and almost being 1 for the variance, so we do not need to normalize them, because they already are. Still, I want to show you a neat trick of how to use broadcasting to quickly normalize each column to have mean of zero and variance of 1:

def normalize(x, m, s): return (x-m)/s

means_ = x_train.mean(axis=2)
vars_ = x_train.var(axis=2)
means_.shape, vars_.shape, x_train.shape
((375, 1), (375, 1), (375, 1, 720))

Only the last dimensions does not match, therefore we can use broadcasting to normalize over all columns:

normalize(x_train, means_, vars_).shape
(375, 375, 720)

Again, in this case our data already looks as it should. Also, the kind of function like normalize I put into a .py file with the name common, which can then be imported by any other python-file or notebook, just as I did right on top of this notebook. So whenever in this notebook there appears a function which is not defined so far, look into the common.py file.

As a next step, we need to transform the target variable and the categorical variable into something the computer can actuelly work with, meaning a number. We can do this using the cat_transform function:

y_train, y_test, dict_y, dict_inv_y = cat_transform(y_train, y_test)
emb_vars_train, emb_vars_test, dict_embs, dict_inv_embs = cat_transform(emb_vars_train, emb_vars_test)
y_train[:10], emb_vars_train[:10], dict_y, dict_embs
(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([[0, 1],
        [1, 0],
        [1, 0],
        [0, 0],
        [1, 0],
        [1, 0],
        [1, 1],
        [0, 0],
        [0, 0],
        [1, 1]]),
 [{0: b'2', 1: b'1', 2: b'3'}],
 [{0: 'US', 1: 'Germany'}, {0: 'high', 1: 'low'}])
dict_y, dict_embs
([{0: b'2', 1: b'1', 2: b'3'}],
 [{0: 'US', 1: 'Germany'}, {0: 'high', 1: 'low'}])

Next, we will use PyTorch to create dataset and dataloader, which we will put into a class called DataBunch (idea stolen from fastai).

device = DEVICE
datasets = create_datasets(x_train, emb_vars_train, y_train,
             x_test, emb_vars_test, y_test,
             valid_pct=VAL_SIZE, seed=1234)
data = DataBunch(*create_loaders(datasets, bs=1024))

Next, we define our model. First, we need to make sure that the final convolution ends up with 1 as the last dimension, so we can put it through a linear layer.

# define model
raw_feat = x_train.shape[1]
emb_dims = [(len(dict_embs[0]), EMB_DIMS), (len(dict_embs[1]), EMB_DIMS)]

num_classes = len(dict_y[0])

Let's grab a batch from our data and see how the convolutions work on our timeseries data:

x_raw, _, _ = next(iter(data.train_dl))
x_raw.shape
torch.Size([300, 1, 720])
raw_ni=x_train.shape[1] # no of input features (here:1)
drop=0.3
m = nn.Conv1d(raw_ni, 128, 28, 7, 0)
output_ = m(x_raw)
print(output_.shape)
m = nn.Conv1d(128, 32, 14, 7, 0)
output_ = m(output_)
print(output_.shape)
m = nn.Conv1d(32, 64, 5, 2, 0)
output_ = m(output_)
print(output_.shape)
# m = nn.Conv1d(64, 32, 3, 8, 0)
# output_ = m(output_)
# print(output_.shape)
m = nn.MaxPool1d(2, stride=4)
output_ = m(output_)
print(output_.shape)
torch.Size([300, 128, 99])
torch.Size([300, 32, 13])
torch.Size([300, 64, 5])
torch.Size([300, 64, 1])

To easily try different kind of architectures, I created a helper function which creates the CNN part of our model. How the complete architecture works will be part of the following tutorial, where I will show in more detail how the code works. But for now, let's see how the final architecture of the model looks like:

model architecture

output_shapes = [128, 32, 64]
kernels_shape = [28, 14, 5]
strides = [7, 7, 2]

model = Classifier_CNN(nn.Sequential(
    *get_cnn_layers(raw_feat, output_shapes, kernels_shape, strides)
    ), emb_dims, num_classes).to(device)
opt = optim.Adam(model.parameters(), lr=0.01)
loss_func = nn.CrossEntropyLoss()

This is how the CNN-Part from our model looks like:

model.raw
Sequential(
  (0): Sequential(
    (0): Conv1d(1, 128, kernel_size=(28,), stride=(7,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (1): Sequential(
    (0): Conv1d(128, 32, kernel_size=(14,), stride=(7,))
    (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (2): Sequential(
    (0): Conv1d(32, 64, kernel_size=(5,), stride=(2,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (3): MaxPool1d(kernel_size=2, stride=4, padding=0, dilation=1, ceil_mode=False)
  (4): Flatten()
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=64, out_features=64, bias=True)
  (7): ReLU(inplace=True)
  (8): Dropout(p=0.5, inplace=False)
  (9): Linear(in_features=64, out_features=64, bias=True)
  (10): ReLU(inplace=True)
)

And this is how the embedding part looks like:

model.embeddings
ModuleList(
  (0): Embedding(2, 5)
  (1): Embedding(2, 5)
)

A pretty neat technique of how to enhance a class with different behaviour is called a Callback. For example, we can use a function to calculate the best learning rate, and use a Callback to use our model with it. Again, how this exactly works will be covered in the next tutorial:

learn = Learner(model, opt, loss_func, data)
run = Runner(cb_funcs=[LR_Find, Recorder])
run.fit(1000, learn)
run.recorder.plot(skip_last=5)

We should take the learning rate with the steepest drop, so somewhere between 1e-3 and 1e-2.

model = Classifier_CNN(nn.Sequential(
    *get_cnn_layers(raw_feat, output_shapes, kernels_shape, strides)
    ), emb_dims, num_classes).to(device)
opt = optim.Adam(model.parameters(), lr=2e-2)
cbfs = [Recorder, partial(AvgStatsCallback,adjusted_accu)]
learn = Learner(model, opt, loss_func, data)
run = Runner(cb_funcs=cbfs)
run.fit(50, learn)
epoch: 10
train: [1.095929463704427, tensor(0.3967)]
valid: [1.3562200927734376, tensor(0.3600)]
epoch: 20
train: [0.8562827555338541, tensor(0.5500)]
valid: [1.3453341674804689, tensor(0.3333)]
epoch: 30
train: [0.5564550272623698, tensor(0.7400)]
valid: [1.2367724609375, tensor(0.6133)]
epoch: 40
train: [0.4498468017578125, tensor(0.8100)]
valid: [1.7772005208333332, tensor(0.4667)]
epoch: 50
train: [0.428627675374349, tensor(0.8333)]
valid: [1.9154888916015624, tensor(0.5333)]

Even though this specific model didn't turn out to be too good, being correct in only approximately 60% of the time (there are 3 classes of which to choose), you can see how I again used a Callback to forward a metric which showed here as the accuracy.

run.recorder.plot_loss()

Next to the validation set I also use the testset to check how good our predictions actually are.

outs = run.predict(learn, learn.data.test_dl)
outs[:10]
array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1])
(outs == y_test).mean()
0.48
run.predict_metrics(learn, learn.data.test_dl, list(dict_y[0].values()))

We're pretty good at identifying class b1, however the model completely fails when it comes to classifying b2.

df_test.iloc[0, :-3].plot.line(title=f'time series with class = {df_test.iloc[0, -3]}, predicted to be class {outs[0]}');
df_test.iloc[10, :-3].plot.line(title=f'time series with class = {df_test.iloc[10, -3]}, predicted to be class {outs[10]}');
df_test.iloc[8, :-3].plot.line(title=f'time series with class = {df_test.iloc[8, -3]}, predicted to be class {outs[8]}');
df_test.iloc[57, :-3].plot.line(title=f'time series with class = {df_test.iloc[57, -3]}, predicted to be class {outs[57]}');
df_test.iloc[326, :-3].plot.line(title=f'time series with class = {df_test.iloc[326, -3]}, predicted to be class {outs[326]}');

Also one particular interesting measure is to look inside the model, meaning having a look at the means and standard-deviations of our activations. ideally the mean should be around 0 and the standard-variance about 1.

def append_stats(i, mod, inp, outp):
    act_means[i].append(outp.data.mean())
    act_stds [i].append(outp.data.std())
output_shapes = [128, 32, 64]
kernels_shape = [28, 14, 5]
strides = [7, 7, 2]

model = Classifier_CNN(nn.Sequential(
    *get_cnn_layers(raw_feat, output_shapes, kernels_shape, strides)
    ), emb_dims, num_classes).to(device)
    
model.raw[0][0].register_forward_hook(partial(append_stats, 0))
model.raw[1][0].register_forward_hook(partial(append_stats, 1))
model.raw[2][0].register_forward_hook(partial(append_stats, 2))

cbfs = [Recorder]
learn = Learner(model, opt, loss_func, data)
run = Runner(cb_funcs=cbfs)

act_means = [[] for _ in range(3)]
act_stds  = [[] for _ in range(3)]

run.fit(50, learn)
for o in act_means: plt.plot(o)
plt.legend(range(3));
for o in act_stds: plt.plot(o)
plt.legend(range(3));

This is really interesting, this zigzag pattern in the later layers meaning we're actually not learning really well here. Let's try another, much simpler architecture and compare the results:

output_shapes = [128]
kernels_shape = [719]
strides = [1]

model = Classifier_CNN(nn.Sequential(
    *get_cnn_layers(raw_feat, output_shapes, kernels_shape, strides)
    ), emb_dims, num_classes).to(device)
opt = optim.Adam(model.parameters(), lr=0.001)
cbfs = [Recorder]
learn = Learner(model, opt, loss_func, data)
run = Runner(cb_funcs=cbfs)

act_means = [[] for _ in range(1)]
act_stds  = [[] for _ in range(1)]

model.raw[0][0].register_forward_hook(partial(append_stats, 0))

run.fit(50, learn)
for o in act_means: plt.plot(o)
plt.legend(range(1));
for o in act_stds: plt.plot(o)
plt.legend(range(1));

This looks better. However, we can also try a different initilization.

def init_weights(m):
    if isinstance(m, nn.Conv1d):
        torch.nn.init.kaiming_uniform_(m.weight)
        m.bias.data.fill_(0.01)
output_shapes = [128, 32, 64]
kernels_shape = [28, 14, 5]
strides = [7, 7, 2]

model = Classifier_CNN(nn.Sequential(
    *get_cnn_layers(raw_feat, output_shapes, kernels_shape, strides)
    ), emb_dims, num_classes).to(device)

model.apply(init_weights)   

model.raw[0][0].register_forward_hook(partial(append_stats, 0))
model.raw[1][0].register_forward_hook(partial(append_stats, 1))
model.raw[2][0].register_forward_hook(partial(append_stats, 2))

cbfs = [Recorder]
learn = Learner(model, opt, loss_func, data)
run = Runner(cb_funcs=cbfs)

act_means = [[] for _ in range(3)]
act_stds  = [[] for _ in range(3)]

run.fit(50, learn)
for o in act_means: plt.plot(o)
plt.legend(range(3));
for o in act_stds: plt.plot(o)
plt.legend(range(3));

This looks way more promising when it comes to the 3 convolutional layers. Admittingly, this still is not perfect.

model = Classifier_CNN(nn.Sequential(
    *get_cnn_layers(raw_feat, output_shapes, kernels_shape, strides)
    ), emb_dims, num_classes).to(device)

model.apply(init_weights)

opt = optim.Adam(model.parameters(), lr=2e-2)
cbfs = [Recorder, partial(AvgStatsCallback,adjusted_accu)]
learn = Learner(model, opt, loss_func, data)
run = Runner(cb_funcs=cbfs)
run.fit(80, learn)
epoch: 10
train: [1.0968295288085939, tensor(0.3433)]
valid: [1.9732820638020834, tensor(0.3867)]
epoch: 20
train: [0.7535211690266927, tensor(0.6067)]
valid: [1.5353872680664062, tensor(0.3867)]
epoch: 30
train: [0.5347029622395834, tensor(0.7467)]
valid: [1.365126953125, tensor(0.5333)]
epoch: 40
train: [0.4531549580891927, tensor(0.8067)]
valid: [2.0907596842447917, tensor(0.5600)]
epoch: 50
train: [0.36646458943684895, tensor(0.8233)]
valid: [3.187868448893229, tensor(0.4667)]
epoch: 60
train: [0.3460370127360026, tensor(0.8533)]
valid: [3.073258463541667, tensor(0.5333)]
epoch: 70
train: [0.3205308024088542, tensor(0.8467)]
valid: [3.1966796875, tensor(0.4933)]
epoch: 80
train: [0.3039264933268229, tensor(0.8500)]
valid: [3.1939200846354168, tensor(0.5867)]
outs = run.predict(learn, learn.data.test_dl)
(outs == y_test).mean()
0.448

This is the end of the first part of this tutorial. In the next tutorial I will show you how the code behind this looks like and how it works.

Lasse