The main purpose of autoencoders is denoising, i.e. creating clean data from that could include either noisy, wrong or missing values.
Consequently, you may want to create new data with an autoencoder that applies a probabilistic approach to "guess" what the imperfect data is.
Therefore, you can use a dropout function that will erase some data in order to test the model quality.
Here is a code example for sequential data:
import random
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from keras.objectives import mse
from keras.models import Sequential
from keras.layers.core import Dropout, Dense
from keras.regularizers import l1l2
from collections import defaultdict
%matplotlib inline
class Autoencoder:
def __init__(self, data,
recurrent_weight=0.5,
optimizer="adam",
dropout_probability=0.5,
hidden_activation="relu",
output_activation="sigmoid",
init="glorot_normal",
l1_penalty=0,
l2_penalty=0):
self.data = data.copy()
self.recurrent_weight = recurrent_weight
self.optimizer = optimizer
self.dropout_probability = dropout_probability
self.hidden_activation = hidden_activation
self.output_activation = output_activation
self.init = init
self.l1_penalty = l1_penalty
self.l2_penalty = l2_penalty
def _get_hidden_layer_sizes(self):
n_dims = self.data.shape[1]
return [
min(2000, 8 * n_dims),
min(500, 2 * n_dims),
int(np.ceil(0.5 * n_dims)),
]
def _create_model(self):
hidden_layer_sizes = self._get_hidden_layer_sizes()
first_layer_size = hidden_layer_sizes[0]
n_dims = self.data.shape[1]
model = Sequential()
model.add(Dense(
first_layer_size,
input_dim= 2 * n_dims,
activation=self.hidden_activation,
W_regularizer=l1l2(self.l1_penalty, self.l2_penalty),
init=self.init))
model.add(Dropout(self.dropout_probability))
for layer_size in hidden_layer_sizes[1:]:
model.add(Dense(
layer_size,
activation=self.hidden_activation,
W_regularizer=l1l2(self.l1_penalty, self.l2_penalty),
init=self.init))
model.add(Dropout(self.dropout_probability))
model.add(Dense(
n_dims,
activation=self.output_activation,
W_regularizer=l1l2(self.l1_penalty, self.l2_penalty),
init=self.init))
loss_function = make_reconstruction_loss(n_dims)
model.compile(optimizer=self.optimizer, loss=loss_function)
return model
def fill(self, missing_mask):
self.data[missing_mask] = -1
def _create_missing_mask(self):
if self.data.dtype != "f" and self.data.dtype != "d":
self.data = self.data.astype(float)
return np.isnan(self.data)
def _train_epoch(self, model, missing_mask, batch_size):
input_with_mask = np.hstack([self.data, missing_mask])
n_samples = len(input_with_mask)
n_batches = int(np.ceil(n_samples / batch_size))
indices = np.arange(n_samples)
np.random.shuffle(indices)
X_shuffled = input_with_mask[indices]
for batch_idx in range(n_batches):
batch_start = batch_idx * batch_size
batch_end = (batch_idx + 1) * batch_size
batch_data = X_shuffled[batch_start:batch_end, :]
model.train_on_batch(batch_data, batch_data)
return model.predict(input_with_mask)
def train(self, batch_size=256, train_epochs=100):
missing_mask = self._create_missing_mask()
self.fill(missing_mask)
self.model = self._create_model()
observed_mask = ~missing_mask
for epoch in range(train_epochs):
X_pred = self._train_epoch(self.model, missing_mask, batch_size)
observed_mae = masked_mae(X_true=self.data,
X_pred=X_pred,
mask=observed_mask)
if epoch % 50 == 0:
print("observed mae:", observed_mae)
old_weight = (1.0 - self.recurrent_weight)
self.data[missing_mask] *= old_weight
pred_missing = X_pred[missing_mask]
self.data[missing_mask] += self.recurrent_weight * pred_missing
return self.data.copy()
Source: https://curiousily.com/posts/data-imputation-using-autoencoders/
Of course, you need enough data to have a correct prediction, but autoencoders could work even with a small dataset, only if the dataset is representative enough of potential tests scenarios.
You way also want to implement ranges of uncertainty to know the results reliability, using the maximum likelyhood for example.