diff --git a/TODO.md b/TODO.md
index c1b2ceb..f90f5c1 100644
--- a/TODO.md
+++ b/TODO.md
@@ -18,7 +18,6 @@ https://community.st.com/s/topic/0TO0X0000003iUqWAI/stm32-machine-learning-ai
 Experiment
 
 - Test 16kHz with 30 mels
-- Use multi-instance learning. Get bigger batches and improve GPU utilization
 - Do hyperparameter optimization per model
 - Double-check reproduction of SB-CNN results
 - Improve Data Augmentation
diff --git a/experiments/ldcnn20k60.yaml b/experiments/ldcnn20k60.yaml
index 149571b..72a8c44 100644
--- a/experiments/ldcnn20k60.yaml
+++ b/experiments/ldcnn20k60.yaml
@@ -7,11 +7,11 @@ hop_length: 512
 augmentations: 12
 augment: 1
 frames: 31
-batch: 400
+batch: 30
 epochs: 100
 train_samples: 30000
 val_samples: 5000
-learning_rate: 0.005
+learning_rate: 0.001
 voting: 'mean'
 voting_overlap: 0.0
 nesterov_momentum: 0.9
diff --git a/microesc/features.py b/microesc/features.py
index 89d3f36..8642f5c 100644
--- a/microesc/features.py
+++ b/microesc/features.py
@@ -59,18 +59,6 @@ def compute_mels(y, settings):
     return mels
 
 
-def sample_windows(length, frame_samples, window_frames, overlap=0.5, start=0):
-    """Split @samples into a number of windows of samples
-    with length @frame_samples * @window_frames
-    """
-
-    ws = frame_samples * window_frames
-    while start < length:
-        end = min(start + ws, length)
-        yield start, end
-        start += (ws * (1-overlap))
-
-
 def features_url(settings, base=default_base_url):
     id = settings_id(settings)
     ext = '.zip'  
@@ -106,36 +94,17 @@ def download_progress(count, blocksize, totalsize):
     return feature_dir
 
 
-def load_sample(sample, settings, feature_dir, window_frames,
-                start_time=None, augment=None, normalize='meanstd'):
+def extract_window(inmels, settings, start_time, normalize):
+
     n_mels = settings['n_mels']
     sample_rate = settings['samplerate']
     hop_length = settings['hop_length']
+    window_frames = settings['frames']
 
-    aug = None
-    if augment and settings['augmentations'] > 0:
-        aug = numpy.random.randint(-1, settings['augmentations'])
-        if aug == -1:
-            aug = None
-
-    # Load precomputed features
-    folder = os.path.join(feature_dir, settings_id(settings))
-    path = feature_path(sample, out_folder=folder, augmentation=aug)
-    mels = numpy.load(path)['arr_0']
-    assert mels.shape[0] == n_mels, mels.shape
-
-    if start_time is None:
-        # Sample a window in time randomly
-        min_start = max(0, mels.shape[1]-window_frames)
-        if min_start == 0:
-            start = 0
-        else:
-            start = numpy.random.randint(0, min_start)
-    else:
-        start = int(start_time * (sample_rate / hop_length))
-
+    start = int(start_time * (sample_rate / hop_length))
     end = start + window_frames
-    mels = mels[:, start:end]
+    #print('s', start, end, inmels.shape[1], end-start/inmels.shape[1])
+    mels = inmels[:, start:end]
 
     # Normalize the window
     if mels.shape[1] > 0:
@@ -149,40 +118,60 @@ def load_sample(sample, settings, feature_dir, window_frames,
         else:
             mels = librosa.core.power_to_db(mels, top_db=80, ref=0.0)
     else:
-        print('Warning: Sample {} with start {} has 0 length'.format(sample, start_time))
+        print('Warning: Sample {} with start {} has 0 length'.format(inmels.shape, start_time))
 
     # Pad to standard size
-    if window_frames is None:
-        padded = mels
-    else:
-        padded = numpy.full((n_mels, window_frames), 0.0, dtype=float)
-        inp = mels[:, 0:min(window_frames, mels.shape[1])]
-        padded[:, 0:inp.shape[1]] = inp
+    padded = numpy.full((n_mels, window_frames), 0.0, dtype=float)
+    inp = mels[:, 0:min(window_frames, mels.shape[1])]
+    padded[:, 0:inp.shape[1]] = inp
 
     # add channel dimension
     data = numpy.expand_dims(padded, -1)
     return data
 
 
-Sample = collections.namedtuple('Sample', 'start end fold slice_file_name')
+def load_sample(sample, exsettings, feature_dir,
+                overlap=0, start=0, augmentation=None, normalize='meanstd'):
 
-def load_windows(sample, settings, loader, overlap, start=0):
-    sample_rate = settings['samplerate']
-    frame_samples = settings['hop_length']
-    window_frames = settings['frames']
+    n_mels = exsettings['n_mels']
+    f_settings = settings(exsettings)
 
-    windows = []
+    # Load precomputed features
+    folder = os.path.join(feature_dir, settings_id(f_settings))
+    path = feature_path(sample, out_folder=folder, augmentation=augmentation)
+    mels = numpy.load(path)['arr_0']
+    assert mels.shape[0] == n_mels, mels.shape
 
-    duration = sample.end - sample.start
-    length = int(sample_rate * duration)
+    sample_rate = exsettings['samplerate']
+    frame_samples = exsettings['hop_length']
+    window_frames = exsettings['frames']
 
-    for win in sample_windows(length, frame_samples, window_frames, overlap=overlap, start=start):
-        chunk = Sample(start=win[0]/sample_rate,
-                       end=win[1]/sample_rate,
-                       fold=sample.fold,
-                       slice_file_name=sample.slice_file_name)    
-        d = loader(chunk)
+    # augmentations may change the sample duration
+    duration = mels.shape[1] * frame_samples/sample_rate
+    duration -= start
+
+    # cut into windows, and normalized
+    window_length = ((frame_samples * window_frames) / sample_rate)
+    hop_length = (1-overlap) * window_length
+    n_windows = int(numpy.ceil(4.0 / hop_length))
+
+    starts = [ start + (i*hop_length) for i in range(0, n_windows) ]
+    if duration < window_length:
+        # make sure short files have at least one window
+        starts = [ starts[0] ]
+    else:
+        starts = [ s for s in starts if s < (duration-(hop_length/2)) ]
+
+    windows = []
+    for s in starts:
+        d = extract_window(mels, exsettings, s, normalize=normalize)
         windows.append(d)
+    d = numpy.stack(windows)
+
+    # single numpy array, zero-padded
+    s = (n_windows, n_mels, window_frames, 1)
+    windows = numpy.zeros(shape=s)
+    windows[:d.shape[0], :, :, :] = d
 
     return windows
 
diff --git a/microesc/train.py b/microesc/train.py
index 6c74801..03d903a 100644
--- a/microesc/train.py
+++ b/microesc/train.py
@@ -19,25 +19,75 @@
 from . import settings as Settings
 
 
-def dataframe_generator(X, Y, loader, batchsize=10, n_classes=10):
-    """
-    Keras generator for lazy-loading data based on a pandas.DataFrame
+
+class Generator(keras.utils.Sequence):
+
+    def __init__(self, x_set, y_set, feature_dir, settings, n_classes=10, augment=False):
+        self.x, self.y = x_set, y_set
+        self.batch_size = settings['batch']
+        self.n_classes = n_classes
+        self.augment = augment
+        self.n_augmentations = settings['augmentations'] if self.augment else 1
+        self.feature_dir = feature_dir
+        self.feature_settings = features.settings(settings)
+        self.settings = settings
+
+    def _load(self, sample, augmentation=None):
+
+        # Time-shift augmentation, randomize starts
+        sample_rate = self.settings['samplerate']
+        frame_samples = self.settings['hop_length']
+        window_frames = self.settings['frames']
+        dur = sample.end - sample.start
+        window_length = ((frame_samples * window_frames) / sample_rate)
+
+        if self.augment:
+            start = numpy.random.random() * (min(window_length, dur)/2)
+        else:
+            start = 0
+
+        windows = features.load_sample(sample,
+            self.settings,
+            feature_dir=self.feature_dir,
+            augmentation=augmentation,
+            overlap=self.settings['voting_overlap'],
+            start=start)
+
+        #no = numpy.random.randint(0, 100)
+        #name = sample.slice_file_name.replace('.wav', '.npy')
+        #numpy.save(f'features/{name}', windows)
+
+        return windows
+
+    def __len__(self):
+        # FIXME: make sure to include all data, not using floor
+        sample_batches = int(numpy.floor(len(self.x) / float(self.batch_size)))
+        augmented = sample_batches * self.n_augmentations
+        return augmented
     
-    X: data column(s)
-    Y: target column
-    loader: function will be passed batches of X to load actual training data
-    """
-        
-    assert len(X) == len(Y), 'X and Y must be equal length'
+    def __getitem__(self, idx):
+        # take augmentation into account
+        aug_idx = idx % self.n_augmentations 
+        sample_idx = idx // self.n_augmentations
+
+        # select data
+        from_idx = sample_idx * self.batch_size
+        to_idx = (sample_idx + 1) * self.batch_size
+        X = self.x.iloc[from_idx:to_idx]
+        y = self.y.iloc[from_idx:to_idx]
+
+        assert X.shape[0] == self.batch_size, (X.shape, self.batch_size, from_idx)
+        assert y.shape[0] == self.batch_size, (y.shape)
 
-    while True:
-        idx = numpy.random.choice(len(X), size=batchsize, replace=False)
-        rows = X.iloc[idx, :].iterrows()
-        data = [ loader(d) for _, d in rows ]
-        y = Y.iloc[idx]
-        y = keras.utils.to_categorical(y, num_classes=n_classes)
-        batch = (numpy.array(data), numpy.array(y))
-        yield batch
+        #print('xx', X.shape, y.shape)
+        if not self.augment:
+            aug_idx = None
+
+        data = [ self._load(d, augmentation=aug_idx) for _, d in X.iterrows() ]
+        y = keras.utils.to_categorical(y, num_classes=self.n_classes)
+        batch = (numpy.stack(data), numpy.array(y))
+        #print('x', batch[0].shape)
+        return batch
 
 
 class LogCallback(keras.callbacks.Callback):
@@ -72,33 +122,46 @@ def write_entry(self, epoch, data):
     def on_epoch_end(self, epoch, logs):
         logs = logs.copy()
     
-        more = self.score() # uses current model
+        more = self.score(epoch, logs) # uses current model
         for k, v in more.items():
             logs[k] = v
 
         self.write_entry(epoch, logs)
 
 
+def build_multi_instance(base, windows=6, bands=32, frames=72, channels=1):
+    from keras import Model
+    from keras.layers import Input, TimeDistributed, GlobalAveragePooling1D
+    
+    input_shape = (windows, bands, frames, channels)
+    
+    input = Input(shape=input_shape)
+    x = input # BatchNormalization()(input)
+    x = TimeDistributed(base)(x)
+    x = GlobalAveragePooling1D()(x)
+    model = Model(input,x)
+    return model
 
 
-def train_model(out_dir, train, val, model,
-                loader, val_loader, settings, seed=1):
+def train_model(out_dir, fold, builder,
+                feature_dir, settings, name):
     """Train a single model"""    
 
     frame_samples = settings['hop_length']
-    train_samples = settings['train_samples']
     window_frames = settings['frames']
-    val_samples = settings['val_samples']
     epochs = settings['epochs']
     batch_size = settings['batch']
     learning_rate = settings.get('learning_rate', 0.01)
 
-    assert len(train) > len(val) * 5, 'training data should be much larger than validation'
+    def generator(data, augment):
+        return Generator(data, data.classID, feature_dir=feature_dir, settings=settings, augment=augment)
 
-    def top3(y_true, y_pred):
-        return keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=3)
+    model = builder()
+    model = build_multi_instance(model, bands=settings['n_mels'], frames=window_frames, windows=6)
+    model.summary()
 
     optimizer = keras.optimizers.SGD(lr=learning_rate, momentum=settings['nesterov_momentum'], nesterov=True)
+
     model.compile(loss='categorical_crossentropy',
                   optimizer=optimizer,
                   metrics=['accuracy'])
@@ -107,30 +170,30 @@ def top3(y_true, y_pred):
     checkpoint = keras.callbacks.ModelCheckpoint(model_path, monitor='val_acc', mode='max',
                                          period=1, verbose=1, save_best_only=False)
 
-    def voted_score():
-        y_pred = features.predict_voted(settings, model, val,
-                        loader=val_loader, method=settings['voting'], overlap=settings['voting_overlap'])
-        class_pred = numpy.argmax(y_pred, axis=1)
-        acc = sklearn.metrics.accuracy_score(val.classID, class_pred)
+    tensorboard = keras.callbacks.TensorBoard(log_dir=f'./logs/{name}',
+                            histogram_freq=0, update_freq=1000,
+                            write_graph=True, write_images=False)
+
+    def voted_score(epoch, logs):
         d = {
-            'voted_val_acc': acc,
+            'voted_val_acc': logs['val_acc'], # XXX: legacy compat
         }
-        for k, v in d.items():
-            print("{}: {:.4f}".format(k, v))
         return d
+
     log_path = os.path.join(out_dir, 'train.csv')
     log = LogCallback(log_path, voted_score)
 
+    train_gen = generator(fold[0], augment=True)
+    val_gen = generator(fold[1], augment=False)
 
-    train_gen = dataframe_generator(train, train.classID, loader=loader, batchsize=batch_size)
-    val_gen = dataframe_generator(val, val.classID, loader=val_loader, batchsize=batch_size)
-
-    callbacks_list = [checkpoint, log]
-    hist = model.fit_generator(train_gen, validation_data=val_gen,
-                        steps_per_epoch=math.ceil(train_samples/batch_size),
-                        validation_steps=math.ceil(val_samples/batch_size),
+    callbacks_list = [checkpoint, log, tensorboard]
+    hist = model.fit_generator(train_gen,
+                        validation_data=val_gen,
                         callbacks=callbacks_list,
-                        epochs=epochs, verbose=1)
+                        epochs=epochs,
+                        shuffle=True,
+                        verbose=1,
+                        workers=1)
 
     df = history_dataframe(hist)
     history_path = os.path.join(out_dir, 'history.csv')
@@ -230,14 +293,7 @@ def main():
     features.maybe_download(feature_settings, feature_dir)
 
     data = urbansound8k.load_dataset()
-    train_data, val_data = load_training_data(data, fold)
-
-    def load(sample, validation):
-        augment = not validation and train_settings['augment'] != 0
-        d = features.load_sample(sample, feature_settings, feature_dir=feature_dir,
-                        window_frames=model_settings['frames'],
-                        augment=augment, normalize=exsettings['normalize'])
-        return d
+    fold_data = load_training_data(data, fold)
 
     def build_model():
         m = models.build(exsettings)
@@ -261,11 +317,11 @@ def build_model():
     print('Training model', name)
     print('Settings', json.dumps(exsettings))
 
-    h = train_model(output_dir, train_data, val_data,
-                      model=m,
-                      loader=functools.partial(load, validation=False),
-                      val_loader=functools.partial(load, validation=True),
-                      settings=exsettings)
+   
+    h = train_model(output_dir, fold_data,
+                      builder=build_model,
+                      feature_dir = feature_dir,
+                      settings=exsettings, name=name)