2017/09/09

Kerasのオプティマイザの比較

Kerasのオプティマイザを比較します。データはMNIST、モデルは、フォントの学習時に使った2層のCNN+2層のFCです。 10エポックのみですので、もっと長く学習させると異なる結果となるかもしれません。

比較結果は下表の通りです。

Optimizer Decay Learning
rate
Momentum Epsilon Nesterov Rho Beta_1 Beta_2 Schedule
decay
Train
Validation
Loss Error Loss Error
SGD 0 0.01 0 n/a False n/a n/a n/a n/a 0.03682 1.12%
0.04107 1.31%
SGD 0 0.01 0.9 n/a False n/a n/a n/a n/a 0.01186 0.40%
0.03228 1.04%
SGD 1e-4 0.01 0 n/a False n/a n/a n/a n/a 0.04970 1.50%
0.05264 1.69%
SGD 1e-4 0.01 0.9 n/a False n/a n/a n/a n/a 0.00687 0.16%
0.02503 0.91%
SGD 0 0.01 0 n/a True n/a n/a n/a n/a 0.03496 1.06%
0.04069 1.30%
SGD 0 0.01 0.9 n/a True n/a n/a n/a n/a 0.01203 0.40%
0.03281 0.97%
SGD 1e-4 0.01 0 n/a True n/a n/a n/a n/a 0.05123 1.53%
0.05515 1.74%
SGD 1e-4 0.01 0.9 n/a True n/a n/a n/a n/a 0.00730 0.20%
0.02470 0.84%
RMSprop 0 0.001 n/a 1e-8 n/a 0.9 n/a n/a n/a 0.02124 0.62%
0.06075 1.51%
RMSprop 1e-4 0.001 n/a 1e-8 n/a 0.9 n/a n/a n/a 0.00596 0.18%
0.04909 1.11%
Adagrad 0 0.01 n/a 1e-8 n/a n/a n/a n/a n/a 0.00734 0.17%
0.02752 0.93%
Adagrad 1e-4 0.01 n/a 1e-8 n/a n/a n/a n/a n/a 0.01216 0.31%
0.02688 0.98%
Adadelta 0 1 n/a 1e-8 n/a 0.95 n/a n/a n/a 0.01282 0.41%
0.03215 1.10%
Adadelta 1e-4 1 n/a 1e-8 n/a 0.95 n/a n/a n/a 0.01258 0.33%
0.02489 0.88%
Adam 0 0.001 n/a 1e-8 n/a n/a 0.9 0.999 n/a 0.02548 0.65%
0.06531 1.35%
Adam 1e-4 0.001 n/a 1e-8 n/a n/a 0.9 0.999 n/a 0.00425 0.14%
0.04176 0.96%
Adamax 0 0.002 n/a 1e-8 n/a n/a 0.9 0.999 n/a 0.00696 0.21%
0.02920 0.91%
Adamax 1e-4 0.002 n/a 1e-8 n/a n/a 0.9 0.999 n/a 0.00435 0.09%
0.02528 0.88%
Nadam n/a 0.002 n/a 1e-8 n/a n/a 0.9 0.999 0.004 4.64530 28.83%
4.52680 28.09%

各オプティマイザのパラメータを調整すれば、もっと良い結果が得られるかもしれません。 今回の結果から、10エポック目のモデルで一番良いのはSGDのdecayとmomentumをそれぞれ1e-4と0.9に設定したもので、エラーは0.84%でした。 Decayを1e-4に設定したAdadeltaとAdamaxがその次に良く、エラーは0.88%でした。

エポックごとのValidation errorの変化を下図に示します。

エラーの低下の過程を見ると、decay=1e-4のAdamaxが収束の速さの観点では一番良いようです。 理由は不明ですが、Nadamはなぜか学習を進めるにつれてエラーが増えていくという不思議な挙動をしています。

評価に使ったコードは以下の通りです。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# This script trains a neural network model using MNIST.
import gc, keras, os, re, datetime
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Reshape
from keras.layers.convolutional import Conv2D
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import Flatten, Dropout
from keras.datasets import mnist

TRAIN_EPOCH = 10

# Image size of MNIST
IMAGE_WIDTH = 28
IMAGE_HEIGHT = 28

# Define a discriminator model for numbers
def num_discriminator_model():
    model = Sequential()
    model.add(Conv2D(64, (5, 5), strides=(2, 2), padding='same',
                     input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, 1), 
                     data_format='channels_last'))
    model.add(LeakyReLU(0.2))
    model.add(Conv2D(128, (5, 5), strides=(2, 2), data_format='channels_last'))
    model.add(LeakyReLU(0.2))
    model.add(Flatten())
    model.add(Dense(256))
    model.add(LeakyReLU(0.2))
    model.add(Dropout(0.5))
    model.add(Dense(10))
    model.add(Activation('softmax'))
    print(model.summary())
    return model

def format_x(x):
    x = (x.astype(np.float32) - 127.5)/127.5 # [0,255] --> [-1,1]
    x = x.reshape((x.shape[0], x.shape[1], x.shape[2], 1))
    return x

def train_by(model, opt):
    model.compile(optimizer=opt,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    hist = model.fit(x_train, y_train, validation_data=(x_val, y_val),
                     epochs=TRAIN_EPOCH, batch_size=32).history

    # Evaluate the last model by train and validation data
    score_train = model.evaluate(x_train, y_train, batch_size=32)
    score_val = model.evaluate(x_val, y_val, batch_size=32)

    # Write training history into a file
    with open("history.log", mode="a") as f:
        d = datetime.datetime.today()
        f.write("#"+d.strftime("%Y-%m-%d %H:%M:%S")+"\n")
        f.write("#Optimizer :"+str(type(opt))+str(opt.get_config())+"\n")
        f.write("#Data in the last line are calculated by the last model"
                " and not calculated in model.fit()\n")
        f.write("#epoch train-loss train-acc val-loss val-acc\n")
        for v in range(0, TRAIN_EPOCH):
            f.write("{0} {1:10.6f} {2:10.6f} {3:10.6f} {4:10.6f}\n"
                    .format(v, hist["loss"][v], hist["acc"][v],
                            hist["val_loss"][v], hist["val_acc"][v]))
        f.write("{0} {1:10.6f} {2:10.6f} {3:10.6f} {4:10.6f}\n"
                .format(v+1, score_train[0], score_train[1],
                        score_val[0], score_val[1]))
        f.write("\n\n")

if __name__ == '__main__':
    # Load MNIST data
    (x_train, y_train), (x_val, y_val) = mnist.load_data()
    x_train = format_x(x_train)
    x_val = format_x(x_val)

    # Encode labels into 1-hot vectors
    y_train = keras.utils.to_categorical(y_train, num_classes=10)
    y_val = keras.utils.to_categorical(y_val, num_classes=10)

    # Make a model
    model = num_discriminator_model()

    # Save initial weights which will be used to reset weights
    model.save_weights("initial_weights.hdf5") # 

    # Make a list of optimizers
    opt = []
    opt.append(keras.optimizers.SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False))
    opt.append(keras.optimizers.SGD(lr=0.01, momentum=0.9, decay=0.0, nesterov=False))
    opt.append(keras.optimizers.SGD(lr=0.01, momentum=0,   decay=1e-4, nesterov=False))
    opt.append(keras.optimizers.SGD(lr=0.01, momentum=0.9, decay=1e-4, nesterov=False))
    opt.append(keras.optimizers.SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=True))
    opt.append(keras.optimizers.SGD(lr=0.01, momentum=0.9, decay=0.0, nesterov=True))
    opt.append(keras.optimizers.SGD(lr=0.01, momentum=0,   decay=1e-4, nesterov=True))
    opt.append(keras.optimizers.SGD(lr=0.01, momentum=0.9, decay=1e-4, nesterov=True))

    opt.append(keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0))
    opt.append(keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=1e-4))

    opt.append(keras.optimizers.Adagrad(lr=0.01, epsilon=1e-08, decay=0.0))
    opt.append(keras.optimizers.Adagrad(lr=0.01, epsilon=1e-08, decay=1e-4))

    opt.append(keras.optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=1e-08, decay=0.0))
    opt.append(keras.optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=1e-08, decay=1e-4))

    opt.append(keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999,
                                     epsilon=1e-08, decay=0.0))
    opt.append(keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999,
                                     epsilon=1e-08, decay=1e-4))

    opt.append(keras.optimizers.Adamax(lr=0.002, beta_1=0.9, beta_2=0.999,
                                       epsilon=1e-08, decay=0.0))
    opt.append(keras.optimizers.Adamax(lr=0.002, beta_1=0.9, beta_2=0.999,
                                       epsilon=1e-08, decay=1e-4))

    opt.append(keras.optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999,
                                      epsilon=1e-08, schedule_decay=0.004))
    # Train for each optimizer
    for o in opt:
        model.load_weights("initial_weights.hdf5") # Initialize weights
        train_by(model, o)
    gc.collect() # To suppress error messages of TensorFlow

0 件のコメント :