Kerasのオプティマイザを比較します。データはMNIST、モデルは、フォントの学習時に使った2層のCNN+2層のFCです。 10エポックのみですので、もっと長く学習させると異なる結果となるかもしれません。
比較結果は下表の通りです。Optimizer | Decay | Learning rate |
Momentum | Epsilon | Nesterov | Rho | Beta_1 | Beta_2 | Schedule decay |
Train | Validation | |||
Loss | Error | Loss | Error | |||||||||||
SGD | 0 | 0.01 | 0 | n/a | False | n/a | n/a | n/a | n/a | 0.03682 | 1.12% | 0.04107 | 1.31% | |
SGD | 0 | 0.01 | 0.9 | n/a | False | n/a | n/a | n/a | n/a | 0.01186 | 0.40% | 0.03228 | 1.04% | |
SGD | 1e-4 | 0.01 | 0 | n/a | False | n/a | n/a | n/a | n/a | 0.04970 | 1.50% | 0.05264 | 1.69% | |
SGD | 1e-4 | 0.01 | 0.9 | n/a | False | n/a | n/a | n/a | n/a | 0.00687 | 0.16% | 0.02503 | 0.91% | |
SGD | 0 | 0.01 | 0 | n/a | True | n/a | n/a | n/a | n/a | 0.03496 | 1.06% | 0.04069 | 1.30% | |
SGD | 0 | 0.01 | 0.9 | n/a | True | n/a | n/a | n/a | n/a | 0.01203 | 0.40% | 0.03281 | 0.97% | |
SGD | 1e-4 | 0.01 | 0 | n/a | True | n/a | n/a | n/a | n/a | 0.05123 | 1.53% | 0.05515 | 1.74% | |
SGD | 1e-4 | 0.01 | 0.9 | n/a | True | n/a | n/a | n/a | n/a | 0.00730 | 0.20% | 0.02470 | 0.84% | |
RMSprop | 0 | 0.001 | n/a | 1e-8 | n/a | 0.9 | n/a | n/a | n/a | 0.02124 | 0.62% | 0.06075 | 1.51% | |
RMSprop | 1e-4 | 0.001 | n/a | 1e-8 | n/a | 0.9 | n/a | n/a | n/a | 0.00596 | 0.18% | 0.04909 | 1.11% | |
Adagrad | 0 | 0.01 | n/a | 1e-8 | n/a | n/a | n/a | n/a | n/a | 0.00734 | 0.17% | 0.02752 | 0.93% | |
Adagrad | 1e-4 | 0.01 | n/a | 1e-8 | n/a | n/a | n/a | n/a | n/a | 0.01216 | 0.31% | 0.02688 | 0.98% | |
Adadelta | 0 | 1 | n/a | 1e-8 | n/a | 0.95 | n/a | n/a | n/a | 0.01282 | 0.41% | 0.03215 | 1.10% | |
Adadelta | 1e-4 | 1 | n/a | 1e-8 | n/a | 0.95 | n/a | n/a | n/a | 0.01258 | 0.33% | 0.02489 | 0.88% | |
Adam | 0 | 0.001 | n/a | 1e-8 | n/a | n/a | 0.9 | 0.999 | n/a | 0.02548 | 0.65% | 0.06531 | 1.35% | |
Adam | 1e-4 | 0.001 | n/a | 1e-8 | n/a | n/a | 0.9 | 0.999 | n/a | 0.00425 | 0.14% | 0.04176 | 0.96% | |
Adamax | 0 | 0.002 | n/a | 1e-8 | n/a | n/a | 0.9 | 0.999 | n/a | 0.00696 | 0.21% | 0.02920 | 0.91% | |
Adamax | 1e-4 | 0.002 | n/a | 1e-8 | n/a | n/a | 0.9 | 0.999 | n/a | 0.00435 | 0.09% | 0.02528 | 0.88% | |
Nadam | n/a | 0.002 | n/a | 1e-8 | n/a | n/a | 0.9 | 0.999 | 0.004 | 4.64530 | 28.83% | 4.52680 | 28.09% |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | # This script trains a neural network model using MNIST.
import gc, keras, os, re, datetime
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Reshape
from keras.layers.convolutional import Conv2D
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import Flatten, Dropout
from keras.datasets import mnist
TRAIN_EPOCH = 10
# Image size of MNIST
IMAGE_WIDTH = 28
IMAGE_HEIGHT = 28
# Define a discriminator model for numbers
def num_discriminator_model():
model = Sequential()
model.add(Conv2D(64, (5, 5), strides=(2, 2), padding='same',
input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, 1),
data_format='channels_last'))
model.add(LeakyReLU(0.2))
model.add(Conv2D(128, (5, 5), strides=(2, 2), data_format='channels_last'))
model.add(LeakyReLU(0.2))
model.add(Flatten())
model.add(Dense(256))
model.add(LeakyReLU(0.2))
model.add(Dropout(0.5))
model.add(Dense(10))
model.add(Activation('softmax'))
print(model.summary())
return model
def format_x(x):
x = (x.astype(np.float32) - 127.5)/127.5 # [0,255] --> [-1,1]
x = x.reshape((x.shape[0], x.shape[1], x.shape[2], 1))
return x
def train_by(model, opt):
model.compile(optimizer=opt,
loss='categorical_crossentropy',
metrics=['accuracy'])
hist = model.fit(x_train, y_train, validation_data=(x_val, y_val),
epochs=TRAIN_EPOCH, batch_size=32).history
# Evaluate the last model by train and validation data
score_train = model.evaluate(x_train, y_train, batch_size=32)
score_val = model.evaluate(x_val, y_val, batch_size=32)
# Write training history into a file
with open("history.log", mode="a") as f:
d = datetime.datetime.today()
f.write("#"+d.strftime("%Y-%m-%d %H:%M:%S")+"\n")
f.write("#Optimizer :"+str(type(opt))+str(opt.get_config())+"\n")
f.write("#Data in the last line are calculated by the last model"
" and not calculated in model.fit()\n")
f.write("#epoch train-loss train-acc val-loss val-acc\n")
for v in range(0, TRAIN_EPOCH):
f.write("{0} {1:10.6f} {2:10.6f} {3:10.6f} {4:10.6f}\n"
.format(v, hist["loss"][v], hist["acc"][v],
hist["val_loss"][v], hist["val_acc"][v]))
f.write("{0} {1:10.6f} {2:10.6f} {3:10.6f} {4:10.6f}\n"
.format(v+1, score_train[0], score_train[1],
score_val[0], score_val[1]))
f.write("\n\n")
if __name__ == '__main__':
# Load MNIST data
(x_train, y_train), (x_val, y_val) = mnist.load_data()
x_train = format_x(x_train)
x_val = format_x(x_val)
# Encode labels into 1-hot vectors
y_train = keras.utils.to_categorical(y_train, num_classes=10)
y_val = keras.utils.to_categorical(y_val, num_classes=10)
# Make a model
model = num_discriminator_model()
# Save initial weights which will be used to reset weights
model.save_weights("initial_weights.hdf5") #
# Make a list of optimizers
opt = []
opt.append(keras.optimizers.SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False))
opt.append(keras.optimizers.SGD(lr=0.01, momentum=0.9, decay=0.0, nesterov=False))
opt.append(keras.optimizers.SGD(lr=0.01, momentum=0, decay=1e-4, nesterov=False))
opt.append(keras.optimizers.SGD(lr=0.01, momentum=0.9, decay=1e-4, nesterov=False))
opt.append(keras.optimizers.SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=True))
opt.append(keras.optimizers.SGD(lr=0.01, momentum=0.9, decay=0.0, nesterov=True))
opt.append(keras.optimizers.SGD(lr=0.01, momentum=0, decay=1e-4, nesterov=True))
opt.append(keras.optimizers.SGD(lr=0.01, momentum=0.9, decay=1e-4, nesterov=True))
opt.append(keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0))
opt.append(keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=1e-4))
opt.append(keras.optimizers.Adagrad(lr=0.01, epsilon=1e-08, decay=0.0))
opt.append(keras.optimizers.Adagrad(lr=0.01, epsilon=1e-08, decay=1e-4))
opt.append(keras.optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=1e-08, decay=0.0))
opt.append(keras.optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=1e-08, decay=1e-4))
opt.append(keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999,
epsilon=1e-08, decay=0.0))
opt.append(keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999,
epsilon=1e-08, decay=1e-4))
opt.append(keras.optimizers.Adamax(lr=0.002, beta_1=0.9, beta_2=0.999,
epsilon=1e-08, decay=0.0))
opt.append(keras.optimizers.Adamax(lr=0.002, beta_1=0.9, beta_2=0.999,
epsilon=1e-08, decay=1e-4))
opt.append(keras.optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999,
epsilon=1e-08, schedule_decay=0.004))
# Train for each optimizer
for o in opt:
model.load_weights("initial_weights.hdf5") # Initialize weights
train_by(model, o)
gc.collect() # To suppress error messages of TensorFlow
|
0 件のコメント :
コメントを投稿