AlexNet, which showed an astounding increase in the percentage of correct answers in ImageNet Large Scale Recognition Competition 2012 and started the current deep learning trend.
I read the paper and implemented it for study.
Paper
ImageNet Classification with Deep Convolutional Neural Networks
Overview of AlexNet
Structure of layer
Roughly, it has an input layer, 5 convolution layers, and 3 fully-connected layers.
The input size is 224×224 and the output is a 1×1000 matrix.
A 224x224 image is input to convolution layer 1 through a convolution filter of size 11x11 and stride width 4, and is input to convolution layer 1 at 56x56. Between convolution layers 1 and 2, 2 and 3, and 5 and fully-connected layer 1, a maximum-value pooling process is performed, and the final size is reduced to 7x7.
The number of kernel layers is 256 when the fifth convolutional layer is reached. After three fully-connected layers, the final output is a pseudo-probability with a softmax function.
Other AlexNet innovations include the use of normalized images from the previous layer's output (groups) as input for convolution layers 1 and 2 to eliminate image luminance bias, and dropouts in fully-connected layers 1 and 2 to prevent overfitting.
Implementation
# -*- coding: utf-8 -*-
import os
import cv2
import math
import numpy as np
import tensorflow as tf
NUM_CLASSES = 1000
IMAGE_SIZE = 224
IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE * 3
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_string('log_dir', 'log', 'Directory to save tensorboard logs')
flags.DEFINE_integer('max_steps', 1000, 'Number of steps to run trainer.')
flags.DEFINE_integer('batch_size', 10, 'Must divide evenly into the dataset sizes.')
flags.DEFINE_float('learning_rate', 1e-2, 'Initial learning rate.')
def inference(images_placeholder, keep_prob):
def weight_variable(shape, num):
initial = tf.truncated_normal(shape, stddev=1.0 / math.sqrt(float(num)))
return(tf.Variable(initial).initialized_value())
def bias_variable(shape):
initial = tf.zeros(shape)
return(tf.Variable(initial).initialized_value())
def conv2d(x, W):
return(tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME'))
def max_pool_3x3(x):
return(tf.nn.max_pool(x, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME'))
x_image = tf.reshape(images_placeholder, [-1, IMAGE_SIZE, IMAGE_SIZE, 3])
with tf.name_scope('conv1') as scope:
W_conv1 = weight_variable([11, 11, 3, 96], IMAGE_SIZE * IMAGE_SIZE)
b_conv1 = bias_variable([96])
h_conv1 = tf.nn.relu(tf.nn.conv2d(x_image, W_conv1, strides=[1, 4, 4, 1], padding='SAME') + b_conv1)
with tf.name_scope('pool1') as scope:
h_pool1 = max_pool_3x3(tf.nn.local_response_normalization(h_conv1))
with tf.name_scope('conv2') as scope:
W_conv2 = weight_variable([5, 5, 96, 256], 96)
b_conv2 = bias_variable([256])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
with tf.name_scope('pool2') as scope:
h_pool2 = max_pool_3x3(tf.nn.local_response_normalization(h_conv2))
with tf.name_scope('conv3') as scope:
W_conv3 = weight_variable([3, 3, 256, 384], 256)
b_conv3 = bias_variable([384])
h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3)
with tf.name_scope('conv4') as scope:
W_conv4 = weight_variable([3, 3, 384, 384], 384)
b_conv4 = bias_variable([384])
h_conv4 = tf.nn.relu(conv2d(h_conv3, W_conv4) + b_conv4)
with tf.name_scope('conv5') as scope:
W_conv5 = weight_variable([3, 3, 384, 256], 384)
b_conv5 = bias_variable([256])
h_conv5 = tf.nn.relu(conv2d(h_conv4, W_conv5) + b_conv5)
with tf.name_scope('pool3') as scope:
h_pool3 = max_pool_3x3(h_conv5)
with tf.name_scope('fc1') as scope:
W_fc1 = weight_variable([7 * 7 * 256, 4096], (7 * 7 * 256))
b_fc1 = bias_variable([4096])
h_pool3_flat = tf.reshape(h_pool3, [-1, 7 * 7 * 256])
h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, W_fc1) + b_fc1)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
with tf.name_scope('fc2') as scope:
W_fc2 = weight_variable([4096, 4096], 4096)
b_fc2 = bias_variable([4096])
h_fc2 = tf.nn.relu(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
h_fc2_drop = tf.nn.dropout(h_fc2, keep_prob)
with tf.name_scope('fc3') as scope:
W_fc3 = weight_variable([4096, NUM_CLASSES], 4096)
b_fc3 = bias_variable([NUM_CLASSES])
y_conv = tf.matmul(h_fc2_drop, W_fc3) + b_fc3
with tf.name_scope('softmax') as scope:
y_conv = tf.nn.softmax(tf.matmul(h_fc2_drop, W_fc3) + b_fc3)
return(y_conv)
def loss(logits, labels):
cross_entropy = tf.reduce_mean(-tf.reduce_sum(labels * tf.log(tf.nn.softmax(logits))))
tf.summary.scalar('cross_entropy', cross_entropy)
return(cross_entropy)
def training(loss, learning_rate):
train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
return(train_step)
def accuracy(logits, labels):
correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
tf.summary.scalar('accuracy', accuracy)
return(accuracy)
if __name__ == '__main__':
train_image = [] # input data
train_label = []
with tf.Graph().as_default():
images_placeholder = tf.placeholder('float32', shape=(None, IMAGE_PIXELS))
labels_placeholder = tf.placeholder('float32', shape=(None, NUM_CLASSES))
keep_prob = tf.placeholder('float')
logits = inference(images_placeholder, keep_prob)
loss_value = loss(logits, labels_placeholder)
train_op = training(loss_value, FLAGS.learning_rate)
acc = accuracy(logits, labels_placeholder)
save_path = 'models/'
model_name = 'model1.ckpt'
if not os.path.exists(save_path):
os.makedirs(save_path)
saver = tf.train.Saver()
save_path_full = os.path.join(save_path, model_name)
with tf.Session() as sess:
summary_op = tf.summary.merge_all()
summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)
sess.run(tf.global_variables_initializer())
feed_dict_not_dropout = {
images_placeholder: train_image[:],
labels_placeholder: train_label[:],
keep_prob: 1.0}
batch_step = int(len(train_image)/FLAGS.batch_size)
for step in range(FLAGS.max_steps):
for i in range(batch_step):
batch = FLAGS.batch_size * i
sess.run(train_op, feed_dict={
images_placeholder: train_image[batch:batch+FLAGS.batch_size],
labels_placeholder: train_label[batch:batch+FLAGS.batch_size],
keep_prob: 0.5})
train_accuracy, summary_str = sess.run([acc, summary_op], feed_dict_not_dropout)
summary_writer.add_summary(summary_str, step)
print("step %d, training accuracy: %g"%(step, train_accuracy))
saver.save(sess, save_path_full)