Here we are going to implement convolution layers and pooling layers from scratch using just numpy.
zero_pad
This function pads all images of a batch of examples X with zeros.
def zero_pad(X, pad):
"""
X -- numpy array of shape (m, n_H, n_W, n_C)
X_pad -- padded image of shape (m, n_H + 2*pad, n_W + 2*pad, n_C)
"""
X_pad = np.pad(X,
((0,0), (pad, pad), (pad, pad), (0,0)),
mode = 'constant',
constant_values = (0,0))
return X_padconv_single_step
This implements a single step of convolution, in which we apply the filter to a single position of the input.
def conv_single_step(a_slice_prev, W, b):
"""
Apply one filter defined by parameters W on a single slice of the output
activation of the previous layer.
a_slice_prev -- slice of input data of shape (f, f, n_C_prev)
W -- (f, f, n_C_prev)
b -- (1, 1, ,1)
"""
s = a_slice_prev*W
Z = np.sum(s)
Z = Z + np.float64(b)
return Zconv_forward
Here we convolve the filters W on an input activation A_prev.
def conv_forward(A_prev, W, b, hparameters):
"""
A_prev -- output of the previous layer. (m, n_H_prev, n_W_prev, n_C_prev)
W -- (f, f, n_C_prev, n_C)
b -- (1, 1, 1, n_C)
Returns:
Z -- conv output. (m, n_H, n_W, n_C)
"""
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
(f, f, n_C_prev, n_C) = W.shape
stride = hparameters["stride"]
pad = hparameters["pad"]
n_H = int((n_H_prev - f + 2*pad)/stride) + 1
n_W = int((n_W_prev - f + 2*pad)/stride) + 1
Z = np.zeros((m, n_H, n_W, n_C))
A_prev_pad = zeros_pad(A_prev, pad)
for i in range(m):
a_prev_pad = A_prev_pad[i]
for h in range(n_H):
vert_start = h*stride
vert_end = vert_start + f
for w in range(n_W):
horiz_start = w*stride
horiz_end = horiz_start + f
for c in range(n_C):
a_slice_prev = a_prev_pad[vert_start:vert_end,
horiz_start:horiz_end, :]
weights = W[:, :, :, c]
biases = b[:, :, :, c]
Z[i, h, w, c] = conv_single_step(a_slice_prev,
weights, biases)
cache = (A_prev, W, b, hparameters)
return Z, cachepool_forward
def pool_forward(A_prev, hparameters, mode ="max"):
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
f = hparameters["f"]
stride = hparameters["stride"]
n_H = int(1 + (n_H_prev - f)/stride)
n_W = int(1 + (n_W_prev - f)/stride)
A = np.zeros((m, n_H, n_W, n_C))
for i in range(m):
for h in range(n_H):
vert_start = h*stride
vert_end = vert_start + f
for w in range(n_W):
horiz_start = w*stride
horiz_end = horiz_start + f
for c in range(n_C):
a_prev_slice = A_prev[i, vert_start:vert_end,
horiz_start:horiz_end, c]
if mode == "max":
A[i, h, w, c] = np.max(a_prev_slice)
elif mode == "average":
A[i, h, w, c] = np.mean(a_prev_slice)
cache = (A_prev, hparameters)
return A, cache Convolutional Layer Backward Pass
Formula for computing with respect to the cost for a certain filter and a given training example is given by:
Where is a filter and is a scalar corresponding to the gradient of the cost with respect to the output of the conv layer Z at the hth row and the wth column
For we use the following formula :
where corresponds to the slice which was used to generate the activation .
Similarly in order to compute with respect to the cost for a certain filter we use :
def conv_backward(dZ, cache):
"""
Args:
dZ -- gradient of the cost w.r.t. the output of the conv layer.
Has dimensions (m, n_H, n_W, n_C)
cahce -- Output of conv_forward()
Returns:
dA_prev -- gradient of cost w.r.t. input of conv layer (A_prev)
Has dimensions (m, n_H_prev, n_w_prev, n_C_prev)
dW -- gradient of the cost w.r.t. to the weights of the conv layer
Has dimensions (f, f, n_C_prev, n_C)
db -- gradient of cost w.r.t. to the biases of the conv layer
Has dimensions (1, 1, 1, n_C)
"""
(A_prev, W, b, hparameters) = cache
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
(f, f, n_C_prev, n_C) = W.shape
stride = hparaametrs["stride"]
pad = hparameters["pad"]
(m, n_H, n_W, n_C) = dZ.shape
dA_prev = np.zeros((m, n_H_prev, n_W_prev, n_C_prev))
dW = np.zeros((f, f, n_C_prev, n_C))
db = np.zeros((1, 1, 1, n_C))
A_prev_pad = zero_pad(A_prev, pad)
dA_prev_pad = zero_pad(dA_prev, pad)
for i in range(m):
a_prev_pad = A_prev_pad[i]
da_prev_pad = dA_prev_pad[i]
for h in range(n_H):
for w in range(n_W):
for c in range(n_C):
vert_start = stride*h
vert_end = vert_start + f
horiz_start = stride*w
horiz_end = horiz_start + f
a_slice = a_prev_pad[vert_start:vert_end,
horiz_start:horiz_end, :]
da_prev_pad[vert_start:vert_end,
horiz_start:horiz_end, :] += W[:,:,:,c]*dZ[i, h, w, c]
dW[:,:,:,c] += a_slice*dZ[i, h, w, c]
db[:,:,:,c] += dZ[i, h, w, c]
dA_prev[i, :, :, :] = da_prev_pad[pad:-pad, pad:-pad, :]
return dA_prev, dW, dbTowards the end of backward pass we remove the padding we had once give to the image. The padded zeros are not parameters and not inputs. They are fake values that will always remain zero. So we pad during backprop only because the convolution operation was performed on a padded image. After computing the gradients, we extract the part corresponding to the real input.
Pooling Layer Backward Pass
Even though a pooling layer has no parameters for backprop to update, you still need to backpropagate the gradient through the pooling layer in order to compute gradients for layers that came before the pooling layer.
We are going to implement a helper function called create_mask_from_window() which keeps track of where the maximum of the matrix is.
def create_mask_from_window(x):
mask = x == np.max(x)
return maskIn max pooling, for each input window, all the influence on the output came from a single input value — the max. In average pooling, every element of the input window has equal influence on the output.
So for avg pooling we need another helper function called distribute_value().
def distribute_value(dz, shape):
n_H, n_W = shape
average = dz/(n_H*n_W)
a = np.zeros((n_H, n_W)) + average
return aNow we can finally implement pool_backward function for both max and average pooling.
def pool_backward(dA, cache, mode = "max"):
"""
Args:
dA -- gradient of cost with respect to the output of the pooling layer, same shape as A
cache -- cache output from the forward pass of the pooling layer, contains the layer's input and hparameters
mode -- the pooling mode you would like to use, defined as a string ("max" or "average")
Returns:
dA_prev -- gradient of cost with respect to the input of the pooling layer, same shape as A_prev
"""
(A_prev, hparameters) = cache
stride = hparameters["stride"]
f = hparameters["f"]
m, n_H_prev, n_W_prev, n_C_prev = A_prev.shape
m, n_H, n_W, n_C = dA.shape
dA_prev = np.zeros(A_prev.shape)
for i in range(m):
a_prev = A_prev[i]
for h in range(n_H):
for w in range(n_W):
for c in range(n_C):
vert_start = h * stride
vert_end = vert_start + f
horiz_start = w * stride
horiz_end = horiz_start + f
if mode == "max":
a_prev_slice = a_prev[
vert_start:vert_end,
horiz_start:horiz_end,
c
]
mask = create_mask_from_window(a_prev_slice)
dA_prev[
i,
vert_start:vert_end,
horiz_start:horiz_end,
c
] += mask * dA[i, h, w, c]
elif mode == "average":
da = dA[i, h, w, c]
dA_prev[
i,
vert_start:vert_end,
horiz_start:horiz_end,
c
] += distribute_value(da, (f, f))
return dA_prevNow in the next few notes we will be seeing examples of some classic networks like the :
LeNet-5AlexNetVGGResNetInception neural network