"""Propagation.
==============
This module contains the critical functionality to propagate information forward and backward through the neural net.
"""
# Copyright (C) 2018 Steven H. Berguin
# This work is licensed under the MIT License.
from __future__ import annotations # needed if python is 3.9
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from .cache import Cache
from .data import Dataset
from .parameters import Parameters
import numpy as np
from .activation import ACTIVATIONS
[docs]def eye(n: int, m: int) -> np.ndarray:
"""Copy identify matrix of shape (n, n) m times."""
eye = np.eye(n, dtype=float)
return np.repeat(eye.reshape((n, n, 1)), m, axis=2)
[docs]def first_layer_forward(X: np.ndarray, cache: Cache | None = None) -> None:
"""Compute input layer activations (in place).
:param X: training data inputs, array of shape (n_x, m)
:param cache: neural net cache that stores neural net quantities
computed during forward prop for each layer, so they can be
accessed during backprop to avoid re-computing them
"""
X = X.astype(float, copy=False)
if cache is not None:
cache.A[0][:] = X
[docs]def first_layer_partials(X: np.ndarray, cache: Cache | None) -> None:
"""Compute input layer partial (in place).
:param X: training data inputs, array of shape (n_x, m)
:param cache: neural net cache that stores neural net quantities
computed during forward prop for each layer, so they can be
accessed during backprop to avoid re-computing them
"""
X = X.astype(float, copy=False)
if cache is not None:
n_x, m = X.shape
cache.A_prime[0][:] = eye(n_x, m)
[docs]def next_layer_partials(layer: int, parameters: Parameters, cache: Cache) -> np.ndarray:
"""Compute j^th partial in place for one layer (in place).
:param layer: index of current layer.
:param parameters: object that stores neural net parameters for each
layer
:param cache: neural net cache that stores neural net quantities
computed during forward prop for each layer, so they can be
accessed during backprop to avoid re-computing them
"""
s = layer
r = layer - 1
W = parameters.W[layer]
g = ACTIVATIONS[parameters.a[layer]]
cache.G_prime[s][:] = g.first_derivative(cache.Z[s], cache.A[s])
for j in range(parameters.n_x):
cache.Z_prime[s][:, j, :] = np.dot(W, cache.A_prime[r][:, j, :])
cache.A_prime[s][:, j, :] = cache.G_prime[s] * np.dot(
W,
cache.A_prime[r][:, j, :],
)
return cache.A_prime[s]
[docs]def next_layer_forward(layer: int, parameters: Parameters, cache: Cache) -> None:
"""Propagate forward through one layer (in place).
:param layer: index of current layer.
:param parameters: object that stores neural net parameters for each
layer
:param cache: neural net cache that stores neural net quantities
computed during forward prop for each layer, so they can be
accessed during backprop to avoid re-computing them
"""
s = layer
r = layer - 1
W = parameters.W[s]
b = parameters.b[s]
g = ACTIVATIONS[parameters.a[s]]
Z = cache.Z[s]
A = cache.A[s]
np.dot(W, cache.A[r], out=Z)
Z += b
g.evaluate(Z, A)
[docs]def model_partials_forward(
X: np.ndarray,
parameters: Parameters,
cache: Cache,
) -> tuple[np.ndarray, np.ndarray]:
"""Propagate forward in order to predict reponse(r) and partial(r).
:param X: training data inputs, array of shape (n_x, m)
:param parameters: object that stores neural net parameters for each
layer
:param cache: neural net cache that stores neural net quantities
computed during forward prop for each layer, so they can be
accessed during backprop to avoid re-computing them
"""
first_layer_forward(X, cache)
first_layer_partials(X, cache)
for layer in parameters.layers[1:]: # type: ignore[index]
next_layer_forward(layer, parameters, cache)
next_layer_partials(layer, parameters, cache)
return cache.A[-1], cache.A_prime[-1]
[docs]def model_forward(X: np.ndarray, parameters: Parameters, cache: Cache) -> np.ndarray:
"""Propagate forward in order to predict reponse(r).
:param X: training data inputs, array of shape (n_x, m)
:param parameters: object that stores neural net parameters for each
layer
:param cache: neural net cache that stores neural net quantities
computed during forward prop for each layer, so they can be
accessed during backprop to avoid re-computing them
"""
first_layer_forward(X, cache)
for layer in parameters.layers[1:]: # type: ignore[index]
next_layer_forward(layer, parameters, cache)
return cache.A[-1]
[docs]def partials_forward(X: np.ndarray, parameters: Parameters, cache: Cache) -> np.ndarray:
"""Propagate forward in order to predict partial(r).
:param X: training data inputs, array of shape (n_x, m)
:param parameters: object that stores neural net parameters for each
layer
:param cache: neural net cache that stores neural net quantities
computed during forward prop for each layer, so they can be
accessed during backprop to avoid re-computing them
"""
return model_partials_forward(X, parameters, cache)[-1]
[docs]def last_layer_backward(cache: Cache, data: Dataset) -> None:
"""Propagate backward through last layer (in place).
:param cache: neural net cache that stores neural net quantities
computed during forward prop for each layer, so they can be
accessed during backprop to avoid re-computing them
:param data: object containing training and associated metadata
"""
cache.dA[-1][:] = data.Y_weights * (cache.A[-1] - data.Y)
if data.J is not None:
cache.dA_prime[-1][:] = data.J_weights * (cache.A_prime[-1] - data.J)
[docs]def next_layer_backward(
layer: int,
parameters: Parameters,
cache: Cache,
data: Dataset,
lambd: float,
) -> None:
"""Propagate backward through next layer (in place).
:param layer: index of current layer.
:param parameters: object that stores neural net parameters for each
layer
:param cache: neural net cache that stores neural net quantities
computed during forward prop for each layer, so they can be
accessed during backprop to avoid re-computing them
:param data: object containing training and associated metadata
:param lambd: coefficient that multiplies regularization term in
cost function
"""
s = layer
r = layer - 1
g = ACTIVATIONS[parameters.a[s]]
g.first_derivative(cache.Z[s], cache.A[s], cache.G_prime[s])
np.dot(cache.G_prime[s] * cache.dA[s], cache.A[r].T, out=parameters.dW[s])
parameters.dW[s] /= data.m
parameters.dW[s] += lambd / data.m * parameters.W[s]
np.sum(cache.G_prime[s] * cache.dA[s], axis=1, keepdims=True, out=parameters.db[s])
parameters.db[s] /= data.m
np.dot(parameters.W[s].T, cache.G_prime[s] * cache.dA[s], out=cache.dA[r])
[docs]def gradient_enhancement(
layer: int,
parameters: Parameters,
cache: Cache,
data: Dataset,
) -> None:
"""Add gradient enhancement to backprop (in place).
:param layer: index of current layer.
:param parameters: object that stores neural net parameters for each
layer
:param cache: neural net cache that stores neural net quantities
computed during forward prop for each layer, so they can be
accessed during backprop to avoid re-computing them
:param data: object containing training and associated metadata
"""
if data.J is None:
return
if np.all(data.J_weights == 0.0):
return
s = layer
r = layer - 1
g = ACTIVATIONS[parameters.a[s]]
cache.G_prime_prime[s][:] = g.second_derivative(
cache.Z[s],
cache.A[s],
cache.G_prime[s],
)
coefficient = 1 / data.m
for j in range(parameters.n_x):
parameters.dW[s] += coefficient * (
np.dot(
cache.dA_prime[s][:, j, :]
* cache.G_prime_prime[s]
* cache.Z_prime[s][:, j, :],
cache.A[r].T,
)
+ np.dot(
cache.dA_prime[s][:, j, :] * cache.G_prime[s],
cache.A_prime[r][:, j, :].T,
)
)
parameters.db[s] += coefficient * np.sum(
cache.dA_prime[s][:, j, :]
* cache.G_prime_prime[s]
* cache.Z_prime[s][:, j, :],
axis=1,
keepdims=True,
)
cache.dA[r] += np.dot(
parameters.W[s].T,
cache.dA_prime[s][:, j, :]
* cache.G_prime_prime[s]
* cache.Z_prime[s][:, j, :],
)
cache.dA_prime[r][:, j, :] = np.dot(
parameters.W[s].T,
cache.dA_prime[s][:, j, :] * cache.G_prime[s],
)
[docs]def model_backward(
data: Dataset,
parameters: Parameters,
cache: Cache,
lambd: float = 0.0,
) -> None:
"""Propagate backward through all layers (in place).
:param parameters: object that stores neural net parameters for each
layer
:param cache: neural net cache that stores neural net quantities
computed during forward prop for each layer, so they can be
accessed during backprop to avoid re-computing them
:param data: object containing training and associated metadata
:param lambd: regularization coefficient to avoid overfitting
[defaulted to zero] (optional)
"""
last_layer_backward(cache, data)
for layer in reversed(parameters.layers): # type: ignore[call-overload]
if layer > 0:
next_layer_backward(layer, parameters, cache, data, lambd)
gradient_enhancement(layer, parameters, cache, data)