Source code for jenn.core.training

"""Training.
============

This class implements the core algorithm responsible for training the neural networks."""

import functools
from collections import defaultdict
from typing import Union

import numpy as np

from .cache import Cache
from .cost import Cost
from .data import Dataset
from .optimization import ADAMOptimizer
from .parameters import Parameters
from .propagation import model_backward, model_partials_forward



[docs]
def objective_function(
    X: np.ndarray,
    cost: Cost,
    parameters: Parameters,
    cache: Cache,
    stacked_params: np.ndarray,
) -> np.float64:
    """Evaluate cost function for training.

    :param X: training data inputs, array of shape (n_x, m)
    cost: cost function to be evaluated
    :param parameters: object that stores neural net parameters for each
        layer
    :param cache: neural net cache that stores neural net quantities
        computed during forward prop for each layer, so they can be
        accessed during backprop to avoid re-computing them
    stacked_params: neural network parameters returned by the optimizer,
        represented as single array of stacked parameters for all layers.
    """
    parameters.unstack(stacked_params)
    Y_pred, J_pred = model_partials_forward(X, parameters, cache)
    return cost.evaluate(Y_pred, J_pred)




[docs]
def objective_gradient(
    data: Dataset,
    parameters: Parameters,
    cache: Cache,
    lambd: float,
    stacked_params: np.ndarray,
) -> np.ndarray:  # noqa: PLR0913
    """Evaluate cost function gradient for backprop.

    :param data: object containing training and associated metadata
    :param parameters: object that stores neural net parameters for each
        layer
    :param cache: neural net cache that stores neural net quantities
        computed during forward prop for each layer, so they can be
        accessed during backprop to avoid re-computing them
    :param lambd: coefficient that multiplies regularization term in
        cost function
    :param gamma: coefficient that multiplies jacobian-enhancement term
        in cost function
    :param stacked_params: neural network parameters returned by the
        optimizer, represented as single array of stacked parameters for
        all layers.
    """
    parameters.unstack(stacked_params)
    model_backward(data, parameters, cache, lambd)
    return parameters.stack_partials()




[docs]
def train_model(
    data: Dataset,
    parameters: Parameters,
    alpha: float = 0.05,
    beta: Union[np.ndarray, float] = 1.0,
    gamma: Union[np.ndarray, float] = 1.0,
    lambd: float = 0.0,
    beta1: float = 0.9,
    beta2: float = 0.99,
    tau: float = 0.5,
    tol: float = 1e-12,
    max_count: int = 1000,
    epsilon_absolute: float = 1e-12,
    epsilon_relative: float = 1e-12,
    epochs: int = 1,
    max_iter: int = 200,
    batch_size: Union[int, None] = None,
    shuffle: bool = True,
    random_state: Union[int, None] = None,
    is_backtracking: bool = False,
    is_verbose: bool = False,
) -> dict:  # noqa: PLR0913
    r"""Train neural net.

    :param data: object containing training and associated metadata
    :param parameters: object that stores neural net parameters for each
        layer
    :param alpha: learning rate :math:`\alpha`
    :param beta: LSE coefficients [defaulted to one] (optional)
    :param gamma: jacobian-enhancement regularization coefficient
        [defaulted to zero] (optional)
    :param lambd: regularization coefficient to avoid overfitting
        [defaulted to zero] (optional)
    :param beta_1: exponential decay rate of 1st moment vector
        :math:`\beta_1\in[0, 1)`
    :param beta_2: exponential decay rate of 2nd moment vector
        :math:`\beta_2\in[0, 1)`
    :param tau: amount by which to reduce :math:`\alpha := \tau \times
        \alpha` on each iteration
    :param tol: stop when cost function doesn't improve more than
        specified tolerance
    :param max_count: stop when line search iterations exceed maximum
        count specified
    :param epsilon_absolute: absolute error stopping criterion
    :param epsilon_relative: relative error stopping criterion
    :param epochs: number of passes through data
    :param batch_size: mini batch size (if None, single batch with all
        data)
    :param max_iter: maximum number of optimizer iterations allowed
    :param shuffle: swhether to huffle data points or not
    :param random_state: random seed (useful to make runs repeatable)
    :param is_backtracking: whether or not to use backtracking during
        line search
    :param is_verbose: print out progress for each iteration, each
        batch, each epoch
    :return: cost function training history accessed as `cost =
        history[epoch][batch][iter]`
    """
    history: dict[str, dict[str, Union[list[np.ndarray], None]]] = defaultdict(dict)
    optimizer = ADAMOptimizer(
        beta_1=beta1,
        beta_2=beta2,
        tau=tau,
        tol=tol,
        max_count=is_backtracking * max_count,
    )
    data.set_weights(beta, gamma)
    for e in range(epochs):
        batches = data.mini_batches(batch_size, shuffle, random_state)
        for b, batch in enumerate(batches):
            cache = Cache(parameters.layer_sizes, batch.m)
            cost = Cost(batch, parameters, lambd)
            func = functools.partial(
                objective_function,
                batch.X,
                cost,
                parameters,
                cache,
            )
            grad = functools.partial(
                objective_gradient,
                batch,
                parameters,
                cache,
                lambd,
            )
            optimizer.minimize(
                x=parameters.stack(),
                f=func,
                dfdx=grad,
                alpha=alpha,
                max_iter=max_iter,
                epsilon_absolute=epsilon_absolute,
                epsilon_relative=epsilon_relative,
                verbose=is_verbose,
                epoch=e,
                batch=b,
            )
            history[f"epoch_{e}"][f"batch_{b}"] = optimizer.cost_history
    return history