General structures

Let $\mathcal{L}, \mathcal{N}$ denote the general structs for layer and network objects.

mutable struct 𝓛 
    neurons::Vector 
    W::Matrix
    biases::Vector
    f::Function
    function 𝓛(neurons::Vector, W::Matrix, biases::Vector, f::function)
        new(neurons, W, biases, f)
    end
end

mutable struct 𝓝
    net::Dict{Int, 𝓛}
    dims::Vector
    nlayers::Int
    nparams::Int

    function 𝓝(dims::Vector)
        structure = Dict()
        structure[1] = 𝓛(zeros(Float32, dims[1]), Array{Float32}(undef, 0, 0), [])
        for i in 2:length(dims)
            neurons = zeros(Float32, dims[i])
            weights = rand(Float32, dims[i], dims[i-1])
            biases = rand(Float32, dims[i])
            structure[i] = 𝓛(neurons, weights, biases)
         end
        n::Int32 = 0
        for i in 2:length(dims)
            n += dims[i - 1] * dims[i] + dims[i]
        end

        new(structure, dims, length(dims), n)
    end 
end

It is useful to conceive $𝑳, 𝑡$ the sets of possible layers and networks using group theory. The usefulness of such conception will become clear later. For the moment let us take such formalization to practice via the following definitions.

function βŠ•(L₁::𝓛, Lβ‚‚::𝓛)
    """Addition operator over the set 𝑳 of layer objects.
    Observe that (𝑳, βŠ•) is a non-abbelian group. In particular,
    the operation

                    ℒ₁ βŠ• β„’β‚‚ = ℒ₃,          β„’α΅’βˆˆ 𝑳

    is neuron-preserving with respect to ℒ₁. This is, 
    ℒ₃ has the same activations as ℒ₁"""

    L₃ = 𝓛(L₁.neurons, L₁.W + Lβ‚‚.W, L₁.biases + Lβ‚‚.biases)
    return L₃
end

function βŠ—(Ξ»::Number, L::𝓛)
    """Scalar-layer multiplication.
    βŠ— is the group action

                βŠ— : ℝ Γ— 𝑳 β†’ 𝑳 
    """

    W = Ξ» * L.W
    b = Ξ» * L.biases
    return 𝓛(L.neurons, W, b)
end

function βŠ—(Ξ»::Number, N::𝓝)
    """Scalar-network multiplication βŠ—.
    Might be thought of as a group action 

                βŠ— : ℝ Γ— 𝑡 β†’ 𝑡

    This operation is replacing. """

    new = 𝓝(N.dims)
    keys = [1:N.nlayers;]
    layers = [βŠ—(Ξ», N.net[i]) for i in 1:N.nlayers]
    new.net = Dict(keys .=> layers)
    return new
end

function βŠ•(N₁::𝓝, Nβ‚‚::𝓝)
    """Addition operator over the set 𝑡 of layer objects.
    Observe that (𝑡, βŠ•) is a non-abbelian group. In particular, 

                    𝐧₁ βŠ• 𝐧₂ = 𝐧₃           𝐧ᡒ∈ 𝑡

    is neuron-preserving with respect to 𝐧₁."""

    if N₁.dims != Nβ‚‚.dims
        throw(DimensionMismatch)
    end
    N₃ = 𝓝(N₁.dims)
    N₃.net = mergewith(βŠ•, N₁.net, Nβ‚‚.net)
    return N₃
end

One can now very easily materialize the training of a network via the following conception. For a given network $\mathcal{N}$, one defines the gradient of each weight $w_{ij}^{(l)}$ in the network as the weight $\nabla w_{ij}^{(l)}$ of a similar network $\nabla \mathcal{N}$.

Note. I say two networks are similar iff their dimensions are the same.

Then

$$ \begin{align} \mathcal{N}(e + 1) = \mathcal{N}(e) - \eta \nabla \mathcal{N}(e) \end{align} $$

defines the updated network in epoch $e + 1$. The parameters of $\mathcal{N}(e +1)$ are shifted towards the direction of steepest decent β€”provided that $(+), (\cdot)$ are well-defined as network-network and scalar-network operators (which is precisely what we've done above).

This formalization of non-abbelian groups for the network and layer objects, by virtue of which we conceptualized training, is not at all necessary. One can indeed update a network's parameters without the need to formalize, for example, a network-network operator β€”and in fact this is what is generally done. However, I find this formal approach to have sharper outlines.

Forward and backward propagation

Firstly let us defined a simple helper function,

function compute_layer(W::Matrix, a::Vector, b::Vector)
    Y = zeros(Float32, length(b))
    mul!(Y, W, a)
    Y += b
    return Y
end

This function performs $\textbf{W}\textbf{a} + \textbf{b}$, the linear combination (plus bias) that is to be evaluated in some activation function.

function fprop(x::Vector, N::𝓝)
    N.net[1].neurons = x
    L = N.nlayers
    for i in 1:(L-1)
        z = compute_layer(N.net[i+1].W, N.net[i].neurons, N.net[i+1].biases)
        N.net[i+1].neurons = N.net[i+1].f(z)
    end
    return(N)
end

function backprop(N::𝓝, yβƒ—::Vector)
    βˆ‡π“ = 𝓝(N.dims)
    βˆ‚Cβˆ‚aα΄Έ = 2 * (N.net[N.nlayers].neurons - yβƒ—)
    for l in reverse(2:N.nlayers)
        βˆ‚Οƒβˆ‚z = dΟƒdx(logit.(N.net[l].neurons))
        P = hadamard(βˆ‚Cβˆ‚aα΄Έ, βˆ‚Οƒβˆ‚z)
        βˆ‡W = kron(P, transpose(N.net[l-1].neurons))
        βˆ‡π“.net[l].W = βˆ‡W
        βˆ‡π“.net[l].biases = P
        # Change the dimension of the vector appropriately
        βˆ‚Cβˆ‚aα΄Έ = zeros(Float32, size(N.net[l].W, 2))
        mul!(βˆ‚Cβˆ‚aα΄Έ, transpose(N.net[l].W), P)
    end
    return βˆ‡π“
end

The logic justifying these algorithms should be clear to anyone familiar with the basics of neural network theory. The only difference with a traditional backpropagation algorithm is that we are updating paramaters via network-network operations.

Appendix of math functions

relu(x::Number) = max(0, x)
function relu(X::Vector)
    relu.(X)
end

function softmax(X::Vector)
    X = X .- maximum(X)
    exp.(X) ./ sum(exp.(X))
end

Οƒ(x::Number) = 1 / (1 + exp(-x))
function Οƒ(X::Vector)
    Οƒ.(X)
end

function dσdx(x::Number)
    Οƒ(x)*(1 - Οƒ(x))
end

function logit(x)
    if x < 0 || x > 1
        error("Logit input out of bounds")
    end
    log(x / (1 - x))
end


function dσdx(X::Vector)
    broadcast(dσdx, X)
end

function dreludx(x::Number)
    ifelse(x > 0, 1, 0)
end

square(x) = x^2
function cost(final_layer, target)::Float32
    target_vector = zeros(Float32, length(final_layer))
    target_vector[target+1] = 1
    sum(broadcast(square, final_layer .- target_vector))
end

# Hadamard product
function hadamard(A, B)
    C = broadcast(*, A, B)
    return C
end