General structures
Let $\mathcal{L}, \mathcal{N}$ denote the general structs for layer and network objects.
mutable struct π
neurons::Vector
W::Matrix
biases::Vector
f::Function
function π(neurons::Vector, W::Matrix, biases::Vector, f::function)
new(neurons, W, biases, f)
end
end
mutable struct π
net::Dict{Int, π}
dims::Vector
nlayers::Int
nparams::Int
function π(dims::Vector)
structure = Dict()
structure[1] = π(zeros(Float32, dims[1]), Array{Float32}(undef, 0, 0), [])
for i in 2:length(dims)
neurons = zeros(Float32, dims[i])
weights = rand(Float32, dims[i], dims[i-1])
biases = rand(Float32, dims[i])
structure[i] = π(neurons, weights, biases)
end
n::Int32 = 0
for i in 2:length(dims)
n += dims[i - 1] * dims[i] + dims[i]
end
new(structure, dims, length(dims), n)
end
end
It is useful to conceive $π³, π΅$ the sets of possible layers and networks using group theory. The usefulness of such conception will become clear later. For the moment let us take such formalization to practice via the following definitions.
function β(Lβ::π, Lβ::π)
"""Addition operator over the set π³ of layer objects.
Observe that (π³, β) is a non-abbelian group. In particular,
the operation
ββ β ββ = ββ, βα΅’β π³
is neuron-preserving with respect to ββ. This is,
ββ has the same activations as ββ"""
Lβ = π(Lβ.neurons, Lβ.W + Lβ.W, Lβ.biases + Lβ.biases)
return Lβ
end
function β(Ξ»::Number, L::π)
"""Scalar-layer multiplication.
β is the group action
β : β Γ π³ β π³
"""
W = Ξ» * L.W
b = Ξ» * L.biases
return π(L.neurons, W, b)
end
function β(Ξ»::Number, N::π)
"""Scalar-network multiplication β.
Might be thought of as a group action
β : β Γ π΅ β π΅
This operation is replacing. """
new = π(N.dims)
keys = [1:N.nlayers;]
layers = [β(Ξ», N.net[i]) for i in 1:N.nlayers]
new.net = Dict(keys .=> layers)
return new
end
function β(Nβ::π, Nβ::π)
"""Addition operator over the set π΅ of layer objects.
Observe that (π΅, β) is a non-abbelian group. In particular,
π§β β π§β = π§β π§α΅’β π΅
is neuron-preserving with respect to π§β."""
if Nβ.dims != Nβ.dims
throw(DimensionMismatch)
end
Nβ = π(Nβ.dims)
Nβ.net = mergewith(β, Nβ.net, Nβ.net)
return Nβ
end
One can now very easily materialize the training of a network via the following conception. For a given network $\mathcal{N}$, one defines the gradient of each weight $w_{ij}^{(l)}$ in the network as the weight $\nabla w_{ij}^{(l)}$ of a similar network $\nabla \mathcal{N}$.
Note. I say two networks are similar iff their dimensions are the same.
Then
$$ \begin{align} \mathcal{N}(e + 1) = \mathcal{N}(e) - \eta \nabla \mathcal{N}(e) \end{align} $$
defines the updated network in epoch $e + 1$. The parameters of $\mathcal{N}(e +1)$ are shifted towards the direction of steepest decent βprovided that $(+), (\cdot)$ are well-defined as network-network and scalar-network operators (which is precisely what we've done above).
This formalization of non-abbelian groups for the network and layer objects, by virtue of which we conceptualized training, is not at all necessary. One can indeed update a network's parameters without the need to formalize, for example, a network-network operator βand in fact this is what is generally done. However, I find this formal approach to have sharper outlines.
Forward and backward propagation
Firstly let us defined a simple helper function,
function compute_layer(W::Matrix, a::Vector, b::Vector)
Y = zeros(Float32, length(b))
mul!(Y, W, a)
Y += b
return Y
end
This function performs $\textbf{W}\textbf{a} + \textbf{b}$, the linear combination (plus bias) that is to be evaluated in some activation function.
function fprop(x::Vector, N::π)
N.net[1].neurons = x
L = N.nlayers
for i in 1:(L-1)
z = compute_layer(N.net[i+1].W, N.net[i].neurons, N.net[i+1].biases)
N.net[i+1].neurons = N.net[i+1].f(z)
end
return(N)
end
function backprop(N::π, yβ::Vector)
βπ = π(N.dims)
βCβaα΄Έ = 2 * (N.net[N.nlayers].neurons - yβ)
for l in reverse(2:N.nlayers)
βΟβz = dΟdx(logit.(N.net[l].neurons))
P = hadamard(βCβaα΄Έ, βΟβz)
βW = kron(P, transpose(N.net[l-1].neurons))
βπ.net[l].W = βW
βπ.net[l].biases = P
# Change the dimension of the vector appropriately
βCβaα΄Έ = zeros(Float32, size(N.net[l].W, 2))
mul!(βCβaα΄Έ, transpose(N.net[l].W), P)
end
return βπ
end
The logic justifying these algorithms should be clear to anyone familiar with the basics of neural network theory. The only difference with a traditional backpropagation algorithm is that we are updating paramaters via network-network operations.
Appendix of math functions
relu(x::Number) = max(0, x)
function relu(X::Vector)
relu.(X)
end
function softmax(X::Vector)
X = X .- maximum(X)
exp.(X) ./ sum(exp.(X))
end
Ο(x::Number) = 1 / (1 + exp(-x))
function Ο(X::Vector)
Ο.(X)
end
function dΟdx(x::Number)
Ο(x)*(1 - Ο(x))
end
function logit(x)
if x < 0 || x > 1
error("Logit input out of bounds")
end
log(x / (1 - x))
end
function dΟdx(X::Vector)
broadcast(dΟdx, X)
end
function dreludx(x::Number)
ifelse(x > 0, 1, 0)
end
square(x) = x^2
function cost(final_layer, target)::Float32
target_vector = zeros(Float32, length(final_layer))
target_vector[target+1] = 1
sum(broadcast(square, final_layer .- target_vector))
end
# Hadamard product
function hadamard(A, B)
C = broadcast(*, A, B)
return C
end