A library for parallelized reverse mode automatic differentiation in C# for custom neural network development.
$ dotnet add package ParallelReverseAutoDiffParallel Reverse Mode Automatic Differentiation in C#
ParallelAutoDiff is a thread-safe C# library for reverse mode automatic differentiation, optimized for parallel computation. It leverages semaphores and locks to coordinate between threads, ensuring accuracy during gradient accumulation. Each operation in the library is implemented as a node with a forward and a backward function, facilitating efficient calculation of derivatives. A unique aspect of this library is its use of the visitor pattern: it includes a specialized 'Neural Network Visitor' which traverses neural network nodes across different threads. This visitor is responsible for gradient accumulation on nodes shared across multiple threads. This design allows for parallelized computations while maintaining consistency and avoiding race conditions. The result is an efficient, scalable automatic differentiation solution, ideal for machine learning applications and neural network training.
Download and install the Cuda Toolkit 12.0 if you want to use the CudaMatrixMultiplyOperation.
AmplifiedSigmoidOperation - Used for gradient amplification.
ApplyDropoutOperation
CudaMatrixMultiplyOperation - Leverages the GPU for fast computation.
HadamardProductOperation
LayerNormalizationOperation
LeakyReLUOperation
MatrixAddOperation
MatrixAddThreeOperation
MatrixMultiplyOperation
MatrixMultiplyScalarOperation
MatrixTransposeOperation
ReLUOperation
ScaleAndShiftOperation
SigmoidOperation
SoftmaxOperation
StretchedSigmoidOperation
TanhOperation
Here is an example:
{
"timeSteps": [
{
"startOperations": [
{
"id": "projectedInput",
"description": "Multiply the input with the weight matrix",
"type": "MatrixMultiplyOperation",
"inputs": [ "We", "inputSequence[t]" ],
"gradientResultTo": [ "dWe", null ]
},
{
"id": "embeddedInput",
"description": "Add the bias",
"type": "MatrixAddOperation",
"inputs": [ "projectedInput", "be" ],
"gradientResultTo": [ null, "dbe" ]
}
],
"layers": [
{
"operations": [
{
"id": "wf_currentInput",
"type": "MatrixMultiplyOperation",
"inputs": [ "Wf[layerIndex]", "currentInput" ],
"gradientResultTo": [ "dWf[layerIndex]", null ]
},
{
"id": "uf_previousHiddenState",
"type": "MatrixMultiplyOperation",
"inputs": [ "Uf[layerIndex]", "previousHiddenState" ],
"gradientResultTo": [ "dUf[layerIndex]", null ]
},
{
"id": "f_add",
"type": "MatrixAddThreeOperation",
"inputs": [ "wf_currentInput", "uf_previousHiddenState", "bf[layerIndex]" ],
"gradientResultTo": [ null, null, "dbf[layerIndex]" ]
},
{
"id": "intermediate_f_1",
"description": "Compute the forget gate",
"type": "MatrixTransposeOperation",
"inputs": [ "f_add" ]
},
{
"id": "intermediate_f_2",
"description": "Compute the forget gate",
"type": "LayerNormalizationOperation",
"inputs": [ "intermediate_f_1" ]
},
{
"id": "intermediate_f_3",
"description": "Compute the forget gate",
"type": "MatrixTransposeOperation",
"inputs": [ "intermediate_f_2" ]
},
{
"id": "f",
"description": "Compute the forget gate",
"type": "AmplifiedSigmoidOperation",
"inputs": [ "intermediate_f_3" ],
"setResultTo": "f[t][layerIndex]"
},
{
"id": "wi_currentInput",
"type": "MatrixMultiplyOperation",
"inputs": [ "Wi[layerIndex]", "currentInput" ],
"gradientResultTo": [ "dWi[layerIndex]", null ]
},
{
"id": "ui_previousHiddenState",
"type": "MatrixMultiplyOperation",
"inputs": [ "Ui[layerIndex]", "previousHiddenState" ],
"gradientResultTo": [ "dUi[layerIndex]", null ]
},
{
"id": "i_add",
"type": "MatrixAddThreeOperation",
"inputs": [ "wi_currentInput", "ui_previousHiddenState", "bi[layerIndex]" ],
"gradientResultTo": [ null, null, "dbi[layerIndex]" ]
},
{
"id": "intermediate_i_1",
"description": "Compute the input gate",
"type": "MatrixTransposeOperation",
"inputs": [ "i_add" ]
},
{
"id": "intermediate_i_2",
"description": "Compute the input gate",
"type": "LayerNormalizationOperation",
"inputs": [ "intermediate_i_1" ]
},
{
"id": "intermediate_i_3",
"description": "Compute the input gate",
"type": "MatrixTransposeOperation",
"inputs": [ "intermediate_i_2" ]
},
{
"id": "i",
"description": "Compute the input gate",
"type": "AmplifiedSigmoidOperation",
"inputs": [ "intermediate_i_3" ],
"setResultTo": "i[t][layerIndex]"
},
{
"id": "wc_currentInput",
"type": "MatrixMultiplyOperation",
"inputs": [ "Wc[layerIndex]", "currentInput" ],
"gradientResultTo": [ "dWc[layerIndex]", null ]
},
{
"id": "uc_previousHiddenState",
"type": "MatrixMultiplyOperation",
"inputs": [ "Uc[layerIndex]", "previousHiddenState" ],
"gradientResultTo": [ "dUc[layerIndex]", null ]
},
{
"id": "cHat_add",
"type": "MatrixAddThreeOperation",
"inputs": [ "wc_currentInput", "uc_previousHiddenState", "bc[layerIndex]" ],
"gradientResultTo": [ null, null, "dbc[layerIndex]" ]
},
{
"id": "intermediate_cHat_1",
"description": "Compute the candidate memory cell state",
"type": "MatrixTransposeOperation",
"inputs": [ "cHat_add" ]
},
{
"id": "intermediate_cHat_2",
"description": "Compute the candidate memory cell state",
"type": "LayerNormalizationOperation",
"inputs": [ "intermediate_cHat_1" ]
},
{
"id": "intermediate_cHat_3",
"description": "Compute the candidate memory cell state",
"type": "MatrixTransposeOperation",
"inputs": [ "intermediate_cHat_2" ]
},
{
"id": "cHat",
"description": "Compute the candidate memory cell state",
"type": "TanhOperation",
"inputs": [ "intermediate_cHat_3" ],
"setResultTo": "cHat[t][layerIndex]"
},
{
"id": "f_previousMemoryCellState",
"type": "HadamardProductOperation",
"inputs": [ "f[t][layerIndex]", "previousMemoryCellState" ]
},
{
"id": "i_cHat",
"type": "HadamardProductOperation",
"inputs": [ "i[t][layerIndex]", "cHat[t][layerIndex]" ]
},
{
"id": "newC",
"description": "Compute the memory cell state",
"type": "MatrixAddOperation",
"inputs": [ "f_previousMemoryCellState", "i_cHat" ]
},
{
"id": "newCTransposed",
"type": "MatrixTransposeOperation",
"inputs": [ "newC" ]
},
{
"id": "newCNormalized",
"type": "LayerNormalizationOperation",
"inputs": [ "newCTransposed" ]
},
{
"id": "c",
"type": "MatrixTransposeOperation",
"inputs": [ "newCNormalized" ],
"setResultTo": "c[t][layerIndex]"
},
{
"id": "wo_currentInput",
"type": "MatrixMultiplyOperation",
"inputs": [ "Wo[layerIndex]", "currentInput" ],
"gradientResultTo": [ "dWo[layerIndex]", null ]
},
{
"id": "uo_previousHiddenState",
"type": "MatrixMultiplyOperation",
"inputs": [ "Uo[layerIndex]", "previousHiddenState" ],
"gradientResultTo": [ "dUo[layerIndex]", null ]
},
{
"id": "o_add",
"type": "MatrixAddThreeOperation",
"inputs": [ "wo_currentInput", "uo_previousHiddenState", "bo[layerIndex]" ],
"gradientResultTo": [ null, null, "dbo[layerIndex]" ]
},
{
"id": "o",
"description": "Compute the output gate",
"type": "LeakyReLUOperation",
"inputs": [ "o_add" ],
"setResultTo": "o[t][layerIndex]"
},
{
"id": "c_tanh",
"type": "TanhOperation",
"inputs": [ "c" ]
},
{
"id": "newH",
"type": "HadamardProductOperation",
"inputs": [ "o[t][layerIndex]", "c_tanh" ]
},
{
"id": "keys",
"type": "MatrixMultiplyOperation",
"inputs": [ "Wk[layerIndex]", "embeddedInput" ],
"gradientResultTo": [ "dWk[layerIndex]", null ]
},
{
"id": "queries",
"type": "MatrixMultiplyOperation",
"inputs": [ "Wq[layerIndex]", "previousHiddenState" ],
"gradientResultTo": [ "dWq[layerIndex]", null ]
},
{
"id": "values",
"type": "MatrixMultiplyOperation",
"inputs": [ "Wv[layerIndex]", "embeddedInput" ],
"gradientResultTo": [ "dWv[layerIndex]", null ]
},
{
"id": "queriesTranspose",
"type": "MatrixTransposeOperation",
"inputs": [ "queries" ]
},
{
"id": "dotProduct",
"description": "Compute the dot product of the queries and keys",
"type": "MatrixMultiplyOperation",
"inputs": [ "keys", "queriesTranspose" ]
},
{
"id": "scaledDotProduct",
"description": "Scale the dot product",
"type": "MatrixMultiplyScalarOperation",
"inputs": [ "dotProduct", "scaledDotProductScalar" ]
},
{
"id": "scaledDotProductTranspose",
"type": "MatrixTransposeOperation",
"inputs": [ "scaledDotProduct" ]
},
{
"id": "attentionWeights",
"type": "SoftmaxOperation",
"inputs": [ "scaledDotProductTranspose" ]
},
{
"id": "attentionOutput",
"type": "MatrixMultiplyOperation",
"inputs": [ "attentionWeights", "values" ]
},
{
"id": "newHWithAttentionOutput",
"type": "MatrixAddOperation",
"inputs": [ "newH", "attentionOutput" ]
},
{
"id": "newHWithAttentionOutputTranspose",
"type": "MatrixTransposeOperation",
"inputs": [ "newHWithAttentionOutput" ]
},
{
"id": "normalizedNewH",
"type": "LayerNormalizationOperation",
"inputs": [ "newHWithAttentionOutputTranspose" ]
},
{
"id": "h",
"type": "MatrixTransposeOperation",
"inputs": [ "normalizedNewH" ],
"setResultTo": "h[t][layerIndex]"
}
]
}
],
"endOperations": [
{
"id": "v_h",
"type": "MatrixMultiplyOperation",
"inputs": [ "V", "hFromCurrentTimeStepAndLastLayer" ],
"gradientResultTo": [ "dV", null ]
},
{
"id": "v_h_b",
"type": "MatrixAddOperation",
"inputs": [ "v_h", "b" ],
"gradientResultTo": [ null, "db" ]
},
{
"id": "output_t",
"type": "AmplifiedSigmoidOperation",
"inputs": [ "v_h_b" ],
"setResultTo": "output[t]"
}
]
}
]
}Use a JSON serialization library like Newtonsoft.JSON to deserialize the JSON file to a JsonArchitecure object.
this.computationGraph = new SelfAttentionMultiLayerLSTMComputationGraph(this);
var zeroMatrixHiddenSize = new Matrix(this.hiddenSize, 1);
this.computationGraph
.AddIntermediate("inputSequence", x => this.Parameters.InputSequence[x.TimeStep])
.AddIntermediate("output", x => this.output[x.TimeStep])
.AddIntermediate("c", x => this.c[x.TimeStep][x.Layer])
.AddIntermediate("h", x => this.h[x.TimeStep][x.Layer])
.AddScalar("scaledDotProductScalar", x => 1.0d / Math.Sqrt(this.hiddenSize))
.AddWeight("Wf", x => this.Wf[x.Layer]).AddGradient("dWf", x => this.dWf[x.Layer])
.AddWeight("Wi", x => this.Wi[x.Layer]).AddGradient("dWi", x => this.dWi[x.Layer])
.AddWeight("Wc", x => this.Wc[x.Layer]).AddGradient("dWc", x => this.dWc[x.Layer])
.AddWeight("Wo", x => this.Wo[x.Layer]).AddGradient("dWo", x => this.dWo[x.Layer])
.AddWeight("Uf", x => this.Uf[x.Layer]).AddGradient("dUf", x => this.dUf[x.Layer])
.AddWeight("Ui", x => this.Ui[x.Layer]).AddGradient("dUi", x => this.dUi[x.Layer])
.AddWeight("Uc", x => this.Uc[x.Layer]).AddGradient("dUc", x => this.dUc[x.Layer])
.AddWeight("Uo", x => this.Uo[x.Layer]).AddGradient("dUo", x => this.dUo[x.Layer])
.AddWeight("bf", x => this.bf[x.Layer]).AddGradient("dbf", x => this.dbf[x.Layer])
.AddWeight("bi", x => this.bi[x.Layer]).AddGradient("dbi", x => this.dbi[x.Layer])
.AddWeight("bc", x => this.bc[x.Layer]).AddGradient("dbc", x => this.dbc[x.Layer])
.AddWeight("bo", x => this.bo[x.Layer]).AddGradient("dbo", x => this.dbo[x.Layer])
.AddWeight("Wq", x => this.Wq[x.Layer]).AddGradient("dWq", x => this.dWq[x.Layer])
.AddWeight("Wk", x => this.Wk[x.Layer]).AddGradient("dWk", x => this.dWk[x.Layer])
.AddWeight("Wv", x => this.Wv[x.Layer]).AddGradient("dWv", x => this.dWv[x.Layer])
.AddWeight("We", x => this.We).AddGradient("dWe", x => this.dWe)
.AddWeight("be", x => this.be).AddGradient("dbe", x => this.dbe)
.AddWeight("V", x => this.V).AddGradient("dV", x => this.dV)
.AddWeight("b", x => this.b).AddGradient("db", x => this.db)
.AddOperationFinder("i", x => this.computationGraph[$"i_{x.TimeStep}_{x.Layer}"])
.AddOperationFinder("f", x => this.computationGraph[$"f_{x.TimeStep}_{x.Layer}"])
.AddOperationFinder("cHat", x => this.computationGraph[$"cHat_{x.TimeStep}_{x.Layer}"])
.AddOperationFinder("o", x => this.computationGraph[$"o_{x.TimeStep}_{x.Layer}"])
.AddOperationFinder("embeddedInput", x => this.computationGraph[$"embeddedInput_{x.TimeStep}_0"])
.AddOperationFinder("hFromCurrentTimeStepAndLastLayer", x => this.computationGraph[$"h_{x.TimeStep}_{this.numLayers - 1}"])
.AddOperationFinder("currentInput", x => x.Layer == 0 ? this.computationGraph[$"embeddedInput_{x.TimeStep}_0"] : this.computationGraph[$"h_{x.TimeStep}_{x.Layer - 1}"])
.AddOperationFinder("previousHiddenState", x => x.TimeStep == 0 ? zeroMatrixHiddenSize : this.computationGraph[$"h_{x.TimeStep - 1}_{x.Layer}"])
.AddOperationFinder("previousMemoryCellState", x => x.TimeStep == 0 ? zeroMatrixHiddenSize : this.computationGraph[$"c_{x.TimeStep - 1}_{x.Layer}"])
.ConstructFromArchitecture(jsonArchitecture, this.numTimeSteps, this.numLayers);Then populate the backward dependency counts by running the following code. It only has to be run once to set up the backward dependency counts.
for (int t = numTimeSteps - 1; t >= 0; t--) // if there are multiple timesteps
{
backwardStartOperation = operationsMap[$"output_t_{t}"]; // the backward start operation
OperationGraphVisitor opVisitor = new OperationGraphVisitor(Guid.NewGuid().ToString(), backwardStartOperation, t);
await opVisitor.TraverseAsync(); // sets the backward dependency counts
await opVisitor.ResetVisitedCountsAsync(backwardStartOperation);
}var op = this.computationGraph.StartOperation ?? throw new Exception("Start operation should not be null.");
IOperation? currOp = null;
do
{
var parameters = this.LookupParameters(op);
var forwardMethod = op.OperationType.GetMethod("Forward") ?? throw new Exception($"Forward method should exist on operation of type {op.OperationType.Name}.");
forwardMethod.Invoke(op, parameters);
if (op.ResultToName != null)
{
var split = op.ResultToName.Split(new[] { '[', ']' }, StringSplitOptions.RemoveEmptyEntries);
var oo = this.computationGraph[MatrixType.Intermediate, split[0], op.LayerInfo];
op.CopyResult(oo);
}
currOp = op;
if (op.HasNext)
{
op = op.Next;
}
}
while (currOp.Next != null);Create a loss function like mean squared error or using policy gradient methods.
Then calculate the gradient of the loss with respect to the output.
Plug the result in as the backward input for the backward start operation.
IOperation? backwardStartOperation = null;
for (int t = this.Parameters.NumTimeSteps - 1; t >= 0; t--)
{
backwardStartOperation = this.computationGraph[$"output_t_{t}_0"];
if (gradientOfLossWrtOutput[t][0] != 0.0d)
{
var backwardInput = new Matrix(1, 1);
backwardInput[0] = gradientOfLossWrtOutput[t];
backwardStartOperation.BackwardInput = backwardInput;
OperationNeuralNetworkVisitor opVisitor = new OperationNeuralNetworkVisitor(Guid.NewGuid().ToString(), backwardStartOperation, t);
await opVisitor.TraverseAsync();
opVisitor.Reset();
traverseCount++;
}
}Cudablas.Instance.Initialize(); // initialize the CUDA library
Cudablas.Instance.SetDevice(0); // set the device to use, defaults to 0
// ... <Run CUDA operations> ...
Cudablas.Instance.Dispose(); // dispose the CUDA library