Production-ready NVIDIA CUDA GPU backend for DotCompute. Provides GPU acceleration (21-92x speedup) through CUDA with NVRTC compilation, P2P transfers, Ring Kernels with NCCL support, and unified memory. Requires CUDA 12.0+ and Compute Capability 5.0+ NVIDIA GPU. Benchmarked on RTX 2000 Ada (CC 8.9).
$ dotnet add package DotCompute.Backends.CUDAProduction-ready NVIDIA GPU compute backend for .NET 9+ with full CUDA support.
The CUDA backend provides GPU acceleration through:
dotnet add package DotCompute.Backends.CUDA --version 0.5.3
using DotCompute.Backends.CUDA;
using Microsoft.Extensions.Logging;
var logger = LoggerFactory.Create(builder => builder.AddConsole())
.CreateLogger<CudaAccelerator>();
var accelerator = new CudaAccelerator(logger);
// Check availability before initialization
if (await accelerator.IsAvailableAsync())
{
await accelerator.InitializeAsync();
// GPU is ready for compute operations
}
services.AddSingleton<IAccelerator, CudaAccelerator>();
// OR
services.AddCudaBackend(); // Includes automatic GPU detection
var kernelDef = new KernelDefinition
{
Name = "VectorAdd",
Source = @"
extern ""C"" __global__
void vector_add(float* a, float* b, float* result, int n)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
result[idx] = a[idx] + b[idx];
}
}
",
EntryPoint = "vector_add"
};
var compiledKernel = await accelerator.CompileKernelAsync(kernelDef);
// Execute with launch parameters
var launchParams = new KernelLaunchParameters
{
GridDim = new Dim3((uint)((length + 255) / 256), 1, 1),
BlockDim = new Dim3(256, 1, 1),
SharedMemorySize = 0
};
await compiledKernel.ExecuteAsync(parameters, launchParams);
// Allocate device memory
var deviceBuffer = await accelerator.AllocateAsync<float>(1_000_000);
// Copy data to GPU
await deviceBuffer.CopyFromAsync(hostData);
// Use in kernel execution
var parameters = new {
input = deviceBuffer,
output = outputBuffer,
length = 1_000_000
};
await compiledKernel.ExecuteAsync(parameters);
// Copy results back
await deviceBuffer.CopyToAsync(hostResults);
The CUDA backend automatically handles:
Tested on RTX 2000 Ada Generation (8GB VRAM, CC 8.9):
| Operation | Data Size | CPU Time | GPU Time | Speedup |
|---|---|---|---|---|
| Vector Add | 10M floats | 45ms | 2.1ms | 21x |
| Matrix Mult | 2048x2048 | 8.2s | 89ms | 92x |
| FFT | 1M complex | 156ms | 8.4ms | 18x |
| Reduction | 10M floats | 38ms | 1.8ms | 21x |
# CUDA installation path (auto-detected)
export CUDA_PATH="/usr/local/cuda"
# Enable additional logging
export DOTCOMPUTE_CUDA_VERBOSE=1
# Force specific GPU device
export CUDA_VISIBLE_DEVICES=0
var options = new CompilationOptions
{
OptimizationLevel = OptimizationLevel.O3, // Maximum optimization
EnableDebugInfo = false, // Disable for production
TargetArchitecture = "compute_75", // Target specific CC
CustomOptions = new[] { "--use_fast_math" }
};
var kernel = await accelerator.CompileKernelAsync(definition, options);
// Enable detailed CUDA logging
var logger = LoggerFactory.Create(builder =>
builder.AddConsole().SetMinimumLevel(LogLevel.Trace));
// Get detailed GPU information
var info = await accelerator.GetDeviceInfoAsync();
Console.WriteLine($"GPU: {info.Name}, Memory: {info.TotalMemory / (1024*1024*1024)}GB");
// Profile kernel execution
var stopwatch = Stopwatch.StartNew();
await kernel.ExecuteAsync(parameters);
await accelerator.SynchronizeAsync();
stopwatch.Stop();
Console.WriteLine($"Kernel time: {stopwatch.ElapsedMilliseconds}ms");
// Enumerate all available GPUs
var devices = await CudaAccelerator.GetAvailableDevicesAsync();
// Create accelerators for each GPU
var accelerators = new List<CudaAccelerator>();
foreach (var device in devices)
{
var acc = new CudaAccelerator(device, logger);
await acc.InitializeAsync();
accelerators.Add(acc);
}
// Enable P2P access between GPUs
await CudaAccelerator.EnablePeerAccessAsync(accelerators[0], accelerators[1]);
// Allocate unified memory (accessible from CPU and GPU)
var unifiedBuffer = await accelerator.AllocateUnifiedAsync<float>(size);
// CPU can access directly
for (int i = 0; i < size; i++)
unifiedBuffer[i] = i * 2.0f;
// GPU kernels can access the same memory
await kernel.ExecuteAsync(new { data = unifiedBuffer, size });
// Results automatically available on CPU
Console.WriteLine($"Result: {unifiedBuffer[0]}");
Comprehensive documentation is available for DotCompute:
The CUDA backend welcomes contributions in:
See CONTRIBUTING.md for development guidelines.