pkg

mivertowski/DotCompute.Backends.CUDAv0.6.2

.NET 9.0

Production-ready NVIDIA CUDA GPU backend for DotCompute. Provides GPU acceleration (21-92x speedup) through CUDA with NVRTC compilation, P2P transfers, Ring Kernels with NCCL support, and unified memory. Requires CUDA 12.0+ and Compute Capability 5.0+ NVIDIA GPU. Benchmarked on RTX 2000 Ada (CC 8.9).

License

MIT

Deps

14

Install Size

—

Vulns

✓ 0

Published

Feb 9, 2026

Get Started

$ dotnet add package DotCompute.Backends.CUDA

Readme

DotCompute.Backends.CUDA

Production-ready NVIDIA GPU compute backend for .NET 9+ with full CUDA support.

Status: ✅ Production Ready

The CUDA backend provides GPU acceleration through:

Complete CUDA Integration: Full NVRTC and CUDA Runtime API support
Multi-GPU Support: Device enumeration and P2P transfers
Ring Kernel Support: Persistent kernels with P2P, NCCL, and shared memory messaging
Compute Capability Detection: Support for CC 5.0+ GPUs
Memory Management: Device memory allocation and unified memory
Native AOT Compatible: Full compatibility with Native AOT compilation

Key Features

CUDA Runtime Integration

Device Management: Multi-GPU enumeration and selection
Kernel Compilation: OpenCL C to PTX compilation via NVRTC
Execution Engine: Stream management and asynchronous execution
Memory Operations: Device memory allocation and host-device transfers

Performance Optimizations

P2P Transfers: Direct GPU-to-GPU memory copying via NVLink
Unified Memory: Automatic data migration between CPU/GPU
Stream Processing: Asynchronous kernel execution
Memory Pooling: Device memory reuse to minimize allocation overhead

Hardware Support

Tested Hardware: RTX 2000 Ada Generation (CC 8.9)
Minimum Requirement: Compute Capability 5.0+ (Maxwell architecture)
Driver Support: CUDA Toolkit 12.0+ with compatible drivers
Multi-GPU: Full support for multi-GPU systems with NVLink

Installation

dotnet add package DotCompute.Backends.CUDA --version 0.5.3

Usage

using DotCompute.Backends.CUDA;
using Microsoft.Extensions.Logging;

var logger = LoggerFactory.Create(builder => builder.AddConsole())
    .CreateLogger<CudaAccelerator>();

var accelerator = new CudaAccelerator(logger);

// Check availability before initialization
if (await accelerator.IsAvailableAsync())
{
    await accelerator.InitializeAsync();
    // GPU is ready for compute operations
}

services.AddSingleton<IAccelerator, CudaAccelerator>();
// OR
services.AddCudaBackend(); // Includes automatic GPU detection

var kernelDef = new KernelDefinition
{
    Name = "VectorAdd",
    Source = @"
        extern ""C"" __global__ 
        void vector_add(float* a, float* b, float* result, int n)
        {
            int idx = blockIdx.x * blockDim.x + threadIdx.x;
            if (idx < n) {
                result[idx] = a[idx] + b[idx];
            }
        }
    ",
    EntryPoint = "vector_add"
};

var compiledKernel = await accelerator.CompileKernelAsync(kernelDef);

// Execute with launch parameters
var launchParams = new KernelLaunchParameters
{
    GridDim = new Dim3((uint)((length + 255) / 256), 1, 1),
    BlockDim = new Dim3(256, 1, 1),
    SharedMemorySize = 0
};

await compiledKernel.ExecuteAsync(parameters, launchParams);

// Allocate device memory
var deviceBuffer = await accelerator.AllocateAsync<float>(1_000_000);

// Copy data to GPU
await deviceBuffer.CopyFromAsync(hostData);

// Use in kernel execution
var parameters = new { 
    input = deviceBuffer,
    output = outputBuffer,
    length = 1_000_000
};

await compiledKernel.ExecuteAsync(parameters);

// Copy results back
await deviceBuffer.CopyToAsync(hostResults);

# CUDA installation path (auto-detected)
export CUDA_PATH="/usr/local/cuda"

# Enable additional logging
export DOTCOMPUTE_CUDA_VERBOSE=1

# Force specific GPU device
export CUDA_VISIBLE_DEVICES=0

var options = new CompilationOptions
{
    OptimizationLevel = OptimizationLevel.O3, // Maximum optimization
    EnableDebugInfo = false, // Disable for production
    TargetArchitecture = "compute_75", // Target specific CC
    CustomOptions = new[] { "--use_fast_math" }
};

var kernel = await accelerator.CompileKernelAsync(definition, options);

// Enable detailed CUDA logging
var logger = LoggerFactory.Create(builder => 
    builder.AddConsole().SetMinimumLevel(LogLevel.Trace));

// Get detailed GPU information
var info = await accelerator.GetDeviceInfoAsync();
Console.WriteLine($"GPU: {info.Name}, Memory: {info.TotalMemory / (1024*1024*1024)}GB");

// Profile kernel execution
var stopwatch = Stopwatch.StartNew();
await kernel.ExecuteAsync(parameters);
await accelerator.SynchronizeAsync();
stopwatch.Stop();
Console.WriteLine($"Kernel time: {stopwatch.ElapsedMilliseconds}ms");

// Enumerate all available GPUs
var devices = await CudaAccelerator.GetAvailableDevicesAsync();

// Create accelerators for each GPU
var accelerators = new List<CudaAccelerator>();
foreach (var device in devices)
{
    var acc = new CudaAccelerator(device, logger);
    await acc.InitializeAsync();
    accelerators.Add(acc);
}

// Enable P2P access between GPUs
await CudaAccelerator.EnablePeerAccessAsync(accelerators[0], accelerators[1]);

// Allocate unified memory (accessible from CPU and GPU)
var unifiedBuffer = await accelerator.AllocateUnifiedAsync<float>(size);

// CPU can access directly
for (int i = 0; i < size; i++)
    unifiedBuffer[i] = i * 2.0f;

// GPU kernels can access the same memory
await kernel.ExecuteAsync(new { data = unifiedBuffer, size });

// Results automatically available on CPU
Console.WriteLine($"Result: {unifiedBuffer[0]}");