pkg

mivertowski/DotCompute.Backends.OpenCLv0.6.2

.NET 9.0

Production-ready OpenCL backend for DotCompute. Cross-platform GPU acceleration for NVIDIA, AMD, Intel, ARM Mali, and Qualcomm Adreno GPUs. Supports OpenCL 1.2+, Ring Kernels with atomic message queues, runtime kernel compilation, and multi-device workload distribution. Works with nvidia-opencl-icd, ROCm, intel-opencl-icd, and vendor drivers.

License

MIT

Deps

13

Install Size

—

Vulns

✓ 0

Published

Feb 9, 2026

Get Started

$ dotnet add package DotCompute.Backends.OpenCL

Readme

DotCompute.Backends.OpenCL

Cross-platform OpenCL compute backend for .NET 9+ with GPU and accelerator support.

Status: ⚠️ EXPERIMENTAL

EXPERIMENTAL: This backend is functional for cross-platform GPU acceleration but has not been extensively production-tested across all vendor implementations. It works well for development and testing across NVIDIA, AMD, and Intel GPUs. Production use recommended only after validation on your target hardware.

The OpenCL backend provides cross-platform GPU acceleration:

OpenCL Runtime Integration: P/Invoke bindings to OpenCL C API
Device Management: Platform and device enumeration
Context Management: OpenCL context creation and lifecycle
Memory Management: Device memory allocation and transfers
Kernel Compilation: Runtime kernel compilation from OpenCL C
Ring Kernel Support: Persistent kernels with message passing
Plugin Architecture: Integrated with DotCompute plugin system
Cross-Vendor Support: NVIDIA, AMD, Intel, ARM Mali, Qualcomm Adreno

Key Components

OpenCL Accelerator

OpenCLAccelerator

Main accelerator implementation providing:

Device initialization and management
Kernel compilation and execution
Memory allocation and synchronization
OpenCL context lifecycle management
Error handling and diagnostics

Device Management

OpenCLDeviceManager

Manages OpenCL devices:

Platform enumeration
Device discovery and selection
Capability detection
Device information queries
Multi-device support

OpenCLDeviceInfo

Device information structure:

Device name and vendor
OpenCL version and driver version
Memory sizes (global, local, constant)

dotnet add package DotCompute.Backends.OpenCL --version 0.5.3

using DotCompute.Backends.OpenCL;
using Microsoft.Extensions.Logging;

var logger = LoggerFactory.Create(builder => builder.AddConsole())
    .CreateLogger<OpenCLAccelerator>();

// Create accelerator
var accelerator = new OpenCLAccelerator(logger, loggerFactory);

// Initialize with default device (first GPU or CPU)
await accelerator.InitializeAsync();

Console.WriteLine($"Using: {accelerator.Name}");
Console.WriteLine($"Global Memory: {accelerator.Info.TotalMemory / (1024*1024)} MB");

using Microsoft.Extensions.DependencyInjection;

var services = new ServiceCollection();

// Register OpenCL backend
services.AddSingleton<IAccelerator, OpenCLAccelerator>();

// OR use plugin registration
services.AddDotComputeBackend("DotCompute.Backends.OpenCL");

using DotCompute.Backends.OpenCL.DeviceManagement;

var deviceManager = new OpenCLDeviceManager(logger);

// Enumerate all devices
var devices = await deviceManager.EnumerateDevicesAsync();

foreach (var device in devices)
{
    Console.WriteLine($"Device: {device.Name}");
    Console.WriteLine($"  Type: {device.DeviceType}");
    Console.WriteLine($"  Compute Units: {device.MaxComputeUnits}");
    Console.WriteLine($"  Global Memory: {device.GlobalMemorySize / (1024*1024)} MB");
    Console.WriteLine($"  Local Memory: {device.LocalMemorySize / 1024} KB");
}

// Select specific device
var selectedDevice = devices.FirstOrDefault(d => d.DeviceType == DeviceType.GPU);
if (selectedDevice != null)
{
    await accelerator.InitializeAsync(selectedDevice);
}

using DotCompute.Abstractions.Kernels;

// Define OpenCL kernel
var kernelDef = new KernelDefinition
{
    Name = "VectorAdd",
    Source = @"
        __kernel void vector_add(
            __global const float* a,
            __global const float* b,
            __global float* result,
            const int length)
        {
            int gid = get_global_id(0);
            if (gid < length) {
                result[gid] = a[gid] + b[gid];
            }
        }
    ",
    EntryPoint = "vector_add"
};

// Compile kernel
var compiledKernel = await accelerator.CompileKernelAsync(kernelDef);

// Allocate device memory
var length = 1_000_000;
var bufferA = await accelerator.Memory.AllocateAsync<float>(length);
var bufferB = await accelerator.Memory.AllocateAsync<float>(length);
var bufferResult = await accelerator.Memory.AllocateAsync<float>(length);

// Copy data to device
var dataA = Enumerable.Range(0, length).Select(i => (float)i).ToArray();
var dataB = Enumerable.Range(0, length).Select(i => (float)(i * 2)).ToArray();

await bufferA.CopyFromAsync(dataA);
await bufferB.CopyFromAsync(dataB);

// Set kernel arguments and execute
var launchParams = new KernelLaunchParameters
{
    GlobalWorkSize = new[] { (uint)length },
    LocalWorkSize = new[] { 256u }
};

await compiledKernel.ExecuteAsync(new object[]
{
    bufferA,
    bufferB,
    bufferResult,
    length
}, launchParams);

// Read results back
var results = new float[length];
await bufferResult.CopyToAsync(results);

// Cleanup
await bufferA.DisposeAsync();
await bufferB.DisposeAsync();
await bufferResult.DisposeAsync();

// Allocate buffer
var buffer = await accelerator.Memory.AllocateAsync<float>(10_000);

// Write to device
var hostData = new float[10_000];
await buffer.CopyFromAsync(hostData);

// Read from device
var resultData = new float[10_000];
await buffer.CopyToAsync(resultData);

// Map memory for zero-copy access (if supported)
if (accelerator.DeviceInfo?.SupportsHostMemoryMapping == true)
{
    var mappedPtr = await buffer.MapAsync(MapMode.ReadWrite);
    // Access memory directly...
    await buffer.UnmapAsync(mappedPtr);
}

using DotCompute.Backends.OpenCL.Factory;

var factory = new OpenCLAcceleratorFactory(configuration, logger);

// Create accelerator with performance profile
var accelerator = await factory.CreateAsync(new WorkloadProfile
{
    WorkloadType = WorkloadType.Compute,
    DataSize = DataSize.Large,
    MemoryIntensive = true
});

OpenCLAccelerator (IAccelerator)
    ├── OpenCLContext (Context management)
    ├── OpenCLDeviceManager (Device discovery)
    ├── OpenCLMemoryManager (Memory operations)
    └── OpenCLCompiledKernel (Kernel execution)

Native Layer:
    ├── OpenCLRuntime (P/Invoke bindings)
    ├── OpenCLTypes (Native type definitions)
    └── OpenCLException (Error handling)

# Ubuntu/Debian
sudo apt-get install ocl-icd-opencl-dev nvidia-opencl-icd

# Fedora/RHEL
sudo dnf install ocl-icd-devel pocl

# Verify installation
clinfo

# Enable OpenCL debugging
export DOTCOMPUTE_OPENCL_DEBUG=1

# Select specific platform
export DOTCOMPUTE_OPENCL_PLATFORM=0

# Select specific device
export DOTCOMPUTE_OPENCL_DEVICE=0

# Force CPU device (for debugging)
export DOTCOMPUTE_OPENCL_FORCE_CPU=1

var options = new OpenCLOptions
{
    PreferredDeviceType = DeviceType.GPU,
    EnableProfiling = true,
    EnableOutOfOrderExecution = false,
    BuildOptions = "-cl-fast-relaxed-math -cl-mad-enable",
    CacheKernels = true
};

// Enable detailed logging
var logger = LoggerFactory.Create(builder =>
    builder.AddConsole().SetMinimumLevel(LogLevel.Trace));

// Get device capabilities
var info = accelerator.DeviceInfo;
Console.WriteLine($"Max Work Group Size: {info.MaxWorkGroupSize}");
Console.WriteLine($"Max Compute Units: {info.MaxComputeUnits}");
Console.WriteLine($"Extensions: {string.Join(", ", info.Extensions)}");

// Profile kernel execution
var sw = Stopwatch.StartNew();
await kernel.ExecuteAsync(args, launchParams);
await accelerator.SynchronizeAsync();
sw.Stop();
Console.WriteLine($"Kernel time: {sw.ElapsedMilliseconds}ms");

// Create accelerator for each device
var accelerators = new List<OpenCLAccelerator>();
foreach (var device in devices)
{
    var acc = new OpenCLAccelerator(logger, loggerFactory);
    await acc.InitializeAsync(device);
    accelerators.Add(acc);
}

// Distribute work across devices
var tasks = accelerators.Select(acc =>
    acc.CompileKernelAsync(kernelDef)
       .ContinueWith(t => t.Result.ExecuteAsync(args))
);

await Task.WhenAll(tasks);

var options = new CompilationOptions
{
    OptimizationLevel = OptimizationLevel.O3,
    CustomOptions = new[]
    {
        "-cl-mad-enable",           // Mad operations
        "-cl-fast-relaxed-math",    // Fast math
        "-cl-finite-math-only",     // No INF/NaN
        "-cl-unsafe-math-optimizations"
    }
};

var kernel = await accelerator.CompileKernelAsync(definition, options);

using DotCompute.Abstractions.RingKernels;

// Define persistent ring kernel for graph processing
[RingKernel(
    KernelId = "graph-process",
    Domain = RingKernelDomain.GraphAnalytics,
    Mode = RingKernelMode.Persistent,
    Capacity = 8192,
    Backends = KernelBackends.OpenCL)]
public static void ProcessGraphVertex(
    IMessageQueue<GraphMessage> incoming,
    IMessageQueue<GraphMessage> outgoing,
    Span<float> values)
{
    int vertexId = Kernel.ThreadId.X;

    // Process messages with OpenCL atomic operations
    while (incoming.TryDequeue(out var msg))
    {
        if (msg.TargetVertex == vertexId)
            values[vertexId] += msg.Value;
    }

    // Send updates to neighbors
    outgoing.Enqueue(new GraphMessage { TargetVertex = ..., Value = ... });
}

// Launch ring kernel on OpenCL device
var runtime = orchestrator.GetRingKernelRuntime();
await runtime.LaunchAsync("graph-process", gridSize: 1024, blockSize: 256);
await runtime.ActivateAsync("graph-process");

// Send messages
await runtime.SendMessageAsync("graph-process", new GraphMessage { ... });

// Monitor performance
var metrics = await runtime.GetMetricsAsync("graph-process");
Console.WriteLine($"Throughput: {metrics.ThroughputMsgsPerSec:F2} msgs/sec");
Console.WriteLine($"GPU Utilization: {metrics.GpuUtilizationPercent:F1}%");