Production-ready OpenCL backend for DotCompute. Cross-platform GPU acceleration for NVIDIA, AMD, Intel, ARM Mali, and Qualcomm Adreno GPUs. Supports OpenCL 1.2+, Ring Kernels with atomic message queues, runtime kernel compilation, and multi-device workload distribution. Works with nvidia-opencl-icd, ROCm, intel-opencl-icd, and vendor drivers.
$ dotnet add package DotCompute.Backends.OpenCLCross-platform OpenCL compute backend for .NET 9+ with GPU and accelerator support.
EXPERIMENTAL: This backend is functional for cross-platform GPU acceleration but has not been extensively production-tested across all vendor implementations. It works well for development and testing across NVIDIA, AMD, and Intel GPUs. Production use recommended only after validation on your target hardware.
The OpenCL backend provides cross-platform GPU acceleration:
Main accelerator implementation providing:
Manages OpenCL devices:
Device information structure:
Platform information:
OpenCL context wrapper:
Unified memory manager for OpenCL:
Buffer implementation:
Compiled kernel representation:
Persistent kernel runtime for OpenCL:
Ring kernel compilation for OpenCL:
Factory for creating OpenCL accelerators:
P/Invoke bindings to OpenCL C API:
Native type definitions:
Exception type for OpenCL errors:
dotnet add package DotCompute.Backends.OpenCL --version 0.5.3
using DotCompute.Backends.OpenCL;
using Microsoft.Extensions.Logging;
var logger = LoggerFactory.Create(builder => builder.AddConsole())
.CreateLogger<OpenCLAccelerator>();
// Create accelerator
var accelerator = new OpenCLAccelerator(logger, loggerFactory);
// Initialize with default device (first GPU or CPU)
await accelerator.InitializeAsync();
Console.WriteLine($"Using: {accelerator.Name}");
Console.WriteLine($"Global Memory: {accelerator.Info.TotalMemory / (1024*1024)} MB");
using Microsoft.Extensions.DependencyInjection;
var services = new ServiceCollection();
// Register OpenCL backend
services.AddSingleton<IAccelerator, OpenCLAccelerator>();
// OR use plugin registration
services.AddDotComputeBackend("DotCompute.Backends.OpenCL");
using DotCompute.Backends.OpenCL.DeviceManagement;
var deviceManager = new OpenCLDeviceManager(logger);
// Enumerate all devices
var devices = await deviceManager.EnumerateDevicesAsync();
foreach (var device in devices)
{
Console.WriteLine($"Device: {device.Name}");
Console.WriteLine($" Type: {device.DeviceType}");
Console.WriteLine($" Compute Units: {device.MaxComputeUnits}");
Console.WriteLine($" Global Memory: {device.GlobalMemorySize / (1024*1024)} MB");
Console.WriteLine($" Local Memory: {device.LocalMemorySize / 1024} KB");
}
// Select specific device
var selectedDevice = devices.FirstOrDefault(d => d.DeviceType == DeviceType.GPU);
if (selectedDevice != null)
{
await accelerator.InitializeAsync(selectedDevice);
}
using DotCompute.Abstractions.Kernels;
// Define OpenCL kernel
var kernelDef = new KernelDefinition
{
Name = "VectorAdd",
Source = @"
__kernel void vector_add(
__global const float* a,
__global const float* b,
__global float* result,
const int length)
{
int gid = get_global_id(0);
if (gid < length) {
result[gid] = a[gid] + b[gid];
}
}
",
EntryPoint = "vector_add"
};
// Compile kernel
var compiledKernel = await accelerator.CompileKernelAsync(kernelDef);
// Allocate device memory
var length = 1_000_000;
var bufferA = await accelerator.Memory.AllocateAsync<float>(length);
var bufferB = await accelerator.Memory.AllocateAsync<float>(length);
var bufferResult = await accelerator.Memory.AllocateAsync<float>(length);
// Copy data to device
var dataA = Enumerable.Range(0, length).Select(i => (float)i).ToArray();
var dataB = Enumerable.Range(0, length).Select(i => (float)(i * 2)).ToArray();
await bufferA.CopyFromAsync(dataA);
await bufferB.CopyFromAsync(dataB);
// Set kernel arguments and execute
var launchParams = new KernelLaunchParameters
{
GlobalWorkSize = new[] { (uint)length },
LocalWorkSize = new[] { 256u }
};
await compiledKernel.ExecuteAsync(new object[]
{
bufferA,
bufferB,
bufferResult,
length
}, launchParams);
// Read results back
var results = new float[length];
await bufferResult.CopyToAsync(results);
// Cleanup
await bufferA.DisposeAsync();
await bufferB.DisposeAsync();
await bufferResult.DisposeAsync();
// Allocate buffer
var buffer = await accelerator.Memory.AllocateAsync<float>(10_000);
// Write to device
var hostData = new float[10_000];
await buffer.CopyFromAsync(hostData);
// Read from device
var resultData = new float[10_000];
await buffer.CopyToAsync(resultData);
// Map memory for zero-copy access (if supported)
if (accelerator.DeviceInfo?.SupportsHostMemoryMapping == true)
{
var mappedPtr = await buffer.MapAsync(MapMode.ReadWrite);
// Access memory directly...
await buffer.UnmapAsync(mappedPtr);
}
using DotCompute.Backends.OpenCL.Factory;
var factory = new OpenCLAcceleratorFactory(configuration, logger);
// Create accelerator with performance profile
var accelerator = await factory.CreateAsync(new WorkloadProfile
{
WorkloadType = WorkloadType.Compute,
DataSize = DataSize.Large,
MemoryIntensive = true
});
OpenCLAccelerator (IAccelerator)
├── OpenCLContext (Context management)
├── OpenCLDeviceManager (Device discovery)
├── OpenCLMemoryManager (Memory operations)
└── OpenCLCompiledKernel (Kernel execution)
Native Layer:
├── OpenCLRuntime (P/Invoke bindings)
├── OpenCLTypes (Native type definitions)
└── OpenCLException (Error handling)
# Ubuntu/Debian
sudo apt-get install ocl-icd-opencl-dev nvidia-opencl-icd
# Fedora/RHEL
sudo dnf install ocl-icd-devel pocl
# Verify installation
clinfo
OpenCL is deprecated on macOS. Use Metal backend for macOS devices.
# Enable OpenCL debugging
export DOTCOMPUTE_OPENCL_DEBUG=1
# Select specific platform
export DOTCOMPUTE_OPENCL_PLATFORM=0
# Select specific device
export DOTCOMPUTE_OPENCL_DEVICE=0
# Force CPU device (for debugging)
export DOTCOMPUTE_OPENCL_FORCE_CPU=1
var options = new OpenCLOptions
{
PreferredDeviceType = DeviceType.GPU,
EnableProfiling = true,
EnableOutOfOrderExecution = false,
BuildOptions = "-cl-fast-relaxed-math -cl-mad-enable",
CacheKernels = true
};
clinfo to list OpenCL platforms and devicesvideo group// Enable detailed logging
var logger = LoggerFactory.Create(builder =>
builder.AddConsole().SetMinimumLevel(LogLevel.Trace));
// Get device capabilities
var info = accelerator.DeviceInfo;
Console.WriteLine($"Max Work Group Size: {info.MaxWorkGroupSize}");
Console.WriteLine($"Max Compute Units: {info.MaxComputeUnits}");
Console.WriteLine($"Extensions: {string.Join(", ", info.Extensions)}");
// Profile kernel execution
var sw = Stopwatch.StartNew();
await kernel.ExecuteAsync(args, launchParams);
await accelerator.SynchronizeAsync();
sw.Stop();
Console.WriteLine($"Kernel time: {sw.ElapsedMilliseconds}ms");
// Create accelerator for each device
var accelerators = new List<OpenCLAccelerator>();
foreach (var device in devices)
{
var acc = new OpenCLAccelerator(logger, loggerFactory);
await acc.InitializeAsync(device);
accelerators.Add(acc);
}
// Distribute work across devices
var tasks = accelerators.Select(acc =>
acc.CompileKernelAsync(kernelDef)
.ContinueWith(t => t.Result.ExecuteAsync(args))
);
await Task.WhenAll(tasks);
var options = new CompilationOptions
{
OptimizationLevel = OptimizationLevel.O3,
CustomOptions = new[]
{
"-cl-mad-enable", // Mad operations
"-cl-fast-relaxed-math", // Fast math
"-cl-finite-math-only", // No INF/NaN
"-cl-unsafe-math-optimizations"
}
};
var kernel = await accelerator.CompileKernelAsync(definition, options);
using DotCompute.Abstractions.RingKernels;
// Define persistent ring kernel for graph processing
[RingKernel(
KernelId = "graph-process",
Domain = RingKernelDomain.GraphAnalytics,
Mode = RingKernelMode.Persistent,
Capacity = 8192,
Backends = KernelBackends.OpenCL)]
public static void ProcessGraphVertex(
IMessageQueue<GraphMessage> incoming,
IMessageQueue<GraphMessage> outgoing,
Span<float> values)
{
int vertexId = Kernel.ThreadId.X;
// Process messages with OpenCL atomic operations
while (incoming.TryDequeue(out var msg))
{
if (msg.TargetVertex == vertexId)
values[vertexId] += msg.Value;
}
// Send updates to neighbors
outgoing.Enqueue(new GraphMessage { TargetVertex = ..., Value = ... });
}
// Launch ring kernel on OpenCL device
var runtime = orchestrator.GetRingKernelRuntime();
await runtime.LaunchAsync("graph-process", gridSize: 1024, blockSize: 256);
await runtime.ActivateAsync("graph-process");
// Send messages
await runtime.SendMessageAsync("graph-process", new GraphMessage { ... });
// Monitor performance
var metrics = await runtime.GetMetricsAsync("graph-process");
Console.WriteLine($"Throughput: {metrics.ThroughputMsgsPerSec:F2} msgs/sec");
Console.WriteLine($"GPU Utilization: {metrics.GpuUtilizationPercent:F1}%");
Comprehensive documentation is available for DotCompute:
Contributions are welcome, particularly in:
See CONTRIBUTING.md for guidelines.
MIT License - Copyright (c) 2025 Michael Ivertowski