ggmlR supports GPU acceleration through the Vulkan backend. Vulkan is a cross-platform graphics API that works on Windows, Linux, and macOS (via MoltenVK). This vignette shows how to use the Vulkan backend for accelerated tensor operations.
First, check if Vulkan is available on your system:
You can list all available Vulkan devices:
if (ggml_vulkan_available()) {
devices <- ggml_vulkan_list_devices()
print(devices)
# Get detailed info for each device
for (i in seq_len(ggml_vulkan_device_count())) {
cat("\nDevice", i - 1, ":\n")
cat(" Name:", ggml_vulkan_device_description(i - 1), "\n")
mem <- ggml_vulkan_device_memory(i - 1)
cat(" Memory:", round(mem / 1024^3, 2), "GB\n")
}
}To use Vulkan for computations:
Here’s how to perform tensor operations on the GPU:
if (ggml_vulkan_available()) {
# Initialize Vulkan backend
vk <- ggml_vulkan_init(0)
# Create scheduler with Vulkan backend
sched <- ggml_backend_sched_new(list(vk))
# Create context for tensors
ctx <- ggml_init(64 * 1024 * 1024)
# Create tensors
n <- 10000
a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n)
b <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n)
# Build computation graph
c <- ggml_add(ctx, a, b)
d <- ggml_mul(ctx, c, c)
result <- ggml_sum(ctx, d)
graph <- ggml_build_forward_expand(ctx, result)
# Reserve memory and allocate
ggml_backend_sched_reserve(sched, graph)
ggml_backend_sched_alloc_graph(sched, graph)
# Set data
ggml_set_f32(a, rnorm(n))
ggml_set_f32(b, rnorm(n))
# Compute on GPU
ggml_backend_sched_graph_compute(sched, graph)
# Get result
cat("Result:", ggml_get_f32(result), "\n")
# Cleanup
ggml_backend_sched_free(sched)
ggml_vulkan_free(vk)
ggml_free(ctx)
}Matrix multiplication benefits significantly from GPU acceleration:
if (ggml_vulkan_available()) {
vk <- ggml_vulkan_init(0)
sched <- ggml_backend_sched_new(list(vk))
ctx <- ggml_init(128 * 1024 * 1024)
# Create matrices
m <- 1024
n <- 1024
k <- 1024
A <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, k, m)
B <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, k, n)
# Matrix multiplication: C = A * B^T
C <- ggml_mul_mat(ctx, A, B)
graph <- ggml_build_forward_expand(ctx, C)
ggml_backend_sched_reserve(sched, graph)
ggml_backend_sched_alloc_graph(sched, graph)
# Initialize with random data
ggml_set_f32(A, rnorm(m * k))
ggml_set_f32(B, rnorm(n * k))
# Time GPU computation
start <- Sys.time()
ggml_backend_sched_graph_compute(sched, graph)
gpu_time <- Sys.time() - start
cat("GPU matmul time:", round(as.numeric(gpu_time) * 1000, 2), "ms\n")
cat("Result shape:", ggml_tensor_shape(C), "\n")
# Cleanup
ggml_backend_sched_free(sched)
ggml_vulkan_free(vk)
ggml_free(ctx)
}If ggml_vulkan_available() returns
FALSE:
vulkaninfo in terminalggml_vulkan_device_memory() to find the most capable
GPUvignette("multi-gpu") for multi-GPU inferencevignette("quantization") for using quantized
models