Vulkan Memory Transfer

9 minute read

Published: December 06, 2024

Vulkan Memory Transfer

In an classic graphics pipeline, the first step is always turning a bunch of points into triangles or other primitives. The input data usually consist of a array of vertice and corresponding indices. Specifically, in the Vulkan, the actual first step is passing the vertices and indices into the gpu memory. CUDA, as general purpose compute API, offers a higher-level API for memory management. Vulkan, on the contrast, providesa fine-grained control over memory allocation and usage, which adds complexity but allows for highly optimized GPU memory management.

Overall Look

// This code is from Sascha Williems Vulkan Sample
std::vector<Vertex> vertexBuffer{
	{ {  1.0f,  1.0f, 0.0f }, { 1.0f, 0.0f, 0.0f } },
	{ { -1.0f,  1.0f, 0.0f }, { 0.0f, 1.0f, 0.0f } },
	{ {  0.0f, -1.0f, 0.0f }, { 0.0f, 0.0f, 1.0f } }
};
uint32_t vertexBufferSize = static_cast<uint32_t>(vertexBuffer.size()) * sizeof(Vertex);

// Setup indices
std::vector<uint32_t> indexBuffer{ 0, 1, 2 };
indices.count = static_cast<uint32_t>(indexBuffer.size());
uint32_t indexBufferSize = indices.count * sizeof(uint32_t);

VkMemoryAllocateInfo memAlloc{};
memAlloc.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
VkMemoryRequirements memReqs;

struct StagingBuffer {
	VkDeviceMemory memory;
	VkBuffer buffer;
};

struct {
	StagingBuffer vertices;
	StagingBuffer indices;
} stagingBuffers;

void* data;

// Vertex buffer
VkBufferCreateInfo vertexBufferInfoCI{};
vertexBufferInfoCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
vertexBufferInfoCI.size = vertexBufferSize;
// Buffer is used as the copy source
vertexBufferInfoCI.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
// Create a host-visible buffer to copy the vertex data to (staging buffer)
VK_CHECK_RESULT(vkCreateBuffer(device, &vertexBufferInfoCI, nullptr, &stagingBuffers.vertices.buffer));
vkGetBufferMemoryRequirements(device, stagingBuffers.vertices.buffer, &memReqs);
memAlloc.allocationSize = memReqs.size;
// Request a host visible memory type that can be used to copy our data do
// Also request it to be coherent, so that writes are visible to the GPU right after unmapping the buffer
memAlloc.memoryTypeIndex = getMemoryTypeIndex(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAlloc, nullptr, &stagingBuffers.vertices.memory));
// Map and copy
VK_CHECK_RESULT(vkMapMemory(device, stagingBuffers.vertices.memory, 0, memAlloc.allocationSize, 0, &data));
memcpy(data, vertexBuffer.data(), vertexBufferSize);
vkUnmapMemory(device, stagingBuffers.vertices.memory);
VK_CHECK_RESULT(vkBindBufferMemory(device, stagingBuffers.vertices.buffer, stagingBuffers.vertices.memory, 0));

// Create a device local buffer to which the (host local) vertex data will be copied and which will be used for rendering
vertexBufferInfoCI.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
VK_CHECK_RESULT(vkCreateBuffer(device, &vertexBufferInfoCI, nullptr, &vertices.buffer));
vkGetBufferMemoryRequirements(device, vertices.buffer, &memReqs);
memAlloc.allocationSize = memReqs.size;
memAlloc.memoryTypeIndex = getMemoryTypeIndex(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAlloc, nullptr, &vertices.memory));
VK_CHECK_RESULT(vkBindBufferMemory(device, vertices.buffer, vertices.memory, 0));

// Index buffer
VkBufferCreateInfo indexbufferCI{};
indexbufferCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
indexbufferCI.size = indexBufferSize;
indexbufferCI.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
// Copy index data to a buffer visible to the host (staging buffer)
VK_CHECK_RESULT(vkCreateBuffer(device, &indexbufferCI, nullptr, &stagingBuffers.indices.buffer));
vkGetBufferMemoryRequirements(device, stagingBuffers.indices.buffer, &memReqs);
memAlloc.allocationSize = memReqs.size;
memAlloc.memoryTypeIndex = getMemoryTypeIndex(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAlloc, nullptr, &stagingBuffers.indices.memory));
VK_CHECK_RESULT(vkMapMemory(device, stagingBuffers.indices.memory, 0, indexBufferSize, 0, &data));
memcpy(data, indexBuffer.data(), indexBufferSize);
vkUnmapMemory(device, stagingBuffers.indices.memory);
VK_CHECK_RESULT(vkBindBufferMemory(device, stagingBuffers.indices.buffer, stagingBuffers.indices.memory, 0));

// Create destination buffer with device only visibility
indexbufferCI.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
VK_CHECK_RESULT(vkCreateBuffer(device, &indexbufferCI, nullptr, &indices.buffer));
vkGetBufferMemoryRequirements(device, indices.buffer, &memReqs);
memAlloc.allocationSize = memReqs.size;
memAlloc.memoryTypeIndex = getMemoryTypeIndex(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAlloc, nullptr, &indices.memory));
VK_CHECK_RESULT(vkBindBufferMemory(device, indices.buffer, indices.memory, 0));

// Buffer copies have to be submitted to a queue, so we need a command buffer for them
// Note: Some devices offer a dedicated transfer queue (with only the transfer bit set) that may be faster when doing lots of copies
VkCommandBuffer copyCmd;

VkCommandBufferAllocateInfo cmdBufAllocateInfo{};
cmdBufAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
cmdBufAllocateInfo.commandPool = commandPool;
cmdBufAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
cmdBufAllocateInfo.commandBufferCount = 1;
VK_CHECK_RESULT(vkAllocateCommandBuffers(device, &cmdBufAllocateInfo, &copyCmd));

VkCommandBufferBeginInfo cmdBufInfo = vks::initializers::commandBufferBeginInfo();
VK_CHECK_RESULT(vkBeginCommandBuffer(copyCmd, &cmdBufInfo));
// Put buffer region copies into command buffer
VkBufferCopy copyRegion{};
// Vertex buffer
copyRegion.size = vertexBufferSize;
vkCmdCopyBuffer(copyCmd, stagingBuffers.vertices.buffer, vertices.buffer, 1, &copyRegion);
// Index buffer
copyRegion.size = indexBufferSize;
vkCmdCopyBuffer(copyCmd, stagingBuffers.indices.buffer, indices.buffer, 1, &copyRegion);
VK_CHECK_RESULT(vkEndCommandBuffer(copyCmd));

// Submit the command buffer to the queue to finish the copy
VkSubmitInfo submitInfo{};
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = &copyCmd;

// Create fence to ensure that the command buffer has finished executing
VkFenceCreateInfo fenceCI{};
fenceCI.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
fenceCI.flags = 0;
VkFence fence;
VK_CHECK_RESULT(vkCreateFence(device, &fenceCI, nullptr, &fence));

// Submit to the queue
VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &submitInfo, fence));
// Wait for the fence to signal that command buffer has finished executing
VK_CHECK_RESULT(vkWaitForFences(device, 1, &fence, VK_TRUE, DEFAULT_FENCE_TIMEOUT));

vkDestroyFence(device, fence, nullptr);
vkFreeCommandBuffers(device, commandPool, 1, &copyCmd);

// Destroy staging buffers
// Note: Staging buffer must not be deleted before the copies have been submitted and executed
vkDestroyBuffer(device, stagingBuffers.vertices.buffer, nullptr);
vkFreeMemory(device, stagingBuffers.vertices.memory, nullptr);
vkDestroyBuffer(device, stagingBuffers.indices.buffer, nullptr);
vkFreeMemory(device, stagingBuffers.indices.memory, nullptr);

Breakdown

Sacha gave very detailed comment on the code but I still found something unclear. This blog works as my personal notebook in which I will try to write down everything I learned along the journey. The standard procesure of transferring has following steps.

Creating host-visible buffer at CPU, we call it staging buffer.
Allocating device-local buffer at GPU to store the data.
Copying the data to host-visible buffer.
Transferring the data from staging buffer to GPU memory.
Cleaning up the staging buffer.

The Vulkan code basically follows the above struture while adding a lot details. Then, let us go through code snnipets.

Direct writes to GPU memory from the CPU are often unsupported on discrete GPUs, or they incur significant overheads due to the physical separation of CPU and GPU memory.

Input Data

std::vector<Vertex> vertexBuffer{
	{ {  1.0f,  1.0f, 0.0f }, { 1.0f, 0.0f, 0.0f } },
	{ { -1.0f,  1.0f, 0.0f }, { 0.0f, 1.0f, 0.0f } },
	{ {  0.0f, -1.0f, 0.0f }, { 0.0f, 0.0f, 1.0f } }
};
uint32_t vertexBufferSize = static_cast<uint32_t>(vertexBuffer.size()) * sizeof(Vertex);

// Setup indices
std::vector<uint32_t> indexBuffer{ 0, 1, 2 };

CPU data is hard-coded for our convenience, each entry in vertex buffer represents both the postion and color in rgb.

Vertex Staging Buffer

struct StagingBuffer {
	VkDeviceMemory memory;
	VkBuffer buffer;
};

struct {
	StagingBuffer vertices;
	StagingBuffer indices;
} stagingBuffers;

void* data;

// Vertex buffer
VkBufferCreateInfo vertexBufferInfoCI{};
vertexBufferInfoCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
vertexBufferInfoCI.size = vertexBufferSize;
// Buffer is used as the copy source
vertexBufferInfoCI.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
// Create a host-visible buffer to copy the vertex data to (staging buffer)
VK_CHECK_RESULT(vkCreateBuffer(device, &vertexBufferInfoCI, nullptr, &stagingBuffers.vertices.buffer));
vkGetBufferMemoryRequirements(device, stagingBuffers.vertices.buffer, &memReqs);
memAlloc.allocationSize = memReqs.size;
// Request a host visible memory type that can be used to copy our data do
// Also request it to be coherent, so that writes are visible to the GPU right after unmapping the buffer
memAlloc.memoryTypeIndex = getMemoryTypeIndex(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAlloc, nullptr, &stagingBuffers.vertices.memory));
// Map and copy
VK_CHECK_RESULT(vkMapMemory(device, stagingBuffers.vertices.memory, 0, memAlloc.allocationSize, 0, &data));
memcpy(data, vertexBuffer.data(), vertexBufferSize);
vkUnmapMemory(device, stagingBuffers.vertices.memory);
VK_CHECK_RESULT(vkBindBufferMemory(device, stagingBuffers.vertices.buffer, stagingBuffers.vertices.memory, 0));

// Create a device local buffer to which the (host local) vertex data will be copied and which will be used for rendering
vertexBufferInfoCI.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
VK_CHECK_RESULT(vkCreateBuffer(device, &vertexBufferInfoCI, nullptr, &vertices.buffer));
vkGetBufferMemoryRequirements(device, vertices.buffer, &memReqs);
memAlloc.allocationSize = memReqs.size;
memAlloc.memoryTypeIndex = getMemoryTypeIndex(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAlloc, nullptr, &vertices.memory));
VK_CHECK_RESULT(vkBindBufferMemory(device, vertices.buffer, vertices.memory, 0));

In the first place, we create a StagingBuffer struct. The vertex staging buffer would have two fields – memory and buffer. Then, we use vertexBufferInfoCI to create a buffer for staging buffer. This is a common procesure in Vulkan to initialize a new Vulkan component. For the memAlloc, we set flags for the memory of staging buffer. After vkMapMemory and vkBindBufferMemory, we create device local buffer as the destination of vertex data.

You might have many questions now. Why we both have memory and buffer for stagingbuffer? Doesn’t we only need device memory as the destination of vertex? Why we map the memory to data? Why we need to bind the buffer and memory?

Memory Mapping

As we mentioned before, we can not directly write to GPU memory from CPU. It follows that if we want to move data to a chunk GPU memory, the source of data should also be in GPU memory. That’s why we have buffer and memory two fields in the stagingbuffer instead of one. To access the momory in CPU, we will use memory mapping mechanism.

VK_CHECK_RESULT(vkMapMemory(device, stagingBuffers.vertices.memory, 0, memAlloc.allocationSize, 0, &data));
memcpy(data, vertexBuffer.data(), vertexBufferSize);
vkUnmapMemory(device, stagingBuffers.vertices.memory);

data is a pointer to the host-visible memory region. When we copy data into this region using memcpy, the data becomes accessible to the GPU after vkMapMemory. If the memory is marked as HOST_COHERENT, the GPU sees the updates automatically; otherwise, an explicit flush may be required.

Memory Binding

buffer is an abstraction that describes the intended usage of memory. When we bind a buffer to memory, it acts as a resource the Vulkan pipeline can use for specific operations (e.g., storing vertex data).” In this case, the buffer is the source of memory transfer.

vertexBufferInfoCI.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;

From my perspecitve, it is a container for the chemical solutions. We put labels on the tube stating the function of stuff inside while we don’t touch the chemicals. After binding the memory and buffer, we can have delicate control of the buffer at CPU level. In the last part of code, we send the vkCmdCopyBuffer to command buffer. This ensures that the data transfer happens in GPU. After the fence signal, we know that memory is transferred to the actual vertex memory.

Fence and Semaphore are basic synchronization techniques in Vulkan. Fence is in charges the operation between the CPU and GPU, ensuring the operation is complete before CPU/GPU switch, while Semaphore manages the tasks completion between the operations in GPU.

Share on

Twitter Facebook LinkedIn

Jiexiao Xu 徐捷骁

Vulkan Memory Transfer

Vulkan Memory Transfer

Overall Look

Breakdown

Input Data

Vertex Staging Buffer

Memory Mapping

Memory Binding

Share on

You May Also Enjoy

Interoperation between CUDA and Vulkan

Interoperation between CUDA and Vulkan