Vulkan Memory Transfer
Published:
Vulkan Memory Transfer
In an classic graphics pipeline, the first step is always turning a bunch of points into triangles or other primitives. The input data usually consist of a array of vertice and corresponding indices. Specifically, in the Vulkan, the actual first step is passing the vertices and indices into the gpu memory. CUDA, as general purpose compute API, offers a higher-level API for memory management. Vulkan, on the contrast, providesa fine-grained control over memory allocation and usage, which adds complexity but allows for highly optimized GPU memory management.
Overall Look
// This code is from Sascha Williems Vulkan Sample
std::vector<Vertex> vertexBuffer{
{ { 1.0f, 1.0f, 0.0f }, { 1.0f, 0.0f, 0.0f } },
{ { -1.0f, 1.0f, 0.0f }, { 0.0f, 1.0f, 0.0f } },
{ { 0.0f, -1.0f, 0.0f }, { 0.0f, 0.0f, 1.0f } }
};
uint32_t vertexBufferSize = static_cast<uint32_t>(vertexBuffer.size()) * sizeof(Vertex);
// Setup indices
std::vector<uint32_t> indexBuffer{ 0, 1, 2 };
indices.count = static_cast<uint32_t>(indexBuffer.size());
uint32_t indexBufferSize = indices.count * sizeof(uint32_t);
VkMemoryAllocateInfo memAlloc{};
memAlloc.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
VkMemoryRequirements memReqs;
struct StagingBuffer {
VkDeviceMemory memory;
VkBuffer buffer;
};
struct {
StagingBuffer vertices;
StagingBuffer indices;
} stagingBuffers;
void* data;
// Vertex buffer
VkBufferCreateInfo vertexBufferInfoCI{};
vertexBufferInfoCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
vertexBufferInfoCI.size = vertexBufferSize;
// Buffer is used as the copy source
vertexBufferInfoCI.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
// Create a host-visible buffer to copy the vertex data to (staging buffer)
VK_CHECK_RESULT(vkCreateBuffer(device, &vertexBufferInfoCI, nullptr, &stagingBuffers.vertices.buffer));
vkGetBufferMemoryRequirements(device, stagingBuffers.vertices.buffer, &memReqs);
memAlloc.allocationSize = memReqs.size;
// Request a host visible memory type that can be used to copy our data do
// Also request it to be coherent, so that writes are visible to the GPU right after unmapping the buffer
memAlloc.memoryTypeIndex = getMemoryTypeIndex(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAlloc, nullptr, &stagingBuffers.vertices.memory));
// Map and copy
VK_CHECK_RESULT(vkMapMemory(device, stagingBuffers.vertices.memory, 0, memAlloc.allocationSize, 0, &data));
memcpy(data, vertexBuffer.data(), vertexBufferSize);
vkUnmapMemory(device, stagingBuffers.vertices.memory);
VK_CHECK_RESULT(vkBindBufferMemory(device, stagingBuffers.vertices.buffer, stagingBuffers.vertices.memory, 0));
// Create a device local buffer to which the (host local) vertex data will be copied and which will be used for rendering
vertexBufferInfoCI.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
VK_CHECK_RESULT(vkCreateBuffer(device, &vertexBufferInfoCI, nullptr, &vertices.buffer));
vkGetBufferMemoryRequirements(device, vertices.buffer, &memReqs);
memAlloc.allocationSize = memReqs.size;
memAlloc.memoryTypeIndex = getMemoryTypeIndex(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAlloc, nullptr, &vertices.memory));
VK_CHECK_RESULT(vkBindBufferMemory(device, vertices.buffer, vertices.memory, 0));
// Index buffer
VkBufferCreateInfo indexbufferCI{};
indexbufferCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
indexbufferCI.size = indexBufferSize;
indexbufferCI.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
// Copy index data to a buffer visible to the host (staging buffer)
VK_CHECK_RESULT(vkCreateBuffer(device, &indexbufferCI, nullptr, &stagingBuffers.indices.buffer));
vkGetBufferMemoryRequirements(device, stagingBuffers.indices.buffer, &memReqs);
memAlloc.allocationSize = memReqs.size;
memAlloc.memoryTypeIndex = getMemoryTypeIndex(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAlloc, nullptr, &stagingBuffers.indices.memory));
VK_CHECK_RESULT(vkMapMemory(device, stagingBuffers.indices.memory, 0, indexBufferSize, 0, &data));
memcpy(data, indexBuffer.data(), indexBufferSize);
vkUnmapMemory(device, stagingBuffers.indices.memory);
VK_CHECK_RESULT(vkBindBufferMemory(device, stagingBuffers.indices.buffer, stagingBuffers.indices.memory, 0));
// Create destination buffer with device only visibility
indexbufferCI.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
VK_CHECK_RESULT(vkCreateBuffer(device, &indexbufferCI, nullptr, &indices.buffer));
vkGetBufferMemoryRequirements(device, indices.buffer, &memReqs);
memAlloc.allocationSize = memReqs.size;
memAlloc.memoryTypeIndex = getMemoryTypeIndex(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAlloc, nullptr, &indices.memory));
VK_CHECK_RESULT(vkBindBufferMemory(device, indices.buffer, indices.memory, 0));
// Buffer copies have to be submitted to a queue, so we need a command buffer for them
// Note: Some devices offer a dedicated transfer queue (with only the transfer bit set) that may be faster when doing lots of copies
VkCommandBuffer copyCmd;
VkCommandBufferAllocateInfo cmdBufAllocateInfo{};
cmdBufAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
cmdBufAllocateInfo.commandPool = commandPool;
cmdBufAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
cmdBufAllocateInfo.commandBufferCount = 1;
VK_CHECK_RESULT(vkAllocateCommandBuffers(device, &cmdBufAllocateInfo, ©Cmd));
VkCommandBufferBeginInfo cmdBufInfo = vks::initializers::commandBufferBeginInfo();
VK_CHECK_RESULT(vkBeginCommandBuffer(copyCmd, &cmdBufInfo));
// Put buffer region copies into command buffer
VkBufferCopy copyRegion{};
// Vertex buffer
copyRegion.size = vertexBufferSize;
vkCmdCopyBuffer(copyCmd, stagingBuffers.vertices.buffer, vertices.buffer, 1, ©Region);
// Index buffer
copyRegion.size = indexBufferSize;
vkCmdCopyBuffer(copyCmd, stagingBuffers.indices.buffer, indices.buffer, 1, ©Region);
VK_CHECK_RESULT(vkEndCommandBuffer(copyCmd));
// Submit the command buffer to the queue to finish the copy
VkSubmitInfo submitInfo{};
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = ©Cmd;
// Create fence to ensure that the command buffer has finished executing
VkFenceCreateInfo fenceCI{};
fenceCI.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
fenceCI.flags = 0;
VkFence fence;
VK_CHECK_RESULT(vkCreateFence(device, &fenceCI, nullptr, &fence));
// Submit to the queue
VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &submitInfo, fence));
// Wait for the fence to signal that command buffer has finished executing
VK_CHECK_RESULT(vkWaitForFences(device, 1, &fence, VK_TRUE, DEFAULT_FENCE_TIMEOUT));
vkDestroyFence(device, fence, nullptr);
vkFreeCommandBuffers(device, commandPool, 1, ©Cmd);
// Destroy staging buffers
// Note: Staging buffer must not be deleted before the copies have been submitted and executed
vkDestroyBuffer(device, stagingBuffers.vertices.buffer, nullptr);
vkFreeMemory(device, stagingBuffers.vertices.memory, nullptr);
vkDestroyBuffer(device, stagingBuffers.indices.buffer, nullptr);
vkFreeMemory(device, stagingBuffers.indices.memory, nullptr);
Breakdown
Sacha gave very detailed comment on the code but I still found something unclear. This blog works as my personal notebook in which I will try to write down everything I learned along the journey. The standard procesure of transferring has following steps.
- Creating host-visible buffer at CPU, we call it staging buffer.
- Allocating device-local buffer at GPU to store the data.
- Copying the data to host-visible buffer.
- Transferring the data from staging buffer to GPU memory.
- Cleaning up the staging buffer.
The Vulkan code basically follows the above struture while adding a lot details. Then, let us go through code snnipets.
Direct writes to GPU memory from the CPU are often unsupported on discrete GPUs, or they incur significant overheads due to the physical separation of CPU and GPU memory.
Input Data
std::vector<Vertex> vertexBuffer{
{ { 1.0f, 1.0f, 0.0f }, { 1.0f, 0.0f, 0.0f } },
{ { -1.0f, 1.0f, 0.0f }, { 0.0f, 1.0f, 0.0f } },
{ { 0.0f, -1.0f, 0.0f }, { 0.0f, 0.0f, 1.0f } }
};
uint32_t vertexBufferSize = static_cast<uint32_t>(vertexBuffer.size()) * sizeof(Vertex);
// Setup indices
std::vector<uint32_t> indexBuffer{ 0, 1, 2 };
CPU data is hard-coded for our convenience, each entry in vertex buffer represents both the postion and color in rgb.
Vertex Staging Buffer
struct StagingBuffer {
VkDeviceMemory memory;
VkBuffer buffer;
};
struct {
StagingBuffer vertices;
StagingBuffer indices;
} stagingBuffers;
void* data;
// Vertex buffer
VkBufferCreateInfo vertexBufferInfoCI{};
vertexBufferInfoCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
vertexBufferInfoCI.size = vertexBufferSize;
// Buffer is used as the copy source
vertexBufferInfoCI.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
// Create a host-visible buffer to copy the vertex data to (staging buffer)
VK_CHECK_RESULT(vkCreateBuffer(device, &vertexBufferInfoCI, nullptr, &stagingBuffers.vertices.buffer));
vkGetBufferMemoryRequirements(device, stagingBuffers.vertices.buffer, &memReqs);
memAlloc.allocationSize = memReqs.size;
// Request a host visible memory type that can be used to copy our data do
// Also request it to be coherent, so that writes are visible to the GPU right after unmapping the buffer
memAlloc.memoryTypeIndex = getMemoryTypeIndex(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAlloc, nullptr, &stagingBuffers.vertices.memory));
// Map and copy
VK_CHECK_RESULT(vkMapMemory(device, stagingBuffers.vertices.memory, 0, memAlloc.allocationSize, 0, &data));
memcpy(data, vertexBuffer.data(), vertexBufferSize);
vkUnmapMemory(device, stagingBuffers.vertices.memory);
VK_CHECK_RESULT(vkBindBufferMemory(device, stagingBuffers.vertices.buffer, stagingBuffers.vertices.memory, 0));
// Create a device local buffer to which the (host local) vertex data will be copied and which will be used for rendering
vertexBufferInfoCI.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
VK_CHECK_RESULT(vkCreateBuffer(device, &vertexBufferInfoCI, nullptr, &vertices.buffer));
vkGetBufferMemoryRequirements(device, vertices.buffer, &memReqs);
memAlloc.allocationSize = memReqs.size;
memAlloc.memoryTypeIndex = getMemoryTypeIndex(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAlloc, nullptr, &vertices.memory));
VK_CHECK_RESULT(vkBindBufferMemory(device, vertices.buffer, vertices.memory, 0));
In the first place, we create a StagingBuffer
struct. The vertex staging buffer would have two fields – memory
and buffer
. Then, we use vertexBufferInfoCI
to create a buffer
for staging buffer. This is a common procesure in Vulkan to initialize a new Vulkan component. For the memAlloc
, we set flags for the memory
of staging buffer. After vkMapMemory
and vkBindBufferMemory
, we create device local buffer as the destination of vertex data.
You might have many questions now. Why we both have memory
and buffer
for stagingbuffer? Doesn’t we only need device memory as the destination of vertex? Why we map the memory to data
? Why we need to bind the buffer
and memory
?
Memory Mapping
As we mentioned before, we can not directly write to GPU memory from CPU. It follows that if we want to move data to a chunk GPU memory, the source of data should also be in GPU memory. That’s why we have buffer
and memory
two fields in the stagingbuffer
instead of one. To access the momory
in CPU, we will use memory mapping mechanism.
VK_CHECK_RESULT(vkMapMemory(device, stagingBuffers.vertices.memory, 0, memAlloc.allocationSize, 0, &data));
memcpy(data, vertexBuffer.data(), vertexBufferSize);
vkUnmapMemory(device, stagingBuffers.vertices.memory);
data
is a pointer to the host-visible memory region. When we copy data into this region using memcpy
, the data
becomes accessible to the GPU after vkMapMemory
. If the memory is marked as HOST_COHERENT
, the GPU sees the updates automatically; otherwise, an explicit flush may be required.
Memory Binding
buffer
is an abstraction that describes the intended usage of memory. When we bind a buffer to memory, it acts as a resource the Vulkan pipeline can use for specific operations (e.g., storing vertex data).” In this case, the buffer is the source of memory transfer.
vertexBufferInfoCI.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
From my perspecitve, it is a container for the chemical solutions. We put labels on the tube stating the function of stuff inside while we don’t touch the chemicals. After binding the memory
and buffer
, we can have delicate control of the buffer at CPU level. In the last part of code, we send the vkCmdCopyBuffer
to command buffer. This ensures that the data transfer happens in GPU. After the fence signal, we know that memory is transferred to the actual vertex memory.
Fence and Semaphore are basic synchronization techniques in Vulkan. Fence is in charges the operation between the CPU and GPU, ensuring the operation is complete before CPU/GPU switch, while Semaphore manages the tasks completion between the operations in GPU.