提问人:ModernEraCaveman 提问时间:7/11/2023 最后编辑:ModernEraCaveman 更新时间:7/11/2023 访问量:53
为什么我的计算着色器没有调整所有输入粒子的位置?武尔坎/GLSL/C++
Why is my compute shader not adjusting the positions of all of the input particles? Vulkan/GLSL/C++
问:
我刚刚弄清楚了使用 Vulkan 实现的计算着色器。但是,我很难理解为什么我输入到着色器的粒子中只有一小部分正在更新。 在视频中。int PARTICLE_COUNT = 32000
YouTube上传的问题。请原谅我没有使用 Imgur,它在过去的几个小时里一直不适合我,并且不允许我创建任何可访问的上传。
计算着色器代码如下:
#version 450
struct camera {
mat4 view;
mat4 proj;
vec3 position;
};
layout(binding = 0) uniform UniformBufferObject {
float dt;
mat4 model;
camera cam;
} ubo;
struct Particle {
vec4 position;
vec4 color;
vec4 velocity;
};
layout(std140, set = 2, binding = 0) readonly buffer inSSBO {
Particle particlesIn[ ];
};
layout(std140, set = 2, binding = 1) buffer outSSBO {
Particle particlesOut[ ];
};
layout (local_size_x = 10, local_size_y = 10, local_size_z = 10) in;
// Organization and Indexing
uvec3 nWG = gl_NumWorkGroups;
uvec3 sWG = gl_WorkGroupSize;
uint i = gl_WorkGroupID.x + (nWG.x * gl_WorkGroupID.y) + (nWG.x * nWG.y * gl_WorkGroupID.z);
uint j = gl_LocalInvocationID.x + (sWG.x * gl_LocalInvocationID.y) + (sWG.x * sWG.y *gl_LocalInvocationID.z);
// Globals
const float c = 1.0f;
// Calculates acceleration towards a position
vec3 Gravity(vec3 p1, vec3 p2, float m1, float m2) {
vec3 rN = normalize(p2 - p1);
float dist2 = distance(p2, p1);
dist2 *= dist2;
return rN * ((m1 * m2) / (dist2));
}
void main()
{
particlesOut[i].position.xyz = particlesIn[i].position.xyz;
// Kinematic Motion of the Elements of the System
vec3 Acceleration;
if (i != j)
{// Particle Interaction Calculations
// Interacting Particle Properties
float m0 = 1.f;
float m1 = 1.f;
vec3 p0 = particlesIn[i].position.xyz;
vec3 p1 = particlesIn[j].position.xyz;
// Velocity Calculation
particlesOut[i].velocity.xyz += Gravity(p0, p1, m0, m1) * ubo.dt;
if (length(particlesOut[i].velocity) > c/2)
{// Sets the Velocity Maximum to the Speed of Light (divided by two bc ITS TOO FAST)
normalize(particlesOut[i].velocity);
particlesOut[i].velocity *= c/2;
}
particlesOut[i].position.xyz += particlesOut[i].velocity.xyz * ubo.dt;
// Flip movement at volume border
if ((particlesOut[i].position.x <= -1.0) || (particlesOut[i].position.x >= 1.0)) {
particlesOut[i].velocity.x = -particlesOut[i].velocity.x;
}
if ((particlesOut[i].position.y <= -1.0) || (particlesOut[i].position.y >= 1.0)) {
particlesOut[i].velocity.y = -particlesOut[i].velocity.y;
}
if ((particlesOut[i].position.z <= -1.0) || (particlesOut[i].position.z >= 1.0)) {
particlesOut[i].velocity.z = -particlesOut[i].velocity.z;
}
}
}
计算着色器调度代码如下:
void computeCommand(VkCommandBuffer& commandBuffer, uint32_t setCount, VkDescriptorSet* sets) {
VkCommandBufferBeginInfo beginInfo
{ VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
if (vkBeginCommandBuffer(commandBuffer, &beginInfo) != VK_SUCCESS) {
throw std::runtime_error("failed to begin recording command buffer!");
}
vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipeline);
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mLayout, 0, setCount, sets, 0, nullptr);
vkCmdDispatch(commandBuffer, PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000));
if (vkEndCommandBuffer(commandBuffer) != VK_SUCCESS) {
throw std::runtime_error("failed to record compute command buffer!");
}
}
以及最后的潜在罪魁祸首,粒子和数据缓冲区代码:struct
struct Particle {
glm::vec4 position;
glm::vec4 color;
glm::vec4 velocity;
const static VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
static VkVertexInputBindingDescription vkCreateBindings() {
VkVertexInputBindingDescription bindingDescription{};
bindingDescription.binding = 0;
bindingDescription.stride = sizeof(Particle);
bindingDescription.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
return bindingDescription;
}
static std::array<VkVertexInputAttributeDescription, 2> vkCreateAttributes() {
std::array<VkVertexInputAttributeDescription, 2> attributeDescriptions{};
attributeDescriptions[0].binding = 0;
attributeDescriptions[0].location = 0;
attributeDescriptions[0].format = VK_FORMAT_R32G32B32A32_SFLOAT;
attributeDescriptions[0].offset = offsetof(Particle, position);
attributeDescriptions[1].binding = 0;
attributeDescriptions[1].location = 1;
attributeDescriptions[1].format = VK_FORMAT_R32G32B32A32_SFLOAT;
attributeDescriptions[1].offset = offsetof(Particle, color);
return attributeDescriptions;
}
static VkPipelineVertexInputStateCreateInfo vkCreateVertexInput() {
static auto bindingDescription = vkCreateBindings();
static auto attributeDescriptions = vkCreateAttributes();
VkPipelineVertexInputStateCreateInfo vertexInputInfo
{ VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO };
vertexInputInfo.vertexBindingDescriptionCount = 1;
vertexInputInfo.vertexAttributeDescriptionCount = static_cast<uint32_t>(attributeDescriptions.size());
vertexInputInfo.pVertexBindingDescriptions = &bindingDescription;
vertexInputInfo.pVertexAttributeDescriptions = attributeDescriptions.data();
return vertexInputInfo;
}
};
// SSBO struct initializes and stores an std::vector<Particle> particles;
void createDataBuffer(SSBO& ssbo) {
void* data;
VkBuffer stagingBuffer;
VkDeviceMemory stagingBufferMemory;
Buffer.resize(MAX_FRAMES_IN_FLIGHT);
Memory.resize(MAX_FRAMES_IN_FLIGHT);
bufferSize = sizeof(Particle)*PARTICLE_COUNT;
createBuffer(stagingBuffer, stagingBufferMemory,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
vkMapMemory(VkGPU::device, stagingBufferMemory, 0, bufferSize, 0, &data);
memcpy(data, ssbo.particles.data(), (size_t)bufferSize);
vkUnmapMemory(VkGPU::device, stagingBufferMemory);
for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
createBuffer(Buffer[i], Memory[i],
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
copyBuffer(stagingBuffer, Buffer[i]);
}
vkDestroyBuffer(VkGPU::device, stagingBuffer, nullptr);
vkFreeMemory(VkGPU::device, stagingBufferMemory, nullptr);
}
当 .移动的粒子更少。2000 个粒子中最多有 10 个。再次,请原谅我使用 YouTube 进行上传。PARTICLE_COUNT = 2000
我有一种感觉,问题出在计算着色器中的索引上,但我不完全确定。我的另一个想法是,分发给工作组的粒子数量可能是问题的根源,但减少粒子数量只会使问题更加明显。
编辑:修复了计算着色器中未准确更新对象位置的行,但这是测试 SSBO 属性的残余。修复该行对解决问题没有任何影响。ParticlesOut[]
readonly
答:
有时,努力工作后休息一下是件好事。经过一顿丰盛的饭菜和休息,我能够用新鲜的眼光看待我的代码,并找出我做错了什么。
在调度计算着色器时,我给每个工作组轴的粒子太少而无法处理。我原来的调度命令是:
vkCmdDispatch(commandBuffer, PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000));
而我的计算着色器的布局是:
layout (local_size_x = 10, local_size_y = 10, local_size_z = 10) in;
将 dispatch 命令更新为以下代码行可提供要处理的适当数量的粒子:
vkCmdDispatch(commandBuffer, PARTICLE_COUNT / (100), PARTICLE_COUNT / (100), PARTICLE_COUNT / (100));
我认为问题源于一开始给每个工作组的粒子太少,所以不是每个粒子都被处理。我通过降低调度命令来测试这一点,这导致了可怕的帧率下降,因为每次调用都必须处理 10 倍以上的粒子。groupCount
PARTICLE_COUNT / (10)
我不完全清楚 3D 工作组背后的数学原理或原理,但这似乎与其他两个工作组轴的大小有关,其中除数等于其他局部工作组的乘积。即 和 ,所以除数等于 或 。如果有人能更好地解释计算背后的数学,我将不胜感激,因为我不完全理解它,超出了我在这里所能解释的内容。local_size_y = 10
local_size_z = 10
10*10
100
groupCount
评论