Query Cache: Implement host side sample counting.

This commit is contained in:
Fernando Sahmkow 2023-08-20 17:53:08 +02:00
parent 2fea1b8407
commit c8237d5c31
5 changed files with 348 additions and 48 deletions

View File

@ -41,6 +41,7 @@ set(SHADER_FILES
pitch_unswizzle.comp pitch_unswizzle.comp
present_bicubic.frag present_bicubic.frag
present_gaussian.frag present_gaussian.frag
queries_prefix_scan_sum.comp
resolve_conditional_render.comp resolve_conditional_render.comp
smaa_edge_detection.vert smaa_edge_detection.vert
smaa_edge_detection.frag smaa_edge_detection.frag

View File

@ -0,0 +1,124 @@
// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel
// SPDX-License-Identifier: MIT
// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
// Nicholas Haemel. Modified to suit needs and optimize for subgroup
#version 460 core
#ifdef VULKAN
#extension GL_KHR_shader_subgroup_arithmetic : enable
#define HAS_EXTENDED_TYPES 1
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS \
} \
;
#define UNIFORM(n)
#define BINDING_INPUT_BUFFER 0
#define BINDING_OUTPUT_IMAGE 1
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
#extension GL_KHR_shader_subgroup_arithmetic : enable
#extension GL_NV_gpu_shader5 : enable
#ifdef GL_NV_gpu_shader5
#define HAS_EXTENDED_TYPES 1
#else
#define HAS_EXTENDED_TYPES 0
#endif
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout(location = n) uniform
#define BINDING_INPUT_BUFFER 0
#define BINDING_OUTPUT_IMAGE 0
#endif
BEGIN_PUSH_CONSTANTS
UNIFORM(0) uint max_accumulation_base;
UNIFORM(1) uint accumulation_limit;
END_PUSH_CONSTANTS
layout(local_size_x = 32) in;
layout(std430, binding = 0) readonly buffer block1 {
uvec2 input_data[gl_WorkGroupSize.x];
};
layout(std430, binding = 1) writeonly coherent buffer block2 {
uvec2 output_data[gl_WorkGroupSize.x];
};
layout(std430, binding = 2) coherent buffer block3 {
uvec2 accumulated_data;
};
shared uvec2 shared_data[gl_WorkGroupSize.x * 2];
uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
uint carry = 0;
uvec2 result;
result.x = uaddCarry(value_1.x, value_2.x, carry);
result.y = value_1.y + value_2.y + carry;
return result;
}
void main(void) {
uint id = gl_LocalInvocationID.x;
uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0);
uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0);
uint work_size = gl_WorkGroupSize.x;
uint rd_id;
uint wr_id;
uint mask;
uvec2 input_1 = input_data[id * 2];
uvec2 input_2 = input_data[id * 2 + 1];
// The number of steps is the log base 2 of the
// work group size, which should be a power of 2
const uint steps = uint(log2(work_size)) + 1;
uint step = 0;
// Each invocation is responsible for the content of
// two elements of the output array
shared_data[id * 2] = input_1;
shared_data[id * 2 + 1] = input_2;
// Synchronize to make sure that everyone has initialized
// their elements of shared_data[] with data loaded from
// the input arrays
barrier();
memoryBarrierShared();
// For each step...
for (step = 0; step < steps; step++) {
// Calculate the read and write index in the
// shared array
mask = (1 << step) - 1;
rd_id = ((id >> step) << (step + 1)) + mask;
wr_id = rd_id + 1 + (id & mask);
// Accumulate the read data into our element
shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
// Synchronize again to make sure that everyone
// has caught up with us
barrier();
memoryBarrierShared();
}
// Add the accumulation
shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1);
shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2);
barrier();
memoryBarrierShared();
// Finally write our data back to the output buffer
output_data[id * 2] = shared_data[id * 2];
output_data[id * 2 + 1] = shared_data[id * 2 + 1];
if (id == 0) {
if (max_accumulation_base >= accumulation_limit + 1) {
accumulated_data = shared_data[accumulation_limit];
return;
}
uvec2 value_1 = shared_data[max_accumulation_base];
uvec2 value_2 = shared_data[accumulation_limit];
accumulated_data = AddUint64(value_1, -value_2);
}
}

View File

@ -12,6 +12,7 @@
#include "common/common_types.h" #include "common/common_types.h"
#include "common/div_ceil.h" #include "common/div_ceil.h"
#include "video_core/host_shaders/astc_decoder_comp_spv.h" #include "video_core/host_shaders/astc_decoder_comp_spv.h"
#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h"
#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
@ -58,6 +59,30 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 2> INPUT_OUTPUT_DESCRIPTOR_SE
}, },
}}; }};
constexpr std::array<VkDescriptorSetLayoutBinding, 3> QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{
{
.binding = 0,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = 2,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
}};
constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
.uniform_buffers = 0, .uniform_buffers = 0,
.storage_buffers = 2, .storage_buffers = 2,
@ -68,6 +93,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
.score = 2, .score = 2,
}; };
constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{
.uniform_buffers = 0,
.storage_buffers = 3,
.texture_buffers = 0,
.image_buffers = 0,
.textures = 0,
.images = 0,
.score = 3,
};
constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{ constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{
{ {
.binding = ASTC_BINDING_INPUT_BUFFER, .binding = ASTC_BINDING_INPUT_BUFFER,
@ -104,6 +139,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT
.stride = sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry),
}; };
constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{
.dstBinding = 0,
.dstArrayElement = 0,
.descriptorCount = 3,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = 0,
.stride = sizeof(DescriptorUpdateEntry),
};
constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS> constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS>
ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
{ {
@ -132,6 +176,11 @@ struct AstcPushConstants {
u32 block_height; u32 block_height;
u32 block_height_mask; u32 block_height_mask;
}; };
struct QueriesPrefixScanPushConstants {
u32 max_accumulation_base;
u32 accumulation_limit;
};
} // Anonymous namespace } // Anonymous namespace
ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
@ -313,8 +362,6 @@ ConditionalRenderingResolvePass::ConditionalRenderingResolvePass(
void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer,
u32 src_offset, bool compare_to_zero) { u32 src_offset, bool compare_to_zero) {
scheduler.RequestOutsideRenderPassOperationContext();
const size_t compare_size = compare_to_zero ? 8 : 24; const size_t compare_size = compare_to_zero ? 8 : 24;
compute_pass_descriptor_queue.Acquire(); compute_pass_descriptor_queue.Acquire();
@ -327,7 +374,7 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_
static constexpr VkMemoryBarrier read_barrier{ static constexpr VkMemoryBarrier read_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr, .pNext = nullptr,
.srcAccessMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
}; };
static constexpr VkMemoryBarrier write_barrier{ static constexpr VkMemoryBarrier write_barrier{
@ -349,6 +396,63 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_
}); });
} }
QueriesPrefixScanPass::QueriesPrefixScanPass(
const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
: ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS,
QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO,
COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>,
QUERIES_PREFIX_SCAN_SUM_COMP_SPV),
scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer,
VkBuffer src_buffer, size_t number_of_sums,
size_t max_accumulation_limit) {
size_t aligned_runs = Common::AlignUp(number_of_sums, 32);
compute_pass_descriptor_queue.Acquire();
compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64));
compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64));
compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64));
const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()};
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums,
aligned_runs](vk::CommandBuffer cmdbuf) {
static constexpr VkMemoryBarrier read_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
};
static constexpr VkMemoryBarrier write_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT |
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT |
VK_ACCESS_UNIFORM_READ_BIT |
VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT,
};
const QueriesPrefixScanPushConstants uniforms{
.max_accumulation_base = static_cast<u32>(max_accumulation_limit),
.accumulation_limit = static_cast<u32>(number_of_sums - 1),
};
const VkDescriptorSet set = descriptor_allocator.Commit();
device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier);
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms);
cmdbuf.Dispatch(static_cast<u32>(aligned_runs / 32U), 1, 1);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier);
});
}
ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_, DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_, StagingBufferPool& staging_buffer_pool_,

View File

@ -95,6 +95,20 @@ private:
ComputePassDescriptorQueue& compute_pass_descriptor_queue; ComputePassDescriptorQueue& compute_pass_descriptor_queue;
}; };
class QueriesPrefixScanPass final : public ComputePass {
public:
explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer,
size_t number_of_sums, size_t max_accumulation_limit);
private:
Scheduler& scheduler;
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
};
class ASTCDecoderPass final : public ComputePass { class ASTCDecoderPass final : public ComputePass {
public: public:
explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,

View File

@ -11,6 +11,7 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "common/bit_util.h"
#include "common/common_types.h" #include "common/common_types.h"
#include "core/memory.h" #include "core/memory.h"
#include "video_core/engines/draw_manager.h" #include "video_core/engines/draw_manager.h"
@ -112,14 +113,34 @@ class SamplesStreamer : public BaseStreamer {
public: public:
explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_,
VideoCore::RasterizerInterface* rasterizer_, const Device& device_, VideoCore::RasterizerInterface* rasterizer_, const Device& device_,
Scheduler& scheduler_, const MemoryAllocator& memory_allocator_) Scheduler& scheduler_, const MemoryAllocator& memory_allocator_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue,
DescriptorPool& descriptor_pool)
: BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_}, : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_},
scheduler{scheduler_}, memory_allocator{memory_allocator_} { scheduler{scheduler_}, memory_allocator{memory_allocator_} {
BuildResolveBuffer();
current_bank = nullptr; current_bank = nullptr;
current_query = nullptr; current_query = nullptr;
ammend_value = 0; ammend_value = 0;
acumulation_value = 0; acumulation_value = 0;
queries_prefix_scan_pass = std::make_unique<QueriesPrefixScanPass>(
device, scheduler, descriptor_pool, compute_pass_descriptor_queue);
const VkBufferCreateInfo buffer_ci = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.size = 8,
.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
};
accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal);
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) {
cmdbuf.FillBuffer(buffer, 0, 8, 0);
});
} }
~SamplesStreamer() = default; ~SamplesStreamer() = default;
@ -159,6 +180,8 @@ public:
acumulation_value = 0; acumulation_value = 0;
}); });
rasterizer->SyncOperation(std::move(func)); rasterizer->SyncOperation(std::move(func));
accumulation_since_last_sync = false;
last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used);
} }
void CloseCounter() override { void CloseCounter() override {
@ -175,7 +198,8 @@ public:
} }
for (size_t i = 0; i < sync_values_stash.size(); i++) { for (size_t i = 0; i < sync_values_stash.size(); i++) {
runtime.template SyncValues<HostSyncValues>(sync_values_stash[i], *resolve_buffers[i]); runtime.template SyncValues<HostSyncValues>(sync_values_stash[i],
*buffers[resolve_buffers[i]]);
} }
sync_values_stash.clear(); sync_values_stash.clear();
@ -189,36 +213,21 @@ public:
sync_values_stash.clear(); sync_values_stash.clear();
sync_values_stash.emplace_back(); sync_values_stash.emplace_back();
std::vector<HostSyncValues>* sync_values = &sync_values_stash.back(); std::vector<HostSyncValues>* sync_values = &sync_values_stash.back();
sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE); sync_values->reserve(num_slots_used);
std::unordered_map<size_t, std::pair<size_t, size_t>> offsets; std::unordered_map<size_t, std::pair<size_t, size_t>> offsets;
size_t this_bank_slot = std::numeric_limits<size_t>::max(); resolve_buffers.clear();
size_t resolve_slots_remaining = resolve_slots; size_t resolve_buffer_index = ObtainBuffer<true>(num_slots_used);
size_t resolve_buffer_index = 0; resolve_buffers.push_back(resolve_buffer_index);
size_t base_offset = 0;
ApplyBanksWideOp<true>(pending_sync, [&](SamplesQueryBank* bank, size_t start, ApplyBanksWideOp<true>(pending_sync, [&](SamplesQueryBank* bank, size_t start,
size_t amount) { size_t amount) {
size_t bank_id = bank->GetIndex(); size_t bank_id = bank->GetIndex();
if (this_bank_slot != bank_id) { auto& resolve_buffer = buffers[resolve_buffer_index];
this_bank_slot = bank_id;
if (resolve_slots_remaining == 0) {
resolve_buffer_index++;
if (resolve_buffer_index >= resolve_buffers.size()) {
BuildResolveBuffer();
}
resolve_slots_remaining = resolve_slots;
sync_values_stash.emplace_back();
sync_values = &sync_values_stash.back();
sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE);
}
resolve_slots_remaining--;
}
auto& resolve_buffer = resolve_buffers[resolve_buffer_index];
const size_t base_offset = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE *
(resolve_slots - resolve_slots_remaining - 1);
VkQueryPool query_pool = bank->GetInnerPool(); VkQueryPool query_pool = bank->GetInnerPool();
scheduler.RequestOutsideRenderPassOperationContext(); scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([start, amount, base_offset, query_pool, scheduler.Record([start, amount, base_offset, query_pool,
buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) { buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) {
size_t final_offset = base_offset + start * SamplesQueryBank::QUERY_SIZE;
const VkBufferMemoryBarrier copy_query_pool_barrier{ const VkBufferMemoryBarrier copy_query_pool_barrier{
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.pNext = nullptr, .pNext = nullptr,
@ -227,39 +236,60 @@ public:
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = buffer, .buffer = buffer,
.offset = final_offset, .offset = base_offset,
.size = amount * SamplesQueryBank::QUERY_SIZE, .size = amount * SamplesQueryBank::QUERY_SIZE,
}; };
cmdbuf.CopyQueryPoolResults( cmdbuf.CopyQueryPoolResults(
query_pool, static_cast<u32>(start), static_cast<u32>(amount), buffer, query_pool, static_cast<u32>(start), static_cast<u32>(amount), buffer,
static_cast<u32>(final_offset), SamplesQueryBank::QUERY_SIZE, static_cast<u32>(base_offset), SamplesQueryBank::QUERY_SIZE,
VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT); VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier); VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier);
}); });
offsets[bank_id] = {sync_values_stash.size() - 1, base_offset}; offsets[bank_id] = {start, base_offset};
base_offset += amount * SamplesQueryBank::QUERY_SIZE;
}); });
// Convert queries // Convert queries
bool has_multi_queries = false;
for (auto q : pending_sync) { for (auto q : pending_sync) {
auto* query = GetQuery(q); auto* query = GetQuery(q);
size_t sync_value_slot = 0;
if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) {
continue; continue;
} }
if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) {
continue; continue;
} }
if (query->size_slots > 1) { if (accumulation_since_last_sync || query->size_slots > 1) {
// This is problematic. if (!has_multi_queries) {
// UNIMPLEMENTED(); has_multi_queries = true;
sync_values_stash.emplace_back();
}
sync_value_slot = 1;
} }
query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; query->flags |= VideoCommon::QueryFlagBits::IsHostSynced;
auto loc_data = offsets[query->start_bank_id]; auto loc_data = offsets[query->start_bank_id];
sync_values_stash[loc_data.first].emplace_back(HostSyncValues{ sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{
.address = query->guest_address, .address = query->guest_address,
.size = SamplesQueryBank::QUERY_SIZE, .size = SamplesQueryBank::QUERY_SIZE,
.offset = loc_data.second + query->start_slot * SamplesQueryBank::QUERY_SIZE, .offset =
loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) *
SamplesQueryBank::QUERY_SIZE,
});
}
if (has_multi_queries) {
size_t intermediary_buffer_index = ObtainBuffer<false>(num_slots_used);
resolve_buffers.push_back(intermediary_buffer_index);
queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index],
*buffers[resolve_buffer_index], num_slots_used,
std::min(last_accumulation_checkpoint, num_slots_used));
} else {
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) {
cmdbuf.FillBuffer(buffer, 0, 8, 0);
}); });
} }
@ -267,6 +297,9 @@ public:
std::function<void()> func([this] { ammend_value = acumulation_value; }); std::function<void()> func([this] { ammend_value = acumulation_value; });
rasterizer->SyncOperation(std::move(func)); rasterizer->SyncOperation(std::move(func));
AbandonCurrentQuery(); AbandonCurrentQuery();
num_slots_used = 0;
last_accumulation_checkpoint = std::numeric_limits<size_t>::max();
accumulation_since_last_sync = has_multi_queries;
pending_sync.clear(); pending_sync.clear();
} }
@ -400,6 +433,7 @@ private:
void ReserveHostQuery() { void ReserveHostQuery() {
size_t new_slot = ReserveBankSlot(); size_t new_slot = ReserveBankSlot();
current_bank->AddReference(1); current_bank->AddReference(1);
num_slots_used++;
if (current_query) { if (current_query) {
size_t bank_id = current_query->start_bank_id; size_t bank_id = current_query->start_bank_id;
size_t banks_set = current_query->size_banks - 1; size_t banks_set = current_query->size_banks - 1;
@ -470,23 +504,37 @@ private:
}); });
} }
void BuildResolveBuffer() { template <bool is_resolve>
size_t ObtainBuffer(size_t num_needed) {
const size_t log_2 = std::max<size_t>(6U, Common::Log2Ceil64(num_needed));
if constexpr (is_resolve) {
if (resolve_table[log_2] != 0) {
return resolve_table[log_2] - 1;
}
} else {
if (intermediary_table[log_2] != 0) {
return intermediary_table[log_2] - 1;
}
}
const VkBufferCreateInfo buffer_ci = { const VkBufferCreateInfo buffer_ci = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr, .pNext = nullptr,
.flags = 0, .flags = 0,
.size = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * resolve_slots, .size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2),
.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE, .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0, .queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr, .pQueueFamilyIndices = nullptr,
}; };
resolve_buffers.emplace_back( buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal));
memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); if constexpr (is_resolve) {
resolve_table[log_2] = buffers.size();
} else {
intermediary_table[log_2] = buffers.size();
}
return buffers.size() - 1;
} }
static constexpr size_t resolve_slots = 8;
QueryCacheRuntime& runtime; QueryCacheRuntime& runtime;
VideoCore::RasterizerInterface* rasterizer; VideoCore::RasterizerInterface* rasterizer;
@ -494,8 +542,12 @@ private:
Scheduler& scheduler; Scheduler& scheduler;
const MemoryAllocator& memory_allocator; const MemoryAllocator& memory_allocator;
VideoCommon::BankPool<SamplesQueryBank> bank_pool; VideoCommon::BankPool<SamplesQueryBank> bank_pool;
std::deque<vk::Buffer> resolve_buffers; std::deque<vk::Buffer> buffers;
std::array<size_t, 32> resolve_table{};
std::array<size_t, 32> intermediary_table{};
vk::Buffer accumulation_buffer;
std::deque<std::vector<HostSyncValues>> sync_values_stash; std::deque<std::vector<HostSyncValues>> sync_values_stash;
std::vector<size_t> resolve_buffers;
// syncing queue // syncing queue
std::vector<size_t> pending_sync; std::vector<size_t> pending_sync;
@ -510,10 +562,14 @@ private:
SamplesQueryBank* current_bank; SamplesQueryBank* current_bank;
VkQueryPool current_query_pool; VkQueryPool current_query_pool;
size_t current_query_id; size_t current_query_id;
size_t num_slots_used{};
size_t last_accumulation_checkpoint{};
bool accumulation_since_last_sync{};
VideoCommon::HostQueryBase* current_query; VideoCommon::HostQueryBase* current_query;
bool has_started{}; bool has_started{};
bool current_unset{};
std::mutex flush_guard; std::mutex flush_guard;
std::unique_ptr<QueriesPrefixScanPass> queries_prefix_scan_pass;
}; };
// Transform feedback queries // Transform feedback queries
@ -1090,7 +1146,8 @@ struct QueryCacheRuntimeImpl {
memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_},
guest_streamer(0, runtime), guest_streamer(0, runtime),
sample_streamer(static_cast<size_t>(QueryType::ZPassPixelCount64), runtime, rasterizer, sample_streamer(static_cast<size_t>(QueryType::ZPassPixelCount64), runtime, rasterizer,
device, scheduler, memory_allocator), device, scheduler, memory_allocator, compute_pass_descriptor_queue,
descriptor_pool),
tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device, tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device,
scheduler, memory_allocator, staging_pool), scheduler, memory_allocator, staging_pool),
primitives_succeeded_streamer( primitives_succeeded_streamer(
@ -1319,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku
return true; return true;
} }
} }
if (!is_in_bc[0] && !is_in_bc[1]) { /*if (!is_in_bc[0] && !is_in_bc[1]) {
// Both queries are in query cache, it's best to just flush. // Both queries are in query cache, it's best to just flush.
return false; return true;
} }*/
HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); HostConditionalRenderingCompareBCImpl(object_1.address, equal_check);
return true; return true;
} }