From bbfad79c89f9b7886005d39b51129bcfd94830b8 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Tue, 2 Aug 2022 17:41:41 +0100 Subject: [PATCH 01/10] Vulkan: Add a workaround for input_position on Adreno drivers Adreno drivers will crash compiling geometry shaders if the input position is not wrapped in a gl_in struct. --- .../spirv/emit_spirv_context_get_set.cpp | 7 +++- .../backend/spirv/spirv_emit_context.cpp | 42 +++++++++++++++---- .../backend/spirv/spirv_emit_context.h | 1 + src/shader_recompiler/profile.h | 2 + .../renderer_vulkan/vk_pipeline_cache.cpp | 1 + 5 files changed, 42 insertions(+), 11 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index db9c94ce8..1590debc4 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -321,8 +321,11 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) { case IR::Attribute::PositionY: case IR::Attribute::PositionZ: case IR::Attribute::PositionW: - return ctx.OpLoad(ctx.F32[1], AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_position, - ctx.Const(element))); + return ctx.OpLoad(ctx.F32[1], ctx.need_input_position_indirect ? + AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_position, + ctx.u32_zero_value, ctx.Const(element)) + : AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_position, + ctx.Const(element))); case IR::Attribute::InstanceId: if (ctx.profile.support_vertex_instance_id) { return ctx.OpBitcast(ctx.F32[1], ctx.OpLoad(ctx.U32[1], ctx.instance_id)); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index ecb2db494..eb1e3e0b6 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -721,9 +721,21 @@ void EmitContext::DefineAttributeMemAccess(const Info& info) { size_t label_index{0}; if (info.loads.AnyComponent(IR::Attribute::PositionX)) { AddLabel(labels[label_index]); - const Id pointer{is_array - ? OpAccessChain(input_f32, input_position, vertex, masked_index) - : OpAccessChain(input_f32, input_position, masked_index)}; + const Id pointer{[&]() { + if (need_input_position_indirect) { + if (is_array) + return OpAccessChain(input_f32, input_position, vertex, u32_zero_value, + masked_index); + else + return OpAccessChain(input_f32, input_position, u32_zero_value, + masked_index); + } else { + if (is_array) + return OpAccessChain(input_f32, input_position, vertex, masked_index); + else + return OpAccessChain(input_f32, input_position, masked_index); + } + }()}; const Id result{OpLoad(F32[1], pointer)}; OpReturnValue(result); ++label_index; @@ -1367,12 +1379,24 @@ void EmitContext::DefineInputs(const IR::Program& program) { Decorate(layer, spv::Decoration::Flat); } if (loads.AnyComponent(IR::Attribute::PositionX)) { - const bool is_fragment{stage != Stage::Fragment}; - const spv::BuiltIn built_in{is_fragment ? spv::BuiltIn::Position : spv::BuiltIn::FragCoord}; - input_position = DefineInput(*this, F32[4], true, built_in); - if (profile.support_geometry_shader_passthrough) { - if (info.passthrough.AnyComponent(IR::Attribute::PositionX)) { - Decorate(input_position, spv::Decoration::PassthroughNV); + const bool is_fragment{stage == Stage::Fragment}; + if (!is_fragment && profile.has_broken_spirv_position_input) { + need_input_position_indirect = true; + + const Id input_position_struct = TypeStruct(F32[4]); + input_position = DefineInput(*this, input_position_struct, true); + + MemberDecorate(input_position_struct, 0, spv::Decoration::BuiltIn, + static_cast(spv::BuiltIn::Position)); + Decorate(input_position_struct, spv::Decoration::Block); + } else { + const spv::BuiltIn built_in{is_fragment ? spv::BuiltIn::FragCoord : spv::BuiltIn::Position}; + input_position = DefineInput(*this, F32[4], true, built_in); + + if (profile.support_geometry_shader_passthrough) { + if (info.passthrough.AnyComponent(IR::Attribute::PositionX)) { + Decorate(input_position, spv::Decoration::PassthroughNV); + } } } } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 4414a5169..dbc5c55b9 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -280,6 +280,7 @@ public: Id write_global_func_u32x2{}; Id write_global_func_u32x4{}; + bool need_input_position_indirect{}; Id input_position{}; std::array input_generics{}; diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h index b8841a536..253e0d0bd 100644 --- a/src/shader_recompiler/profile.h +++ b/src/shader_recompiler/profile.h @@ -55,6 +55,8 @@ struct Profile { /// OpFClamp is broken and OpFMax + OpFMin should be used instead bool has_broken_spirv_clamp{}; + /// The Position builtin needs to be wrapped in a struct when used as an input + bool has_broken_spirv_position_input{}; /// Offset image operands with an unsigned type do not work bool has_broken_unsigned_image_offsets{}; /// Signed instructions with unsigned data types are misinterpreted diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 67e5bc648..4d7770bf8 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -331,6 +331,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device .need_declared_frag_colors = false, .has_broken_spirv_clamp = driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS, + .has_broken_spirv_position_input = driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY, .has_broken_unsigned_image_offsets = false, .has_broken_signed_operations = false, .has_broken_fp16_float_controls = driver_id == VK_DRIVER_ID_NVIDIA_PROPRIETARY, From c1cc99584cac7b3207b01502591e093712ffde55 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Fri, 21 Oct 2022 22:58:37 +0100 Subject: [PATCH 02/10] shader_recompiler: Add comparison operators to descriptor types --- src/shader_recompiler/shader_info.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h index 44236b6b1..f93181e1e 100644 --- a/src/shader_recompiler/shader_info.h +++ b/src/shader_recompiler/shader_info.h @@ -65,6 +65,8 @@ enum class Interpolation { struct ConstantBufferDescriptor { u32 index; u32 count; + + auto operator<=>(const ConstantBufferDescriptor&) const = default; }; struct StorageBufferDescriptor { @@ -72,6 +74,8 @@ struct StorageBufferDescriptor { u32 cbuf_offset; u32 count; bool is_written; + + auto operator<=>(const StorageBufferDescriptor&) const = default; }; struct TextureBufferDescriptor { @@ -84,6 +88,8 @@ struct TextureBufferDescriptor { u32 secondary_shift_left; u32 count; u32 size_shift; + + auto operator<=>(const TextureBufferDescriptor&) const = default; }; using TextureBufferDescriptors = boost::container::small_vector; @@ -95,6 +101,8 @@ struct ImageBufferDescriptor { u32 cbuf_offset; u32 count; u32 size_shift; + + auto operator<=>(const ImageBufferDescriptor&) const = default; }; using ImageBufferDescriptors = boost::container::small_vector; @@ -110,6 +118,8 @@ struct TextureDescriptor { u32 secondary_shift_left; u32 count; u32 size_shift; + + auto operator<=>(const TextureDescriptor&) const = default; }; using TextureDescriptors = boost::container::small_vector; @@ -122,6 +132,8 @@ struct ImageDescriptor { u32 cbuf_offset; u32 count; u32 size_shift; + + auto operator<=>(const ImageDescriptor&) const = default; }; using ImageDescriptors = boost::container::small_vector; From 3f0985c7b01bbc5e7172bffb137f23385088aca6 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Sat, 19 Nov 2022 15:53:19 +0000 Subject: [PATCH 03/10] shader_recompiler: SPIRV: Only enable int64 feature when supported --- src/shader_recompiler/backend/spirv/spirv_emit_context.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index eb1e3e0b6..f874622b8 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -544,7 +544,7 @@ void EmitContext::DefineCommonTypes(const Info& info) { U16 = Name(TypeInt(16, false), "u16"); S16 = Name(TypeInt(16, true), "s16"); } - if (info.uses_int64) { + if (info.uses_int64 && profile.support_int64) { AddCapability(spv::Capability::Int64); U64 = Name(TypeInt(64, false), "u64"); } From 8804a4eb23e0c4f3e4bab03dee7c204bd38bf21e Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Wed, 23 Nov 2022 21:30:23 +0000 Subject: [PATCH 04/10] shader_recompiler: Align SSBO offsets to meet host requirements We can take advantage of SSBO addresses being passed in a constant bufer to account for the extra alignment requirements in the shader itself. --- .../frontend/maxwell/translate_program.cpp | 2 +- src/shader_recompiler/host_translate_info.h | 1 + .../ir_opt/global_memory_to_storage_buffer_pass.cpp | 12 ++++++++---- src/shader_recompiler/ir_opt/passes.h | 2 +- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp index ac159d24b..89fced84c 100644 --- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp @@ -223,7 +223,7 @@ IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool low_addr{TrackLowAddress(&inst)}) { @@ -415,7 +416,10 @@ IR::U32 StorageOffset(IR::Block& block, IR::Inst& inst, StorageBufferAddr buffer } // Subtract the least significant 32 bits from the guest offset. The result is the storage // buffer offset in bytes. - const IR::U32 low_cbuf{ir.GetCbuf(ir.Imm32(buffer.index), ir.Imm32(buffer.offset))}; + IR::U32 low_cbuf{ir.GetCbuf(ir.Imm32(buffer.index), ir.Imm32(buffer.offset))}; + + // Align the offset base to match the host alignment requirements + low_cbuf = ir.BitwiseAnd(low_cbuf, ir.Imm32(~(alignment - 1U))); return ir.ISub(offset, low_cbuf); } @@ -510,7 +514,7 @@ void Replace(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index, } } // Anonymous namespace -void GlobalMemoryToStorageBufferPass(IR::Program& program) { +void GlobalMemoryToStorageBufferPass(IR::Program& program, const HostTranslateInfo& host_info) { StorageInfo info; for (IR::Block* const block : program.post_order_blocks) { for (IR::Inst& inst : block->Instructions()) { @@ -534,7 +538,7 @@ void GlobalMemoryToStorageBufferPass(IR::Program& program) { const IR::U32 index{IR::Value{static_cast(info.set.index_of(it))}}; IR::Block* const block{storage_inst.block}; IR::Inst* const inst{storage_inst.inst}; - const IR::U32 offset{StorageOffset(*block, *inst, storage_buffer)}; + const IR::U32 offset{StorageOffset(*block, *inst, storage_buffer, host_info.min_ssbo_alignment)}; Replace(*block, *inst, index, offset); } } diff --git a/src/shader_recompiler/ir_opt/passes.h b/src/shader_recompiler/ir_opt/passes.h index 1f8f2ba95..4ffad1172 100644 --- a/src/shader_recompiler/ir_opt/passes.h +++ b/src/shader_recompiler/ir_opt/passes.h @@ -15,7 +15,7 @@ namespace Shader::Optimization { void CollectShaderInfoPass(Environment& env, IR::Program& program); void ConstantPropagationPass(Environment& env, IR::Program& program); void DeadCodeEliminationPass(IR::Program& program); -void GlobalMemoryToStorageBufferPass(IR::Program& program); +void GlobalMemoryToStorageBufferPass(IR::Program& program, const HostTranslateInfo& host_info); void IdentityRemovalPass(IR::Program& program); void LowerFp16ToFp32(IR::Program& program); void LowerInt64ToInt32(IR::Program& program); From 9e2997c4b6456031622602002924617690e32a13 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Sun, 1 Jan 2023 13:23:24 +0000 Subject: [PATCH 05/10] Vulkan, OpenGL: Hook up storage buffer alignment code --- src/video_core/buffer_cache/buffer_cache.h | 12 +++++++++--- src/video_core/renderer_opengl/gl_buffer_cache.h | 4 ++++ src/video_core/renderer_opengl/gl_shader_cache.cpp | 1 + src/video_core/renderer_vulkan/vk_buffer_cache.cpp | 4 ++++ src/video_core/renderer_vulkan/vk_buffer_cache.h | 2 ++ src/video_core/renderer_vulkan/vk_pipeline_cache.cpp | 1 + 6 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 06fd40851..e6414df78 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -1938,14 +1938,20 @@ typename BufferCache

::Binding BufferCache

::StorageBufferBinding(GPUVAddr s bool is_written) const { const GPUVAddr gpu_addr = gpu_memory->Read(ssbo_addr); const u32 size = gpu_memory->Read(ssbo_addr + 8); - const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); + const u32 alignment = runtime.GetStorageBufferAlignment(); + + const GPUVAddr aligned_gpu_addr = Common::AlignDown(gpu_addr, alignment); + const u32 aligned_size = Common::AlignUp(static_cast(gpu_addr - aligned_gpu_addr) + size, alignment); + + const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(aligned_gpu_addr); if (!cpu_addr || size == 0) { return NULL_BINDING; } - const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE); + + const VAddr cpu_end = Common::AlignUp(*cpu_addr + aligned_size, Core::Memory::YUZU_PAGESIZE); const Binding binding{ .cpu_addr = *cpu_addr, - .size = is_written ? size : static_cast(cpu_end - *cpu_addr), + .size = is_written ? aligned_size : static_cast(cpu_end - *cpu_addr), .buffer_id = BufferId{}, }; return binding; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index a8c3f8b67..bb1962073 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -160,6 +160,10 @@ public: return device.CanReportMemoryUsage(); } + u32 GetStorageBufferAlignment() const { + return static_cast(device.GetShaderStorageBufferAlignment()); + } + private: static constexpr std::array PABO_LUT{ GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 03b6314ff..01e0cddb6 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -236,6 +236,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo .needs_demote_reorder = device.IsAmd(), .support_snorm_render_buffer = false, .support_viewport_index_layer = device.HasVertexViewportLayer(), + .min_ssbo_alignment = static_cast(device.GetShaderStorageBufferAlignment()), } { if (use_asynchronous_shaders) { workers = CreateWorkers(); diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 487d8b416..bc78287ce 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -330,6 +330,10 @@ bool BufferCacheRuntime::CanReportMemoryUsage() const { return device.CanReportMemoryUsage(); } +u32 BufferCacheRuntime::GetStorageBufferAlignment() const { + return static_cast(device.GetStorageBufferAlignment()); +} + void BufferCacheRuntime::Finish() { scheduler.Finish(); } diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 183b33632..06539c733 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -73,6 +73,8 @@ public: bool CanReportMemoryUsage() const; + u32 GetStorageBufferAlignment() const; + [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size); [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size); diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 4d7770bf8..4aaea0e12 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -344,6 +344,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE, .support_snorm_render_buffer = true, .support_viewport_index_layer = device.IsExtShaderViewportIndexLayerSupported(), + .min_ssbo_alignment = static_cast(device.GetStorageBufferAlignment()), }; if (device.GetMaxVertexInputAttributes() < Maxwell::NumVertexAttributes) { From 625a4af73afec1a45f5a8004b0933f5a3d414103 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Sat, 31 Dec 2022 23:19:10 +0000 Subject: [PATCH 06/10] shader_recompiler: Add support for lowering geometry passthrough Reuses most of the existing code for generating the gl_Layer passthrough. Fixes geometry in Nier: Automata on GPUs without HW passthrough support. --- .../frontend/maxwell/translate_program.cpp | 106 +++++++++++------- src/shader_recompiler/host_translate_info.h | 1 + 2 files changed, 67 insertions(+), 40 deletions(-) diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp index 89fced84c..4a0ccceb7 100644 --- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp @@ -171,6 +171,64 @@ std::map GenerateLegacyToGenericMappings( } return mapping; } + +void EmitGeometryPassthrough(IR::IREmitter& ir, const IR::Program& program, const Shader::VaryingState &passthrough_mask, bool passthrough_position, std::optional passthrough_layer_attr) { + for (u32 i = 0; i < program.output_vertices; i++) { + // Assign generics from input + for (u32 j = 0; j < 32; j++) { + if (!passthrough_mask.Generic(j)) { + continue; + } + + const IR::Attribute attr = IR::Attribute::Generic0X + (j * 4); + ir.SetAttribute(attr + 0, ir.GetAttribute(attr + 0, ir.Imm32(i)), ir.Imm32(0)); + ir.SetAttribute(attr + 1, ir.GetAttribute(attr + 1, ir.Imm32(i)), ir.Imm32(0)); + ir.SetAttribute(attr + 2, ir.GetAttribute(attr + 2, ir.Imm32(i)), ir.Imm32(0)); + ir.SetAttribute(attr + 3, ir.GetAttribute(attr + 3, ir.Imm32(i)), ir.Imm32(0)); + } + + if (passthrough_position) { + // Assign position from input + const IR::Attribute attr = IR::Attribute::PositionX; + ir.SetAttribute(attr + 0, ir.GetAttribute(attr + 0, ir.Imm32(i)), ir.Imm32(0)); + ir.SetAttribute(attr + 1, ir.GetAttribute(attr + 1, ir.Imm32(i)), ir.Imm32(0)); + ir.SetAttribute(attr + 2, ir.GetAttribute(attr + 2, ir.Imm32(i)), ir.Imm32(0)); + ir.SetAttribute(attr + 3, ir.GetAttribute(attr + 3, ir.Imm32(i)), ir.Imm32(0)); + } + + if (passthrough_layer_attr) { + // Assign layer + ir.SetAttribute(IR::Attribute::Layer, ir.GetAttribute(*passthrough_layer_attr), ir.Imm32(0)); + } + + // Emit vertex + ir.EmitVertex(ir.Imm32(0)); + } + ir.EndPrimitive(ir.Imm32(0)); +} + +u32 GetOutputTopologyVertices(OutputTopology output_topology) { + switch (output_topology) { + case OutputTopology::PointList: + return 1; + case OutputTopology::LineStrip: + return 2; + default: + return 3; + } +} + +void LowerGeometryPassthrough(const IR::Program& program, const HostTranslateInfo& host_info) { + for (IR::Block *const block : program.blocks) { + for (IR::Inst &inst : block->Instructions()) { + if (inst.GetOpcode() == IR::Opcode::Epilogue) { + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + EmitGeometryPassthrough(ir, program, program.info.passthrough, program.info.passthrough.AnyComponent(IR::Attribute::PositionX), {}); + } + } + } +} + } // Anonymous namespace IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool& block_pool, @@ -198,6 +256,11 @@ IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool> (i % 32)) & 1) == 0; } + + if (!host_info.support_geometry_shader_passthrough) { + program.output_vertices = GetOutputTopologyVertices(program.output_topology); + LowerGeometryPassthrough(program, host_info); + } } break; } @@ -342,17 +405,8 @@ IR::Program GenerateGeometryPassthrough(ObjectPool& inst_pool, IR::Program program; program.stage = Stage::Geometry; program.output_topology = output_topology; - switch (output_topology) { - case OutputTopology::PointList: - program.output_vertices = 1; - break; - case OutputTopology::LineStrip: - program.output_vertices = 2; - break; - default: - program.output_vertices = 3; - break; - } + program.output_vertices = GetOutputTopologyVertices(output_topology); + program.is_geometry_passthrough = false; program.info.loads.mask = source_program.info.stores.mask; @@ -366,35 +420,7 @@ IR::Program GenerateGeometryPassthrough(ObjectPool& inst_pool, node.data.block = current_block; IR::IREmitter ir{*current_block}; - for (u32 i = 0; i < program.output_vertices; i++) { - // Assign generics from input - for (u32 j = 0; j < 32; j++) { - if (!program.info.stores.Generic(j)) { - continue; - } - - const IR::Attribute attr = IR::Attribute::Generic0X + (j * 4); - ir.SetAttribute(attr + 0, ir.GetAttribute(attr + 0, ir.Imm32(i)), ir.Imm32(0)); - ir.SetAttribute(attr + 1, ir.GetAttribute(attr + 1, ir.Imm32(i)), ir.Imm32(0)); - ir.SetAttribute(attr + 2, ir.GetAttribute(attr + 2, ir.Imm32(i)), ir.Imm32(0)); - ir.SetAttribute(attr + 3, ir.GetAttribute(attr + 3, ir.Imm32(i)), ir.Imm32(0)); - } - - // Assign position from input - const IR::Attribute attr = IR::Attribute::PositionX; - ir.SetAttribute(attr + 0, ir.GetAttribute(attr + 0, ir.Imm32(i)), ir.Imm32(0)); - ir.SetAttribute(attr + 1, ir.GetAttribute(attr + 1, ir.Imm32(i)), ir.Imm32(0)); - ir.SetAttribute(attr + 2, ir.GetAttribute(attr + 2, ir.Imm32(i)), ir.Imm32(0)); - ir.SetAttribute(attr + 3, ir.GetAttribute(attr + 3, ir.Imm32(i)), ir.Imm32(0)); - - // Assign layer - ir.SetAttribute(IR::Attribute::Layer, ir.GetAttribute(source_program.info.emulated_layer), - ir.Imm32(0)); - - // Emit vertex - ir.EmitVertex(ir.Imm32(0)); - } - ir.EndPrimitive(ir.Imm32(0)); + EmitGeometryPassthrough(ir, program, program.info.stores, true, source_program.info.emulated_layer); IR::Block* return_block{block_pool.Create(inst_pool)}; IR::IREmitter{*return_block}.Epilogue(); diff --git a/src/shader_recompiler/host_translate_info.h b/src/shader_recompiler/host_translate_info.h index 7df090124..dc402ee47 100644 --- a/src/shader_recompiler/host_translate_info.h +++ b/src/shader_recompiler/host_translate_info.h @@ -16,6 +16,7 @@ struct HostTranslateInfo { bool support_snorm_render_buffer{}; ///< True when the device supports SNORM render buffers bool support_viewport_index_layer{}; ///< True when the device supports gl_Layer in VS u32 min_ssbo_alignment{}; ///< Minimum alignment supported by the device for SSBOs + bool support_geometry_shader_passthrough{}; ///< True when the device supports geometry passthrough shaders }; } // namespace Shader From 6c812a0c84d6fb2f7ffe1d64044be8aeaa905300 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Sun, 1 Jan 2023 13:35:23 +0000 Subject: [PATCH 07/10] Vulkan, OpenGL: Hook up geometry shader passthrough emulation --- src/video_core/renderer_opengl/gl_shader_cache.cpp | 1 + src/video_core/renderer_vulkan/vk_pipeline_cache.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 01e0cddb6..7dd854e0f 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -237,6 +237,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo .support_snorm_render_buffer = false, .support_viewport_index_layer = device.HasVertexViewportLayer(), .min_ssbo_alignment = static_cast(device.GetShaderStorageBufferAlignment()), + .support_geometry_shader_passthrough = device.HasGeometryShaderPassthrough(), } { if (use_asynchronous_shaders) { workers = CreateWorkers(); diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 4aaea0e12..013b42cf8 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -345,6 +345,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device .support_snorm_render_buffer = true, .support_viewport_index_layer = device.IsExtShaderViewportIndexLayerSupported(), .min_ssbo_alignment = static_cast(device.GetStorageBufferAlignment()), + .support_geometry_shader_passthrough = device.IsNvGeometryShaderPassthroughSupported(), }; if (device.GetMaxVertexInputAttributes() < Maxwell::NumVertexAttributes) { From 68ed60cee41ce93a637fd8463657b137d95fdae4 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Fri, 30 Dec 2022 13:45:00 +0000 Subject: [PATCH 08/10] shader_recompiler: Fix shuffle partitioning for >64 invoc-per-subgroup GPUs The existing implementation only supports 64 invoc-per-subgroup GPUs, and misbehaves on adreno when invocations need to be split into 4 emulated subgroups. --- .../backend/spirv/emit_spirv_warp.cpp | 58 +++++++++---------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index 2c90f2368..c5db19d09 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp @@ -58,11 +58,10 @@ Id SelectValue(EmitContext& ctx, Id in_range, Id value, Id src_thread_id) { ctx.OpGroupNonUniformShuffle(ctx.U32[1], SubgroupScope(ctx), value, src_thread_id), value); } -Id GetUpperClamp(EmitContext& ctx, Id invocation_id, Id clamp) { - const Id thirty_two{ctx.Const(32u)}; - const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, invocation_id, thirty_two)}; - const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)}; - return ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp); +Id AddPartitionBase(EmitContext& ctx, Id thread_id) { + const Id partition_idx{ctx.OpShiftRightLogical(ctx.U32[1], GetThreadId(ctx), ctx.Const(5u))}; + const Id partition_base{ctx.OpShiftLeftLogical(ctx.U32[1], partition_idx, ctx.Const(5u))}; + return ctx.OpIAdd(ctx.U32[1], thread_id, partition_base); } } // Anonymous namespace @@ -145,64 +144,63 @@ Id EmitSubgroupGeMask(EmitContext& ctx) { Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id segmentation_mask) { const Id not_seg_mask{ctx.OpNot(ctx.U32[1], segmentation_mask)}; - const Id thread_id{GetThreadId(ctx)}; - if (ctx.profile.warp_size_potentially_larger_than_guest) { - const Id thirty_two{ctx.Const(32u)}; - const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, thread_id, thirty_two)}; - const Id upper_index{ctx.OpIAdd(ctx.U32[1], thirty_two, index)}; - const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)}; - index = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_index, index); - clamp = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp); - } + const Id thread_id{EmitLaneId(ctx)}; const Id min_thread_id{ComputeMinThreadId(ctx, thread_id, segmentation_mask)}; const Id max_thread_id{ComputeMaxThreadId(ctx, min_thread_id, clamp, not_seg_mask)}; const Id lhs{ctx.OpBitwiseAnd(ctx.U32[1], index, not_seg_mask)}; - const Id src_thread_id{ctx.OpBitwiseOr(ctx.U32[1], lhs, min_thread_id)}; + Id src_thread_id{ctx.OpBitwiseOr(ctx.U32[1], lhs, min_thread_id)}; const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; + if (ctx.profile.warp_size_potentially_larger_than_guest) { + src_thread_id = AddPartitionBase(ctx, src_thread_id); + } + SetInBoundsFlag(inst, in_range); return SelectValue(ctx, in_range, value, src_thread_id); } Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id segmentation_mask) { - const Id thread_id{GetThreadId(ctx)}; - if (ctx.profile.warp_size_potentially_larger_than_guest) { - clamp = GetUpperClamp(ctx, thread_id, clamp); - } + const Id thread_id{EmitLaneId(ctx)}; const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; - const Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)}; + Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)}; const Id in_range{ctx.OpSGreaterThanEqual(ctx.U1, src_thread_id, max_thread_id)}; + if (ctx.profile.warp_size_potentially_larger_than_guest) { + src_thread_id = AddPartitionBase(ctx, src_thread_id); + } + SetInBoundsFlag(inst, in_range); return SelectValue(ctx, in_range, value, src_thread_id); } Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id segmentation_mask) { - const Id thread_id{GetThreadId(ctx)}; - if (ctx.profile.warp_size_potentially_larger_than_guest) { - clamp = GetUpperClamp(ctx, thread_id, clamp); - } + const Id thread_id{EmitLaneId(ctx)}; const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; - const Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)}; + Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)}; const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; + if (ctx.profile.warp_size_potentially_larger_than_guest) { + src_thread_id = AddPartitionBase(ctx, src_thread_id); + } + SetInBoundsFlag(inst, in_range); return SelectValue(ctx, in_range, value, src_thread_id); } Id EmitShuffleButterfly(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id segmentation_mask) { - const Id thread_id{GetThreadId(ctx)}; - if (ctx.profile.warp_size_potentially_larger_than_guest) { - clamp = GetUpperClamp(ctx, thread_id, clamp); - } + const Id thread_id{EmitLaneId(ctx)}; const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; - const Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)}; + Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)}; const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; + if (ctx.profile.warp_size_potentially_larger_than_guest) { + src_thread_id = AddPartitionBase(ctx, src_thread_id); + } + SetInBoundsFlag(inst, in_range); return SelectValue(ctx, in_range, value, src_thread_id); } From 12b4c9c04ca525d7f80b6c116e393ac29b01e581 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Thu, 5 Jan 2023 20:17:22 +0000 Subject: [PATCH 09/10] externals: Update sirit --- externals/sirit | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/externals/sirit b/externals/sirit index d7ad93a88..ab7546399 160000 --- a/externals/sirit +++ b/externals/sirit @@ -1 +1 @@ -Subproject commit d7ad93a88864bda94e282e95028f90b5784e4d20 +Subproject commit ab75463999f4f3291976b079d42d52ee91eebf3f From 58fec43768c837c63453e87df8f337a2d139324a Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Thu, 5 Jan 2023 22:10:21 +0000 Subject: [PATCH 10/10] Run clang-format --- .../spirv/emit_spirv_context_get_set.cpp | 11 ++++--- .../backend/spirv/spirv_emit_context.cpp | 5 +-- .../frontend/maxwell/translate_program.cpp | 32 +++++++++++-------- src/shader_recompiler/host_translate_info.h | 5 +-- .../global_memory_to_storage_buffer_pass.cpp | 3 +- src/video_core/buffer_cache/buffer_cache.h | 3 +- 6 files changed, 35 insertions(+), 24 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 1590debc4..0cd87a48f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -321,11 +321,12 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) { case IR::Attribute::PositionY: case IR::Attribute::PositionZ: case IR::Attribute::PositionW: - return ctx.OpLoad(ctx.F32[1], ctx.need_input_position_indirect ? - AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_position, - ctx.u32_zero_value, ctx.Const(element)) - : AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_position, - ctx.Const(element))); + return ctx.OpLoad( + ctx.F32[1], + ctx.need_input_position_indirect + ? AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_position, ctx.u32_zero_value, + ctx.Const(element)) + : AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_position, ctx.Const(element))); case IR::Attribute::InstanceId: if (ctx.profile.support_vertex_instance_id) { return ctx.OpBitcast(ctx.F32[1], ctx.OpLoad(ctx.U32[1], ctx.instance_id)); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index f874622b8..a0c155fdb 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -729,7 +729,7 @@ void EmitContext::DefineAttributeMemAccess(const Info& info) { else return OpAccessChain(input_f32, input_position, u32_zero_value, masked_index); - } else { + } else { if (is_array) return OpAccessChain(input_f32, input_position, vertex, masked_index); else @@ -1390,7 +1390,8 @@ void EmitContext::DefineInputs(const IR::Program& program) { static_cast(spv::BuiltIn::Position)); Decorate(input_position_struct, spv::Decoration::Block); } else { - const spv::BuiltIn built_in{is_fragment ? spv::BuiltIn::FragCoord : spv::BuiltIn::Position}; + const spv::BuiltIn built_in{is_fragment ? spv::BuiltIn::FragCoord + : spv::BuiltIn::Position}; input_position = DefineInput(*this, F32[4], true, built_in); if (profile.support_geometry_shader_passthrough) { diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp index 4a0ccceb7..a3b99e24d 100644 --- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp @@ -172,7 +172,10 @@ std::map GenerateLegacyToGenericMappings( return mapping; } -void EmitGeometryPassthrough(IR::IREmitter& ir, const IR::Program& program, const Shader::VaryingState &passthrough_mask, bool passthrough_position, std::optional passthrough_layer_attr) { +void EmitGeometryPassthrough(IR::IREmitter& ir, const IR::Program& program, + const Shader::VaryingState& passthrough_mask, + bool passthrough_position, + std::optional passthrough_layer_attr) { for (u32 i = 0; i < program.output_vertices; i++) { // Assign generics from input for (u32 j = 0; j < 32; j++) { @@ -198,7 +201,8 @@ void EmitGeometryPassthrough(IR::IREmitter& ir, const IR::Program& program, cons if (passthrough_layer_attr) { // Assign layer - ir.SetAttribute(IR::Attribute::Layer, ir.GetAttribute(*passthrough_layer_attr), ir.Imm32(0)); + ir.SetAttribute(IR::Attribute::Layer, ir.GetAttribute(*passthrough_layer_attr), + ir.Imm32(0)); } // Emit vertex @@ -209,21 +213,23 @@ void EmitGeometryPassthrough(IR::IREmitter& ir, const IR::Program& program, cons u32 GetOutputTopologyVertices(OutputTopology output_topology) { switch (output_topology) { - case OutputTopology::PointList: - return 1; - case OutputTopology::LineStrip: - return 2; - default: - return 3; + case OutputTopology::PointList: + return 1; + case OutputTopology::LineStrip: + return 2; + default: + return 3; } } void LowerGeometryPassthrough(const IR::Program& program, const HostTranslateInfo& host_info) { - for (IR::Block *const block : program.blocks) { - for (IR::Inst &inst : block->Instructions()) { + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { if (inst.GetOpcode() == IR::Opcode::Epilogue) { IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; - EmitGeometryPassthrough(ir, program, program.info.passthrough, program.info.passthrough.AnyComponent(IR::Attribute::PositionX), {}); + EmitGeometryPassthrough( + ir, program, program.info.passthrough, + program.info.passthrough.AnyComponent(IR::Attribute::PositionX), {}); } } } @@ -407,7 +413,6 @@ IR::Program GenerateGeometryPassthrough(ObjectPool& inst_pool, program.output_topology = output_topology; program.output_vertices = GetOutputTopologyVertices(output_topology); - program.is_geometry_passthrough = false; program.info.loads.mask = source_program.info.stores.mask; program.info.stores.mask = source_program.info.stores.mask; @@ -420,7 +425,8 @@ IR::Program GenerateGeometryPassthrough(ObjectPool& inst_pool, node.data.block = current_block; IR::IREmitter ir{*current_block}; - EmitGeometryPassthrough(ir, program, program.info.stores, true, source_program.info.emulated_layer); + EmitGeometryPassthrough(ir, program, program.info.stores, true, + source_program.info.emulated_layer); IR::Block* return_block{block_pool.Create(inst_pool)}; IR::IREmitter{*return_block}.Epilogue(); diff --git a/src/shader_recompiler/host_translate_info.h b/src/shader_recompiler/host_translate_info.h index dc402ee47..55fc48768 100644 --- a/src/shader_recompiler/host_translate_info.h +++ b/src/shader_recompiler/host_translate_info.h @@ -15,8 +15,9 @@ struct HostTranslateInfo { bool needs_demote_reorder{}; ///< True when the device needs DemoteToHelperInvocation reordered bool support_snorm_render_buffer{}; ///< True when the device supports SNORM render buffers bool support_viewport_index_layer{}; ///< True when the device supports gl_Layer in VS - u32 min_ssbo_alignment{}; ///< Minimum alignment supported by the device for SSBOs - bool support_geometry_shader_passthrough{}; ///< True when the device supports geometry passthrough shaders + u32 min_ssbo_alignment{}; ///< Minimum alignment supported by the device for SSBOs + bool support_geometry_shader_passthrough{}; ///< True when the device supports geometry + ///< passthrough shaders }; } // namespace Shader diff --git a/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp index f8d20af3c..9101722ba 100644 --- a/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp +++ b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp @@ -538,7 +538,8 @@ void GlobalMemoryToStorageBufferPass(IR::Program& program, const HostTranslateIn const IR::U32 index{IR::Value{static_cast(info.set.index_of(it))}}; IR::Block* const block{storage_inst.block}; IR::Inst* const inst{storage_inst.inst}; - const IR::U32 offset{StorageOffset(*block, *inst, storage_buffer, host_info.min_ssbo_alignment)}; + const IR::U32 offset{ + StorageOffset(*block, *inst, storage_buffer, host_info.min_ssbo_alignment)}; Replace(*block, *inst, index, offset); } } diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index e6414df78..627917ab6 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -1941,7 +1941,8 @@ typename BufferCache

::Binding BufferCache

::StorageBufferBinding(GPUVAddr s const u32 alignment = runtime.GetStorageBufferAlignment(); const GPUVAddr aligned_gpu_addr = Common::AlignDown(gpu_addr, alignment); - const u32 aligned_size = Common::AlignUp(static_cast(gpu_addr - aligned_gpu_addr) + size, alignment); + const u32 aligned_size = + Common::AlignUp(static_cast(gpu_addr - aligned_gpu_addr) + size, alignment); const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(aligned_gpu_addr); if (!cpu_addr || size == 0) {