VideoCore: Implement DispatchIndirect
This commit is contained in:
parent
710ca3ca49
commit
115792158d
@ -14,6 +14,7 @@
|
|||||||
namespace Tegra {
|
namespace Tegra {
|
||||||
|
|
||||||
constexpr u32 MacroRegistersStart = 0xE00;
|
constexpr u32 MacroRegistersStart = 0xE00;
|
||||||
|
constexpr u32 ComputeInline = 0x6D;
|
||||||
|
|
||||||
DmaPusher::DmaPusher(Core::System& system_, GPU& gpu_, MemoryManager& memory_manager_,
|
DmaPusher::DmaPusher(Core::System& system_, GPU& gpu_, MemoryManager& memory_manager_,
|
||||||
Control::ChannelState& channel_state_)
|
Control::ChannelState& channel_state_)
|
||||||
@ -83,20 +84,35 @@ bool DmaPusher::Step() {
|
|||||||
dma_state.dma_get, command_list_header.size * sizeof(u32));
|
dma_state.dma_get, command_list_header.size * sizeof(u32));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (Settings::IsGPULevelHigh() && dma_state.method < MacroRegistersStart) {
|
const auto safe_process = [&] {
|
||||||
Core::Memory::GpuGuestMemory<Tegra::CommandHeader,
|
Core::Memory::GpuGuestMemory<Tegra::CommandHeader,
|
||||||
Core::Memory::GuestMemoryFlags::SafeRead>
|
Core::Memory::GuestMemoryFlags::SafeRead>
|
||||||
headers(memory_manager, dma_state.dma_get, command_list_header.size,
|
headers(memory_manager, dma_state.dma_get, command_list_header.size,
|
||||||
&command_headers);
|
&command_headers);
|
||||||
ProcessCommands(headers);
|
ProcessCommands(headers);
|
||||||
return true;
|
};
|
||||||
}
|
const auto unsafe_process = [&] {
|
||||||
Core::Memory::GpuGuestMemory<Tegra::CommandHeader,
|
Core::Memory::GpuGuestMemory<Tegra::CommandHeader,
|
||||||
Core::Memory::GuestMemoryFlags::UnsafeRead>
|
Core::Memory::GuestMemoryFlags::UnsafeRead>
|
||||||
headers(memory_manager, dma_state.dma_get, command_list_header.size, &command_headers);
|
headers(memory_manager, dma_state.dma_get, command_list_header.size,
|
||||||
|
&command_headers);
|
||||||
ProcessCommands(headers);
|
ProcessCommands(headers);
|
||||||
|
};
|
||||||
|
if (Settings::IsGPULevelHigh()) {
|
||||||
|
if (dma_state.method >= MacroRegistersStart) {
|
||||||
|
unsafe_process();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (subchannel_type[dma_state.subchannel] == Engines::EngineTypes::KeplerCompute &&
|
||||||
|
dma_state.method == ComputeInline) {
|
||||||
|
unsafe_process();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
safe_process();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
unsafe_process();
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -130,8 +130,10 @@ public:
|
|||||||
|
|
||||||
void DispatchCalls();
|
void DispatchCalls();
|
||||||
|
|
||||||
void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id) {
|
void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id,
|
||||||
|
Engines::EngineTypes engine_type) {
|
||||||
subchannels[subchannel_id] = engine;
|
subchannels[subchannel_id] = engine;
|
||||||
|
subchannel_type[subchannel_id] = engine_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
|
void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
|
||||||
@ -170,6 +172,7 @@ private:
|
|||||||
const bool ib_enable{true}; ///< IB mode enabled
|
const bool ib_enable{true}; ///< IB mode enabled
|
||||||
|
|
||||||
std::array<Engines::EngineInterface*, max_subchannels> subchannels{};
|
std::array<Engines::EngineInterface*, max_subchannels> subchannels{};
|
||||||
|
std::array<Engines::EngineTypes, max_subchannels> subchannel_type;
|
||||||
|
|
||||||
GPU& gpu;
|
GPU& gpu;
|
||||||
Core::System& system;
|
Core::System& system;
|
||||||
|
@ -11,6 +11,14 @@
|
|||||||
|
|
||||||
namespace Tegra::Engines {
|
namespace Tegra::Engines {
|
||||||
|
|
||||||
|
enum class EngineTypes : u32 {
|
||||||
|
KeplerCompute,
|
||||||
|
Maxwell3D,
|
||||||
|
Fermi2D,
|
||||||
|
MaxwellDMA,
|
||||||
|
KeplerMemory,
|
||||||
|
};
|
||||||
|
|
||||||
class EngineInterface {
|
class EngineInterface {
|
||||||
public:
|
public:
|
||||||
virtual ~EngineInterface() = default;
|
virtual ~EngineInterface() = default;
|
||||||
|
@ -69,6 +69,14 @@ public:
|
|||||||
/// Binds a rasterizer to this engine.
|
/// Binds a rasterizer to this engine.
|
||||||
void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
|
void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
|
||||||
|
|
||||||
|
GPUVAddr ExecTargetAddress() const {
|
||||||
|
return regs.dest.Address();
|
||||||
|
}
|
||||||
|
|
||||||
|
u32 GetUploadSize() const {
|
||||||
|
return copy_size;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void ProcessData(std::span<const u8> read_buffer);
|
void ProcessData(std::span<const u8> read_buffer);
|
||||||
|
|
||||||
|
@ -43,16 +43,33 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
|
|||||||
|
|
||||||
switch (method) {
|
switch (method) {
|
||||||
case KEPLER_COMPUTE_REG_INDEX(exec_upload): {
|
case KEPLER_COMPUTE_REG_INDEX(exec_upload): {
|
||||||
|
UploadInfo info{.upload_address = upload_address,
|
||||||
|
.exec_address = upload_state.ExecTargetAddress(),
|
||||||
|
.copy_size = upload_state.GetUploadSize()};
|
||||||
|
uploads.push_back(info);
|
||||||
upload_state.ProcessExec(regs.exec_upload.linear != 0);
|
upload_state.ProcessExec(regs.exec_upload.linear != 0);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case KEPLER_COMPUTE_REG_INDEX(data_upload): {
|
case KEPLER_COMPUTE_REG_INDEX(data_upload): {
|
||||||
|
upload_address = current_dma_segment;
|
||||||
upload_state.ProcessData(method_argument, is_last_call);
|
upload_state.ProcessData(method_argument, is_last_call);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case KEPLER_COMPUTE_REG_INDEX(launch):
|
case KEPLER_COMPUTE_REG_INDEX(launch): {
|
||||||
|
const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
|
||||||
|
|
||||||
|
for (auto& data : uploads) {
|
||||||
|
const GPUVAddr offset = data.exec_address - launch_desc_loc;
|
||||||
|
if (offset / sizeof(u32) == LAUNCH_REG_INDEX(grid_dim_x) &&
|
||||||
|
memory_manager.IsMemoryDirty(data.upload_address, data.copy_size)) {
|
||||||
|
indirect_compute = {data.upload_address};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
uploads.clear();
|
||||||
ProcessLaunch();
|
ProcessLaunch();
|
||||||
|
indirect_compute = std::nullopt;
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -62,6 +79,7 @@ void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amoun
|
|||||||
u32 methods_pending) {
|
u32 methods_pending) {
|
||||||
switch (method) {
|
switch (method) {
|
||||||
case KEPLER_COMPUTE_REG_INDEX(data_upload):
|
case KEPLER_COMPUTE_REG_INDEX(data_upload):
|
||||||
|
upload_address = current_dma_segment;
|
||||||
upload_state.ProcessData(base_start, amount);
|
upload_state.ProcessData(base_start, amount);
|
||||||
return;
|
return;
|
||||||
default:
|
default:
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
#include <optional>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "common/bit_field.h"
|
#include "common/bit_field.h"
|
||||||
#include "common/common_funcs.h"
|
#include "common/common_funcs.h"
|
||||||
@ -36,6 +37,9 @@ namespace Tegra::Engines {
|
|||||||
#define KEPLER_COMPUTE_REG_INDEX(field_name) \
|
#define KEPLER_COMPUTE_REG_INDEX(field_name) \
|
||||||
(offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32))
|
(offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32))
|
||||||
|
|
||||||
|
#define LAUNCH_REG_INDEX(field_name) \
|
||||||
|
(offsetof(Tegra::Engines::KeplerCompute::LaunchParams, field_name) / sizeof(u32))
|
||||||
|
|
||||||
class KeplerCompute final : public EngineInterface {
|
class KeplerCompute final : public EngineInterface {
|
||||||
public:
|
public:
|
||||||
explicit KeplerCompute(Core::System& system, MemoryManager& memory_manager);
|
explicit KeplerCompute(Core::System& system, MemoryManager& memory_manager);
|
||||||
@ -201,6 +205,10 @@ public:
|
|||||||
void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
|
void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
|
||||||
u32 methods_pending) override;
|
u32 methods_pending) override;
|
||||||
|
|
||||||
|
std::optional<GPUVAddr> GetIndirectComputeAddress() const {
|
||||||
|
return indirect_compute;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void ProcessLaunch();
|
void ProcessLaunch();
|
||||||
|
|
||||||
@ -216,6 +224,15 @@ private:
|
|||||||
MemoryManager& memory_manager;
|
MemoryManager& memory_manager;
|
||||||
VideoCore::RasterizerInterface* rasterizer = nullptr;
|
VideoCore::RasterizerInterface* rasterizer = nullptr;
|
||||||
Upload::State upload_state;
|
Upload::State upload_state;
|
||||||
|
GPUVAddr upload_address;
|
||||||
|
|
||||||
|
struct UploadInfo {
|
||||||
|
GPUVAddr upload_address;
|
||||||
|
GPUVAddr exec_address;
|
||||||
|
u32 copy_size;
|
||||||
|
};
|
||||||
|
std::vector<UploadInfo> uploads;
|
||||||
|
std::optional<GPUVAddr> indirect_compute{};
|
||||||
};
|
};
|
||||||
|
|
||||||
#define ASSERT_REG_POSITION(field_name, position) \
|
#define ASSERT_REG_POSITION(field_name, position) \
|
||||||
|
@ -34,19 +34,24 @@ void Puller::ProcessBindMethod(const MethodCall& method_call) {
|
|||||||
bound_engines[method_call.subchannel] = engine_id;
|
bound_engines[method_call.subchannel] = engine_id;
|
||||||
switch (engine_id) {
|
switch (engine_id) {
|
||||||
case EngineID::FERMI_TWOD_A:
|
case EngineID::FERMI_TWOD_A:
|
||||||
dma_pusher.BindSubchannel(channel_state.fermi_2d.get(), method_call.subchannel);
|
dma_pusher.BindSubchannel(channel_state.fermi_2d.get(), method_call.subchannel,
|
||||||
|
EngineTypes::Fermi2D);
|
||||||
break;
|
break;
|
||||||
case EngineID::MAXWELL_B:
|
case EngineID::MAXWELL_B:
|
||||||
dma_pusher.BindSubchannel(channel_state.maxwell_3d.get(), method_call.subchannel);
|
dma_pusher.BindSubchannel(channel_state.maxwell_3d.get(), method_call.subchannel,
|
||||||
|
EngineTypes::Maxwell3D);
|
||||||
break;
|
break;
|
||||||
case EngineID::KEPLER_COMPUTE_B:
|
case EngineID::KEPLER_COMPUTE_B:
|
||||||
dma_pusher.BindSubchannel(channel_state.kepler_compute.get(), method_call.subchannel);
|
dma_pusher.BindSubchannel(channel_state.kepler_compute.get(), method_call.subchannel,
|
||||||
|
EngineTypes::KeplerCompute);
|
||||||
break;
|
break;
|
||||||
case EngineID::MAXWELL_DMA_COPY_A:
|
case EngineID::MAXWELL_DMA_COPY_A:
|
||||||
dma_pusher.BindSubchannel(channel_state.maxwell_dma.get(), method_call.subchannel);
|
dma_pusher.BindSubchannel(channel_state.maxwell_dma.get(), method_call.subchannel,
|
||||||
|
EngineTypes::MaxwellDMA);
|
||||||
break;
|
break;
|
||||||
case EngineID::KEPLER_INLINE_TO_MEMORY_B:
|
case EngineID::KEPLER_INLINE_TO_MEMORY_B:
|
||||||
dma_pusher.BindSubchannel(channel_state.kepler_memory.get(), method_call.subchannel);
|
dma_pusher.BindSubchannel(channel_state.kepler_memory.get(), method_call.subchannel,
|
||||||
|
EngineTypes::KeplerMemory);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id);
|
UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id);
|
||||||
|
@ -380,6 +380,17 @@ void RasterizerOpenGL::DispatchCompute() {
|
|||||||
pipeline->SetEngine(kepler_compute, gpu_memory);
|
pipeline->SetEngine(kepler_compute, gpu_memory);
|
||||||
pipeline->Configure();
|
pipeline->Configure();
|
||||||
const auto& qmd{kepler_compute->launch_description};
|
const auto& qmd{kepler_compute->launch_description};
|
||||||
|
auto indirect_address = kepler_compute->GetIndirectComputeAddress();
|
||||||
|
if (indirect_address) {
|
||||||
|
// DispatchIndirect
|
||||||
|
static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
|
||||||
|
const auto post_op = VideoCommon::ObtainBufferOperation::DiscardWrite;
|
||||||
|
const auto [buffer, offset] =
|
||||||
|
buffer_cache.ObtainBuffer(*indirect_address, 12, sync_info, post_op);
|
||||||
|
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, buffer->Handle());
|
||||||
|
glDispatchComputeIndirect(static_cast<GLintptr>(offset));
|
||||||
|
return;
|
||||||
|
}
|
||||||
glDispatchCompute(qmd.grid_dim_x, qmd.grid_dim_y, qmd.grid_dim_z);
|
glDispatchCompute(qmd.grid_dim_x, qmd.grid_dim_y, qmd.grid_dim_z);
|
||||||
++num_queued_commands;
|
++num_queued_commands;
|
||||||
has_written_global_memory |= pipeline->WritesGlobalMemory();
|
has_written_global_memory |= pipeline->WritesGlobalMemory();
|
||||||
|
@ -463,6 +463,20 @@ void RasterizerVulkan::DispatchCompute() {
|
|||||||
pipeline->Configure(*kepler_compute, *gpu_memory, scheduler, buffer_cache, texture_cache);
|
pipeline->Configure(*kepler_compute, *gpu_memory, scheduler, buffer_cache, texture_cache);
|
||||||
|
|
||||||
const auto& qmd{kepler_compute->launch_description};
|
const auto& qmd{kepler_compute->launch_description};
|
||||||
|
auto indirect_address = kepler_compute->GetIndirectComputeAddress();
|
||||||
|
if (indirect_address) {
|
||||||
|
// DispatchIndirect
|
||||||
|
static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
|
||||||
|
const auto post_op = VideoCommon::ObtainBufferOperation::DiscardWrite;
|
||||||
|
const auto [buffer, offset] =
|
||||||
|
buffer_cache.ObtainBuffer(*indirect_address, 12, sync_info, post_op);
|
||||||
|
scheduler.RequestOutsideRenderPassOperationContext();
|
||||||
|
scheduler.Record([indirect_buffer = buffer->Handle(),
|
||||||
|
indirect_offset = offset](vk::CommandBuffer cmdbuf) {
|
||||||
|
cmdbuf.DispatchIndirect(indirect_buffer, indirect_offset);
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
const std::array<u32, 3> dim{qmd.grid_dim_x, qmd.grid_dim_y, qmd.grid_dim_z};
|
const std::array<u32, 3> dim{qmd.grid_dim_x, qmd.grid_dim_y, qmd.grid_dim_z};
|
||||||
scheduler.RequestOutsideRenderPassOperationContext();
|
scheduler.RequestOutsideRenderPassOperationContext();
|
||||||
scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); });
|
scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); });
|
||||||
|
@ -92,6 +92,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
|
|||||||
X(vkCmdCopyImage);
|
X(vkCmdCopyImage);
|
||||||
X(vkCmdCopyImageToBuffer);
|
X(vkCmdCopyImageToBuffer);
|
||||||
X(vkCmdDispatch);
|
X(vkCmdDispatch);
|
||||||
|
X(vkCmdDispatchIndirect);
|
||||||
X(vkCmdDraw);
|
X(vkCmdDraw);
|
||||||
X(vkCmdDrawIndexed);
|
X(vkCmdDrawIndexed);
|
||||||
X(vkCmdDrawIndirect);
|
X(vkCmdDrawIndirect);
|
||||||
|
@ -203,6 +203,7 @@ struct DeviceDispatch : InstanceDispatch {
|
|||||||
PFN_vkCmdCopyImage vkCmdCopyImage{};
|
PFN_vkCmdCopyImage vkCmdCopyImage{};
|
||||||
PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{};
|
PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{};
|
||||||
PFN_vkCmdDispatch vkCmdDispatch{};
|
PFN_vkCmdDispatch vkCmdDispatch{};
|
||||||
|
PFN_vkCmdDispatchIndirect vkCmdDispatchIndirect{};
|
||||||
PFN_vkCmdDraw vkCmdDraw{};
|
PFN_vkCmdDraw vkCmdDraw{};
|
||||||
PFN_vkCmdDrawIndexed vkCmdDrawIndexed{};
|
PFN_vkCmdDrawIndexed vkCmdDrawIndexed{};
|
||||||
PFN_vkCmdDrawIndirect vkCmdDrawIndirect{};
|
PFN_vkCmdDrawIndirect vkCmdDrawIndirect{};
|
||||||
@ -1209,6 +1210,10 @@ public:
|
|||||||
dld->vkCmdDispatch(handle, x, y, z);
|
dld->vkCmdDispatch(handle, x, y, z);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void DispatchIndirect(VkBuffer indirect_buffer, VkDeviceSize offset) const noexcept {
|
||||||
|
dld->vkCmdDispatchIndirect(handle, indirect_buffer, offset);
|
||||||
|
}
|
||||||
|
|
||||||
void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask,
|
void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask,
|
||||||
VkDependencyFlags dependency_flags, Span<VkMemoryBarrier> memory_barriers,
|
VkDependencyFlags dependency_flags, Span<VkMemoryBarrier> memory_barriers,
|
||||||
Span<VkBufferMemoryBarrier> buffer_barriers,
|
Span<VkBufferMemoryBarrier> buffer_barriers,
|
||||||
|
Loading…
Reference in New Issue
Block a user