gl_rasterizer: Upload constant buffers with glNamedBufferSubData
Nvidia's OpenGL driver maps gl(Named)BufferSubData with some requirements to a fast. This path has an extra memcpy but updates the buffer without orphaning or waiting for previous calls. It can be seen as a better model for "push constants" that can upload a whole UBO instead of 256 bytes. This path has some requirements established here: http://on-demand.gputechconf.com/gtc/2014/presentations/S4379-opengl-44-scene-rendering-techniques.pdf#page=24 Instead of using the stream buffer, this commits moves constant buffers uploads to calls of glNamedBufferSubData and from my testing it brings a performance improvement. This is disabled when the vendor is not Nvidia since it brings performance regressions.
This commit is contained in:
parent
11e39da02b
commit
76ca2a5f82
|
@ -30,7 +30,7 @@ public:
|
||||||
using BufferInfo = std::pair<const TBufferType*, u64>;
|
using BufferInfo = std::pair<const TBufferType*, u64>;
|
||||||
|
|
||||||
BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
|
BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
|
||||||
bool is_written = false) {
|
bool is_written = false, bool use_fast_cbuf = false) {
|
||||||
std::lock_guard lock{mutex};
|
std::lock_guard lock{mutex};
|
||||||
|
|
||||||
auto& memory_manager = system.GPU().MemoryManager();
|
auto& memory_manager = system.GPU().MemoryManager();
|
||||||
|
@ -43,11 +43,15 @@ public:
|
||||||
// Cache management is a big overhead, so only cache entries with a given size.
|
// Cache management is a big overhead, so only cache entries with a given size.
|
||||||
// TODO: Figure out which size is the best for given games.
|
// TODO: Figure out which size is the best for given games.
|
||||||
constexpr std::size_t max_stream_size = 0x800;
|
constexpr std::size_t max_stream_size = 0x800;
|
||||||
if (size < max_stream_size) {
|
if (use_fast_cbuf || size < max_stream_size) {
|
||||||
if (!is_written && !IsRegionWritten(cache_addr, cache_addr + size - 1)) {
|
if (!is_written && !IsRegionWritten(cache_addr, cache_addr + size - 1)) {
|
||||||
|
if (use_fast_cbuf) {
|
||||||
|
return ConstBufferUpload(host_ptr, size);
|
||||||
|
} else {
|
||||||
return StreamBufferUpload(host_ptr, size, alignment);
|
return StreamBufferUpload(host_ptr, size, alignment);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
auto block = GetBlock(cache_addr, size);
|
auto block = GetBlock(cache_addr, size);
|
||||||
auto map = MapAddress(block, gpu_addr, cache_addr, size);
|
auto map = MapAddress(block, gpu_addr, cache_addr, size);
|
||||||
|
@ -152,6 +156,10 @@ protected:
|
||||||
virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset,
|
virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset,
|
||||||
std::size_t dst_offset, std::size_t size) = 0;
|
std::size_t dst_offset, std::size_t size) = 0;
|
||||||
|
|
||||||
|
virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
/// Register an object into the cache
|
/// Register an object into the cache
|
||||||
void Register(const MapInterval& new_map, bool inherit_written = false) {
|
void Register(const MapInterval& new_map, bool inherit_written = false) {
|
||||||
const CacheAddr cache_ptr = new_map->GetStart();
|
const CacheAddr cache_ptr = new_map->GetStart();
|
||||||
|
|
|
@ -8,13 +8,17 @@
|
||||||
|
|
||||||
#include "common/assert.h"
|
#include "common/assert.h"
|
||||||
#include "common/microprofile.h"
|
#include "common/microprofile.h"
|
||||||
|
#include "video_core/engines/maxwell_3d.h"
|
||||||
#include "video_core/rasterizer_interface.h"
|
#include "video_core/rasterizer_interface.h"
|
||||||
#include "video_core/renderer_opengl/gl_buffer_cache.h"
|
#include "video_core/renderer_opengl/gl_buffer_cache.h"
|
||||||
|
#include "video_core/renderer_opengl/gl_device.h"
|
||||||
#include "video_core/renderer_opengl/gl_rasterizer.h"
|
#include "video_core/renderer_opengl/gl_rasterizer.h"
|
||||||
#include "video_core/renderer_opengl/gl_resource_manager.h"
|
#include "video_core/renderer_opengl/gl_resource_manager.h"
|
||||||
|
|
||||||
namespace OpenGL {
|
namespace OpenGL {
|
||||||
|
|
||||||
|
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
|
||||||
|
|
||||||
MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
|
MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
|
||||||
|
|
||||||
CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size)
|
CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size)
|
||||||
|
@ -26,11 +30,22 @@ CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t siz
|
||||||
CachedBufferBlock::~CachedBufferBlock() = default;
|
CachedBufferBlock::~CachedBufferBlock() = default;
|
||||||
|
|
||||||
OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
|
OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
|
||||||
std::size_t stream_size)
|
const Device& device, std::size_t stream_size)
|
||||||
: VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>{
|
: GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {
|
||||||
rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {}
|
if (!device.HasFastBufferSubData()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
OGLBufferCache::~OGLBufferCache() = default;
|
static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
|
||||||
|
glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
|
||||||
|
for (const GLuint cbuf : cbufs) {
|
||||||
|
glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
OGLBufferCache::~OGLBufferCache() {
|
||||||
|
glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
|
||||||
|
}
|
||||||
|
|
||||||
Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) {
|
Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) {
|
||||||
return std::make_shared<CachedBufferBlock>(cache_addr, size);
|
return std::make_shared<CachedBufferBlock>(cache_addr, size);
|
||||||
|
@ -69,4 +84,12 @@ void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t
|
||||||
static_cast<GLsizeiptr>(size));
|
static_cast<GLsizeiptr>(size));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
|
||||||
|
std::size_t size) {
|
||||||
|
DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
|
||||||
|
const GLuint& cbuf = cbufs[cbuf_cursor++];
|
||||||
|
glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
|
||||||
|
return {&cbuf, 0};
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace OpenGL
|
} // namespace OpenGL
|
||||||
|
|
|
@ -4,10 +4,12 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <array>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#include "common/common_types.h"
|
#include "common/common_types.h"
|
||||||
#include "video_core/buffer_cache/buffer_cache.h"
|
#include "video_core/buffer_cache/buffer_cache.h"
|
||||||
|
#include "video_core/engines/maxwell_3d.h"
|
||||||
#include "video_core/rasterizer_cache.h"
|
#include "video_core/rasterizer_cache.h"
|
||||||
#include "video_core/renderer_opengl/gl_resource_manager.h"
|
#include "video_core/renderer_opengl/gl_resource_manager.h"
|
||||||
#include "video_core/renderer_opengl/gl_stream_buffer.h"
|
#include "video_core/renderer_opengl/gl_stream_buffer.h"
|
||||||
|
@ -18,12 +20,14 @@ class System;
|
||||||
|
|
||||||
namespace OpenGL {
|
namespace OpenGL {
|
||||||
|
|
||||||
|
class Device;
|
||||||
class OGLStreamBuffer;
|
class OGLStreamBuffer;
|
||||||
class RasterizerOpenGL;
|
class RasterizerOpenGL;
|
||||||
|
|
||||||
class CachedBufferBlock;
|
class CachedBufferBlock;
|
||||||
|
|
||||||
using Buffer = std::shared_ptr<CachedBufferBlock>;
|
using Buffer = std::shared_ptr<CachedBufferBlock>;
|
||||||
|
using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
|
||||||
|
|
||||||
class CachedBufferBlock : public VideoCommon::BufferBlock {
|
class CachedBufferBlock : public VideoCommon::BufferBlock {
|
||||||
public:
|
public:
|
||||||
|
@ -38,14 +42,18 @@ private:
|
||||||
OGLBuffer gl_buffer{};
|
OGLBuffer gl_buffer{};
|
||||||
};
|
};
|
||||||
|
|
||||||
class OGLBufferCache final : public VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer> {
|
class OGLBufferCache final : public GenericBufferCache {
|
||||||
public:
|
public:
|
||||||
explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
|
explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
|
||||||
std::size_t stream_size);
|
const Device& device, std::size_t stream_size);
|
||||||
~OGLBufferCache();
|
~OGLBufferCache();
|
||||||
|
|
||||||
const GLuint* GetEmptyBuffer(std::size_t) override;
|
const GLuint* GetEmptyBuffer(std::size_t) override;
|
||||||
|
|
||||||
|
void Acquire() noexcept {
|
||||||
|
cbuf_cursor = 0;
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override;
|
Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override;
|
||||||
|
|
||||||
|
@ -61,6 +69,14 @@ protected:
|
||||||
|
|
||||||
void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
|
void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
|
||||||
std::size_t dst_offset, std::size_t size) override;
|
std::size_t dst_offset, std::size_t size) override;
|
||||||
|
|
||||||
|
BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::size_t cbuf_cursor = 0;
|
||||||
|
std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
|
||||||
|
Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram>
|
||||||
|
cbufs;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace OpenGL
|
} // namespace OpenGL
|
||||||
|
|
|
@ -51,8 +51,11 @@ bool HasExtension(const std::vector<std::string_view>& images, std::string_view
|
||||||
} // Anonymous namespace
|
} // Anonymous namespace
|
||||||
|
|
||||||
Device::Device() {
|
Device::Device() {
|
||||||
|
const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
|
||||||
const std::vector extensions = GetExtensions();
|
const std::vector extensions = GetExtensions();
|
||||||
|
|
||||||
|
const bool is_nvidia = vendor == "NVIDIA Corporation";
|
||||||
|
|
||||||
uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
|
uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
|
||||||
shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
|
shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
|
||||||
max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
|
max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
|
||||||
|
@ -64,6 +67,7 @@ Device::Device() {
|
||||||
has_variable_aoffi = TestVariableAoffi();
|
has_variable_aoffi = TestVariableAoffi();
|
||||||
has_component_indexing_bug = TestComponentIndexingBug();
|
has_component_indexing_bug = TestComponentIndexingBug();
|
||||||
has_precise_bug = TestPreciseBug();
|
has_precise_bug = TestPreciseBug();
|
||||||
|
has_fast_buffer_sub_data = is_nvidia;
|
||||||
|
|
||||||
LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
|
LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
|
||||||
LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
|
LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
|
||||||
|
|
|
@ -54,6 +54,10 @@ public:
|
||||||
return has_precise_bug;
|
return has_precise_bug;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool HasFastBufferSubData() const {
|
||||||
|
return has_fast_buffer_sub_data;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static bool TestVariableAoffi();
|
static bool TestVariableAoffi();
|
||||||
static bool TestComponentIndexingBug();
|
static bool TestComponentIndexingBug();
|
||||||
|
@ -69,6 +73,7 @@ private:
|
||||||
bool has_variable_aoffi{};
|
bool has_variable_aoffi{};
|
||||||
bool has_component_indexing_bug{};
|
bool has_component_indexing_bug{};
|
||||||
bool has_precise_bug{};
|
bool has_precise_bug{};
|
||||||
|
bool has_fast_buffer_sub_data{};
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace OpenGL
|
} // namespace OpenGL
|
||||||
|
|
|
@ -67,7 +67,7 @@ static std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buf
|
||||||
RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
|
RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
|
||||||
ScreenInfo& info)
|
ScreenInfo& info)
|
||||||
: texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device},
|
: texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device},
|
||||||
system{system}, screen_info{info}, buffer_cache{*this, system, STREAM_BUFFER_SIZE} {
|
system{system}, screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
|
||||||
shader_program_manager = std::make_unique<GLShader::ProgramManager>();
|
shader_program_manager = std::make_unique<GLShader::ProgramManager>();
|
||||||
state.draw.shader_program = 0;
|
state.draw.shader_program = 0;
|
||||||
state.Apply();
|
state.Apply();
|
||||||
|
@ -558,6 +558,8 @@ void RasterizerOpenGL::DrawPrelude() {
|
||||||
SyncPolygonOffset();
|
SyncPolygonOffset();
|
||||||
SyncAlphaTest();
|
SyncAlphaTest();
|
||||||
|
|
||||||
|
buffer_cache.Acquire();
|
||||||
|
|
||||||
// Draw the vertex batch
|
// Draw the vertex batch
|
||||||
const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
|
const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
|
||||||
|
|
||||||
|
@ -573,9 +575,11 @@ void RasterizerOpenGL::DrawPrelude() {
|
||||||
(sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
|
(sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
|
||||||
Maxwell::MaxShaderStage;
|
Maxwell::MaxShaderStage;
|
||||||
|
|
||||||
|
if (!device.HasFastBufferSubData()) {
|
||||||
// Add space for at least 18 constant buffers
|
// Add space for at least 18 constant buffers
|
||||||
buffer_size += Maxwell::MaxConstBuffers *
|
buffer_size += Maxwell::MaxConstBuffers *
|
||||||
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
|
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
|
||||||
|
}
|
||||||
|
|
||||||
// Prepare the vertex array.
|
// Prepare the vertex array.
|
||||||
buffer_cache.Map(buffer_size);
|
buffer_cache.Map(buffer_size);
|
||||||
|
@ -739,10 +743,12 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
|
||||||
state.draw.shader_program = program;
|
state.draw.shader_program = program;
|
||||||
state.draw.program_pipeline = 0;
|
state.draw.program_pipeline = 0;
|
||||||
|
|
||||||
|
if (!device.HasFastBufferSubData()) {
|
||||||
const std::size_t buffer_size =
|
const std::size_t buffer_size =
|
||||||
Tegra::Engines::KeplerCompute::NumConstBuffers *
|
Tegra::Engines::KeplerCompute::NumConstBuffers *
|
||||||
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
|
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
|
||||||
buffer_cache.Map(buffer_size);
|
buffer_cache.Map(buffer_size);
|
||||||
|
}
|
||||||
|
|
||||||
bind_ubo_pushbuffer.Setup(0);
|
bind_ubo_pushbuffer.Setup(0);
|
||||||
bind_ssbo_pushbuffer.Setup(0);
|
bind_ssbo_pushbuffer.Setup(0);
|
||||||
|
@ -750,7 +756,9 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
|
||||||
SetupComputeConstBuffers(kernel);
|
SetupComputeConstBuffers(kernel);
|
||||||
SetupComputeGlobalMemory(kernel);
|
SetupComputeGlobalMemory(kernel);
|
||||||
|
|
||||||
|
if (!device.HasFastBufferSubData()) {
|
||||||
buffer_cache.Unmap();
|
buffer_cache.Unmap();
|
||||||
|
}
|
||||||
|
|
||||||
bind_ubo_pushbuffer.Bind();
|
bind_ubo_pushbuffer.Bind();
|
||||||
bind_ssbo_pushbuffer.Bind();
|
bind_ssbo_pushbuffer.Bind();
|
||||||
|
@ -879,7 +887,8 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b
|
||||||
const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
|
const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
|
||||||
|
|
||||||
const auto alignment = device.GetUniformBufferAlignment();
|
const auto alignment = device.GetUniformBufferAlignment();
|
||||||
const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment);
|
const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
|
||||||
|
device.HasFastBufferSubData());
|
||||||
bind_ubo_pushbuffer.Push(cbuf, offset, size);
|
bind_ubo_pushbuffer.Push(cbuf, offset, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue