From d4f871cb6a616f6b1a8a76049e042118571f3dd3 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 1 Jan 2021 23:28:55 +0100 Subject: [PATCH 1/2] X86/NativeClock: Improve performance of clock calculations on hot path. --- src/common/x64/native_clock.cpp | 69 ++++++++++++++++++++++++++++++--- src/common/x64/native_clock.h | 7 ++++ 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index eb8a7782f..e246432d0 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -2,12 +2,17 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include #include +#include #include #include #ifdef _MSC_VER #include + +#pragma intrinsic(__umulh) +#pragma intrinsic(_udiv128) #else #include #endif @@ -15,6 +20,55 @@ #include "common/uint128.h" #include "common/x64/native_clock.h" +namespace { + +[[nodiscard]] u64 GetFixedPoint64Factor(u64 numerator, u64 divisor) { +#ifdef __SIZEOF_INT128__ + const auto base = static_cast(numerator) << 64ULL; + return static_cast(base / divisor); +#elif defined(_M_X64) || defined(_M_ARM64) + std::array r = {0, numerator}; + u64 remainder; +#if _MSC_VER < 1923 + return udiv128(r[1], r[0], divisor, &remainder); +#else + return _udiv128(r[1], r[0], divisor, &remainder); +#endif +#else + // This one is bit more inaccurate. + return MultiplyAndDivide64(std::numeric_limits::max(), numerator, divisor); +#endif +} + +[[nodiscard]] u64 MultiplyHigh(u64 a, u64 b) { +#ifdef __SIZEOF_INT128__ + return (static_cast(a) * static_cast(b)) >> 64; +#elif defined(_M_X64) || defined(_M_ARM64) + return __umulh(a, b); // MSVC +#else + // Generic fallback + const u64 a_lo = u32(a); + const u64 a_hi = a >> 32; + const u64 b_lo = u32(b); + const u64 b_hi = b >> 32; + + const u64 a_x_b_hi = a_hi * b_hi; + const u64 a_x_b_mid = a_hi * b_lo; + const u64 b_x_a_mid = b_hi * a_lo; + const u64 a_x_b_lo = a_lo * b_lo; + + const u64 carry_bit = (static_cast(static_cast(a_x_b_mid)) + + static_cast(static_cast(b_x_a_mid)) + (a_x_b_lo >> 32)) >> + 32; + + const u64 multhi = a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit; + + return multhi; +#endif +} + +} // namespace + namespace Common { u64 EstimateRDTSCFrequency() { @@ -50,6 +104,11 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen _mm_mfence(); last_measure = __rdtsc(); accumulated_ticks = 0U; + ns_rtsc_factor = GetFixedPoint64Factor(1000000000, rtsc_frequency); + us_rtsc_factor = GetFixedPoint64Factor(1000000, rtsc_frequency); + ms_rtsc_factor = GetFixedPoint64Factor(1000, rtsc_frequency); + clock_rtsc_factor = GetFixedPoint64Factor(emulated_clock_frequency, rtsc_frequency); + cpu_rtsc_factor = GetFixedPoint64Factor(emulated_cpu_frequency, rtsc_frequency); } u64 NativeClock::GetRTSC() { @@ -75,27 +134,27 @@ void NativeClock::Pause(bool is_paused) { std::chrono::nanoseconds NativeClock::GetTimeNS() { const u64 rtsc_value = GetRTSC(); - return std::chrono::nanoseconds{MultiplyAndDivide64(rtsc_value, 1000000000, rtsc_frequency)}; + return std::chrono::nanoseconds{MultiplyHigh(rtsc_value, ns_rtsc_factor)}; } std::chrono::microseconds NativeClock::GetTimeUS() { const u64 rtsc_value = GetRTSC(); - return std::chrono::microseconds{MultiplyAndDivide64(rtsc_value, 1000000, rtsc_frequency)}; + return std::chrono::microseconds{MultiplyHigh(rtsc_value, us_rtsc_factor)}; } std::chrono::milliseconds NativeClock::GetTimeMS() { const u64 rtsc_value = GetRTSC(); - return std::chrono::milliseconds{MultiplyAndDivide64(rtsc_value, 1000, rtsc_frequency)}; + return std::chrono::milliseconds{MultiplyHigh(rtsc_value, ms_rtsc_factor)}; } u64 NativeClock::GetClockCycles() { const u64 rtsc_value = GetRTSC(); - return MultiplyAndDivide64(rtsc_value, emulated_clock_frequency, rtsc_frequency); + return MultiplyHigh(rtsc_value, clock_rtsc_factor); } u64 NativeClock::GetCPUCycles() { const u64 rtsc_value = GetRTSC(); - return MultiplyAndDivide64(rtsc_value, emulated_cpu_frequency, rtsc_frequency); + return MultiplyHigh(rtsc_value, cpu_rtsc_factor); } } // namespace X64 diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h index 6d1e32ac8..a7b1ee9e0 100644 --- a/src/common/x64/native_clock.h +++ b/src/common/x64/native_clock.h @@ -41,6 +41,13 @@ private: u64 last_measure{}; u64 accumulated_ticks{}; u64 rtsc_frequency; + + // factors + u64 ns_rtsc_factor{}; + u64 us_rtsc_factor{}; + u64 ms_rtsc_factor{}; + u64 clock_rtsc_factor{}; + u64 cpu_rtsc_factor{}; }; } // namespace X64 From 53d92318b82cd4a9e08f814fcb8aab624d795c6c Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 2 Jan 2021 02:24:49 +0100 Subject: [PATCH 2/2] X86/NativeClock: Reimplement RTDSC access to be lock free. --- src/common/CMakeLists.txt | 1 - src/common/atomic_ops.cpp | 75 --------------------------------- src/common/atomic_ops.h | 71 ++++++++++++++++++++++++++++--- src/common/x64/native_clock.cpp | 41 ++++++++++++------ src/common/x64/native_clock.h | 22 ++++++---- 5 files changed, 107 insertions(+), 103 deletions(-) delete mode 100644 src/common/atomic_ops.cpp diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 2c2bd2ee8..abe62543e 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -98,7 +98,6 @@ add_library(common STATIC algorithm.h alignment.h assert.h - atomic_ops.cpp atomic_ops.h detached_tasks.cpp detached_tasks.h diff --git a/src/common/atomic_ops.cpp b/src/common/atomic_ops.cpp deleted file mode 100644 index 1612d0e67..000000000 --- a/src/common/atomic_ops.cpp +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2020 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include - -#include "common/atomic_ops.h" - -#if _MSC_VER -#include -#endif - -namespace Common { - -#if _MSC_VER - -bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) { - const u8 result = - _InterlockedCompareExchange8(reinterpret_cast(pointer), value, expected); - return result == expected; -} - -bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) { - const u16 result = - _InterlockedCompareExchange16(reinterpret_cast(pointer), value, expected); - return result == expected; -} - -bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) { - const u32 result = - _InterlockedCompareExchange(reinterpret_cast(pointer), value, expected); - return result == expected; -} - -bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) { - const u64 result = _InterlockedCompareExchange64(reinterpret_cast(pointer), - value, expected); - return result == expected; -} - -bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) { - return _InterlockedCompareExchange128(reinterpret_cast(pointer), value[1], - value[0], - reinterpret_cast<__int64*>(expected.data())) != 0; -} - -#else - -bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) { - return __sync_bool_compare_and_swap(pointer, expected, value); -} - -bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) { - return __sync_bool_compare_and_swap(pointer, expected, value); -} - -bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) { - return __sync_bool_compare_and_swap(pointer, expected, value); -} - -bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) { - return __sync_bool_compare_and_swap(pointer, expected, value); -} - -bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) { - unsigned __int128 value_a; - unsigned __int128 expected_a; - std::memcpy(&value_a, value.data(), sizeof(u128)); - std::memcpy(&expected_a, expected.data(), sizeof(u128)); - return __sync_bool_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a); -} - -#endif - -} // namespace Common diff --git a/src/common/atomic_ops.h b/src/common/atomic_ops.h index b46888589..2b1f515e8 100644 --- a/src/common/atomic_ops.h +++ b/src/common/atomic_ops.h @@ -4,14 +4,75 @@ #pragma once +#include +#include + #include "common/common_types.h" +#if _MSC_VER +#include +#endif + namespace Common { -[[nodiscard]] bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected); -[[nodiscard]] bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected); -[[nodiscard]] bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected); -[[nodiscard]] bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected); -[[nodiscard]] bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected); +#if _MSC_VER + +[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) { + const u8 result = + _InterlockedCompareExchange8(reinterpret_cast(pointer), value, expected); + return result == expected; +} + +[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) { + const u16 result = + _InterlockedCompareExchange16(reinterpret_cast(pointer), value, expected); + return result == expected; +} + +[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) { + const u32 result = + _InterlockedCompareExchange(reinterpret_cast(pointer), value, expected); + return result == expected; +} + +[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) { + const u64 result = _InterlockedCompareExchange64(reinterpret_cast(pointer), + value, expected); + return result == expected; +} + +[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) { + return _InterlockedCompareExchange128(reinterpret_cast(pointer), value[1], + value[0], + reinterpret_cast<__int64*>(expected.data())) != 0; +} + +#else + +[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) { + return __sync_bool_compare_and_swap(pointer, expected, value); +} + +[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) { + return __sync_bool_compare_and_swap(pointer, expected, value); +} + +[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) { + return __sync_bool_compare_and_swap(pointer, expected, value); +} + +[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) { + return __sync_bool_compare_and_swap(pointer, expected, value); +} + +[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) { + unsigned __int128 value_a; + unsigned __int128 expected_a; + std::memcpy(&value_a, value.data(), sizeof(u128)); + std::memcpy(&expected_a, expected.data(), sizeof(u128)); + return __sync_bool_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a); +} + +#endif } // namespace Common diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index e246432d0..a65f6b832 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -17,6 +17,7 @@ #include #endif +#include "common/atomic_ops.h" #include "common/uint128.h" #include "common/x64/native_clock.h" @@ -102,8 +103,8 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{ rtsc_frequency_} { _mm_mfence(); - last_measure = __rdtsc(); - accumulated_ticks = 0U; + time_point.inner.last_measure = __rdtsc(); + time_point.inner.accumulated_ticks = 0U; ns_rtsc_factor = GetFixedPoint64Factor(1000000000, rtsc_frequency); us_rtsc_factor = GetFixedPoint64Factor(1000000, rtsc_frequency); ms_rtsc_factor = GetFixedPoint64Factor(1000, rtsc_frequency); @@ -112,23 +113,35 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen } u64 NativeClock::GetRTSC() { - std::scoped_lock scope{rtsc_serialize}; - _mm_mfence(); - const u64 current_measure = __rdtsc(); - u64 diff = current_measure - last_measure; - diff = diff & ~static_cast(static_cast(diff) >> 63); // max(diff, 0) - if (current_measure > last_measure) { - last_measure = current_measure; - } - accumulated_ticks += diff; + TimePoint new_time_point{}; + TimePoint current_time_point{}; + do { + current_time_point.pack = time_point.pack; + _mm_mfence(); + const u64 current_measure = __rdtsc(); + u64 diff = current_measure - current_time_point.inner.last_measure; + diff = diff & ~static_cast(static_cast(diff) >> 63); // max(diff, 0) + new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure + ? current_measure + : current_time_point.inner.last_measure; + new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff; + } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, + current_time_point.pack)); /// The clock cannot be more precise than the guest timer, remove the lower bits - return accumulated_ticks & inaccuracy_mask; + return new_time_point.inner.accumulated_ticks & inaccuracy_mask; } void NativeClock::Pause(bool is_paused) { if (!is_paused) { - _mm_mfence(); - last_measure = __rdtsc(); + TimePoint current_time_point{}; + TimePoint new_time_point{}; + do { + current_time_point.pack = time_point.pack; + new_time_point.pack = current_time_point.pack; + _mm_mfence(); + new_time_point.inner.last_measure = __rdtsc(); + } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, + current_time_point.pack)); } } diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h index a7b1ee9e0..7cbd400d2 100644 --- a/src/common/x64/native_clock.h +++ b/src/common/x64/native_clock.h @@ -6,7 +6,6 @@ #include -#include "common/spin_lock.h" #include "common/wall_clock.h" namespace Common { @@ -32,22 +31,29 @@ public: private: u64 GetRTSC(); + union alignas(16) TimePoint { + TimePoint() : pack{} {} + u128 pack{}; + struct Inner { + u64 last_measure{}; + u64 accumulated_ticks{}; + } inner; + }; + /// value used to reduce the native clocks accuracy as some apss rely on /// undefined behavior where the level of accuracy in the clock shouldn't /// be higher. static constexpr u64 inaccuracy_mask = ~(UINT64_C(0x400) - 1); - SpinLock rtsc_serialize{}; - u64 last_measure{}; - u64 accumulated_ticks{}; - u64 rtsc_frequency; - + TimePoint time_point; // factors + u64 clock_rtsc_factor{}; + u64 cpu_rtsc_factor{}; u64 ns_rtsc_factor{}; u64 us_rtsc_factor{}; u64 ms_rtsc_factor{}; - u64 clock_rtsc_factor{}; - u64 cpu_rtsc_factor{}; + + u64 rtsc_frequency; }; } // namespace X64