mirror of
https://github.com/bitcoin/bitcoin.git
synced 2025-08-27 20:11:47 +02:00
optimization: peel align-head and unroll body to 64 bytes
Benchmarks indicated that obfuscating multiple bytes already gives an order of magnitude speed-up, but: * GCC still emitted scalar code; * Clang’s auto-vectorized loop ran on the slow unaligned-load path. Fix contains: * peeling the misaligned head enabled the hot loop starting at an 8-byte address; * `std::assume_aligned<8>` tells the optimizer the promise holds - required to keep Apple Clang happy; * manually unrolling the body to 64 bytes enabled GCC to auto-vectorize. Note that `target.size() > KEY_SIZE` condition is just an optimization, the aligned and unaligned loops work without it as well - it's why the alignment calculation still contains `std::min`. > C++ compiler .......................... GNU 14.2.0 | ns/byte | byte/s | err% | ins/byte | cyc/byte | IPC | bra/byte | miss% | total | benchmark |--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:---------- | 0.03 | 32,464,658,919.11 | 0.0% | 0.50 | 0.11 | 4.474 | 0.08 | 0.0% | 5.29 | `ObfuscationBench` > C++ compiler .......................... Clang 20.1.7 | ns/byte | byte/s | err% | ins/byte | cyc/byte | IPC | bra/byte | miss% | total | benchmark |--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:---------- | 0.02 | 41,231,547,045.17 | 0.0% | 0.30 | 0.09 | 3.463 | 0.02 | 0.0% | 5.47 | `ObfuscationBench` Co-authored-by: Hodlinator <172445034+hodlinator@users.noreply.github.com>
This commit is contained in:
@@ -14,6 +14,7 @@
|
|||||||
#include <bit>
|
#include <bit>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <ios>
|
#include <ios>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
class Obfuscation
|
class Obfuscation
|
||||||
{
|
{
|
||||||
@@ -33,10 +34,27 @@ public:
|
|||||||
{
|
{
|
||||||
if (!*this) return;
|
if (!*this) return;
|
||||||
|
|
||||||
const KeyType rot_key{m_rotations[key_offset % KEY_SIZE]}; // Continue obfuscation from where we left off
|
KeyType rot_key{m_rotations[key_offset % KEY_SIZE]}; // Continue obfuscation from where we left off
|
||||||
|
if (target.size() > KEY_SIZE) {
|
||||||
|
// Obfuscate until 64-bit alignment boundary
|
||||||
|
if (const auto misalign{std::bit_cast<uintptr_t>(target.data()) % KEY_SIZE}) {
|
||||||
|
const size_t alignment{std::min(KEY_SIZE - misalign, target.size())};
|
||||||
|
XorWord(target.first(alignment), rot_key);
|
||||||
|
|
||||||
|
target = {std::assume_aligned<KEY_SIZE>(target.data() + alignment), target.size() - alignment};
|
||||||
|
rot_key = m_rotations[(key_offset + alignment) % KEY_SIZE];
|
||||||
|
}
|
||||||
|
// Aligned obfuscation in 64-byte chunks
|
||||||
|
for (constexpr auto unroll{8}; target.size() >= KEY_SIZE * unroll; target = target.subspan(KEY_SIZE * unroll)) {
|
||||||
|
for (size_t i{0}; i < unroll; ++i) {
|
||||||
|
XorWord(target.subspan(i * KEY_SIZE, KEY_SIZE), rot_key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Aligned obfuscation in 64-bit chunks
|
||||||
for (; target.size() >= KEY_SIZE; target = target.subspan(KEY_SIZE)) {
|
for (; target.size() >= KEY_SIZE; target = target.subspan(KEY_SIZE)) {
|
||||||
XorWord(target.first<KEY_SIZE>(), rot_key);
|
XorWord(target.first<KEY_SIZE>(), rot_key);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
XorWord(target, rot_key);
|
XorWord(target, rot_key);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user