mirror of
https://github.com/bitcoin/bitcoin.git
synced 2025-11-12 23:18:14 +01:00
Merge #13386: SHA256 implementations based on Intel SHA Extensions
66b2cf1ccfUse immintrin.h everywhere for intrinsics (Pieter Wuille)4c935e2eeeAdd SHA256 implementation using using Intel SHA intrinsics (Pieter Wuille)268400d318[Refactor] CPU feature detection logic for SHA256 (Pieter Wuille) Pull request description: Based on #13191. This adds SHA256 implementations that use Intel's SHA Extension instructions (using intrinsics). This needs GCC 4.9 or Clang 3.4. In addition to #13191, two extra implementations are provided: * (a) A variable-length SHA256 implementation using SHA extensions. * (b) A 2-way 64-byte input double-SHA256 implementation using SHA extensions. Benchmarks for 9001-element Merkle tree root computation on an AMD Ryzen 1800X system: * Using generic C++ code (pre-#10821): 6.1ms * Using SSE4 (master, #10821): 4.6ms * Using 4-way SSE4 specialized for 64-byte inputs (#13191): 2.8ms * Using 8-way AVX2 specialized for 64-byte inputs (#13191): 2.1ms * Using 2-way SHA-NI specialized for 64-byte inputs (this PR): 0.56ms Benchmarks for 32-byte SHA256 on the same system: * Using SSE4 (master, #10821): 190ns * Using SHA-NI (this PR): 53ns Benchmarks for 1000000-byte SHA256 on the same system: * Using SSE4 (master, #10821): 2.5ms * Using SHA-NI (this PR): 0.51ms Tree-SHA512: 2b319e33b22579f815d91f9daf7994a5e1e799c4f73c13e15070dd54ba71f3f6438ccf77ae9cbd1ce76f972d9cbeb5f0edfea3d86f101bbc1055db70e42743b7
This commit is contained in:
@@ -29,6 +29,16 @@ namespace sha256d64_avx2
|
||||
void Transform_8way(unsigned char* out, const unsigned char* in);
|
||||
}
|
||||
|
||||
namespace sha256d64_shani
|
||||
{
|
||||
void Transform_2way(unsigned char* out, const unsigned char* in);
|
||||
}
|
||||
|
||||
namespace sha256_shani
|
||||
{
|
||||
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks);
|
||||
}
|
||||
|
||||
// Internal implementation code.
|
||||
namespace
|
||||
{
|
||||
@@ -448,6 +458,7 @@ void TransformD64Wrapper(unsigned char* out, const unsigned char* in)
|
||||
|
||||
TransformType Transform = sha256::Transform;
|
||||
TransformD64Type TransformD64 = sha256::TransformD64;
|
||||
TransformD64Type TransformD64_2way = nullptr;
|
||||
TransformD64Type TransformD64_4way = nullptr;
|
||||
TransformD64Type TransformD64_8way = nullptr;
|
||||
|
||||
@@ -512,6 +523,13 @@ bool SelfTest() {
|
||||
TransformD64(out, data + 1);
|
||||
if (!std::equal(out, out + 32, result_d64)) return false;
|
||||
|
||||
// Test TransformD64_2way, if available.
|
||||
if (TransformD64_2way) {
|
||||
unsigned char out[64];
|
||||
TransformD64_2way(out, data + 1);
|
||||
if (!std::equal(out, out + 64, result_d64)) return false;
|
||||
}
|
||||
|
||||
// Test TransformD64_4way, if available.
|
||||
if (TransformD64_4way) {
|
||||
unsigned char out[128];
|
||||
@@ -556,32 +574,64 @@ std::string SHA256AutoDetect()
|
||||
{
|
||||
std::string ret = "standard";
|
||||
#if defined(USE_ASM) && (defined(__x86_64__) || defined(__amd64__) || defined(__i386__))
|
||||
(void)AVXEnabled; // Silence unused warning (in case ENABLE_AVX2 is not defined)
|
||||
bool have_sse4 = false;
|
||||
bool have_xsave = false;
|
||||
bool have_avx = false;
|
||||
bool have_avx2 = false;
|
||||
bool have_shani = false;
|
||||
bool enabled_avx = false;
|
||||
|
||||
(void)AVXEnabled;
|
||||
(void)have_sse4;
|
||||
(void)have_avx;
|
||||
(void)have_xsave;
|
||||
(void)have_avx2;
|
||||
(void)have_shani;
|
||||
(void)enabled_avx;
|
||||
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
cpuid(1, 0, eax, ebx, ecx, edx);
|
||||
if ((ecx >> 19) & 1) {
|
||||
have_sse4 = (ecx >> 19) & 1;
|
||||
have_xsave = (ecx >> 27) & 1;
|
||||
have_avx = (ecx >> 28) & 1;
|
||||
if (have_xsave && have_avx) {
|
||||
enabled_avx = AVXEnabled();
|
||||
}
|
||||
if (have_sse4) {
|
||||
cpuid(7, 0, eax, ebx, ecx, edx);
|
||||
have_avx2 = (ebx >> 5) & 1;
|
||||
have_shani = (ebx >> 29) & 1;
|
||||
}
|
||||
|
||||
#if defined(ENABLE_SHANI) && !defined(BUILD_BITCOIN_INTERNAL)
|
||||
if (have_shani) {
|
||||
Transform = sha256_shani::Transform;
|
||||
TransformD64 = TransformD64Wrapper<sha256_shani::Transform>;
|
||||
TransformD64_2way = sha256d64_shani::Transform_2way;
|
||||
ret = "shani(1way,2way)";
|
||||
have_sse4 = false; // Disable SSE4/AVX2;
|
||||
have_avx2 = false;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (have_sse4) {
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
Transform = sha256_sse4::Transform;
|
||||
TransformD64 = TransformD64Wrapper<sha256_sse4::Transform>;
|
||||
ret = "sse4(1way)";
|
||||
#endif
|
||||
#if defined(ENABLE_SSE41) && !defined(BUILD_BITCOIN_INTERNAL)
|
||||
TransformD64_4way = sha256d64_sse41::Transform_4way;
|
||||
ret = "sse4(1way+4way)";
|
||||
#if defined(ENABLE_AVX2) && !defined(BUILD_BITCOIN_INTERNAL)
|
||||
if (((ecx >> 27) & 1) && ((ecx >> 28) & 1)) { // XSAVE and AVX
|
||||
cpuid(7, 0, eax, ebx, ecx, edx);
|
||||
if ((ebx >> 5) & 1) { // AVX2 flag
|
||||
if (AVXEnabled()) { // OS has enabled AVX registers
|
||||
TransformD64_8way = sha256d64_avx2::Transform_8way;
|
||||
ret += ",avx2(8way)";
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
ret = "sse4";
|
||||
ret += ",sse41(4way)";
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(ENABLE_AVX2) && !defined(BUILD_BITCOIN_INTERNAL)
|
||||
if (have_avx2 && have_avx && enabled_avx) {
|
||||
TransformD64_8way = sha256d64_avx2::Transform_8way;
|
||||
ret += ",avx2(8way)";
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
assert(SelfTest());
|
||||
@@ -663,6 +713,14 @@ void SHA256D64(unsigned char* out, const unsigned char* in, size_t blocks)
|
||||
blocks -= 4;
|
||||
}
|
||||
}
|
||||
if (TransformD64_2way) {
|
||||
while (blocks >= 2) {
|
||||
TransformD64_2way(out, in);
|
||||
out += 64;
|
||||
in += 128;
|
||||
blocks -= 2;
|
||||
}
|
||||
}
|
||||
while (blocks) {
|
||||
TransformD64(out, in);
|
||||
out += 32;
|
||||
|
||||
Reference in New Issue
Block a user