Merge bitcoin/bitcoin#24946: Unroll the ChaCha20 inner loop for performance

81c09ee45c Unroll the ChaCha20 inner loop for performance (Pieter Wuille)

Pull request description:

  Unrolling the inner ChaCha20 loop gives a ~15% speedup for me in the CHACHA20_* benchmarks. It's a simple change, this performance helps with RNG generation, and will matter more for BIP324.

ACKs for top commit:
  martinus:
    tested ACK  81c09ee with clang++ 13.0.1, test `CHACHA20_1MB`:
  MarcoFalke:
    ACK 81c09ee45c 🍟

Tree-SHA512: 108bd0ba573bb08de92d611e7be7c09a2c2700f9655f44129b87f9b71f7e101dfc6bd345783e7b4b9b40f0b003913cf59187f422da8cdb5b20887f7855b2611a
This commit is contained in:
MacroFake
2022-05-09 13:56:32 +02:00

View File

@ -18,6 +18,8 @@ constexpr static inline uint32_t rotl32(uint32_t v, int c) { return (v << c) | (
a += b; d = rotl32(d ^ a, 8); \
c += d; b = rotl32(b ^ c, 7);
#define REPEAT10(a) do { {a}; {a}; {a}; {a}; {a}; {a}; {a}; {a}; {a}; {a}; } while(0)
static const unsigned char sigma[] = "expand 32-byte k";
static const unsigned char tau[] = "expand 16-byte k";
@ -119,16 +121,19 @@ void ChaCha20::Keystream(unsigned char* c, size_t bytes)
x13 = j13;
x14 = j14;
x15 = j15;
for (i = 20;i > 0;i -= 2) {
QUARTERROUND( x0, x4, x8,x12)
QUARTERROUND( x1, x5, x9,x13)
QUARTERROUND( x2, x6,x10,x14)
QUARTERROUND( x3, x7,x11,x15)
QUARTERROUND( x0, x5,x10,x15)
QUARTERROUND( x1, x6,x11,x12)
QUARTERROUND( x2, x7, x8,x13)
QUARTERROUND( x3, x4, x9,x14)
}
// The 20 inner ChaCha20 rounds are unrolled here for performance.
REPEAT10(
QUARTERROUND( x0, x4, x8,x12);
QUARTERROUND( x1, x5, x9,x13);
QUARTERROUND( x2, x6,x10,x14);
QUARTERROUND( x3, x7,x11,x15);
QUARTERROUND( x0, x5,x10,x15);
QUARTERROUND( x1, x6,x11,x12);
QUARTERROUND( x2, x7, x8,x13);
QUARTERROUND( x3, x4, x9,x14);
);
x0 += j0;
x1 += j1;
x2 += j2;
@ -231,16 +236,19 @@ void ChaCha20::Crypt(const unsigned char* m, unsigned char* c, size_t bytes)
x13 = j13;
x14 = j14;
x15 = j15;
for (i = 20;i > 0;i -= 2) {
QUARTERROUND( x0, x4, x8,x12)
QUARTERROUND( x1, x5, x9,x13)
QUARTERROUND( x2, x6,x10,x14)
QUARTERROUND( x3, x7,x11,x15)
QUARTERROUND( x0, x5,x10,x15)
QUARTERROUND( x1, x6,x11,x12)
QUARTERROUND( x2, x7, x8,x13)
QUARTERROUND( x3, x4, x9,x14)
}
// The 20 inner ChaCha20 rounds are unrolled here for performance.
REPEAT10(
QUARTERROUND( x0, x4, x8,x12);
QUARTERROUND( x1, x5, x9,x13);
QUARTERROUND( x2, x6,x10,x14);
QUARTERROUND( x3, x7,x11,x15);
QUARTERROUND( x0, x5,x10,x15);
QUARTERROUND( x1, x6,x11,x12);
QUARTERROUND( x2, x7, x8,x13);
QUARTERROUND( x3, x4, x9,x14);
);
x0 += j0;
x1 += j1;
x2 += j2;