From 42ec5585424ceb91bed07826dde15697c020661a Mon Sep 17 00:00:00 2001
From: Gleb Naumenko <naumenko.gs@gmail.com>
Date: Tue, 11 Aug 2020 10:42:26 +0300
Subject: [PATCH 1/4] Justify the choice of ADDR cache lifetime

---
 src/net.cpp | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
diff --git a/src/net.cpp b/src/net.cpp
index 883e57bdf00..7841965f601 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -2545,6 +2545,31 @@ std::vector<CAddress> CConnman::GetAddresses(Network requestor_network, size_t m
     if (m_addr_response_caches.find(requestor_network) == m_addr_response_caches.end() ||
         m_addr_response_caches[requestor_network].m_update_addr_response < current_time) {
         m_addr_response_caches[requestor_network].m_addrs_response_cache = GetAddresses(max_addresses, max_pct);
+
+        // Choosing a proper cache lifetime is a trade-off between the privacy leak minimization
+        // and the usefulness of ADDR responses to honest users.
+        //
+        // Longer cache lifetime makes it more difficult for an attacker to scrape
+        // enough AddrMan data to maliciously infer something useful.
+        // By the time an attacker scraped enough AddrMan records, most of
+        // the records should be old enough to not leak topology info by
+        // e.g. analyzing real-time changes in timestamps.
+        //
+        // It takes only several hundred requests to scrape everything from an AddrMan containing 100,000 nodes,
+        // so ~24 hours of cache lifetime indeed makes the data less inferable by the time
+        // most of it could be scraped (considering that timestamps are updated via
+        // ADDR self-announcements and when nodes communicate).
+        // We also should be robust to those attacks which may not require scraping *full* victim's AddrMan
+        // (because even several timestamps of the same handful of nodes may leak privacy).
+        //
+        // On the other hand, longer cache lifetime makes ADDR responses
+        // outdated and less useful for an honest requestor, e.g. if most nodes
+        // in the ADDR response are no longer active.
+        //
+        // However, the churn in the network is known to be rather low. Since we consider
+        // nodes to be "terrible" (see IsTerrible()) if the timestamps are older than 30 days,
+        // max. 24 hours of "penalty" due to cache shouldn't make any meaningful difference
+        // in terms of the freshness of the response.
         m_addr_response_caches[requestor_network].m_update_addr_response = current_time + std::chrono::hours(21) + GetRandMillis(std::chrono::hours(6));
     }
     return m_addr_response_caches[requestor_network].m_addrs_response_cache;

From 81b00f87800f40cb14f2131ff27668bd2bb9e551 Mon Sep 17 00:00:00 2001
From: Gleb Naumenko <naumenko.gs@gmail.com>
Date: Tue, 11 Aug 2020 12:41:26 +0300
Subject: [PATCH 2/4] Add indexing ADDR cache by local socket addr

---
 src/net.cpp            | 20 ++++++++++++++------
 src/net.h              | 12 ++++++++----
 src/net_processing.cpp |  2 +-
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/net.cpp b/src/net.cpp
index 7841965f601..e43b4806123 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -94,6 +94,7 @@ const std::string NET_MESSAGE_COMMAND_OTHER = "*other*";
 
 static const uint64_t RANDOMIZER_ID_NETGROUP = 0x6c0edd8036ef4036ULL; // SHA256("netgroup")[0:8]
 static const uint64_t RANDOMIZER_ID_LOCALHOSTNONCE = 0xd93e69e2bbfa5735ULL; // SHA256("localhostnonce")[0:8]
+static const uint64_t RANDOMIZER_ID_ADDRCACHE = 0x1cf2e4ddd306dda9ULL; // SHA256("addrcache")[0:8]
 //
 // Global state variables
 //
@@ -2539,12 +2540,19 @@ std::vector<CAddress> CConnman::GetAddresses(size_t max_addresses, size_t max_pc
     return addresses;
 }
 
-std::vector<CAddress> CConnman::GetAddresses(Network requestor_network, size_t max_addresses, size_t max_pct)
+std::vector<CAddress> CConnman::GetAddresses(CNode& requestor, size_t max_addresses, size_t max_pct)
 {
+    SOCKET socket;
+    WITH_LOCK(requestor.cs_hSocket, socket = requestor.hSocket);
+    auto local_socket_bytes = GetBindAddress(socket).GetAddrBytes();
+    uint64_t cache_id = GetDeterministicRandomizer(RANDOMIZER_ID_ADDRCACHE)
+        .Write(requestor.addr.GetNetwork())
+        .Write(local_socket_bytes.data(), local_socket_bytes.size())
+        .Finalize();
     const auto current_time = GetTime<std::chrono::microseconds>();
-    if (m_addr_response_caches.find(requestor_network) == m_addr_response_caches.end() ||
-        m_addr_response_caches[requestor_network].m_update_addr_response < current_time) {
-        m_addr_response_caches[requestor_network].m_addrs_response_cache = GetAddresses(max_addresses, max_pct);
+    if (m_addr_response_caches.find(cache_id) == m_addr_response_caches.end() ||
+        m_addr_response_caches[cache_id].m_update_addr_response < current_time) {
+        m_addr_response_caches[cache_id].m_addrs_response_cache = GetAddresses(max_addresses, max_pct);
 
         // Choosing a proper cache lifetime is a trade-off between the privacy leak minimization
         // and the usefulness of ADDR responses to honest users.
@@ -2570,9 +2578,9 @@ std::vector<CAddress> CConnman::GetAddresses(Network requestor_network, size_t m
         // nodes to be "terrible" (see IsTerrible()) if the timestamps are older than 30 days,
         // max. 24 hours of "penalty" due to cache shouldn't make any meaningful difference
         // in terms of the freshness of the response.
-        m_addr_response_caches[requestor_network].m_update_addr_response = current_time + std::chrono::hours(21) + GetRandMillis(std::chrono::hours(6));
+        m_addr_response_caches[cache_id].m_update_addr_response = current_time + std::chrono::hours(21) + GetRandMillis(std::chrono::hours(6));
     }
-    return m_addr_response_caches[requestor_network].m_addrs_response_cache;
+    return m_addr_response_caches[cache_id].m_addrs_response_cache;
 }
 
 bool CConnman::AddNode(const std::string& strNode)
diff --git a/src/net.h b/src/net.h
index c72eada3ff4..c9ab579eb24 100644
--- a/src/net.h
+++ b/src/net.h
@@ -269,7 +269,7 @@ public:
      * A non-malicious call (from RPC or a peer with addr permission) should
      * call the function without a parameter to avoid using the cache.
      */
-    std::vector<CAddress> GetAddresses(Network requestor_network, size_t max_addresses, size_t max_pct);
+    std::vector<CAddress> GetAddresses(CNode& requestor, size_t max_addresses, size_t max_pct);
 
     // This allows temporarily exceeding m_max_outbound_full_relay, with the goal of finding
     // a peer that is better than all our current peers.
@@ -447,15 +447,19 @@ private:
 
     /**
      * Addr responses stored in different caches
-     * per network prevent cross-network node identification.
+     * per (network, local socket) prevent cross-network node identification.
      * If a node for example is multi-homed under Tor and IPv6,
      * a single cache (or no cache at all) would let an attacker
      * to easily detect that it is the same node by comparing responses.
+     * Indexing by local socket prevents leakage when a node has multiple
+     * listening addresses on the same network.
+     *
      * The used memory equals to 1000 CAddress records (or around 32 bytes) per
      * distinct Network (up to 5) we have/had an inbound peer from,
-     * resulting in at most ~160 KB.
+     * resulting in at most ~160 KB. Every separate local socket may
+     * add up to ~160 KB extra.
      */
-    std::map<Network, CachedAddrResponse> m_addr_response_caches;
+    std::map<uint64_t, CachedAddrResponse> m_addr_response_caches;
 
     /**
      * Services this instance offers.
diff --git a/src/net_processing.cpp b/src/net_processing.cpp
index 60bdfbe9f5d..bf359a0d686 100644
--- a/src/net_processing.cpp
+++ b/src/net_processing.cpp
@@ -3516,7 +3516,7 @@ void PeerLogicValidation::ProcessMessage(CNode& pfrom, const std::string& msg_ty
         if (pfrom.HasPermission(PF_ADDR)) {
             vAddr = m_connman.GetAddresses(MAX_ADDR_TO_SEND, MAX_PCT_ADDR_TO_SEND);
         } else {
-            vAddr = m_connman.GetAddresses(pfrom.addr.GetNetwork(), MAX_ADDR_TO_SEND, MAX_PCT_ADDR_TO_SEND);
+            vAddr = m_connman.GetAddresses(pfrom, MAX_ADDR_TO_SEND, MAX_PCT_ADDR_TO_SEND);
         }
         FastRandomContext insecure_rand;
         for (const CAddress &addr : vAddr) {

From 83ad65f31b5c9441ae1618614082e584854a14e1 Mon Sep 17 00:00:00 2001
From: Gleb Naumenko <naumenko.gs@gmail.com>
Date: Tue, 11 Aug 2020 13:39:56 +0300
Subject: [PATCH 3/4] Address nits in ADDR caching

---
 src/net.cpp | 12 ++++++------
 src/net.h   |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/net.cpp b/src/net.cpp
index e43b4806123..8ac45dbcb59 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -2550,10 +2550,10 @@ std::vector<CAddress> CConnman::GetAddresses(CNode& requestor, size_t max_addres
         .Write(local_socket_bytes.data(), local_socket_bytes.size())
         .Finalize();
     const auto current_time = GetTime<std::chrono::microseconds>();
-    if (m_addr_response_caches.find(cache_id) == m_addr_response_caches.end() ||
-        m_addr_response_caches[cache_id].m_update_addr_response < current_time) {
-        m_addr_response_caches[cache_id].m_addrs_response_cache = GetAddresses(max_addresses, max_pct);
-
+    auto r = m_addr_response_caches.emplace(cache_id, CachedAddrResponse{});
+    CachedAddrResponse& cache_entry = r.first->second;
+    if (cache_entry.m_cache_entry_expiration < current_time) { // If emplace() added new one it has expiration 0.
+        cache_entry.m_addrs_response_cache = GetAddresses(max_addresses, max_pct);
         // Choosing a proper cache lifetime is a trade-off between the privacy leak minimization
         // and the usefulness of ADDR responses to honest users.
         //
@@ -2578,9 +2578,9 @@ std::vector<CAddress> CConnman::GetAddresses(CNode& requestor, size_t max_addres
         // nodes to be "terrible" (see IsTerrible()) if the timestamps are older than 30 days,
         // max. 24 hours of "penalty" due to cache shouldn't make any meaningful difference
         // in terms of the freshness of the response.
-        m_addr_response_caches[cache_id].m_update_addr_response = current_time + std::chrono::hours(21) + GetRandMillis(std::chrono::hours(6));
+        cache_entry.m_cache_entry_expiration = current_time + std::chrono::hours(21) + GetRandMillis(std::chrono::hours(6));
     }
-    return m_addr_response_caches[cache_id].m_addrs_response_cache;
+    return cache_entry.m_addrs_response_cache;
 }
 
 bool CConnman::AddNode(const std::string& strNode)
diff --git a/src/net.h b/src/net.h
index c9ab579eb24..21faea591ac 100644
--- a/src/net.h
+++ b/src/net.h
@@ -442,7 +442,7 @@ private:
      */
     struct CachedAddrResponse {
         std::vector<CAddress> m_addrs_response_cache;
-        std::chrono::microseconds m_update_addr_response{0};
+        std::chrono::microseconds m_cache_entry_expiration{0};
     };
 
     /**
@@ -454,10 +454,10 @@ private:
      * Indexing by local socket prevents leakage when a node has multiple
      * listening addresses on the same network.
      *
-     * The used memory equals to 1000 CAddress records (or around 32 bytes) per
+     * The used memory equals to 1000 CAddress records (or around 40 bytes) per
      * distinct Network (up to 5) we have/had an inbound peer from,
-     * resulting in at most ~160 KB. Every separate local socket may
-     * add up to ~160 KB extra.
+     * resulting in at most ~196 KB. Every separate local socket may
+     * add up to ~196 KB extra.
      */
     std::map<uint64_t, CachedAddrResponse> m_addr_response_caches;
 

From 0d04784af151de249bbbcbad51e6e8ad9af8f5a3 Mon Sep 17 00:00:00 2001
From: Gleb Naumenko <naumenko.gs@gmail.com>
Date: Sun, 23 Aug 2020 17:31:45 +0300
Subject: [PATCH 4/4] Refactor the functional test

---
 test/functional/p2p_getaddr_caching.py | 47 ++++++++------------------
 1 file changed, 14 insertions(+), 33 deletions(-)

diff --git a/test/functional/p2p_getaddr_caching.py b/test/functional/p2p_getaddr_caching.py
index 6622ea9ec20..2b75ad51754 100755
--- a/test/functional/p2p_getaddr_caching.py
+++ b/test/functional/p2p_getaddr_caching.py
@@ -5,13 +5,8 @@
 """Test addr response caching"""
 
 import time
-from test_framework.messages import (
-    CAddress,
-    NODE_NETWORK,
-    NODE_WITNESS,
-    msg_addr,
-    msg_getaddr,
-)
+
+from test_framework.messages import msg_getaddr
 from test_framework.p2p import (
     P2PInterface,
     p2p_lock
@@ -21,21 +16,9 @@ from test_framework.util import (
     assert_equal,
 )
 
+# As defined in net_processing.
 MAX_ADDR_TO_SEND = 1000
-
-def gen_addrs(n):
-    addrs = []
-    for i in range(n):
-        addr = CAddress()
-        addr.time = int(time.time())
-        addr.nServices = NODE_NETWORK | NODE_WITNESS
-        # Use first octets to occupy different AddrMan buckets
-        first_octet = i >> 8
-        second_octet = i % 256
-        addr.ip = "{}.{}.1.1".format(first_octet, second_octet)
-        addr.port = 8333
-        addrs.append(addr)
-    return addrs
+MAX_PCT_ADDR_TO_SEND = 23
 
 class AddrReceiver(P2PInterface):
 
@@ -62,18 +45,16 @@ class AddrTest(BitcoinTestFramework):
         self.num_nodes = 1
 
     def run_test(self):
-        self.log.info('Create connection that sends and requests addr messages')
-        addr_source = self.nodes[0].add_p2p_connection(P2PInterface())
-
-        msg_send_addrs = msg_addr()
         self.log.info('Fill peer AddrMan with a lot of records')
-        # Since these addrs are sent from the same source, not all of them will be stored,
-        # because we allocate a limited number of AddrMan buckets per addr source.
-        total_addrs = 10000
-        addrs = gen_addrs(total_addrs)
-        for i in range(int(total_addrs/MAX_ADDR_TO_SEND)):
-            msg_send_addrs.addrs = addrs[i * MAX_ADDR_TO_SEND:(i + 1) * MAX_ADDR_TO_SEND]
-            addr_source.send_and_ping(msg_send_addrs)
+        for i in range(10000):
+            first_octet = i >> 8
+            second_octet = i % 256
+            a = "{}.{}.1.1".format(first_octet, second_octet)
+            self.nodes[0].addpeeraddress(a, 8333)
+
+        # Need to make sure we hit MAX_ADDR_TO_SEND records in the addr response later because
+        # only a fraction of all known addresses can be cached and returned.
+        assert(len(self.nodes[0].getnodeaddresses(0)) > int(MAX_ADDR_TO_SEND / (MAX_PCT_ADDR_TO_SEND / 100)))
 
         responses = []
         self.log.info('Send many addr requests within short time to receive same response')
@@ -89,7 +70,7 @@ class AddrTest(BitcoinTestFramework):
             responses.append(addr_receiver.get_received_addrs())
         for response in responses[1:]:
             assert_equal(response, responses[0])
-        assert(len(response) < MAX_ADDR_TO_SEND)
+        assert(len(response) == MAX_ADDR_TO_SEND)
 
         cur_mock_time += 3 * 24 * 60 * 60
         self.nodes[0].setmocktime(cur_mock_time)