feat: Add trimmed mean filtering for exchange rate outlier detection (#3206)

Co-authored-by: Vlad Stan <stan.v.vlad@gmail.com>
2025-12-04 17:51:07 +01:00 · 2025-06-25 11:16:00 +01:00
parent efcc5e2148
commit edb7d66efc
2 changed files with 177 additions and 1 deletions
--- a/lnbits/utils/exchange_rates.py
+++ b/lnbits/utils/exchange_rates.py
@@ -1,4 +1,5 @@
 import asyncio
+import statistics
 from typing import Optional

 import httpx
@@ -187,6 +188,54 @@ def allowed_currencies() -> list[str]:
    return list(currencies.keys())


+def apply_trimmed_mean_filter(
+    rates: list[tuple[str, float]], threshold_percentage: float = 0.01
+) -> list[tuple[str, float]]:
+    """
+    Apply trimmed mean filtering to remove outliers from exchange rates.
+
+    Args:
+        rates: List of (provider_name, rate_value) tuples
+        threshold_percentage: Percentage threshold for outlier removal (default 1%)
+
+    Returns:
+        Filtered list of rates with outliers removed
+    """
+    if len(rates) < 3:
+        # Need at least 3 rates to apply filtering
+        return rates
+
+    rates_values = [r[1] for r in rates]
+    median_value = statistics.median(rates_values)
+
+    # Filter out values that are more than threshold_percentage away from median
+    filtered_rates = []
+    for rate in rates:
+        provider_name, value = rate
+        deviation = abs(value - median_value) / median_value
+        if deviation <= threshold_percentage:
+            logger.debug(
+                f"Keeping {provider_name}: {value} (deviation: {deviation:.4f})"
+            )
+            filtered_rates.append(rate)
+        else:
+            logger.debug(
+                f"Removing outlier {provider_name}: {value} "
+                f"(deviation: {deviation:.4f})"
+            )
+
+    # If we still have at least 2 rates after filtering, use them
+    if len(filtered_rates) >= 2:
+        logger.debug(f"Filtered rates: {filtered_rates}")
+        return filtered_rates
+    else:
+        # Fall back to median if filtering removed too many values
+        logger.debug("Filtering removed too many values, using median instead")
+        # Find the rate closest to median
+        closest_rate = min(rates, key=lambda x: abs(x[1] - median_value))
+        return [closest_rate]
+
+
 async def btc_rates(currency: str) -> list[tuple[str, float]]:
    if currency.upper() not in allowed_currencies():
        raise ValueError(f"Currency '{currency}' not allowed.")
@@ -236,7 +285,9 @@ async def btc_rates(currency: str) -> list[tuple[str, float]]:
    ]
    results = await asyncio.gather(*calls)

-    return [r for r in results if r is not None]
+    all_rates = [r for r in results if r is not None]
+
+    return apply_trimmed_mean_filter(all_rates)


 async def btc_price(currency: str) -> float:
--- a/tests/unit/test_exchange_rates.py
+++ b/tests/unit/test_exchange_rates.py
@@ -0,0 +1,125 @@
+from lnbits.utils.exchange_rates import (
+    apply_trimmed_mean_filter,
+)
+
+
+class TestApplyTrimmedMeanFilter:
+    """Test the trimmed mean filtering function"""
+
+    def test_trimmed_mean_filter_with_outliers(self):
+        """Test filtering removes outliers that deviate more than threshold"""
+        # Mock rates with one outlier (20% deviation)
+        rates = [
+            ("Binance", 50000.0),
+            ("Coinbase", 51000.0),
+            ("Kraken", 52000.0),
+            ("Outlier", 60000.0),  # 20% higher than others
+        ]
+
+        result = apply_trimmed_mean_filter(rates, threshold_percentage=0.01)
+
+        # Should remove the outliers (binance and outlier)
+        assert len(result) == 2
+        assert ("Outlier", 60000.0) not in result
+        assert ("Binance", 50000.0) not in result
+        assert ("Coinbase", 51000.0) in result
+        assert ("Kraken", 52000.0) in result
+
+    def test_trimmed_mean_filter_no_outliers(self):
+        """Test filtering keeps all rates when none are outliers"""
+        rates = [
+            ("Binance", 50000.0),
+            ("Coinbase", 50100.0),
+            ("Kraken", 50200.0),
+        ]
+
+        result = apply_trimmed_mean_filter(rates, threshold_percentage=0.01)
+
+        # Should keep all rates
+        assert len(result) == 3
+        assert result == rates
+
+    def test_trimmed_mean_filter_insufficient_data(self):
+        """Test filtering returns original data when less than 3 rates"""
+        rates = [
+            ("Binance", 50000.0),
+            ("Coinbase", 51000.0),
+        ]
+
+        result = apply_trimmed_mean_filter(rates, threshold_percentage=0.01)
+
+        # Should return original rates unchanged
+        assert result == rates
+
+    def test_trimmed_mean_filter_single_rate(self):
+        """Test filtering with single rate"""
+        rates = [("Binance", 50000.0)]
+
+        result = apply_trimmed_mean_filter(rates, threshold_percentage=0.01)
+
+        # Should return original rate unchanged
+        assert result == rates
+
+    def test_trimmed_mean_filter_empty_list(self):
+        """Test filtering with empty list"""
+        rates = []
+
+        result = apply_trimmed_mean_filter(rates, threshold_percentage=0.01)
+
+        # Should return empty list
+        assert result == []
+
+    def test_trimmed_mean_filter_too_many_outliers(self):
+        """Test fallback to median when filtering removes too many values"""
+        rates = [
+            ("Provider1", 50000.0),
+            ("Provider2", 60000.0),  # 20% higher
+            ("Provider3", 40000.0),  # 20% lower
+        ]
+
+        result = apply_trimmed_mean_filter(rates, threshold_percentage=0.01)
+
+        # Should fall back to rate closest to median (Provider1)
+        assert len(result) == 1
+        assert result[0] == ("Provider1", 50000.0)
+
+    def test_trimmed_mean_filter_different_thresholds(self):
+        """Test filtering with different threshold percentages"""
+        rates = [
+            ("Binance", 50000.0),
+            ("Coinbase", 51000.0),
+            ("Kraken", 53000.0),
+            ("Outlier", 55000.0),
+        ]
+
+        # For the values, the average is 52250
+        #  1% either side of the average is 51727.50 and 52772.50
+        # This would result in three rates being removed (Binance, Kraken and Outlier)
+        result_1pct = apply_trimmed_mean_filter(rates, threshold_percentage=0.01)
+
+        assert len(result_1pct) == 1
+        assert ("Binance", 50000.0) not in result_1pct
+        assert ("Coinbase", 51000.0) in result_1pct
+        assert ("Kraken", 53000.0) not in result_1pct
+        assert ("Outlier", 55000.0) not in result_1pct
+
+        # With 5% threshold, should keep just three
+        result_5pct = apply_trimmed_mean_filter(rates, threshold_percentage=0.05)
+        assert len(result_5pct) == 3
+        assert ("Binance", 50000.0) in result_5pct
+        assert ("Coinbase", 51000.0) in result_5pct
+        assert ("Kraken", 53000.0) in result_5pct
+        assert ("Outlier", 55000.0) not in result_5pct
+
+    def test_trimmed_mean_filter_edge_case_exact_threshold(self):
+        """Test filtering with rates exactly at the threshold"""
+        rates = [
+            ("Binance", 50000.0),
+            ("Coinbase", 50500.0),  # Exactly 1% higher
+        ]
+
+        result = apply_trimmed_mean_filter(rates, threshold_percentage=0.01)
+
+        # Should keep the rate at exactly 1% deviation
+        assert len(result) == 2
+        assert result == rates