fix(featureflag): cross-tier hash parity + variant only when enabled (MUL-3615)

Two must-fix issues from the PR review on #4496: 1. TS hash had a trailing zero separator that Go did not emit, so the same (key, identifier) bucketed differently on the two tiers. The "user lands in the same bucket on server and client" promise was broken. For example billing_new_invoice/user-42 was bucket 97 in Go and bucket 11 in TS. Fix: TS fnv1a now emits the zero separator BETWEEN parts only, never after the last one, matching Go's hash.Write byte stream exactly. Verified by parallel golden tests on both sides that pin five (key, identifier) -> bucket triples; if either side drifts both tests fail and one must be brought back in sync. 2. StaticProvider returned `Rule.Variant` regardless of whether the rule evaluated to enabled=true. A 0%-rollout user, a deny-listed user, or a default-off user would see variant="experiment-v2", so callers branching on Variant() would route control users into the experiment arm. Fix: Rule.Variant is now the ON-variant only. When the rule evaluates to enabled=false the Decision's variant is the canonical "off", regardless of what Rule.Variant says. Documented as a behavior contract in the Rule godoc / JSDoc and covered by regression tests on both sides. Tests: - go test -race ./pkg/featureflag/... : all green (1.58s). - pnpm --filter @multica/core test : 661/661 (3 new). - pnpm --filter @multica/core typecheck: clean. Co-authored-by: multica-agent <github@multica.ai>
2026-07-05 13:29:44 +02:00 · 2026-06-24 13:23:46 +08:00
parent 1fee14bd2c
commit b4518dc95e
9 changed files with 172 additions and 19 deletions
--- a/docs/feature-flags.md
+++ b/docs/feature-flags.md
@@ -102,6 +102,8 @@ default:
 }
 ```

+`Rule.Variant` is the **on-variant**: it is only returned when the rule evaluates to enabled=true (allow hit, percent hit, default-on). When the rule evaluates to disabled (deny hit, percent miss, default-off) the Service returns `"off"` so callers branching on `Variant()` cannot route control users into the experiment arm. This is exercised by `TestStaticProviderVariantOnlyWhenEnabled` and is the same on the TS side.
+
 The Service is nil-safe and missing-key-safe: `(*Service)(nil).IsEnabled(ctx, "any", true)` returns `true`. Business code never needs to guard against a missing flag.

 ---
--- a/packages/core/feature-flags/hash.test.ts
+++ b/packages/core/feature-flags/hash.test.ts
@@ -22,6 +22,24 @@ describe("feature-flags hash", () => {
    expect(bucketFor("ab", "c")).not.toBe(bucketFor("a", "bc"));
  });

+  // Pinned (key, identifier) -> bucket values that MUST agree with the
+  // Go-side server/pkg/featureflag/hash_test.go::TestPercentBucketCrossLanguageGolden.
+  // The shared golden table is the single source of truth for "same user,
+  // same bucket" across backend and frontend; if either side drifts, both
+  // tests fail and one must be brought back in sync.
+  it("cross-language golden: bucket values match the Go side exactly", () => {
+    const cases: ReadonlyArray<[string, string, number]> = [
+      ["billing_new_invoice", "user-42", 97],
+      ["feature_a", "user-1", 50],
+      ["checkout_algo", "u-7f8a", 11],
+      ["ws_rollout", "workspace-1", 62],
+      ["empty_id_flag", "", 83],
+    ];
+    for (const [key, id, want] of cases) {
+      expect(bucketFor(key, id)).toBe(want);
+    }
+  });
+
  it("inPercent clamps boundary values", () => {
    expect(inPercent("any", "any", 0)).toBe(false);
    expect(inPercent("any", "any", -10)).toBe(false);
--- a/packages/core/feature-flags/hash.ts
+++ b/packages/core/feature-flags/hash.ts
@@ -3,9 +3,10 @@
 *
 * The same (key, identifier) pair MUST always produce the same bucket;
 * otherwise users would flip in and out of experiments across requests. The
- * algorithm matches the Go-side server/pkg/featureflag/hash.go so a flag
- * evaluated on the frontend and on the backend lands in the same bucket for
- * the same user.
+ * algorithm matches the Go-side server/pkg/featureflag/hash.go byte-for-byte
+ * so a flag evaluated on the frontend and on the backend lands in the same
+ * bucket for the same user. Cross-language equality is exercised by golden
+ * tests on both sides; see hash.test.ts and hash_test.go.
 *
 * FNV-1a is used because it is cheap, dependency-free, and well-distributed
 * enough for sub-100 bucketing. It is NOT cryptographic; do not use it for
@@ -15,17 +16,22 @@ function fnv1a(parts: ReadonlyArray<string>): number {
  // 32-bit FNV-1a: offset basis 0x811c9dc5, prime 0x01000193.
  let hash = 0x811c9dc5;
  for (let p = 0; p < parts.length; p++) {
+    if (p > 0) {
+      // Zero-byte separator BETWEEN parts (not after the last one). This
+      // matches what the Go side writes via h.Write([]byte{0}) between
+      // key and identifier and is what prevents ("ab", "c") and
+      // ("a", "bc") from colliding. A trailing separator would diverge
+      // from Go and silently break cross-tier bucket parity.
+      hash ^= 0;
+      hash = Math.imul(hash, 0x01000193);
+    }
    const s = parts[p]!;
    for (let i = 0; i < s.length; i++) {
      hash ^= s.charCodeAt(i);
-      // Multiply by FNV prime mod 2^32. Using Math.imul keeps the result
-      // in a 32-bit integer without slipping into float territory.
+      // Multiply by FNV prime mod 2^32. Math.imul keeps the result in a
+      // 32-bit integer without slipping into float territory.
      hash = Math.imul(hash, 0x01000193);
    }
-    // Zero separator between parts so ("ab", "c") and ("a", "bc") cannot
-    // hash to the same value.
-    hash ^= 0;
-    hash = Math.imul(hash, 0x01000193);
  }
  // Force unsigned 32-bit before the modulo to match Go's uint32 arithmetic.
  return hash >>> 0;
@@ -33,7 +39,8 @@ function fnv1a(parts: ReadonlyArray<string>): number {

 /**
 * bucketFor returns a deterministic bucket in [0, 100) for the supplied
- * (key, identifier) pair.
+ * (key, identifier) pair. Identical to the Go bucketFor in
+ * server/pkg/featureflag/hash.go.
 */
 export function bucketFor(key: string, identifier: string): number {
  return fnv1a([key, identifier]) % 100;
--- a/packages/core/feature-flags/static-provider.test.ts
+++ b/packages/core/feature-flags/static-provider.test.ts
@@ -54,6 +54,36 @@ describe("StaticProvider", () => {
    expect(d?.enabled).toBe(true);
  });

+  // Regression test for the MUL-3615 review: when a rule sets `variant`
+  // but the rule itself evaluates to enabled=false (deny match, percent
+  // miss, default-off), the decision MUST report variant="off", never
+  // the on-variant. Otherwise a switch on `useVariant()` would route
+  // non-rolled-in users into the experiment arm.
+  it("variant: returns 'off' when the rule evaluates to disabled", () => {
+    const sp = new StaticProvider({
+      exp: {
+        default: false,
+        variant: "experiment-v2",
+        deny: ["banned-user"],
+        percent: { percent: 0 },
+      },
+    });
+    for (const userId of ["banned-user", "random-user", ""]) {
+      const d = sp.lookup("exp", { userId });
+      expect(d?.enabled).toBe(false);
+      expect(d?.variant).toBe("off");
+    }
+  });
+
+  it("variant: returns the on-variant when the rule evaluates to enabled", () => {
+    const sp = new StaticProvider({
+      exp: { default: false, variant: "experiment-v2", allow: ["rolled-in"] },
+    });
+    const d = sp.lookup("exp", { userId: "rolled-in" });
+    expect(d?.enabled).toBe(true);
+    expect(d?.variant).toBe("experiment-v2");
+  });
+
  it("loadRules replaces, not merges, the rule map", () => {
    const sp = new StaticProvider({ old: { default: true } });
    sp.loadRules({ fresh: { default: true } });
--- a/packages/core/feature-flags/static-provider.ts
+++ b/packages/core/feature-flags/static-provider.ts
@@ -80,10 +80,18 @@ function decisionFromRule(
  enabled: boolean,
  reason: Decision["reason"],
 ): Decision {
+  // Variant policy: rule.variant is the ON-variant. When the rule
+  // evaluates to false we return the canonical "off" so a caller
+  // branching on the variant cannot accidentally enter the experiment
+  // arm for a user that did not roll in.
+  let variant = boolToVariant(enabled);
+  if (enabled && rule.variant && rule.variant.length > 0) {
+    variant = rule.variant;
+  }
  return {
    key,
    enabled,
-    variant: rule.variant && rule.variant.length > 0 ? rule.variant : boolToVariant(enabled),
+    variant,
    reason,
    source: "static",
  };
--- a/packages/core/feature-flags/types.ts
+++ b/packages/core/feature-flags/types.ts
@@ -78,8 +78,11 @@ export interface Rule {
  /** Value returned when no targeting rule matches. Defaults to false. */
  default?: boolean;
  /**
-   * Optional variant identifier returned alongside the boolean. Use for
-   * multi-arm flags ("control" / "experiment-v2" / "experiment-v3").
+   * Variant identifier returned WHEN the rule evaluates to enabled=true.
+   * Use for multi-arm experiments (e.g. "experiment-v2"). When the rule
+   * evaluates to enabled=false the Decision's variant is always "off",
+   * so callers branching on `Variant()` cannot accidentally enter the
+   * experiment arm for users that did not roll in.
   */
  variant?: string;
  /** Identifier values that force the flag ON. */
--- a/server/pkg/featureflag/eval_context_test.go
+++ b/server/pkg/featureflag/eval_context_test.go
@@ -101,3 +101,33 @@ func TestPercentBucketSeparator(t *testing.T) {
 		t.Fatalf("hash separator failed: bucketFor('ab','c') == bucketFor('a','bc') == %d", left)
 	}
 }
+
+// TestPercentBucketCrossLanguageGolden pins concrete (key, identifier) ->
+// bucket values that the Go side MUST agree on with the TS side. The same
+// values are duplicated in packages/core/feature-flags/hash.test.ts; if
+// either side drifts, both tests fail and one must be brought back in
+// sync. This is the single source of truth for "same user, same bucket"
+// across the backend and the frontend.
+func TestPercentBucketCrossLanguageGolden(t *testing.T) {
+	t.Parallel()
+	cases := []struct {
+		key, id string
+		want    int
+	}{
+		{"billing_new_invoice", "user-42", 97},
+		{"feature_a", "user-1", 50},
+		{"checkout_algo", "u-7f8a", 11},
+		{"ws_rollout", "workspace-1", 62},
+		{"empty_id_flag", "", 83},
+	}
+	for _, tc := range cases {
+		got := bucketFor(tc.key, tc.id)
+		if got != tc.want {
+			t.Fatalf(
+				"cross-language golden mismatch: bucketFor(%q, %q) = %d, want %d. "+
+					"If you changed the hash you MUST also update hash.test.ts.",
+				tc.key, tc.id, got, tc.want,
+			)
+		}
+	}
+}
--- a/server/pkg/featureflag/static_provider.go
+++ b/server/pkg/featureflag/static_provider.go
@@ -29,9 +29,17 @@ type Rule struct {
 	// Default is the value returned when no targeting rule matches.
 	Default bool

-	// Variant overrides the boolean "on" / "off" projection with an
-	// arbitrary variant identifier. Use this for multi-arm flags.
-	// When Variant != "", the boolean Default still controls Enabled.
+	// Variant is the variant identifier returned WHEN the rule evaluates
+	// to enabled=true. For multi-arm experiments, set Variant to the
+	// experiment-arm identifier (e.g. "experiment-v2"); for plain on/off
+	// flags leave it empty.
+	//
+	// When the rule evaluates to enabled=false (default-off, deny hit,
+	// percent miss, ...) the resulting Decision's Variant is always the
+	// canonical "off". This is deliberate: a caller that branches on
+	// Variant("checkout_algo", "control") would otherwise be routed into
+	// the experiment arm even though the user did not roll into the
+	// experiment cohort.
 	Variant string

 	// Allow is the set of identifier values that force the flag ON.
@@ -178,9 +186,13 @@ func evaluateRule(key string, rule Rule, ec EvalContext) Decision {
 }

 func decisionFromRule(key string, rule Rule, enabled bool, reason Reason) Decision {
-	variant := rule.Variant
-	if variant == "" {
-		variant = boolToVariant(enabled)
+	// Variant policy: rule.Variant is the ON-variant. When the rule
+	// evaluates to false we return the canonical "off" so a caller
+	// branching on Variant() cannot accidentally enter the experiment
+	// arm for a user that did not roll in.
+	variant := boolToVariant(enabled)
+	if enabled && rule.Variant != "" {
+		variant = rule.Variant
 	}
 	return Decision{
 		Key:     key,
--- a/server/pkg/featureflag/static_provider_test.go
+++ b/server/pkg/featureflag/static_provider_test.go
@@ -189,6 +189,49 @@ func TestStaticProviderCustomAttribute(t *testing.T) {
 	}
 }

+// TestStaticProviderVariantOnlyWhenEnabled is the regression test for the
+// review feedback from MUL-3615: a Rule with Variant="experiment-v2" but
+// enabled=false (deny match, percent miss, default-off) MUST surface
+// Variant="off", not the on-variant. Otherwise a caller branching on
+// Variant() would route control users into the experiment arm.
+func TestStaticProviderVariantOnlyWhenEnabled(t *testing.T) {
+	t.Parallel()
+	sp := NewStaticProvider()
+	sp.Set("exp", Rule{
+		Default: false,
+		Variant: "experiment-v2",
+		Deny:    []string{"banned-user"},
+		Percent: &PercentRollout{Percent: 0}, // 0% rollout: nobody is in.
+	})
+
+	for _, userID := range []string{"banned-user", "random-user", ""} {
+		ctx := WithEvalContext(context.Background(), EvalContext{UserID: userID})
+		d, _ := sp.Lookup(ctx, "exp")
+		if d.Enabled {
+			t.Fatalf("user=%q must be disabled at 0%% rollout, got %+v", userID, d)
+		}
+		if d.Variant != "off" {
+			t.Fatalf("user=%q got Variant=%q, want %q — on-variant must not leak when disabled",
+				userID, d.Variant, "off")
+		}
+	}
+}
+
+func TestStaticProviderVariantWhenEnabled(t *testing.T) {
+	t.Parallel()
+	sp := NewStaticProvider()
+	sp.Set("exp", Rule{
+		Default: false,
+		Variant: "experiment-v2",
+		Allow:   []string{"rolled-in-user"},
+	})
+	ctx := WithEvalContext(context.Background(), EvalContext{UserID: "rolled-in-user"})
+	d, _ := sp.Lookup(ctx, "exp")
+	if !d.Enabled || d.Variant != "experiment-v2" {
+		t.Fatalf("enabled user should see the on-variant, got %+v", d)
+	}
+}
+
 // randomUserID returns a stable user identifier derived from i. It exists
 // so the rollout distribution test is deterministic across runs (no rand).
 func randomUserID(i int) string {