From eaeed25b2397a2ef4415d05b0d0cd041d2fe97ee Mon Sep 17 00:00:00 2001
From: Mansour Behabadi <mbehabadi@mathspace.com.au>
Date: Thu, 31 Oct 2024 10:31:11 +1100
Subject: [PATCH 1/4] Add schema support for split_attribute

---
 src/feafea/config_schema.json | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/feafea/config_schema.json b/src/feafea/config_schema.json
index 4383930..f8fb2ec 100644
--- a/src/feafea/config_schema.json
+++ b/src/feafea/config_schema.json
@@ -111,7 +111,12 @@
             {
               "required": [
                 "variants"
-              ]
+              ],
+              "not": {
+                "required": [
+                  "split_attribute"
+                ]
+              }
             },
             {
               "required": [
@@ -173,6 +178,10 @@
                 ]
               }
             },
+            "split_attribute": {
+              "type": "string",
+              "pattern": "^[a-zA-Z_][a-zA-Z0-9_]*$"
+            },
             "split_group": {
               "type": "string",
               "pattern": "^[a-zA-Z_][a-zA-Z0-9_]*$"

From 52847c532196b11b43640f4873dde7e55ea2188a Mon Sep 17 00:00:00 2001
From: Mansour Behabadi <mbehabadi@mathspace.com.au>
Date: Fri, 1 Nov 2024 17:07:45 +1100
Subject: [PATCH 2/4] Disallow insplit function and splits rule at the same
 time

---
 src/feafea/__init__.py | 23 +++++++++++++++++------
 tests/test_configs.py  |  1 +
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/feafea/__init__.py b/src/feafea/__init__.py
index daf8fc9..0267e96 100644
--- a/src/feafea/__init__.py
+++ b/src/feafea/__init__.py
@@ -365,7 +365,7 @@ class _CompiledFilter:
     The compiled filter function.
     """
 
-    __slots__ = ("eval", "flag_refs", "rule_refs")
+    __slots__ = ("eval", "flag_refs", "rule_refs", "uses_insplit")
 
     # The compiled filter function.
     eval: Callable[[TargetID, Attributes], bool]
@@ -377,6 +377,9 @@ class _CompiledFilter:
     # The set of rule names/splits referenced in the filter. This is used to ensure
     # no circular references to rules occur.
     rule_refs: set[tuple[str, str | None]]
+    # Whether the filter uses the insplit function. This is used to disallow the
+    # simultaneous use of insplit and split rules in the same rule.
+    uses_insplit: bool
 
     # Filter references are not needed since filter references in a filter are
     # inlined during compilation and any circular references checked.
@@ -528,7 +531,7 @@ def _inline(
 
         return (f0, f1, f2)
 
-    def _pythonize(self, f: _ParsedFilter) -> str:
+    def _pythonize(self, f: _ParsedFilter, visited_insplit: Callable[[], None]) -> str:
         """
         Convert the parse tree into a python expression.
 
@@ -540,17 +543,18 @@ def _pythonize(self, f: _ParsedFilter) -> str:
             case "AND" | "OR":
                 # We know for certain that lhs and rhs are booleans. AND/ORing them
                 # together will always yield a boolean and will never raise an exception.
-                return f" {self._py_op_map[op]} ".join(f"({self._pythonize(a)})" for a in arg1)
+                return f" {self._py_op_map[op]} ".join(f"({self._pythonize(a, visited_insplit)})" for a in arg1)
             case "NOT":
                 # We know for certain that the argument is a boolean. Negating it
                 # will always yield a boolean and will never raise an exception.
-                return f"(not ({self._pythonize(arg1)}))"
+                return f"(not ({self._pythonize(arg1, visited_insplit)}))"
             case "EQ" | "NE" | "LE" | "GE" | "GT" | "LT" | "IN" | "NOTIN" | "INTERSECTS":
                 sym_type, sym_name, *sym_args = arg1
                 match sym_type:
                     case "FUNC":
                         func_name, func_args = sym_name, sym_args[0]
                         if func_name == "insplit":
+                            visited_insplit()
                             # We have two types of attributes we need to handle,
                             # sets and single values. All values and set values
                             # must be handled. For single values, we convert
@@ -626,13 +630,19 @@ def compile(self, name: str, flags: dict[str, CompiledFlag], rules: dict[str, _C
         """
         Compile the filter into a callable form.
         """
+        comp_filter = _CompiledFilter()
+
         seen = set()
         sets: list[set] = []
         flag_refs: dict[str, type] = {}
         rule_refs: set[tuple[str, str | None]] = set()
         inlined = self._inline(self._filters[name], seen, sets, flag_refs, rule_refs)
         optimized = self._optimize(inlined)
-        py_expr = self._pythonize(optimized)
+
+        def visited_insplit():
+            comp_filter.uses_insplit = True
+
+        py_expr = self._pythonize(optimized, visited_insplit)
         code_str = f"""
 def a(target_id, attributes):
     return {py_expr}
@@ -646,7 +656,6 @@ def a(target_id, attributes):
             "_hash_percent": _hash_percent,
         }
         exec(py_code, _globals, _locals)
-        comp_filter = _CompiledFilter()
         comp_filter.eval = _locals["a"]
         comp_filter.flag_refs = flag_refs
         comp_filter.rule_refs = rule_refs
@@ -912,6 +921,8 @@ def from_dict(c: DictConfig) -> CompiledConfig:
 
             compiled_rule.split_names = set()
             if "splits" in r:
+                if hasattr(compiled_rule, "_compiled_filter") and compiled_rule._compiled_filter.uses_insplit:
+                    raise ValueError("insplit filter and split rules cannot be used in the same rule")
                 compiled_rule.split_names.update(s["name"] for s in r["splits"] if "name" in s)
                 split_seed = r.get("split_group", rule_name)
                 py_common += [f"split_target_percent = _hash_percent(target_id, seed={split_seed!r})"]
diff --git a/tests/test_configs.py b/tests/test_configs.py
index b1bb88d..6727ac8 100644
--- a/tests/test_configs.py
+++ b/tests/test_configs.py
@@ -477,6 +477,7 @@ def test_invalid_rules_def(self):
             ({"variants": {"a": 2}, "schedule": {"start": "2024-10-10 10:10:10", "end": "2024-10-10 20:20:20"}}, ValueError, "Timezone missing"),
             ({"variants": {"a": 2}, "schedule": {"start": "2024-10-10 20:10:10Z", "end": "2024-10-10 10:20:20Z"}}, ValueError, "start must be before end"),
             ({"variants": {"a": 2}, "schedule": {"start": "2024-10-10 10:10:10Z", "end": "2024-10-10 10:10:10Z"}}, ValueError, "start must be before end"),
+            ({"filter": "insplit(attr:abc, 0, 10)", "splits": [{"percentage": 10}]}, ValueError, "insplit filter and split rules cannot be used in the same rule"),
             (
                 {"variants": {"a": 2}, "schedule": {"start": "2024-10-10 10:10:00Z", "end": "2024-10-10 10:20:00Z", "ramp_up": "8m", "ramp_down": "8m"}},
                 ValueError,

From e25b34d03f162e10cd7469ad49246ad4f4de6e48 Mon Sep 17 00:00:00 2001
From: Mansour Behabadi <mbehabadi@mathspace.com.au>
Date: Fri, 1 Nov 2024 22:16:01 +1100
Subject: [PATCH 3/4] Implement split_attribute

---
 src/feafea/__init__.py | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/feafea/__init__.py b/src/feafea/__init__.py
index 0267e96..870a60a 100644
--- a/src/feafea/__init__.py
+++ b/src/feafea/__init__.py
@@ -32,7 +32,7 @@
     _feafea_checksum = md5(f.read()).hexdigest()
 
 
-def _hash_percent(s: str, seed: str = "") -> float:
+def _hash_percent(s: Any, seed: str = "") -> float:
     """
     Hashes the given string and seed to a float in the range [0, 100).
 
@@ -631,6 +631,7 @@ def compile(self, name: str, flags: dict[str, CompiledFlag], rules: dict[str, _C
         Compile the filter into a callable form.
         """
         comp_filter = _CompiledFilter()
+        comp_filter.uses_insplit = False
 
         seen = set()
         sets: list[set] = []
@@ -692,7 +693,7 @@ class _CompiledFlagRule:
 
     __slots__ = ("rule_name", "eval")
     rule_name: str
-    eval: Callable[[TargetID, Attributes], Variant | tuple[Variant, str] | None]
+    eval: Callable[[TargetID, Attributes], Variant | tuple[Variant, str, list[AttributeValue]] | None]
 
 
 class CompiledFlag:
@@ -721,12 +722,13 @@ def eval(self, target_id: TargetID, attributes: Attributes) -> FlagEvaluation:
         e.flag = self.name
         e.target_id = target_id
         e.split = ""
+        e.split_attribute_values = []
         for rule in self._rules:
             v = rule.eval(target_id, attributes)
             if v is not None:
                 e.rule = rule.rule_name
                 if isinstance(v, tuple):
-                    e.variant, e.split = v
+                    e.variant, e.split, e.split_attribute_values = v
                     e.reason = "split_rule"
                 else:
                     e.variant = v
@@ -925,18 +927,23 @@ def from_dict(c: DictConfig) -> CompiledConfig:
                     raise ValueError("insplit filter and split rules cannot be used in the same rule")
                 compiled_rule.split_names.update(s["name"] for s in r["splits"] if "name" in s)
                 split_seed = r.get("split_group", rule_name)
-                py_common += [f"split_target_percent = _hash_percent(target_id, seed={split_seed!r})"]
+                split_attribute = r.get("split_attribute")
+                # Build a map of (attribute value -> percent) for each attribute value.
+                if split_attribute is None:
+                    py_common += [f"split_percents = {'{'}target_id:_hash_percent(target_id, seed={split_seed!r}){'}'}"]
+                else:
+                    py_common += [f"split_attribute = attributes[{split_attribute}] if isinstance(attributes.get({split_attribute}), set) else [attributes.get({split_attribute})]"]
+                    py_common += [f"split_percents = {'{'}(v:_hash_percent(v, seed={split_seed!r}) for v in split_attribute)"]
                 comulative_percentage_start = 0
                 comulative_percentage_end = 0
                 for split in r["splits"]:
                     comulative_percentage_end += split["percentage"]
-                    py_common += [
-                        f"#FIRULE if split_target_percent >= {comulative_percentage_start!r} and split_target_percent < {comulative_percentage_end!r}: return (True, {split.get("name", "")!r})"
-                    ]
+                    matched_attr_values = f"matched_attr_values = [v for v, p in split_percents.items() if {comulative_percentage_start!r} <= p < {comulative_percentage_end!r}]"
+                    py_common += ["#FIRULE " + matched_attr_values]
+                    py_common += [f"#FIRULE if matched_attr_values: return (True, {split.get("name", "")!r})"]
                     for flag, variant in split.get("variants", {}).items():
-                        py_flag[flag].append(
-                            f"if split_target_percent >= {comulative_percentage_start!r} and split_target_percent < {comulative_percentage_end!r}: return ({variant!r}, {split.get("name", "")!r})"
-                        )
+                        py_flag[flag].append(matched_attr_values)
+                        py_flag[flag].append(f"if matched_attr_values: return ({variant!r}, {split.get("name", "")!r}, matched_attr_values)")
                     comulative_percentage_start += split["percentage"]
 
             # Compile the flag independent rule.
@@ -964,9 +971,9 @@ def from_dict(c: DictConfig) -> CompiledConfig:
             for flag, py_lines in py_flag.items():
                 if not py_lines:
                     assert False, "unreachable"  # pragma: no cover
-                # Returns variant | (variant, split_name) | None
+                # Returns variant | (variant, split_name, split_attr_values) | None
                 # - variant: The variant evaluated for non-split rule
-                # - (variant, split_name): The variant evaluated for split rule
+                # - (variant, split_name, split_attr_values): The variant evaluated for split rule
                 # - None: The rule does not apply
                 lines = ["def a(target_id, attributes):"]
                 lines += list(f"  {line}" for line in (py_common + py_lines))
@@ -1057,6 +1064,7 @@ class FlagEvaluation:
         "rule",
         "reason",
         "split",
+        "split_attribute_values",
         "timestamp",
     )
     flag: str
@@ -1066,6 +1074,7 @@ class FlagEvaluation:
     rule: str | Literal[""]
     reason: Literal["split_rule", "const_rule", "default"]
     split: str | Literal[""]
+    split_attribute_values: list[AttributeValue]
     timestamp: float
 
 

From 745adade13db13ca68f857c9c2d631e722ceed14 Mon Sep 17 00:00:00 2001
From: Mansour Behabadi <mbehabadi@mathspace.com.au>
Date: Sat, 2 Nov 2024 00:23:05 +1100
Subject: [PATCH 4/4] wip

---
 src/feafea/__init__.py |  4 ++--
 tests/test_configs.py  | 49 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/feafea/__init__.py b/src/feafea/__init__.py
index 870a60a..8360867 100644
--- a/src/feafea/__init__.py
+++ b/src/feafea/__init__.py
@@ -932,8 +932,8 @@ def from_dict(c: DictConfig) -> CompiledConfig:
                 if split_attribute is None:
                     py_common += [f"split_percents = {'{'}target_id:_hash_percent(target_id, seed={split_seed!r}){'}'}"]
                 else:
-                    py_common += [f"split_attribute = attributes[{split_attribute}] if isinstance(attributes.get({split_attribute}), set) else [attributes.get({split_attribute})]"]
-                    py_common += [f"split_percents = {'{'}(v:_hash_percent(v, seed={split_seed!r}) for v in split_attribute)"]
+                    py_common += [f"split_attribute = attributes[{split_attribute!r}] if isinstance(attributes.get({split_attribute!r}), set) else [attributes.get({split_attribute!r})]"]
+                    py_common += [f"split_percents = {'{'}v:_hash_percent(v, seed={split_seed!r}) for v in split_attribute{'}'}"]
                 comulative_percentage_start = 0
                 comulative_percentage_end = 0
                 for split in r["splits"]:
diff --git a/tests/test_configs.py b/tests/test_configs.py
index 6727ac8..80a4665 100644
--- a/tests/test_configs.py
+++ b/tests/test_configs.py
@@ -337,6 +337,55 @@ def test_valid_split_probability(self):
         self.assertAlmostEqual(variant_count[3] / 100000, 0.6, delta=0.002)
         self.assertAlmostEqual(variant_count[1] / 100000, 0.3, delta=0.002)
 
+    def test_valid_split_probability_on_single_value_attribute(self):
+        config = {
+            "flags": {
+                "a": {"variants": [1, 2, 3], "default": 1},
+            },
+            "rules": {
+                "r1": {
+                    "split_attribute": "school_id",
+                    "splits": [
+                        {"percentage": 10, "variants": {"a": 2}},
+                        {"percentage": 60, "variants": {"a": 3}},
+                    ],
+                },
+            },
+        }
+        variant_count = {1: 0, 2: 0, 3: 0}
+        cc = CompiledConfig.from_dict(config)
+        for id in range(100000):
+            v = cc.flags["a"].eval("", {"school_id": str(id)}).variant
+            assert isinstance(v, int)
+            variant_count[v] += 1
+        self.assertAlmostEqual(variant_count[2] / 100000, 0.1, delta=0.002)
+        self.assertAlmostEqual(variant_count[3] / 100000, 0.6, delta=0.002)
+        self.assertAlmostEqual(variant_count[1] / 100000, 0.3, delta=0.002)
+
+    def test_valid_split_probability_on_multi_value_attribute(self):
+        config = {
+            "flags": {
+                "a": {"variants": [1, 2, 3], "default": 1},
+            },
+            "rules": {
+                "r1": {
+                    "split_attribute": "school_ids",
+                    "splits": [
+                        {"percentage": 10, "variants": {"a": 2}},
+                        {"percentage": 60, "variants": {"a": 3}},
+                    ],
+                },
+            },
+        }
+        cc = CompiledConfig.from_dict(config)
+        for id in range(100000):
+            v = cc.flags["a"].eval("", {"school_ids": str(id)}).variant
+            assert isinstance(v, int)
+            variant_count[v] += 1
+        self.assertAlmostEqual(variant_count[2] / 100000, 0.1, delta=0.002)
+        self.assertAlmostEqual(variant_count[3] / 100000, 0.6, delta=0.002)
+        self.assertAlmostEqual(variant_count[1] / 100000, 0.3, delta=0.002)
+
     def test_valid_schedule(self):
         cases = [
             # schedule, ts, expected