SynthesisLab
diff --git a/‎examples/rl/bandits/clever_evaluator.py‎
Lines changed: 0 additions & 136 deletions b/‎examples/rl/bandits/clever_evaluator.py‎
Lines changed: 0 additions & 136 deletions
diff --git a/‎examples/rl/bandits/topk_manager.py‎
Lines changed: 117 additions & 0 deletions b/‎examples/rl/bandits/topk_manager.py‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎examples/rl/optim/constant_optimizer.py‎
Lines changed: 5 additions & 7 deletions b/‎examples/rl/optim/constant_optimizer.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎examples/rl/program_evaluator.py‎
Lines changed: 83 additions & 0 deletions b/‎examples/rl/program_evaluator.py‎
Lines changed: 83 additions & 0 deletions
@@ -0,0 +1,117 @@
+from examples.rl.program_evaluator import ProgramEvaluator
+import numpy as np
+
+from typing import List, Optional, Tuple, TypeVar, Generic
+
+
+T = TypeVar("T", covariant=True)
+
+
+class TopkManager(Generic[T]):
+    #
+    def __init__(
+        self,
+        evaluator: ProgramEvaluator,
+        c: float = 0.7,
+        k: int = 2,
+    ) -> None:
+        self.evaluator = evaluator
+        self.candidates: List[T] = []
+        self.k = k
+        self.c = c
+
+    def num_candidates(self) -> int:
+        return len(self.candidates)
+
+    def challenge_with(
+        self,
+        new_candidate: T,
+        max_budget: int = 100,
+        prior_experience: List[float] = [],
+    ) -> Tuple[Optional[T], int]:
+        """
+        return: the T ejected and the no of calls to get_return
+        """
+        # Add new program
+        self.evaluator.add_returns(new_candidate, prior_experience)
+        self.candidates.append(new_candidate)
+        ejected_candidate, budget_used = self.__run_until_ejection__(max_budget)
+        if ejected_candidate:
+            self.__eject__(ejected_candidate)
+        return ejected_candidate, budget_used
+
+    def get_best_stats(self) -> Tuple[T, float, float, float, float]:
+        best_arm = np.argmax([self.evaluator.mean_return(p) for p in self.candidates])
+        candidate = self.candidates[best_arm]
+        n = self.evaluator.samples(candidate)
+        if n == 0:
+            return candidate, float("nan"), float("inf"), -float("inf"), float("inf")
+        rew = self.evaluator.returns(candidate)
+        mean_return = np.mean(rew)
+        return (
+            candidate,
+            mean_return,
+            n,
+            min(rew),
+            max(rew),
+        )
+
+    def run_at_least(self, min_budget: int, min_score: float = -float("inf")) -> int:
+        best_arm = np.argmax([self.evaluator.mean_return(p) for p in self.candidates])
+        candidate = self.candidates[best_arm]
+        initial: int = self.evaluator.samples(candidate)
+        budget_used: int = 0
+        while (
+            initial + budget_used < min_budget
+            and self.evaluator.mean_return(candidate) >= min_score
+        ):
+            budget_used += 1
+            has_no_error = self.evaluator.eval(candidate)
+            if not has_no_error:
+                break
+        return budget_used
+
+    def __run_until_ejection__(self, max_budget: int) -> Tuple[Optional[T], int]:
+        """
+        return: the T ejected and the cost
+        """
+        budget_used: int = 0
+        while self.__get_candidate_to_eject__() is None and budget_used < max_budget:
+            index: int = np.argmin([self.evaluator.samples(p) for p in self.candidates])
+            candidate: T = self.candidates[index]
+            has_no_error = self.evaluator.eval(candidate)
+            if not has_no_error:
+                return candidate, budget_used
+            budget_used += 1
+        return self.__get_candidate_to_eject__(
+            len(self.candidates) >= self.k
+        ), budget_used
+
+    def __get_candidate_to_eject__(self, force: bool = False) -> Optional[T]:
+        if len(self.candidates) == 1:
+            return None
+        mean_returns = [self.evaluator.mean_return(p) for p in self.candidates]
+        worst_arm = np.argmin(mean_returns)
+        worst = self.candidates[worst_arm]
+        if force:
+            return worst
+        best_arm = np.argmax(mean_returns)
+        best = self.candidates[best_arm]
+
+        if mean_returns[best_arm] - self.uncertainty(best) >= mean_returns[
+            worst_arm
+        ] + self.uncertainty(worst):
+            return worst
+        return None
+
+    def uncertainty(self, candidate: T) -> float:
+        n = self.evaluator.samples(candidate)
+        if n == 0:
+            return float("inf")
+        return self.c * np.sqrt(
+            np.log(sum(self.evaluator.samples(p) for p in self.candidates)) / n
+        )
+
+    def __eject__(self, candidate: T):
+        self.evaluator.delete_data(candidate)
+        self.candidates.remove(candidate)
@@ -14,27 +14,25 @@ def __init__(self, seed: Optional[int] = None) -> None:
         self.c = 0.7
         self.min_budget_per_arm = 15
         self.best_return = 0
-        self._rng = np.random.default_rng(seed)
+        # self._rng = np.random.default_rng(seed)
 
     def optimize(
-        self,
-        eval: Callable[[], float],
-        constants: List[Constant],
+        self, eval: Callable[[], float], constants: List[Constant], **kwargs
     ) -> Tuple[List[Tile], List[List[float]]]:
         self.budget_used = 0
         self._constants = constants
         tiles = [tile_split(-np.inf, np.inf, splits=4) for _ in constants]
         self._eval = eval
         self._can_hope_to_beat_best = True
-        return self._optimize_tiles_(constants, tiles)
+        return self._optimize_tiles_(constants, tiles, **kwargs)
 
     def _pick_values(self) -> List[int]:
         arms = []
         for index, bandit in enumerate(self._bandits):
             arm = bandit.choose_arm_ucb()
             arms.append(arm)
             self._constants[index].assign(
-                self._tiles_list[index][arm].map(self._rng.uniform(0, 1))
+                self._tiles_list[index][arm].map(np.random.uniform(0, 1))
             )
         return arms
 
@@ -62,7 +60,7 @@ def _optimize_tiles_(
         constants: List[Constant],
         tiles_list: List[List[Tile]],
         prev_experiences=None,
-        max_total_budget=1500,
+        max_total_budget=1000,
     ) -> Tuple[List[Tile], List[List[float]]]:
         self._constants = constants
         self._tiles_list = tiles_list
 
@@ -0,0 +1,83 @@
+from typing import Callable, List, Tuple
+import gymnasium as gym
+from synth.semantic.evaluator import Evaluator
+from synth.syntax.program import Program
+import numpy as np
+
+
+def __state2env__(state: np.ndarray) -> Tuple:
+    return tuple(state.tolist())
+
+
+def __adapt_action2env__(env: gym.Env, action) -> List:
+    if isinstance(env.action_space, gym.spaces.Box):
+        if len(env.action_space.shape) == 1 and env.action_space.shape[0] == 1:
+            return [min(max(action, env.action_space.low[0]), env.action_space.high[0])]
+    return action
+
+
+class ProgramEvaluator:
+    def __init__(self, env_factory: Callable[[], gym.Env], evaluator: Evaluator):
+        self.cache = {}
+        self.env_factory = env_factory
+        self.dsl_eval = evaluator
+        self.recording = True
+        self.tmp_keys = []
+
+    def record(self, record: bool):
+        if not self.recording and record:
+            for key in self.tmp_keys:
+                del self.cache[key]
+            self.tmp_keys.clear()
+        self.recording = record
+
+    def delete_data(self, program: Program):
+        del self.cache[program.hash]
+
+    def returns(self, program: Program) -> List[float]:
+        return self.cache.get(program.hash, (0, []))[1]
+
+    def mean_return(self, program: Program) -> float:
+        r = self.returns(program)
+        if len(r) == 0:
+            return 0
+        return sum(r) / len(r)
+
+    def samples(self, program: Program) -> int:
+        return len(self.cache.get(program.hash, (0, []))[1])
+
+    def add_returns(self, program: Program, returns: List[float]):
+        if program.hash not in self.cache:
+            self.cache[program.hash] = (self.env_factory(), [])
+            if not self.recording:
+                self.tmp_keys.append(program.hash)
+        li = self.returns(program)
+        for el in returns:
+            li.append(el)
+
+    def eval(self, program: Program, n_episodes: int = 1) -> bool:
+        if program.hash not in self.cache:
+            self.cache[program.hash] = (self.env_factory(), [])
+            if not self.recording:
+                self.tmp_keys.append(program.hash)
+        env, returns = self.cache[program.hash]
+        try:
+            state = None
+            for _ in range(n_episodes):
+                episode = []
+                state = env.reset()[0]
+                done = False
+                while not done:
+                    input = __state2env__(state)
+                    action = self.dsl_eval.eval(program, input)
+                    adapted_action = __adapt_action2env__(env, action)
+                    if adapted_action not in env.action_space:
+                        return False
+                    next_state, reward, done, truncated, _ = env.step(adapted_action)
+                    done |= truncated
+                    episode.append(reward)
+                    state = next_state
+                returns.append(sum(episode))
+        except OverflowError:
+            return False
+        return True