feat: mcts policy based on trace scheduler (#1203)

xuangu-fang · you-n-g · jingyuanlm · web-flow · commit 13890e0bbcaf · 2025-10-17T16:21:39.000+08:00
* init mcts class

* full ver of MCTS

* auto-lint

* make MCTS feedback in exp-gen()

* refactor: move reset logic from Trace to ExpGen and update usage accordingly

* fix: reinitialize trace on consecutive errors in DataScienceRDLoop

* feat: add reset method to BaseScheduler and call in MCTSScheduler reset

* style: reorder imports for consistency and PEP8 compliance

* lint

* fix observe_feedback

* fix bug

* remove uncommited_rec_status

* more simple

* refactor: move commit observation logic to process_uncommitted_nodes method

* docs: add TODO comment about rule-based virtual root node expansion

* add score reward

* fix bug

* fix small bug

* lint

* change reward

* lint

---------

Co-authored-by: Young &lt;afe.young@gmail.com&gt;
Co-authored-by: jingyuanlm &lt;842442862@qq.com&gt;
Co-authored-by: amstrongzyf &lt;amstrongzyf@126.com&gt;
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -101,6 +101,13 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     scheduler_temperature: float = 1.0
     """The temperature for the trace scheduler for softmax calculation, used in ProbabilisticScheduler"""
 
+    # PUCT exploration constant for MCTSScheduler (ignored by other schedulers)
+    scheduler_c_puct: float = 1.0
+    """Exploration constant used by MCTSScheduler (PUCT)."""
+
+    enable_score_reward: bool = False
+    """Enable using score-based reward for trace selection in multi-trace scheduling."""
+
     #### multi-trace:checkpoint selector
     selector_name: str = "rdagent.scenarios.data_science.proposal.exp_gen.select.expand.LatestCKPSelector"
     """The name of the selector to use"""
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
@@ -325,6 +325,14 @@ async def async_gen(self, trace: Trace, loop: LoopBase) -> Experiment:
                 return self.gen(trace)
             await asyncio.sleep(1)
 
+    def reset(self) -> None:
+        """
+        Reset the proposal to the initial state.
+        Sometimes the main loop may want to reset the whole process to the initial state.
+        Default implementation does nothing; override in subclasses if needed.
+        """
+        return
+
 
 class HypothesisGen(ABC):
 
diff --git a/rdagent/scenarios/data_science/loop.py b/rdagent/scenarios/data_science/loop.py
@@ -34,6 +34,9 @@
 from rdagent.scenarios.data_science.proposal.exp_gen.base import DataScienceScen
 from rdagent.scenarios.data_science.proposal.exp_gen.idea_pool import DSKnowledgeBase
 from rdagent.scenarios.data_science.proposal.exp_gen.proposal import DSProposalV2ExpGen
+from rdagent.scenarios.data_science.proposal.exp_gen.trace_scheduler import (
+    MCTSScheduler,
+)
 from rdagent.utils.workflow.misc import wait_retry
 
 
@@ -246,6 +249,7 @@ def record(self, prev_out: dict[str, Any]):
                 ),
                 cur_loop_id,
             )
+            # Value backpropagation is handled in async_gen before next() via observe_commits
 
             if self.trace.sota_experiment() is None:
                 if DS_RD_SETTING.coder_on_whole_pipeline:
@@ -271,6 +275,8 @@ def record(self, prev_out: dict[str, Any]):
                         logger.error("Consecutive errors reached the limit. Dumping trace.")
                         logger.log_object(self.trace, tag="trace before restart")
                         self.trace = DSTrace(scen=self.trace.scen, knowledge_base=self.trace.knowledge_base)
+                        # Reset the trace; MCTS stats will be cleared via registered callback
+                        self.exp_gen.reset()
 
         # set the SOTA experiment to submit
         sota_exp_to_submit = self.sota_exp_selector.get_sota_exp_to_submit(self.trace)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/router/__init__.py b/rdagent/scenarios/data_science/proposal/exp_gen/router/__init__.py
@@ -21,6 +21,7 @@
 )
 from rdagent.scenarios.data_science.proposal.exp_gen.proposal import DSProposalV2ExpGen
 from rdagent.scenarios.data_science.proposal.exp_gen.trace_scheduler import (
+    MCTSScheduler,
     RoundRobinScheduler,
     SOTABasedScheduler,
     TraceScheduler,
@@ -63,6 +64,9 @@ def gen(
             "ParallelMultiTraceExpGen is designed for async usage, please call async_gen instead."
         )
 
+    def reset(self) -> None:
+        self.trace_scheduler.reset()
+
     async def async_gen(self, trace: DSTrace, loop: LoopBase) -> DSExperiment:
         """
         Waits for a free execution slot, selects a parent trace using the
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/trace_scheduler.py b/rdagent/scenarios/data_science/proposal/exp_gen/trace_scheduler.py
@@ -7,7 +7,9 @@
 from collections import defaultdict
 from typing import TYPE_CHECKING
 
+from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.log import rdagent_logger as logger
+from rdagent.scenarios.kaggle.kaggle_crawler import get_metric_direction
 
 if TYPE_CHECKING:
     from rdagent.scenarios.data_science.proposal.exp_gen.base import DSTrace
@@ -38,6 +40,12 @@ async def next(self, trace: DSTrace) -> tuple[int, ...]:
         """
         raise NotImplementedError
 
+    def reset(self) -> None:
+        """
+        Reset the scheduler to the initial state.
+        """
+        pass
+
 
 class BaseScheduler(TraceScheduler):
     def __init__(self):
@@ -49,7 +57,10 @@ async def next(self, trace: DSTrace) -> tuple[int, ...]:
         Atomically selects the next leaf node from the trace in order.
         """
         while True:
-            # step 0: Commit the pending selections
+            # step 1: Commit the pending selections
+            self.process_uncommitted_nodes(trace)
+
+            # step 2: update uncommited_rec_status & rec_commit_idx
             for i in range(self.rec_commit_idx, len(trace.dag_parent)):
                 parent_of_i = trace.dag_parent[i]
                 if parent_of_i == trace.NEW_ROOT:
@@ -71,11 +82,22 @@ async def next(self, trace: DSTrace) -> tuple[int, ...]:
 
             await asyncio.sleep(1)
 
+    def process_uncommitted_nodes(self, trace: DSTrace) -> None:
+        """
+        A slot for implementing custom logic to process uncommitted nodes.
+
+        `uncommited_rec_status` & `rec_commit_idx` will be updated automatically.
+        """
+
     @abstractmethod
     def select(self, trace: DSTrace) -> tuple[int, ...] | None:
         """Selects the parent nodes for the new experiment, or None if no selection can be made."""
         raise NotImplementedError
 
+    def reset(self) -> None:
+        self.uncommited_rec_status = defaultdict(int)
+        self.rec_commit_idx = 0
+
 
 class RoundRobinScheduler(BaseScheduler):
     """
@@ -289,3 +311,138 @@ def calculate_potential(self, trace: DSTrace, leaf_id: int) -> float:
         Return random potential for uniform random selection.
         """
         return random.random()
+
+
+class MCTSScheduler(ProbabilisticScheduler):
+    """
+    A simplified MCTS-based scheduler using a PUCT-like scoring rule.
+
+    Formula:
+    U(s, a) = Q(s, a) + c_puct * P(s, a) * sqrt(N(s)) / (1 + N(s, a))
+    where Q is the average reward, N is the visit count, P is the prior probability, c_puct is the given weight to balance exploration and exploitation.
+
+    Design goals for the initial version:
+    - Reuse ProbabilisticScheduler's potential calculation as prior P (via softmax).
+    - Maintain visit/value statistics per leaf to compute Q and U.
+    - Update visits on selection; update values after feedback via observe_feedback.
+    - Keep NEW_ROOT policy and uncommitted status handling identical to base classes.
+    """
+
+    def __init__(self, max_trace_num: int, temperature: float = 1.0, *args, **kwargs):
+        super().__init__(max_trace_num, temperature)
+        # Read c_puct from settings if available, otherwise fall back to default 1.0
+        self.c_puct = getattr(DS_RD_SETTING, "scheduler_c_puct", 1.0) or 1.0
+        # Statistics keyed by leaf node index
+        self.node_visit_count: dict[int, int] = {}
+        self.node_value_sum: dict[int, float] = {}
+        self.node_prior: dict[int, float] = {}
+        # Global counter to stabilize U term
+        self.global_visit_count: int = 0
+        # Last observed commit index for batch feedback observation
+        self.last_observed_commit_idx: int = 0
+
+    def _get_q(self, node_id: int) -> float:
+        visits = self.node_visit_count.get(node_id, 0)
+        value_sum = self.node_value_sum.get(node_id, 0.0)
+        if visits <= 0:
+            # Unseen nodes default to neutral Q
+            return 0.0
+        return value_sum / visits
+
+    def _get_u(self, node_id: int) -> float:
+        prior = self.node_prior.get(node_id, 0.0)
+        visits = self.node_visit_count.get(node_id, 0)
+        # Avoid div-by-zero; encourage exploration when visits are small
+        return self.c_puct * prior * math.sqrt(max(1, self.global_visit_count)) / (1 + visits)
+
+    def select(self, trace: DSTrace) -> tuple[int, ...] | None:
+        # Step 1: keep same policy to reach target number of parallel traces
+        # TODO: expanding from the virtual root node is implemented in a rule-based way.
+        if trace.sub_trace_count + self.uncommited_rec_status[trace.NEW_ROOT] < self.max_trace_num:
+            return trace.NEW_ROOT
+
+        # Step 2: consider only available leaves (not being expanded)
+        available_leaves = list(set(range(len(trace.hist))))
+        if not available_leaves:
+            return None
+
+        # Step 3: compute priors (P) from potentials via softmax
+        potentials = [self.calculate_potential(trace, leaf) for leaf in available_leaves]
+        if any(p < 0 for p in potentials):
+            raise ValueError("Potential function returned a negative value.")
+        priors = self._softmax_probabilities(potentials)
+        for leaf, p in zip(available_leaves, priors):
+            self.node_prior[leaf] = p
+
+        # Step 4: score each leaf using PUCT-like rule: Q + U
+        best_leaf = None
+        best_score = -float("inf")
+        for leaf in available_leaves:
+            q = self._get_q(leaf)
+            u = self._get_u(leaf)
+            score = q + u
+            if score > best_score:
+                best_score = score
+                best_leaf = leaf
+
+        if best_leaf is None:
+            return None
+
+        # # Step 5: optimistic visit update on selection; value update deferred to observe_feedback
+        self.global_visit_count += 1
+
+        return (best_leaf,)
+
+    def observe_feedback(self, trace: DSTrace, new_idx: int, reward: float | None = None) -> None:
+        """
+        Update statistics after an experiment is committed to the trace.
+
+        Args:
+            trace: The DSTrace object.
+            new_idx: Index of the newly appended experiment in trace.hist.
+            reward: Optional explicit reward. If None, derive from feedback.decision (1.0/0.0).
+        """
+        if reward is None:
+            if 0 <= new_idx < len(trace.hist):
+                re, fb = trace.hist[new_idx]
+                if DS_RD_SETTING.enable_score_reward:
+                    bigger_is_better = get_metric_direction(trace.scen.competition)
+                    if getattr(fb, "decision", False):
+                        reward = math.tanh(re.result.loc["ensemble"].iloc[0].round(3)) * (1 if bigger_is_better else -1)
+                    else:
+                        reward = -1 if bigger_is_better else 1
+                else:
+                    reward = 1.0 if getattr(fb, "decision", False) else 0.0
+            else:
+                # Out-of-range safety
+                reward = 0.0
+
+        id_list = trace.get_parents(new_idx)
+        for id in id_list:
+            self.node_value_sum[id] = self.node_value_sum.get(id, 0.0) + float(reward)
+            self.node_visit_count[id] = self.node_visit_count.get(id, 0) + 1
+
+    def reset(self) -> None:
+        """
+        Clear all maintained statistics. Should be called when the underlying trace is reset.
+        """
+        super().reset()
+        self.node_visit_count.clear()
+        self.node_value_sum.clear()
+        self.node_prior.clear()
+        self.global_visit_count = 0
+        self.last_observed_commit_idx = 0
+
+    def process_uncommitted_nodes(self, trace: DSTrace) -> None:
+        """
+        Batch observe all newly committed experiments since last observation.
+        Should be called before making a new selection to ensure statistics are up-to-date.
+        """
+        start_idx = max(0, self.last_observed_commit_idx)
+        # Only observe fully committed items (both dag_parent and hist appended)
+        end_idx = min(len(trace.dag_parent), len(trace.hist))
+        if start_idx >= end_idx:
+            return
+        for idx in range(start_idx, end_idx):
+            self.observe_feedback(trace, idx)
+        self.last_observed_commit_idx = end_idx