PrimeIntellect-ai · willccbb · Nov 19, 2025 · Nov 9, 2025 · Nov 9, 2025 · Nov 9, 2025
diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
@@ -29,12 +29,12 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v6
         with:
-          python-version: '3.11'
+          python-version: "3.11"
       - name: Install uv
         uses: astral-sh/setup-uv@v4
         with:
           version: "latest"
       - name: Install dependencies
-        run: uv sync
+        run: uv sync --extra rl
       - name: Run ty
-        run: uv run ty check .
+        run: uv run ty check verifiers
diff --git a/.gitignore b/.gitignore
@@ -29,7 +29,7 @@ docs/build/
 *.pyc
 
 # libraries
-prime-rl/
+prime-rl
 
 # outputs
 wandb/

diff --git a/configs/prime-rl/wiki-search.toml b/configs/prime-rl/wiki-search.toml
@@ -1,13 +1,14 @@
-inference_gpu_ids = [0]
-trainer_gpu_ids = [1]
+inference_gpu_ids = [0,1,2,3,4,5]
+trainer_gpu_ids = [6,7]
 
 max_steps = 500
+max_async_level = 4
 
 [model]
-name = "Qwen/Qwen3-4B-Instruct-2507"
+name = "Qwen/Qwen3-4B-Thinking-2507"
 
 [wandb]
-project = "wiki-search"
+project = "wiki-search-debug"
 name = "wiki-search-4b"
 
 [trainer.optim]
@@ -31,16 +32,17 @@ target_modules = [
 [orchestrator]
 batch_size = 512
 rollouts_per_example = 16
-seq_len = 4096
+seq_len = 16384
 mask_truncated_completions = false
 zero_truncated_completions = true
+oversampling_factor = 2.0
+
 
 [orchestrator.sampling]
-max_tokens = 512
+max_tokens = 4096
 
 [orchestrator.buffer]
-type = "online-difficulty"
-oversampling_factor = 2.0
+online_difficulty_filtering = true
 
 [[orchestrator.env]]
 id = "primeintellect/wiki-search"

diff --git a/configs/vf-rl/reasoning-gym.toml b/configs/vf-rl/reasoning-gym.toml
@@ -10,13 +10,11 @@ num_eval_examples = 2000
 seed = 1
 
 [inference]
-gpus = 4
-tensor_parallel_size = 2
-data_parallel_size = 2
+gpus = 6
 enforce_eager = true
 
 [trainer]
-gpus = 4
+gpus = 2
 batch_size = 512
 micro_batch_size = 2
 max_seq_len = 4096

diff --git a/configs/vf-rl/wiki-search.toml b/configs/vf-rl/wiki-search.toml
@@ -1,7 +1,7 @@
 model = "Qwen/Qwen3-4B-Instruct-2507"
 
 [env]
-id = "primeintellect/wiki-search"
+id = "wiki-search"
 
 [env.args]
 max_turns = 10
@@ -20,7 +20,7 @@ gpus = 1
 run_name = "wiki-search"
 micro_batch_size = 4
 rollouts_per_example = 16
-batch_size = 1024
+batch_size = 512
 max_steps = 500
 max_tokens = 512
 max_seq_len = 4096
diff --git a/configs/vf-rl/wordle.toml b/configs/vf-rl/wordle.toml
@@ -1,21 +1,18 @@
 model = "Qwen/Qwen3-4B-Instruct-2507"
 
 [env]
-id = "will/wordle"
+id = "wordle"
 
 [inference]
 gpus = 1
 
-[inference.args]
-enforce_eager = true
-
 [trainer]
 gpus = 1
 
 [trainer.args]
 lora_target_modules = "all-linear"
 run_name = "wordle"
-micro_batch_size = 8
+micro_batch_size = 4
 rollouts_per_example = 16
 batch_size = 512
 max_steps = 500

diff --git a/environments/math_group/pyproject.toml b/environments/math_group/pyproject.toml
@@ -1,8 +1,8 @@
 [project]
 name = "math-group"
-version = "0.1.0"
+version = "0.1.1"
 dependencies = [
-    "verifiers>=0.1.4",
+    "verifiers>=0.1.8",
     "math-verify>=0.8.0",
 ]
 

diff --git a/environments/math_python/pyproject.toml b/environments/math_python/pyproject.toml
@@ -2,10 +2,10 @@
 name = "math-python"
 description = "Solve math problems using Python in a sandbox environment"
 tags = ["tool-use", "math", "sandbox", "train", "prime-sandboxes", "python", "coding"]
-version = "0.1.7"
+version = "0.1.8"
 requires-python = ">=3.11"
 dependencies = [
-    "verifiers>=0.1.5.post0",
+    "verifiers>=0.1.8",
     "math-verify>=0.8.0",
 ]
 

diff --git a/environments/sentence_repeater/sentence_repeater.py b/environments/sentence_repeater/sentence_repeater.py
@@ -1,7 +1,7 @@
 import random
 from copy import deepcopy
 from difflib import SequenceMatcher
-from typing import List, Tuple
+from typing import List
 
 from datasets import Dataset, load_dataset
 
@@ -75,19 +75,19 @@ class SentenceRepeaterEnv(vf.MultiTurnEnv):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    async def is_completed(self, messages: Messages, state: State, **kwargs) -> bool:
-        max_turns_reached = await super().is_completed(messages, state, **kwargs)
-        return state["turn"] >= len(state["info"]["questions"]) or max_turns_reached
+    @vf.stop
+    async def all_questions_answered(self, state: State) -> bool:
+        return len(state["trajectory"]) >= len(state["info"]["questions"])
 
     async def env_response(
         self, messages: Messages, state: State, **kwargs
-    ) -> Tuple[Messages, State]:
+    ) -> Messages:
         return [
             {
                 "role": "user",
                 "content": state["info"]["questions"][state["turn"]],
             }
-        ], state
+        ]
 
 
 def load_environment(**kwargs) -> vf.Environment:

diff --git a/environments/wiki_search/pyproject.toml b/environments/wiki_search/pyproject.toml
@@ -3,9 +3,9 @@ name = "wiki-search"
 description = "Agentic RAG over Wikipedia pages for trivia Q&A"
 tags = ["wikipedia", "multi-turn", "agentic-search", "rag", "train", "eval", "llm-judge"]
 requires-python = ">=3.11"
-version = "0.1.20"
+version = "0.1.21"
 dependencies = [
-    "verifiers>=0.1.7",
+    "verifiers>=0.1.8",
     "chromadb",
     "datasets",
     "openai",

diff --git a/environments/wordle/outputs/evals/wordle--gpt-4.1-mini/816ec3b0/metadata.json b/environments/wordle/outputs/evals/wordle--gpt-4.1-mini/816ec3b0/metadata.json
diff --git a/environments/wordle/outputs/evals/wordle--gpt-4.1-mini/816ec3b0/results.jsonl b/environments/wordle/outputs/evals/wordle--gpt-4.1-mini/816ec3b0/results.jsonl
diff --git a/environments/wordle/pyproject.toml b/environments/wordle/pyproject.toml
@@ -2,9 +2,9 @@
 name = "wordle"
 description = "Game environment for Wordle, built on top of TextArena"
 tags = ["textarena", "multi-turn", "reasoning", "game", "train", "eval"]
-version = "0.1.5"
+version = "0.1.6"
 dependencies = [
-    "verifiers>=0.1.5.post0",
+    "verifiers>=0.1.8",
     "nltk",
     "textarena",
 ]