Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
2545383
big chungus refactor for branching rollouts + cleaner state handling
willccbb Nov 9, 2025
c602c16
tests passing
willccbb Nov 9, 2025
527a1f4
3.11 fix; ruff
willccbb Nov 9, 2025
6223463
vllm logprob args
willccbb Nov 9, 2025
1fda16f
dict indexing for messages
willccbb Nov 9, 2025
8c587ee
remove generateinputs
willccbb Nov 9, 2025
c2faf0b
optional truncation in trajectorystep for tokens
willccbb Nov 9, 2025
580d8da
small tweaks
willccbb Nov 9, 2025
f7a7394
optional decorator rank for sorting order
willccbb Nov 10, 2025
ccc7a40
minor tweak
willccbb Nov 10, 2025
a91e975
change rank -> priority
willccbb Nov 10, 2025
f91f9e0
add cleanup to is_completed
willccbb Nov 15, 2025
32d8f9f
tool_env error handling, sandbox command timeout
willccbb Nov 16, 2025
b61334f
handle updated context length msg
willccbb Nov 16, 2025
a0e9319
duplicate is_truncated field
willccbb Nov 17, 2025
c1b7e28
add model/sampling to state
willccbb Nov 17, 2025
e5ea48b
client/model/sampling in init_state
willccbb Nov 17, 2025
4e951bb
updated config
willccbb Nov 17, 2025
4aba40a
add kimi overlong prompt message
willccbb Nov 17, 2025
27015ab
add kimi overlong prompt message
willccbb Nov 17, 2025
a4ec20e
set_max_seq_len
willccbb Nov 17, 2025
8860d81
Add numpy, sympy, and scipy to PythonEnv
snimu Nov 17, 2025
1ba0ebf
Merge branch 'main' into trajectories
willccbb Nov 19, 2025
752658d
pin prime-rl to will/trajectories branch
willccbb Nov 19, 2025
716e658
update prime-rl wiki-search config
willccbb Nov 19, 2025
0b04bed
ruff, ty
willccbb Nov 19, 2025
d871e72
fix init_state tests
willccbb Nov 19, 2025
8344655
version, release notes
willccbb Nov 19, 2025
3fd6e86
env version bumps
willccbb Nov 19, 2025
3bec3fa
ty fixes
willccbb Nov 19, 2025
a1224ce
opt deps for ty CI
willccbb Nov 19, 2025
65a403a
pin trajectories for configs
willccbb Nov 19, 2025
e593301
use verifiers commit for configs
willccbb Nov 19, 2025
a273dbf
ty for verifiers only
willccbb Nov 19, 2025
0421bd1
bump vllm version
willccbb Nov 19, 2025
5a09fe1
process overlong prompt into trajectory
willccbb Nov 19, 2025
fb313c8
skip steps with None tokens
willccbb Nov 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/style.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.11'
python-version: "3.11"
- name: Install uv
uses: astral-sh/setup-uv@v4
with:
version: "latest"
- name: Install dependencies
run: uv sync
run: uv sync --extra rl
- name: Run ty
run: uv run ty check .
run: uv run ty check verifiers
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ docs/build/
*.pyc

# libraries
prime-rl/
prime-rl

# outputs
wandb/
Expand Down
18 changes: 10 additions & 8 deletions configs/prime-rl/wiki-search.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
inference_gpu_ids = [0]
trainer_gpu_ids = [1]
inference_gpu_ids = [0,1,2,3,4,5]
trainer_gpu_ids = [6,7]

max_steps = 500
max_async_level = 4

[model]
name = "Qwen/Qwen3-4B-Instruct-2507"
name = "Qwen/Qwen3-4B-Thinking-2507"

[wandb]
project = "wiki-search"
project = "wiki-search-debug"
name = "wiki-search-4b"

[trainer.optim]
Expand All @@ -31,16 +32,17 @@ target_modules = [
[orchestrator]
batch_size = 512
rollouts_per_example = 16
seq_len = 4096
seq_len = 16384
mask_truncated_completions = false
zero_truncated_completions = true
oversampling_factor = 2.0


[orchestrator.sampling]
max_tokens = 512
max_tokens = 4096

[orchestrator.buffer]
type = "online-difficulty"
oversampling_factor = 2.0
online_difficulty_filtering = true

[[orchestrator.env]]
id = "primeintellect/wiki-search"
Expand Down
6 changes: 2 additions & 4 deletions configs/vf-rl/reasoning-gym.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,11 @@ num_eval_examples = 2000
seed = 1

[inference]
gpus = 4
tensor_parallel_size = 2
data_parallel_size = 2
gpus = 6
enforce_eager = true

[trainer]
gpus = 4
gpus = 2
batch_size = 512
micro_batch_size = 2
max_seq_len = 4096
Expand Down
4 changes: 2 additions & 2 deletions configs/vf-rl/wiki-search.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
model = "Qwen/Qwen3-4B-Instruct-2507"

[env]
id = "primeintellect/wiki-search"
id = "wiki-search"

[env.args]
max_turns = 10
Expand All @@ -20,7 +20,7 @@ gpus = 1
run_name = "wiki-search"
micro_batch_size = 4
rollouts_per_example = 16
batch_size = 1024
batch_size = 512
max_steps = 500
max_tokens = 512
max_seq_len = 4096
7 changes: 2 additions & 5 deletions configs/vf-rl/wordle.toml
Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
model = "Qwen/Qwen3-4B-Instruct-2507"

[env]
id = "will/wordle"
id = "wordle"

[inference]
gpus = 1

[inference.args]
enforce_eager = true

[trainer]
gpus = 1

[trainer.args]
lora_target_modules = "all-linear"
run_name = "wordle"
micro_batch_size = 8
micro_batch_size = 4
rollouts_per_example = 16
batch_size = 512
max_steps = 500
Expand Down
4 changes: 2 additions & 2 deletions environments/math_group/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[project]
name = "math-group"
version = "0.1.0"
version = "0.1.1"
dependencies = [
"verifiers>=0.1.4",
"verifiers>=0.1.8",
"math-verify>=0.8.0",
]

Expand Down
4 changes: 2 additions & 2 deletions environments/math_python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
name = "math-python"
description = "Solve math problems using Python in a sandbox environment"
tags = ["tool-use", "math", "sandbox", "train", "prime-sandboxes", "python", "coding"]
version = "0.1.7"
version = "0.1.8"
requires-python = ">=3.11"
dependencies = [
"verifiers>=0.1.5.post0",
"verifiers>=0.1.8",
"math-verify>=0.8.0",
]

Expand Down
12 changes: 6 additions & 6 deletions environments/sentence_repeater/sentence_repeater.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import random
from copy import deepcopy
from difflib import SequenceMatcher
from typing import List, Tuple
from typing import List

from datasets import Dataset, load_dataset

Expand Down Expand Up @@ -75,19 +75,19 @@ class SentenceRepeaterEnv(vf.MultiTurnEnv):
def __init__(self, **kwargs):
super().__init__(**kwargs)

async def is_completed(self, messages: Messages, state: State, **kwargs) -> bool:
max_turns_reached = await super().is_completed(messages, state, **kwargs)
return state["turn"] >= len(state["info"]["questions"]) or max_turns_reached
@vf.stop
async def all_questions_answered(self, state: State) -> bool:
return len(state["trajectory"]) >= len(state["info"]["questions"])

async def env_response(
self, messages: Messages, state: State, **kwargs
) -> Tuple[Messages, State]:
) -> Messages:
return [
{
"role": "user",
"content": state["info"]["questions"][state["turn"]],
}
], state
]


def load_environment(**kwargs) -> vf.Environment:
Expand Down
4 changes: 2 additions & 2 deletions environments/wiki_search/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ name = "wiki-search"
description = "Agentic RAG over Wikipedia pages for trivia Q&A"
tags = ["wikipedia", "multi-turn", "agentic-search", "rag", "train", "eval", "llm-judge"]
requires-python = ">=3.11"
version = "0.1.20"
version = "0.1.21"
dependencies = [
"verifiers>=0.1.7",
"verifiers>=0.1.8",
"chromadb",
"datasets",
"openai",
Expand Down

This file was deleted.

This file was deleted.

4 changes: 2 additions & 2 deletions environments/wordle/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
name = "wordle"
description = "Game environment for Wordle, built on top of TextArena"
tags = ["textarena", "multi-turn", "reasoning", "game", "train", "eval"]
version = "0.1.5"
version = "0.1.6"
dependencies = [
"verifiers>=0.1.5.post0",
"verifiers>=0.1.8",
"nltk",
"textarena",
]
Expand Down
Loading
Loading