-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy path09_flexible_rewards.py
More file actions
121 lines (98 loc) · 2.68 KB
/
09_flexible_rewards.py
File metadata and controls
121 lines (98 loc) · 2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
ToolBrain Training Example - Flexible Rewards
This script demonstrates different reward functions for training agents:
1. Exact match (standard accuracy)
2. Tool execution success
3. Combined rewards with weights
4. LLM-as-a-Judge with Gemini
"""
from dotenv import load_dotenv
load_dotenv()
from smolagents import tool, TransformersModel, CodeAgent
from toolbrain import Brain
from toolbrain.rewards import (
reward_exact_match,
reward_tool_execution_success,
reward_combined,
)
from toolbrain.core_types import Trace
from typing import Any
# --- 1. Define Tools ---
@tool
def add(a: int, b: int) -> int:
"""
Add two integers.
Args:
a (int): First addend.
b (int): Second addend.
Returns:
int: Sum of a and b.
"""
return a + b
@tool
def multiply(a: int, b: int) -> int:
"""
Multiply two integers.
Args:
a (int): First factor.
b (int): Second factor.
Returns:
int: Product of a and b.
"""
return a * b
# --- 2. Training Data ---
training_dataset = [
{
"query": "Use the add tool to calculate 5 + 7",
"gold_answer": "12"
},
{
"query": "What is 8 multiplied by 6?",
"gold_answer": "48"
},
]
# Custom combined reward function
def custom_combined_reward(trace: Trace, **kwargs: Any) -> float:
weights = {
"exact_match": 0.7, # 70% weight on correctness
"tool_success": 0.3, # 30% weight on successful tool execution
}
kwargs["weights"] = weights
return reward_combined(trace, **kwargs)
print("🧠 ToolBrain Flexible Rewards Example")
print("=" * 60)
# 1. Create agent
model = TransformersModel(
model_id="Qwen/Qwen2.5-0.5B-Instruct",
max_new_tokens=128
)
agent = CodeAgent(
model=model,
tools=[add, multiply],
max_steps=1
)
print("✅ Agent created.")
# 2. Train with exact match reward (standard)
# print("\n🎯 Training with exact match reward...")
# brain_exact = Brain(
# agent,
# algorithm="GRPO",
# reward_func=reward_exact_match
# )
# brain_exact.train(training_dataset[:1], num_iterations=1)
# # 3. Train with tool execution success reward
# print("\n🛠️ Training with tool execution success reward...")
# brain_tool = Brain(
# agent,
# algorithm="GRPO",
# reward_func=reward_tool_execution_success
# )
# brain_tool.train(training_dataset[:1], num_iterations=1)
# 4. Train with combined reward (exact match + tool success)
print("\n🎭 Training with combined reward (70% accuracy + 30% tool success)...")
brain_combined = Brain(
agent,
algorithm="GRPO",
reward_func=custom_combined_reward
)
brain_combined.train(training_dataset[:1], num_iterations=1)