Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancement/page summary #518

Merged
merged 2 commits into from
Feb 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions browser_use/agent/message_manager/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,12 @@ def __init__(
'name': 'AgentOutput',
'args': {
'current_state': {
'evaluation_previous_goal': 'Success - No previous actions to evaluate.',
'memory': 'Starting with the new task 0/10 done',
'next_goal': 'Start browser',
'page_summary': 'On the page are company a,b,c wtih their revenue 1,2,3.',
'evaluation_previous_goal': 'Success - I opend the first page',
'memory': 'Starting with the new task. I have completed 1/10 steps',
'next_goal': 'Click on company a',
},
'action': [],
'action': [{'click_element': {'index': 0}}],
},
'id': str(self.tool_id),
'type': 'tool_call',
Expand All @@ -106,7 +107,7 @@ def __init__(

self.tool_id += 1

placeholder_message = HumanMessage(content='Task history starts here:')
placeholder_message = HumanMessage(content='[Your task history memory starts here]')
self._add_message_with_tokens(placeholder_message)

@staticmethod
Expand Down
1 change: 1 addition & 0 deletions browser_use/agent/message_manager/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def test_token_overflow_handling_with_real_flow(message_manager: MessageManager,

output = AgentOutput(
current_state=AgentBrain(
page_summary=f'Thought process from step {i}',
evaluation_previous_goal=f'Success in step {i}',
memory=f'Memory from step {i}',
next_goal=f'Goal for step {i + 1}',
Expand Down
19 changes: 14 additions & 5 deletions browser_use/agent/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def important_rules(self) -> str:
1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
{
"current_state": {
"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not",
"page_summary": "Quick detailed summary of new information from the current page which is not yet in the task history memory. Be specific with details which are important for the task. This is not on the meta level, but should be facts. If all the information is already in the task history memory, leave this empty.",
"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not",
"memory": "Description of what has been done and what you need to remember. Be very specific. Count here ALWAYS how many times you have done something and how many remain. E.g. 0 out of 10 websites analyzed. Continue with abc and xyz",
"next_goal": "What needs to be done with the next actions"
},
Expand Down Expand Up @@ -57,7 +58,7 @@ def important_rules(self) -> str:

4. NAVIGATION & ERROR HANDLING:
- If no suitable elements exist, use other functions to complete the task
- If stuck, try alternative approaches
- If stuck, try alternative approaches - like going back to a previous page, new search, new tab etc.
- Handle popups/cookies by accepting or closing them
- Use scroll to find elements you are looking for

Expand Down Expand Up @@ -88,6 +89,12 @@ def important_rules(self) -> str:
- Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes...
- only use multiple actions if it makes sense.

9. Long tasks:
- If the task is long keep track of the status in the memory. If the ultimate task requires multiple subinformation, keep track of the status in the memory.
- If you get stuck,

10. Extraction:
- If your task is to find information or do research - call extract_page_content on the specific pages to get and store the information.

"""
text += f' - use maximum {self.max_actions_per_step} actions per sequence'
Expand Down Expand Up @@ -124,8 +131,8 @@ def get_system_message(self) -> SystemMessage:

AGENT_PROMPT = f"""You are a precise browser automation agent that interacts with websites through structured commands. Your role is to:
1. Analyze the provided webpage elements and structure
2. Plan a sequence of actions to accomplish the given task
3. Respond with valid JSON containing your action sequence and state assessment
2. Use the given information to accomplish the ultimate task
3. Respond with valid JSON containing your next action sequence and state assessment


{self.input_format()}
Expand Down Expand Up @@ -190,7 +197,9 @@ def get_user_message(self, use_vision: bool = True) -> HumanMessage:
step_info_description += f'Current date and time: {time_str}'

state_description = f"""

[Task history memory ends here]
[Current state starts here]
You will see the following only once - if you need to remember it and you dont know it yet, write it down in the memory:
Current url: {self.state.url}
Available tabs:
{self.state.tabs}
Expand Down
2 changes: 1 addition & 1 deletion browser_use/agent/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ def _log_response(self, response: AgentOutput) -> None:
emoji = '⚠'
else:
emoji = '🤷'

logger.debug(f'🤖 {emoji} Page summary: {response.current_state.page_summary}')
logger.info(f'{emoji} Eval: {response.current_state.evaluation_previous_goal}')
logger.info(f'🧠 Memory: {response.current_state.memory}')
logger.info(f'🎯 Next goal: {response.current_state.next_goal}')
Expand Down
8 changes: 7 additions & 1 deletion browser_use/agent/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,12 @@ def sample_history(action_registry):
histories = [
AgentHistory(
model_output=AgentOutput(
current_state=AgentBrain(evaluation_previous_goal='None', memory='Started task', next_goal='Click button'),
current_state=AgentBrain(
page_summary='I need to find the founders of browser-use',
evaluation_previous_goal='None',
memory='Started task',
next_goal='Click button',
),
action=[click_action],
),
result=[ActionResult(is_done=False)],
Expand Down Expand Up @@ -107,6 +112,7 @@ def sample_history(action_registry):
AgentHistory(
model_output=AgentOutput(
current_state=AgentBrain(
page_summary='I found out that the founders are John Doe and Jane Smith. I need to draft them a message.',
evaluation_previous_goal='Extracted content',
memory='Content extracted',
next_goal='Finish task',
Expand Down
1 change: 1 addition & 0 deletions browser_use/agent/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class ActionResult(BaseModel):
class AgentBrain(BaseModel):
"""Current state of the agent"""

page_summary: str
evaluation_previous_goal: str
memory: str
next_goal: str
Expand Down
14 changes: 3 additions & 11 deletions examples/features/small_model_for_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,10 @@

load_dotenv()

# Initialize the model
llm = ChatOpenAI(
model='gpt-4o',
temperature=0.0,
)
task = 'Find the founders of browser-use in ycombinator, extract all links and open the links one by one'


# smaller model to extract content
llm = ChatOpenAI(model='gpt-4o', temperature=0.0)
small_llm = ChatOpenAI(model='gpt-4o-mini', temperature=0.0)

agent = Agent(task=task, llm=llm, max_actions_per_step=2, page_extraction_llm=small_llm)
task = 'Find the founders of browser-use in ycombinator, extract all links and open the links one by one'
agent = Agent(task=task, llm=llm, page_extraction_llm=small_llm)


async def main():
Expand Down