Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lib 41 fix extraction for text fields in iframe bodies #764

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
0ccbb65
Extraction test script
MagMueller Feb 13, 2025
7e40b6d
Exclude label as interactive, include iframe bodies
MagMueller Feb 13, 2025
1147b05
Ignore json
MagMueller Feb 13, 2025
723a988
Fix default disable_security
MagMueller Feb 13, 2025
7a88f1a
Fix send_keys for words in natural langugae
MagMueller Feb 13, 2025
c452f93
Simplify input text
MagMueller Feb 13, 2025
6dd84bc
Include canvas in extraction
MagMueller Feb 13, 2025
483b589
Wrap wait in try catch
MagMueller Feb 13, 2025
8052b07
Merge branch 'main' into LIB-41-fix-extraction-for-text-fields-in-ifr…
MagMueller Feb 13, 2025
cfa41e1
Seperate isTop and is in viewport
MagMueller Feb 14, 2025
4ec5fe4
Include in viewport parameter
MagMueller Feb 15, 2025
e5fe568
Only show last line of error
MagMueller Feb 15, 2025
dad3892
Dont show element to click (takes too long)
MagMueller Feb 15, 2025
2571332
Include top layer description
MagMueller Feb 18, 2025
874e150
Remove tabindex
MagMueller Feb 18, 2025
5b4f297
Include escape
MagMueller Feb 18, 2025
11103d6
Fix for isOnTop isInViewport - scroll as fallback
MagMueller Feb 18, 2025
887f177
Fixed highlights and scrolling for multiple scrollers
MagMueller Feb 18, 2025
f31d573
Simplify test
MagMueller Feb 18, 2025
ecb749a
Remove link
MagMueller Feb 18, 2025
52d8200
Check if text node is visable
MagMueller Feb 18, 2025
ebee83e
Seperate checks if text is on top
MagMueller Feb 18, 2025
9824b67
Check scrolling for text nodes
MagMueller Feb 18, 2025
d34242c
Fix bounding box for iframes
MagMueller Feb 18, 2025
9b3331b
Split _build_dom_tree into 2 functions
MagMueller Feb 18, 2025
65703c4
Remove page from test
MagMueller Feb 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -183,4 +183,6 @@ gcp-login.json
.idea
*.txt
*.pdf
*.csv
*.csv
*.json
*.jsonl
13 changes: 4 additions & 9 deletions browser_use/agent/message_manager/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@

import json
import logging
from datetime import datetime
from typing import Dict, List, Optional, Type

from langchain_anthropic import ChatAnthropic
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import (
AIMessage,
Expand All @@ -14,7 +12,6 @@
SystemMessage,
ToolMessage,
)
from langchain_openai import ChatOpenAI

from browser_use.agent.message_manager.views import MessageHistory, MessageMetadata
from browser_use.agent.prompts import AgentMessagePrompt, SystemPrompt
Expand All @@ -35,7 +32,6 @@ def __init__(
estimated_characters_per_token: int = 3,
image_tokens: int = 800,
include_attributes: list[str] = [],
max_error_length: int = 400,
max_actions_per_step: int = 10,
message_context: Optional[str] = None,
sensitive_data: Optional[Dict[str, str]] = None,
Expand All @@ -49,7 +45,6 @@ def __init__(
self.estimated_characters_per_token = estimated_characters_per_token
self.IMG_TOKENS = image_tokens
self.include_attributes = include_attributes
self.max_error_length = max_error_length
self.message_context = message_context
self.sensitive_data = sensitive_data
system_message = self.system_prompt_class(
Expand Down Expand Up @@ -95,12 +90,12 @@ def __init__(
]

example_tool_call = AIMessage(
content=f'',
content='',
tool_calls=tool_calls,
)
self._add_message_with_tokens(example_tool_call)
tool_message = ToolMessage(
content=f'Browser started',
content='Browser started',
tool_call_id=str(self.tool_id),
)
self._add_message_with_tokens(tool_message)
Expand Down Expand Up @@ -147,7 +142,8 @@ def add_state_message(
msg = HumanMessage(content='Action result: ' + str(r.extracted_content))
self._add_message_with_tokens(msg)
if r.error:
msg = HumanMessage(content='Action error: ' + str(r.error)[-self.max_error_length :])
last_line = r.error.split('\n')[-1]
msg = HumanMessage(content='Action error: ' + last_line)
self._add_message_with_tokens(msg)
result = None # if result in history, we dont want to add it again

Expand All @@ -156,7 +152,6 @@ def add_state_message(
state,
result,
include_attributes=self.include_attributes,
max_error_length=self.max_error_length,
step_info=step_info,
).get_user_message(use_vision)
self._add_message_with_tokens(state_message)
Expand Down
8 changes: 3 additions & 5 deletions browser_use/agent/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,10 @@ def __init__(
state: BrowserState,
result: Optional[List[ActionResult]] = None,
include_attributes: list[str] = [],
max_error_length: int = 400,
step_info: Optional[AgentStepInfo] = None,
):
self.state = state
self.result = result
self.max_error_length = max_error_length
self.include_attributes = include_attributes
self.step_info = step_info

Expand Down Expand Up @@ -205,7 +203,7 @@ def get_user_message(self, use_vision: bool = True) -> HumanMessage:
Current url: {self.state.url}
Available tabs:
{self.state.tabs}
Interactive elements from current page:
Interactive elements from top layer of the current page inside the viewport:
{elements_text}
{step_info_description}
"""
Expand All @@ -215,8 +213,8 @@ def get_user_message(self, use_vision: bool = True) -> HumanMessage:
if result.extracted_content:
state_description += f'\nAction result {i + 1}/{len(self.result)}: {result.extracted_content}'
if result.error:
# only use last 300 characters of error
error = result.error[-self.max_error_length :]
# only use last line of error
error = result.error.split('\n')[-1]
state_description += f'\nAction error {i + 1}/{len(self.result)}: ...{error}'

if self.state.screenshot and use_vision == True:
Expand Down
17 changes: 7 additions & 10 deletions browser_use/agent/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,13 @@ def __init__(
'type',
'name',
'role',
'tabindex',
'aria-label',
'placeholder',
'value',
'alt',
'aria-expanded',
'data-date-format',
],
max_error_length: int = 400,
max_actions_per_step: int = 10,
tool_call_in_content: bool = True,
initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None,
Expand Down Expand Up @@ -123,7 +122,6 @@ def __init__(
self.save_conversation_path_encoding = save_conversation_path_encoding
self._last_result = None
self.include_attributes = include_attributes
self.max_error_length = max_error_length
self.generate_gif = generate_gif

# Initialize planner
Expand Down Expand Up @@ -173,7 +171,6 @@ def __init__(
system_prompt_class=self.system_prompt_class,
max_input_tokens=self.max_input_tokens,
include_attributes=self.include_attributes,
max_error_length=self.max_error_length,
max_actions_per_step=self.max_actions_per_step,
message_context=self.message_context,
sensitive_data=self.sensitive_data,
Expand Down Expand Up @@ -221,15 +218,15 @@ def _set_version_and_source(self) -> None:

def _set_model_names(self) -> None:
self.chat_model_library = self.llm.__class__.__name__
self.model_name = "Unknown"
self.model_name = 'Unknown'
# Check for 'model_name' attribute first
if hasattr(self.llm, "model_name"):
if hasattr(self.llm, 'model_name'):
model = self.llm.model_name
self.model_name = model if model is not None else "Unknown"
self.model_name = model if model is not None else 'Unknown'
# Fallback to 'model' attribute if needed
elif hasattr(self.llm, "model"):
elif hasattr(self.llm, 'model'):
model = self.llm.model
self.model_name = model if model is not None else "Unknown"
self.model_name = model if model is not None else 'Unknown'

if self.planner_llm:
if hasattr(self.planner_llm, 'model_name'):
Expand Down Expand Up @@ -635,7 +632,6 @@ async def _validate_output(self) -> bool:
state=state,
result=self._last_result,
include_attributes=self.include_attributes,
max_error_length=self.max_error_length,
)
msg = [SystemMessage(content=system_msg), content.get_user_message(self.use_vision)]
else:
Expand All @@ -646,6 +642,7 @@ class ValidationResult(BaseModel):
"""
Validation results.
"""

is_valid: bool
reason: str

Expand Down
2 changes: 1 addition & 1 deletion browser_use/browser/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class BrowserConfig:
headless: True
Whether to run browser in headless mode

disable_security: False
disable_security: True
Disable browser security features

extra_chromium_args: []
Expand Down
33 changes: 15 additions & 18 deletions browser_use/browser/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class BrowserContextConfig:
cookies_file: None
Path to cookies file for persistence

disable_security: False
disable_security: True
Disable browser security features

minimum_wait_page_load_time: 0.5
Expand Down Expand Up @@ -115,7 +115,7 @@ class BrowserContextConfig:
maximum_wait_page_load_time: float = 5
wait_between_actions: float = 1

disable_security: bool = False
disable_security: bool = True

browser_window_size: BrowserContextWindowSize = field(default_factory=lambda: {'width': 1280, 'height': 1100})
no_viewport: Optional[bool] = None
Expand Down Expand Up @@ -939,33 +939,30 @@ async def _input_text_element_node(self, element_node: DOMElementNode, text: str
"""
try:
# Highlight before typing
if element_node.highlight_index is not None:
await self._update_state(focus_element=element_node.highlight_index)
# if element_node.highlight_index is not None:
# await self._update_state(focus_element=element_node.highlight_index)

page = await self.get_current_page()
element_handle = await self.get_locate_element(element_node)

if element_handle is None:
raise BrowserError(f'Element: {repr(element_node)} not found')

# Ensure element is ready for input
await element_handle.wait_for_element_state('stable', timeout=2000)
await element_handle.scroll_into_view_if_needed(timeout=2100)
try:
await element_handle.wait_for_element_state('stable', timeout=1000)
await element_handle.scroll_into_view_if_needed(timeout=1000)
except Exception:
pass

# Get element properties to determine input method
is_contenteditable = await element_handle.get_property('isContentEditable')

# Different handling for contenteditable vs input fields
try:
if await is_contenteditable.json_value():
await element_handle.evaluate('el => el.textContent = ""')
await element_handle.type(text, delay=5)
else:
await element_handle.fill(text)
except Exception:
logger.debug('Could not type text into element. Trying to click and type.')
await element_handle.click()
if await is_contenteditable.json_value():
await element_handle.evaluate('el => el.textContent = ""')
await element_handle.type(text, delay=5)
else:
await element_handle.fill(text)

except Exception as e:
logger.debug(f'Failed to input text into element: {repr(element_node)}. Error: {str(e)}')
Expand All @@ -979,8 +976,8 @@ async def _click_element_node(self, element_node: DOMElementNode) -> Optional[st

try:
# Highlight before clicking
if element_node.highlight_index is not None:
await self._update_state(focus_element=element_node.highlight_index)
# if element_node.highlight_index is not None:
# await self._update_state(focus_element=element_node.highlight_index)

element_handle = await self.get_locate_element(element_node)

Expand Down
16 changes: 14 additions & 2 deletions browser_use/controller/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,13 +221,25 @@ async def scroll_up(params: ScrollAction, browser: BrowserContext):

# send keys
@self.registry.action(
'Send strings of special keys like Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press. Be aware of different operating systems and their shortcuts',
'Send strings of special keys like Escape,Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press. ',
param_model=SendKeysAction,
)
async def send_keys(params: SendKeysAction, browser: BrowserContext):
page = await browser.get_current_page()

await page.keyboard.press(params.keys)
try:
await page.keyboard.press(params.keys)
except Exception as e:
if 'Unknown key' in str(e):
# loop over the keys and try to send each one
for key in params.keys:
try:
await page.keyboard.press(key)
except Exception as e:
logger.debug(f'Error sending key {key}: {str(e)}')
raise e
else:
raise e
msg = f'⌨️ Sent keys: {params.keys}'
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True)
Expand Down
Loading