Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/file upload dict #565

Merged
merged 2 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,6 @@ gcp-login.json
.vscode
.ruff_cache
.idea
*.txt
*.pdf
*.csv
5 changes: 5 additions & 0 deletions browser_use/agent/message_manager/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ def task_instructions(task: str) -> HumanMessage:
content = f'Your ultimate task is: """{task}""". If you achieved your ultimate task, stop everything and use the done action in the next step to complete the task. If not, continue as usual.'
return HumanMessage(content=content)

def add_file_paths(self, file_paths: list[str]) -> None:
content = f'Here are file paths you can use: {file_paths}'
msg = HumanMessage(content=content)
self._add_message_with_tokens(msg)

def add_new_task(self, new_task: str) -> None:
content = f'Your new ultimate task is: """{new_task}""". Take the previous context into account and finish your new ultimate task. '
msg = HumanMessage(content=content)
Expand Down
9 changes: 7 additions & 2 deletions browser_use/agent/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def __init__(
message_context: Optional[str] = None,
generate_gif: bool | str = True,
sensitive_data: Optional[Dict[str, str]] = None,
available_file_paths: Optional[list[str]] = None,
include_attributes: list[str] = [
'title',
'type',
Expand Down Expand Up @@ -111,7 +112,7 @@ def __init__(
self.page_extraction_llm = llm
else:
self.page_extraction_llm = page_extraction_llm

self.available_file_paths = available_file_paths
self.task = task
self.use_vision = use_vision
self.use_vision_for_planner = use_vision_for_planner
Expand Down Expand Up @@ -175,7 +176,8 @@ def __init__(
message_context=self.message_context,
sensitive_data=self.sensitive_data,
)

if self.available_file_paths:
self.message_manager.add_file_paths(self.available_file_paths)
# Step callback
self.register_new_step_callback = register_new_step_callback
self.register_done_callback = register_done_callback
Expand Down Expand Up @@ -311,6 +313,7 @@ async def step(self, step_info: Optional[AgentStepInfo] = None) -> None:
page_extraction_llm=self.page_extraction_llm,
sensitive_data=self.sensitive_data,
check_break_if_paused=lambda: self._check_if_stopped_or_paused(),
available_file_paths=self.available_file_paths,
)
self._last_result = result

Expand Down Expand Up @@ -539,6 +542,7 @@ async def run(self, max_steps: int = 100) -> AgentHistoryList:
check_for_new_elements=False,
page_extraction_llm=self.page_extraction_llm,
check_break_if_paused=lambda: self._check_if_stopped_or_paused(),
available_file_paths=self.available_file_paths,
)
self._last_result = result

Expand Down Expand Up @@ -677,6 +681,7 @@ async def rerun_history(
check_for_new_elements=False,
page_extraction_llm=self.page_extraction_llm,
check_break_if_paused=lambda: self._check_if_stopped_or_paused(),
available_file_paths=self.available_file_paths,
)

results = []
Expand Down
2 changes: 1 addition & 1 deletion browser_use/browser/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -1049,7 +1049,7 @@ async def get_element_by_index(self, index: int) -> ElementHandle | None:
element_handle = await self.get_locate_element(selector_map[index])
return element_handle

async def get_dom_element_by_index(self, index: int) -> DOMElementNode | None:
async def get_dom_element_by_index(self, index: int) -> DOMElementNode:
selector_map = await self.get_selector_map()
return selector_map[index]

Expand Down
41 changes: 15 additions & 26 deletions browser_use/controller/registry/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def _create_param_model(self, function: Callable) -> Type[BaseModel]:
params = {
name: (param.annotation, ... if param.default == param.empty else param.default)
for name, param in sig.parameters.items()
if name != 'browser' and name != 'page_extraction_llm'
if name != 'browser' and name != 'page_extraction_llm' and name != 'available_file_paths'
}
# TODO: make the types here work
return create_model(
Expand Down Expand Up @@ -88,6 +88,7 @@ async def execute_action(
browser: Optional[BrowserContext] = None,
page_extraction_llm: Optional[BaseChatModel] = None,
sensitive_data: Optional[Dict[str, str]] = None,
available_file_paths: Optional[list[str]] = None,
) -> Any:
"""Execute a registered action"""
if action_name not in self.registry.actions:
Expand All @@ -107,35 +108,23 @@ async def execute_action(
if sensitive_data:
validated_params = self._replace_sensitive_data(validated_params, sensitive_data)

if 'browser' in parameter_names and not browser:
raise ValueError(f'Action {action_name} requires browser but none provided.')
if 'page_extraction_llm' in parameter_names and not page_extraction_llm:
raise ValueError(f'Action {action_name} requires page_extraction_llm but none provided.')
if 'available_file_paths' in parameter_names and not available_file_paths:
raise ValueError(f'Action {action_name} requires available_file_paths but none provided.')
# Prepare arguments based on parameter type
if 'browser' in parameter_names and 'page_extraction_llm' in parameter_names:
if not browser:
raise ValueError(f'Action {action_name} requires browser but none provided.')
if not page_extraction_llm:
raise ValueError(f'Action {action_name} requires page_extraction_llm but none provided.')
if is_pydantic:
return await action.function(validated_params, browser=browser, page_extraction_llm=page_extraction_llm)
return await action.function(
**validated_params.model_dump(), browser=browser, page_extraction_llm=page_extraction_llm
)

extra_args = {}
if 'browser' in parameter_names:
if not browser:
raise ValueError(f'Action {action_name} requires browser but none provided.')
if is_pydantic:
return await action.function(validated_params, browser=browser)
return await action.function(**validated_params.model_dump(), browser=browser)

extra_args['browser'] = browser
if 'page_extraction_llm' in parameter_names:
if not page_extraction_llm:
raise ValueError(f'Action {action_name} requires page_extraction_llm but none provided.')
if is_pydantic:
return await action.function(validated_params, page_extraction_llm=page_extraction_llm)
return await action.function(**validated_params.model_dump(), page_extraction_llm=page_extraction_llm)

extra_args['page_extraction_llm'] = page_extraction_llm
if 'available_file_paths' in parameter_names:
extra_args['available_file_paths'] = available_file_paths
if is_pydantic:
return await action.function(validated_params)
return await action.function(**validated_params.model_dump())
return await action.function(validated_params, **extra_args)
return await action.function(**validated_params.model_dump(), **extra_args)

except Exception as e:
raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e
Expand Down
5 changes: 4 additions & 1 deletion browser_use/controller/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,7 @@ async def multi_act(
check_for_new_elements: bool = True,
page_extraction_llm: Optional[BaseChatModel] = None,
sensitive_data: Optional[Dict[str, str]] = None,
available_file_paths: Optional[list[str]] = None,
) -> list[ActionResult]:
"""Execute multiple actions"""
results = []
Expand Down Expand Up @@ -477,7 +478,7 @@ async def multi_act(

check_break_if_paused()

results.append(await self.act(action, browser_context, page_extraction_llm, sensitive_data))
results.append(await self.act(action, browser_context, page_extraction_llm, sensitive_data, available_file_paths))

logger.debug(f'Executed action {i + 1} / {len(actions)}')
if results[-1].is_done or results[-1].error or i == len(actions) - 1:
Expand All @@ -495,6 +496,7 @@ async def act(
browser_context: BrowserContext,
page_extraction_llm: Optional[BaseChatModel] = None,
sensitive_data: Optional[Dict[str, str]] = None,
available_file_paths: Optional[list[str]] = None,
) -> ActionResult:
"""Execute an action"""

Expand All @@ -515,6 +517,7 @@ async def act(
browser=browser_context,
page_extraction_llm=page_extraction_llm,
sensitive_data=sensitive_data,
available_file_paths=available_file_paths,
)

Laminar.set_span_output(result)
Expand Down
62 changes: 41 additions & 21 deletions examples/custom-functions/file_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,14 @@

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import asyncio
import logging

from langchain_openai import ChatOpenAI

from browser_use import Agent, Controller
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext

CV = Path.cwd() / 'examples/test_cv.txt'
import logging

logger = logging.getLogger(__name__)

# Initialize controller first
Expand All @@ -29,52 +27,74 @@


@controller.action(
'Upload file to element ',
'Upload file to interactive element with file path ',
)
async def upload_file(index: int, browser: BrowserContext):
path = str(CV.absolute())
dom_el = await browser.get_dom_element_by_index(index)
async def upload_file(index: int, path: str, browser: BrowserContext, available_file_paths: list[str]):
if path not in available_file_paths:
return ActionResult(error=f'File path {path} is not available')

if dom_el is None:
return ActionResult(error=f'No element found at index {index}')
if not os.path.exists(path):
return ActionResult(error=f'File {path} does not exist')

dom_el = await browser.get_dom_element_by_index(index)

file_upload_dom_el = dom_el.get_file_upload_element()

if file_upload_dom_el is None:
logger.info(f'No file upload element found at index {index}')
return ActionResult(error=f'No file upload element found at index {index}')
msg = f'No file upload element found at index {index}'
logger.info(msg)
return ActionResult(error=msg)

file_upload_el = await browser.get_locate_element(file_upload_dom_el)

if file_upload_el is None:
logger.info(f'No file upload element found at index {index}')
return ActionResult(error=f'No file upload element found at index {index}')
msg = f'No file upload element found at index {index}'
logger.info(msg)
return ActionResult(error=msg)

try:
await file_upload_el.set_input_files(path)
msg = f'Successfully uploaded file to index {index}'
logger.info(msg)
return ActionResult(extracted_content=msg)
return ActionResult(extracted_content=msg, include_in_memory=True)
except Exception as e:
logger.debug(f'Error in set_input_files: {str(e)}')
return ActionResult(error=f'Failed to upload file to index {index}')
msg = f'Failed to upload file to index {index}: {str(e)}'
logger.info(msg)
return ActionResult(error=msg)


@controller.action('Read the file content of a file given a path')
async def read_file(path: str, available_file_paths: list[str]):
if path not in available_file_paths:
return ActionResult(error=f'File path {path} is not available')

@controller.action('Close file dialog')
async def close_file_dialog(browser: BrowserContext):
page = await browser.get_current_page()
await page.keyboard.press('Escape')
with open(path, 'r') as f:
content = f.read()
msg = f'File content: {content}'
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True)


def create_file(file_type: str = 'txt'):
with open(f'tmp.{file_type}', 'w') as f:
f.write('test')
file_path = Path.cwd() / f'tmp.{file_type}'
logger.info(f'Created file: {file_path}')
return str(file_path)


async def main():
task = f'go to https://kzmpmkh2zfk1ojnpxfn1.lite.vusercontent.net/ and upload to each upload field my file'
task = f'Go to https://kzmpmkh2zfk1ojnpxfn1.lite.vusercontent.net/ and - read the file content and upload them to fields'

available_file_paths = [create_file('txt'), create_file('pdf'), create_file('csv')]

model = ChatOpenAI(model='gpt-4o')
agent = Agent(
task=task,
llm=model,
controller=controller,
browser=browser,
available_file_paths=available_file_paths,
)

await agent.run()
Expand Down