browser-use/eval/service.py at main · bytes-code/browser-use

History

1955 lines (1722 loc) · 72.4 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

# ================================================

# Imports

# ================================================

import argparse

import asyncio

import base64

import http.client

import json

import logging

import os

import time

from pathlib import Path

from uuid import UUID

import anyio

from dotenv import load_dotenv

from lmnr import AsyncLaminarClient, Instruments, Laminar

from pydantic import BaseModel

from browser_use import ActionResult, Agent, BrowserSession, Controller

from browser_use.agent.views import AgentHistoryList

from browser_use.llm.base import BaseChatModel

from browser_use.observability import observe, observe_debug

MAX_IMAGE = 5

from eval.browsers import (

ANCHOR_BROWSER_API_KEY,

BRIGHTDATA_CDP_URL,

BROWSERBASE_API_KEY,

BROWSERBASE_PROJECT_ID,

HYPERBROWSER_API_KEY,

setup_browser_session,

)

from eval.comprehensive_judge import evaluate_task_with_comprehensive_judge

from eval.cookie_judge import check_login_cookie_at_step, evaluate_task_with_login_cookie, save_login_cookie_tracking

from eval.models import SUPPORTED_MODELS, get_llm

from eval.resource_monitoring import (

get_system_resources,

log_system_resources,

setup_signal_handlers,

start_resource_monitoring,

stop_resource_monitoring,

)

from eval.server import (

fetch_auth_distribution_from_server,

fetch_tasks_from_server,

format_auth_info_for_agent,

save_task_result_to_server,

send_progress_update,

start_new_run,

)

from eval.task_types import Stage, StageError, Task, TaskResult

from eval.utils import get_git_info

from eval.web_judge import Online_Mind2Web_eval_with_retry

# ================================================

# Setup Logging

# ================================================

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s: %(message)s')

logger = logging.getLogger(__name__)

# ================================================

# Environment variables

# ================================================

# Load dotenv

load_dotenv()

# Check for SERPER API key

SERPER_API_KEY = os.getenv('SERPER_API_KEY')

if not SERPER_API_KEY:

logger.warning('SERPER_API_KEY is not set. Search functionality will not be available.')

# ================================================

# Tracking and Observations

# ================================================

Laminar.initialize(disabled_instruments={Instruments.BROWSER_USE}, disable_batch=True)

laminar_client = AsyncLaminarClient()

# Resource monitoring functions moved to resource_monitoring.py module

# ================================================

# Custom Controllers

# ================================================

def create_controller_with_serp_search(output_model: type[BaseModel] | None = None):

"""Create a controller with SERP search instead of Google search"""

controller = Controller(exclude_actions=['search_google'], output_model=output_model)

@controller.registry.action('Search the web for a specific query')

async def search_web(query: str):

"""Search the web using Serper API"""

if not SERPER_API_KEY:

return ActionResult(extracted_content='Search unavailable: SERPER_API_KEY not configured', include_in_memory=True)

try:

# Make request to Serper API

conn = http.client.HTTPSConnection('google.serper.dev')

payload = json.dumps({'q': query})

headers = {'X-API-KEY': SERPER_API_KEY, 'Content-Type': 'application/json'}

conn.request('POST', '/search', payload, headers)

res = conn.getresponse()

data = res.read()

serp_data = json.loads(data.decode('utf-8'))

# Exclude searchParameters and credits to reduce noise

serp_data = {k: v for k, v in serp_data.items() if k not in ['searchParameters', 'credits']}

# Log the search data for debugging

logger.debug(f"SERP search for '{query}': {json.dumps(serp_data, indent=2)}")

# Convert to string for the agent

serp_data_str = json.dumps(serp_data)

return ActionResult(

extracted_content=serp_data_str, include_in_memory=False, include_extracted_content_only_once=True

)

except Exception as e:

logger.error(f'Error in SERP search: {type(e).__name__}: {e}')

return ActionResult(error=f'Search error: {str(e)}')

return controller

def create_controller(

use_serp: bool = False,

output_model: type[BaseModel] | None = None,

gmail_tokens_dict: dict[str, str] | None = None,

task: 'Task | None' = None,

"""Create a controller, optionally with SERP search and Gmail 2FA support"""

if use_serp:

controller = create_controller_with_serp_search(output_model=output_model)

else:

controller = Controller(output_model=output_model)

# Add Gmail 2FA support if tokens dict is available and task has login_type OTP

if gmail_tokens_dict and task and hasattr(task, 'login_type') and task.login_type == 'OTP':

try:

# Extract username from task - check multiple possible sources

username = None

# Check if task has email field directly

if hasattr(task, 'username') and getattr(task, 'username', None):

username = getattr(task, 'username')

# Check if email is in task description or other fields

elif hasattr(task, 'confirmed_task') and '@' in task.confirmed_task:

# Extract email from task description using regex

import re

email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

matches = re.findall(email_pattern, task.confirmed_task)

if matches:

username = matches[0]

if username:

# Extract user ID (part before @)

user_id = username.split('@')[0]

# Look up access token in the dictionary

access_token = gmail_tokens_dict.get(user_id)

if access_token:

from browser_use.integrations.gmail import register_gmail_actions

# Register Gmail actions using the access token

controller = register_gmail_actions(controller, access_token=access_token)

logger.info(f'Gmail 2FA integration registered successfully for user {user_id} (OTP task)')

else:

logger.info(f'No Gmail 2FA token found for user {user_id}, running without Gmail integration')

else:

logger.info('No email found in OTP task, running without Gmail integration')

except Exception as e:

logger.error(f'Failed to setup Gmail integration: {e}')

else:

if gmail_tokens_dict and task:

if not hasattr(task, 'login_type') or task.login_type != 'OTP':

logger.info(f'Task login_type is "{getattr(task, "login_type", "None")}", not OTP - skipping Gmail integration')

else:

logger.info('Gmail 2FA tokens provided but no task or task missing login_type')

else:

logger.info('No Gmail 2FA tokens provided or no task, running without Gmail integration')

return controller

# ================================================

# Formatting results

# ================================================

def clean_action_dict(action_dict: dict) -> dict:

return {k: clean_action_dict(v) if isinstance(v, dict) else v for k, v in action_dict.items() if v is not None}

async def reformat_agent_history(

agent_history: AgentHistoryList,

task_id: str,

run_id: str,

task: str,

last_message: str,

base_path: str = 'saved_trajectories',

include_result: bool = False,

agent_execution_time: float | None = None,

) -> dict:

# Update directory name

task_dir = Path(base_path) / task_id

trajectory_with_highlights_dir = task_dir / 'trajectory_with_highlights'

# Create directories

task_dir.mkdir(parents=True, exist_ok=True)

trajectory_with_highlights_dir.mkdir(parents=True, exist_ok=True)

# Collect screenshot paths and action history

screenshot_paths = []

action_history = []

final_result = None

self_report_completed = False

self_report_success = None

complete_history = []

total_tokens_used = 0 # Initialize token counter

# Process history items

for step_num, history_item in enumerate(agent_history.history):

# Save screenshot

if history_item.state and history_item.state.screenshot:

screenshot_path = trajectory_with_highlights_dir / f'step_{step_num}.png'

screenshot_paths.append(str(screenshot_path))

# Save the actual screenshot

screenshot_data = base64.b64decode(history_item.state.screenshot)

async with await anyio.open_file(screenshot_path, 'wb') as f:

await f.write(screenshot_data)

# Get action result content

if history_item.result:

for result in history_item.result:

# We don't want to include the final result in the action history as per the evaluation criteria

if result.extracted_content and result.extracted_content != 'None' and not result.is_done:

action_history.append(result.extracted_content)

# Check if this is the final result

if result.is_done:

final_result = result.extracted_content

self_report_completed = True

self_report_success = result.success

# Build complete history entry with cleaned model output

model_output = None

if history_item.model_output:

model_output = history_item.model_output.model_dump()

if 'action' in model_output:

# Clean each action in the action list

model_output['action'] = [clean_action_dict(action) for action in model_output['action']]

step_metadata = history_item.metadata.model_dump() if history_item.metadata else {}

step_info = {

'step_number': step_num,

'model_output': model_output,

'result': [r.model_dump() for r in history_item.result] if history_item.result else None,

'state': {

'url': history_item.state.url if history_item.state else None,

'title': history_item.state.title if history_item.state else None,

'metadata': step_metadata, # Use dumped metadata

}

complete_history.append(step_info)

# Sum up tokens from metadata

if step_metadata and 'input_tokens' in step_metadata:

try:

total_tokens_used += int(step_metadata['input_tokens'])

except (ValueError, TypeError):

logger.warning(

f"Task {task_id}, Step {step_num}: Could not parse input_tokens '{step_metadata['input_tokens']}' as integer."

)

# Calculate task duration from metadata (step-based timing)

step_based_duration = None

if complete_history and len(complete_history) > 0:

first_step = complete_history[0].get('metadata', {})

last_step = complete_history[-1].get('metadata', {})

if first_step and last_step:

start_time = first_step.get('step_start_time')

end_time = last_step.get('step_end_time')

if start_time and end_time:

# Ensure timestamps are floats before subtracting

try:

start_time_float = float(start_time)

end_time_float = float(end_time)

step_based_duration = end_time_float - start_time_float

except (ValueError, TypeError) as e:

logger.warning(f'Could not calculate step-based duration due to invalid timestamp format: {e}')

# Use agent execution time if provided (wall-clock timing around run_agent), otherwise fall back to step-based

task_duration = agent_execution_time if agent_execution_time is not None else step_based_duration

# Conditionally include the final result in action history

if include_result and final_result and final_result.strip():

action_history = action_history + [final_result]

# Extract usage data from agent history

usage_data = None

logger.info(f'Agent history usage object: {agent_history.usage}')

logger.info(f'Agent history usage type: {type(agent_history.usage)}')

if hasattr(agent_history, 'usage') and agent_history.usage:

logger.info(f'Agent history usage model_dump: {agent_history.usage.model_dump()}')

usage_data = agent_history.usage.model_dump()

else:

logger.warning('Agent history has no usage data or usage is empty/None')

# Create results structure with new fields

results = {

'task_id': task_id,

'run_id': run_id,

'task': task,

'action_history': action_history,

'screenshot_paths': screenshot_paths,

'final_result_response': final_result,

'last_message': last_message,

'self_report_completed': self_report_completed,

'self_report_success': self_report_success,

'complete_history': complete_history,

'task_duration': task_duration,

'steps': len(complete_history),

'tokensUsed': total_tokens_used, # Add total tokens used

'usage': usage_data, # Add usage data

}

# Save results file

results_path = task_dir / 'result.json'

async with await anyio.open_file(results_path, 'w') as f:

# Use a custom JSON encoder to handle potential non-serializable types like Path

await f.write(json.dumps(results, indent=2, default=str))

return results

# ================================================

# Judge task result

# ================================================

@observe_debug()

async def judge_task_result(

model, task_folder: Path, score_threshold: float = 3, use_mind2web: bool = False, judge_repeat_count: int = 1

) -> dict:

"""

Judge a single task result using the comprehensive judge system by default,

with optional fallback to the original Online_Mind2Web evaluation.

Args:

model: The model to use for evaluation

task_folder: Path to the task result folder

score_threshold: Score threshold for image filtering (used only for Mind2Web)

use_mind2web: If True, use the original Online_Mind2Web evaluation instead

judge_repeat_count: Number of times to repeat the judge evaluation (averages over multiple judgments)

Returns:

Dictionary containing judgment results

"""

result_file = task_folder / 'result.json'

if not result_file.exists():

return {

'task_id': task_folder.name,

'judgement': 'No result.json found',

'success': False,

'error': 'No result.json found',

'score': 0.0,

}

try:

async with await anyio.open_file(result_file) as f:

result = json.loads(await f.read())

# Check if we should use the original Mind2Web evaluation

if use_mind2web:

logger.info(f'Task {task_folder.name}: Using original Online_Mind2Web evaluation')

# If a Online_Mind2Web_evaluation is already saved, we can skip the eval

if result.get('Online_Mind2Web_evaluation'):

return result.get('Online_Mind2Web_evaluation')

# Get the screenshot paths, task description, and action history

screenshot_paths = result.get('screenshot_paths', [])

task_description = result.get('task')

action_history = result.get('action_history', [])

# Use the retry wrapper for evaluation

try:

# Await the async function directly instead of using asyncio.run()

eval_result = await Online_Mind2Web_eval_with_retry(

task_description, action_history, screenshot_paths, model, score_threshold

)

if eval_result is None:

raise Exception('Evaluation failed after all retries')

messages, text, system_msg, record, key_points = eval_result

# Final steps to get judgement - use async invoke directly

judgement_response = await model.ainvoke(messages)

judgement = judgement_response.completion

if 'success' in judgement.lower().split('status:')[1]: # This is the official criteria for success

evaluation = {

'task_id': task_folder.name,

'judgement': judgement,

'success': True,

'error': None,

'score': 1.0,

}

else: # This is the official criteria for failure

evaluation = {

'task_id': task_folder.name,

'judgement': judgement,

'success': False,

'error': None,

'score': 0.0,

}

# Save the Online_Mind2Web_evaluation into the result.json file

result['Online_Mind2Web_evaluation'] = evaluation

async with await anyio.open_file(result_file, 'w') as f:

await f.write(json.dumps(result, indent=2))

return evaluation

except Exception as err:

return {

'task_id': task_folder.name,

'judgement': f'Mind2Web evaluation failed: {type(err).__name__}: {err}',

'success': False,

'error': f'{type(err).__name__}: {err}',

'score': 0.0,

}

else:

# Use the new comprehensive judge system (default)

logger.info(f'Task {task_folder.name}: Using comprehensive judge evaluation with {judge_repeat_count} repetition(s)')

# Check if comprehensive judge result already exists

if result.get('comprehensive_judge_evaluation'):

existing_eval = result['comprehensive_judge_evaluation']

return {

'task_id': task_folder.name,

'judgement': existing_eval.get('reasoning', 'Comprehensive evaluation completed'),

'success': existing_eval.get('passed', False),

'error': None,

'score': existing_eval.get('final_score', 0) / 100.0, # Convert to 0-1 scale

'comprehensive_evaluation': existing_eval,

}

try:

# Run comprehensive judge evaluation (with repeat and averaging handled in comprehensive_judge.py)

comprehensive_result = await asyncio.wait_for(

evaluate_task_with_comprehensive_judge(

task_folder=task_folder, model=model, max_images=10, judge_repeat_count=judge_repeat_count

timeout=180 * judge_repeat_count, # Increase timeout based on repeat count

)

if comprehensive_result.get('error'):

return {

'task_id': task_folder.name,

'judgement': f'Comprehensive evaluation failed: {comprehensive_result["error"]}',

'success': False,

'error': comprehensive_result['error'],

'score': 0.0,

}

comp_eval = comprehensive_result.get('comprehensive_judge')

if comp_eval:

return {

'task_id': task_folder.name,

'judgement': comp_eval.get('reasoning', 'Comprehensive evaluation completed'),

'success': comp_eval.get('passed', False),

'error': None,

'score': comp_eval.get('final_score', 0) / 100.0, # Convert to 0-1 scale

'comprehensive_evaluation': comp_eval,

}

else:

return {

'task_id': task_folder.name,

'judgement': 'Comprehensive judge failed to return results',

'success': False,

'error': 'Comprehensive judge failed to return results',

'score': 0.0,

}

except Exception as err:

logger.error(f'Comprehensive judge evaluation failed for {task_folder.name}: {err}')

return {

'task_id': task_folder.name,

'judgement': f'Comprehensive judge error: {type(err).__name__}: {err}',

'success': False,

'error': f'Comprehensive judge error: {type(err).__name__}: {err}',

'score': 0.0,

}

except Exception as err:

return {

'task_id': task_folder.name,

'judgement': f'Evaluation failed: {type(err).__name__}: {err}',

'success': False,

'error': f'{type(err).__name__}: {err}',

'score': 0.0,

}

# ================================================

# Main Evaluation Functions

# ================================================

@observe(name='executor', span_type='EXECUTOR') # type: ignore[arg-type]

async def run_agent_with_browser(

browser_session: BrowserSession,

task: Task,

llm: BaseChatModel,

max_steps: int,

use_vision: bool,

use_serp: bool = False,

enable_memory: bool = False,

memory_interval: int = 10,

max_actions_per_step: int = 10,

validate_output: bool = False,

planner_llm: BaseChatModel | None = None,

planner_interval: int = 1,

use_thinking: bool = True,

gmail_tokens_dict: dict[str, str] | None = None,

images_per_step: int = 1,

) -> tuple[AgentHistoryList, str]:

"""Run agent with the browser session"""

# Create controller, optionally with SERP search, structured output, and Gmail 2FA support

controller = create_controller(

use_serp=use_serp, output_model=task.output_model, gmail_tokens_dict=gmail_tokens_dict, task=task

)

# Check for deprecated memory parameters

if enable_memory:

raise ValueError(

'Memory support has been removed as of version 0.3.2. '

'The agent context for memory is significantly improved and no longer requires the old memory system. '

"Please remove the 'enable_memory' parameter."

)

# Set up login cookie monitoring if this is a login task

is_login_task = hasattr(task, 'login_cookie') and task.login_cookie

new_step_callback = None

if is_login_task:

logger.info(f'🔐 Setting up login cookie monitoring for task {task.task_id}')

async def login_cookie_step_callback(browser_state_summary, agent_output, step_number):

"""Callback to check login cookie after each step"""

try:

if task.login_cookie is not None:

await check_login_cookie_at_step(

browser_session=browser_session, task_id=task.task_id, login_cookie=task.login_cookie, step=step_number

)

else:

logger.warning(f'❌ Task {task.task_id} Step {step_number}: login_cookie is None, skipping check')

except Exception as e:

logger.warning(f'❌ Error checking login cookie at step {step_number}: {type(e).__name__}: {e}')

new_step_callback = login_cookie_step_callback

agent = Agent(

task=task.confirmed_task,

llm=llm,

controller=controller,

browser_session=browser_session,

use_vision=use_vision,

max_actions_per_step=max_actions_per_step,

validate_output=validate_output,

planner_llm=planner_llm,

planner_interval=planner_interval,

use_thinking=use_thinking,

images_per_step=images_per_step,

source='eval_platform',

calculate_cost=True,

register_new_step_callback=new_step_callback,

)

# get last message

await agent.run(max_steps=max_steps)

last_input_messages = agent.message_manager.last_input_messages

last_message = last_input_messages[-1].text

# Save login cookie tracking if this was a login task

if is_login_task:

# Save tracking data to the task folder (will be created later in the pipeline)

# For now, we'll save it when the task folder is available

pass

return agent.state.history, last_message

@observe(name='evaluate_task_result', span_type='EVALUATOR') # type: ignore[arg-type]

async def evaluate_task_result(

eval_model: BaseChatModel,

task_folder: Path,

task: Task | None = None,

use_mind2web: bool = False,

judge_repeat_count: int = 1,

) -> dict:

"""Evaluate the task result"""

# Check if this is a login task that should use both cookie-based and judge evaluation

if task and hasattr(task, 'login_cookie') and task.login_cookie:

logger.info(f'Using combined cookie-based and judge evaluation for login task {task.task_id}')

# First run the judge evaluation to get comprehensive feedback

judge_result = await judge_task_result(

eval_model, task_folder, score_threshold=3, use_mind2web=use_mind2web, judge_repeat_count=judge_repeat_count

)

# Then run the cookie-based evaluation to get the actual score

cookie_result = await evaluate_task_with_login_cookie(task.login_cookie, task_folder)

# Use the score from cookie_result to overwrite judge_result

judge_result['score'] = cookie_result['score']

judge_result['success'] = cookie_result['success']

judge_result['error'] = cookie_result['error']

# Also overwrite comprehensive judge evaluation if it exists

if 'comprehensive_evaluation' in judge_result and judge_result['comprehensive_evaluation']:

judge_result['comprehensive_evaluation']['passed'] = cookie_result['success']

# Convert score from 0-1 scale to 0-100 scale for comprehensive judge

judge_result['comprehensive_evaluation']['final_score'] = int(cookie_result['score'] * 100)

return judge_result

else:

return await judge_task_result(

eval_model, task_folder, score_threshold=3, use_mind2web=use_mind2web, judge_repeat_count=judge_repeat_count

)

@observe_debug()

async def cleanup_browser_safe(browser_session: BrowserSession):

"""Safe browser cleanup with timeout"""

try:

logger.debug('Browser cleanup: Starting close operation for session')

await asyncio.wait_for(browser_session.kill(), timeout=30)

logger.debug('Browser cleanup: Close operation completed successfully')

except TimeoutError:

logger.warning('Browser cleanup: Timed out after 30 seconds')

except Exception as e:

logger.warning(f'Browser cleanup: Failed with error: {type(e).__name__}: {e}')

# ================================================

# Stage runner and related functions

# ================================================

def save_result_to_server(convex_url: str, secret_key: str, payload: dict) -> bool:

"""Save result to server (sync function for use with asyncio.to_thread)"""

return save_task_result_to_server(convex_url, secret_key, payload)

async def run_stage(stage: Stage, stage_func, timeout: int | None = None):

"""Generic stage runner with timeout"""

if timeout:

return await asyncio.wait_for(stage_func(), timeout)

return await stage_func()

def determine_current_stage(completed_stages: set) -> Stage:

"""Determine current stage based on completed stages"""

if Stage.SAVE_SERVER in completed_stages:

return Stage.SAVE_SERVER

elif Stage.EVALUATE in completed_stages:

return Stage.EVALUATE

elif Stage.FORMAT_HISTORY in completed_stages:

return Stage.FORMAT_HISTORY

elif Stage.RUN_AGENT in completed_stages:

return Stage.RUN_AGENT

elif Stage.SETUP_BROWSER in completed_stages:

return Stage.SETUP_BROWSER

else:

return Stage.SETUP_BROWSER # Default starting stage

@observe(name='evaluation', span_type='EVALUATION') # type: ignore[arg-type]

async def run_task_with_semaphore(

task: Task,

run_id: str,

lmnr_run_id: str | None,

laminar_eval_link: str | None,

convex_url: str,

secret_key: str,

eval_model: BaseChatModel,

llm: BaseChatModel,

max_steps_per_task: int,

headless: bool,

use_vision: bool,

semaphore_runs: asyncio.Semaphore, # Pass semaphore as argument

auth_distribution: dict | None = None, # Pre-fetched auth distribution

github_workflow_url: str | None = None,

use_serp: bool = False,

browser: str = 'local',

enable_memory: bool = False,

memory_interval: int = 10,

max_actions_per_step: int = 10,

validate_output: bool = False,

planner_llm: BaseChatModel | None = None,

planner_interval: int = 1,

include_result: bool = False,

highlight_elements: bool = True,

use_mind2web_judge: bool = False,

use_thinking: bool = True,

gmail_tokens_dict: dict[str, str] | None = None,

judge_repeat_count: int = 1,

images_per_step: int = 1,

default_navigation_timeout: int | None = None,

default_timeout: int | None = None,

minimum_wait_page_load_time: float | None = None,

wait_for_network_idle_page_load_time: float | None = None,

maximum_wait_page_load_time: float | None = None,

wait_between_actions: float | None = None,

stealth: bool = False,

) -> dict:

"""Clean pipeline approach for running tasks"""

task_start_time = time.time()

logger.info(f'🚀 Task {task.task_id}: Starting execution pipeline')

logger.info(f'📊 Task {task.task_id}: Waiting to acquire semaphore (current available: ~{semaphore_runs._value})')

log_system_resources(f'TASK_START_{task.task_id}')

semaphore_acquired_time = None

async with semaphore_runs:

semaphore_acquired_time = time.time()

wait_time = semaphore_acquired_time - task_start_time

logger.info(

f'✅ Task {task.task_id}: Semaphore acquired after {wait_time:.2f}s (remaining slots: ~{semaphore_runs._value})'

)

log_system_resources(f'SEMAPHORE_ACQUIRED_{task.task_id}')

task_result = None

browser_session = None

laminar_task_link = None

datapoint_id = None

agent_execution_time = None # Track agent execution time separately

try:

if lmnr_run_id:

try:

datapoint_id = await laminar_client.evals.create_datapoint(

eval_id=UUID(lmnr_run_id),

data={

'task_id': task.task_id,

'confirmed_task': task.confirmed_task,

'website': task.website,

'reference_length': task.reference_length,

'level': task.level,

'cluster_id': task.cluster_id,

'category': task.category,

metadata={

'use_vision': str(use_vision),

'use_serp': str(use_serp),

'enable_memory': str(enable_memory),

'memory_interval': str(memory_interval),

'max_actions_per_step': str(max_actions_per_step),

'validate_output': str(validate_output),

'planner_model': str(planner_llm),

'planner_interval': str(planner_interval),

'include_result': str(include_result),

trace_id=Laminar.get_trace_id(),

)

# Only create task-specific link if we have the evaluation link

if laminar_eval_link:

laminar_task_link = f'{laminar_eval_link}?traceId={Laminar.get_trace_id()}&datapointId={datapoint_id}'

logger.info(f'Task {task.task_id}: Laminar link: {laminar_task_link}')

else:

logger.debug(f'Task {task.task_id}: No Laminar evaluation link available, task link not created')

except Exception as e:

logger.warning(f'Task {task.task_id}: Failed to create Laminar datapoint: {type(e).__name__}: {e}')

else:

logger.debug(f'Task {task.task_id}: No Laminar run ID available, skipping datapoint creation')

# Initialize task result and basic setup

task_result = TaskResult(

task.task_id, run_id, task.confirmed_task, task, max_steps_per_task, laminar_task_link, github_workflow_url

)

task_folder = Path(f'saved_trajectories/{task.task_id}')

logger.info(f'Task {task.task_id}: Starting execution pipeline.')

# Send initial progress update to show task is starting

send_progress_update(convex_url, secret_key, run_id, task.task_id, 'starting', 'active', github_workflow_url)

try:

agent_history = None # Initialize to track agent execution

# Stage 1: Setup browser

try:

logger.info(f'Task {task.task_id}: Browser setup starting.')

# Send progress update for starting browser setup

send_progress_update(

convex_url, secret_key, run_id, task.task_id, 'setup_browser', 'active', github_workflow_url

)

browser_session = await run_stage(

Stage.SETUP_BROWSER,

lambda: setup_browser_session(

task,

headless,

highlight_elements,

browser,

default_navigation_timeout,

default_timeout,

minimum_wait_page_load_time,

wait_for_network_idle_page_load_time,

maximum_wait_page_load_time,

wait_between_actions,

stealth,

timeout=120,

)

task_result.stage_completed(Stage.SETUP_BROWSER)

logger.info(f'Task {task.task_id}: Browser session started successfully.')

# Send progress update for completed browser setup

send_progress_update(

convex_url, secret_key, run_id, task.task_id, 'browser_ready', 'active', github_workflow_url

)

except Exception as e:

error = StageError(Stage.SETUP_BROWSER, 'exception', str(e))

task_result.stage_failed(Stage.SETUP_BROWSER, error)

logger.error(f'Task {task.task_id}: Browser setup failed: {str(e)}')

# Send error progress update

send_progress_update(

convex_url, secret_key, run_id, task.task_id, 'setup_browser', 'failed', github_workflow_url, None, str(e)

)

# Continue to server save instead of early return

# Stage 2: Run agent

if browser_session: # Only run agent if browser setup succeeded

try:

logger.info(f'Task {task.task_id}: Agent run starting.')

# Send progress update for starting agent run

send_progress_update(

convex_url, secret_key, run_id, task.task_id, 'run_agent', 'active', github_workflow_url

)

# Handle auth information if task requires it

task_with_auth = task

if hasattr(task, 'auth_keys') and task.auth_keys:

# Validate auth_keys is a list

if isinstance(task.auth_keys, list) and len(task.auth_keys) > 0:

if auth_distribution:

logger.info(

f'Task {task.task_id}: Using pre-fetched auth distribution for auth_keys: {task.auth_keys}'

)

auth_info_text = format_auth_info_for_agent(auth_distribution, task.auth_keys)

if auth_info_text:

# Create a modified task with auth info appended

class TaskWithAuth(Task):

def __init__(self, original_task: Task, auth_text: str):

# Copy all attributes from original task

for attr_name in dir(original_task):

if not attr_name.startswith('__'):

setattr(self, attr_name, getattr(original_task, attr_name))

# Modify the confirmed_task to include auth info

self.confirmed_task = original_task.confirmed_task + auth_text

task_with_auth = TaskWithAuth(task, auth_info_text)

logger.info(f'Task {task.task_id}: Auth info added to task description')

else:

logger.warning(

f'Task {task.task_id}: No matching auth info found for keys: {task.auth_keys}'

)

else:

logger.warning(f'Task {task.task_id}: Auth keys specified but no auth distribution available')

else:

logger.warning(f'Task {task.task_id}: auth_keys is not a valid list: {task.auth_keys}')

# Start timing for agent execution only

agent_start_time = time.time()

agent_history, last_message = await run_stage(

Stage.RUN_AGENT,

lambda: run_agent_with_browser(

browser_session,

task_with_auth,

llm,

max_steps_per_task,

use_vision,

use_serp,

enable_memory,

memory_interval,

max_actions_per_step,

validate_output,

planner_llm,

planner_interval,

use_thinking,

gmail_tokens_dict,

images_per_step,

timeout=1000,

)

# End timing for agent execution only

agent_end_time = time.time()

agent_execution_time = agent_end_time - agent_start_time

task_result.stage_completed(Stage.RUN_AGENT)

logger.info(f'Task {task.task_id}: Agent run completed in {agent_execution_time:.2f}s.')

# Save login cookie tracking data if this was a login task

if hasattr(task, 'login_cookie') and task.login_cookie:

try:

await save_login_cookie_tracking(task_folder, task.task_id)

except Exception as e:

logger.warning(

f'Failed to save login cookie tracking for task {task.task_id}: {type(e).__name__}: {e}'

)

# Send progress update for completed agent run

send_progress_update(

convex_url, secret_key, run_id, task.task_id, 'agent_completed', 'active', github_workflow_url

)

except Exception as e:

error = StageError(Stage.RUN_AGENT, 'exception', str(e))

task_result.stage_failed(Stage.RUN_AGENT, error)

logger.error(f'Task {task.task_id}: Agent run failed: {str(e) + " " + str(e.__traceback__)}')

# Send error progress update

send_progress_update(

convex_url, secret_key, run_id, task.task_id, 'run_agent', 'failed', github_workflow_url, None, str(e)

)

# Continue to server save instead of early return

# Stage 3: Format history (MOVED OUTSIDE browser_session block)

if agent_history is not None: # Only format if agent ran successfully

try:

logger.info(f'Task {task.task_id}: History formatting starting.')

formatted_data = await run_stage(

Stage.FORMAT_HISTORY,

lambda: reformat_agent_history(

agent_history,

task.task_id,

run_id,

task.confirmed_task,

last_message,

include_result=include_result,

agent_execution_time=agent_execution_time, # Pass agent execution time

)

task_result.stage_completed(Stage.FORMAT_HISTORY, formatted_data)

logger.info(f'Task {task.task_id}: Agent history formatted.')

except Exception as e:

error = StageError(Stage.FORMAT_HISTORY, 'exception', str(e))

task_result.stage_failed(Stage.FORMAT_HISTORY, error)

logger.error(f'Task {task.task_id}: History formatting failed: {str(e)}')

# Continue to server save instead of early return

# Stage 4: Evaluate (MOVED OUTSIDE browser_session block)

if task_result.has_execution_data() and Stage.EVALUATE not in task_result.completed_stages:

try:

logger.info(f'Task {task.task_id}: Evaluation starting.')

evaluation = await run_stage(

Stage.EVALUATE,

lambda: evaluate_task_result(eval_model, task_folder, task, use_mind2web_judge, judge_repeat_count),

timeout=300 * judge_repeat_count, # Increase timeout based on repeat count

)

task_result.stage_completed(Stage.EVALUATE, evaluation)

logger.info(f'Task {task.task_id}: Evaluation completed.')

if lmnr_run_id and datapoint_id:

await laminar_client.evals.update_datapoint(

eval_id=UUID(lmnr_run_id),

datapoint_id=datapoint_id,

scores={

'accuracy': evaluation['score'],

)

except Exception as e:

error = StageError(Stage.EVALUATE, 'exception', str(e))

task_result.stage_failed(Stage.EVALUATE, error)

logger.error(f'Task {task.task_id}: Evaluation failed: {str(e)}')

# Stage 5: Save to server (MOVED OUTSIDE browser_session block - ALWAYS attempt)

try:

logger.info(f'Task {task.task_id}: Saving result to server.')

# Only save to server if URLs are provided (skip for single task mode)

if convex_url and secret_key:

await run_stage(

Stage.SAVE_SERVER,

lambda: asyncio.to_thread(

save_result_to_server,

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

service.py

Latest commit

History

service.py

File metadata and controls