跳转到主要内容

browseruse_bench.utils.eval_utils

Evaluation-related utility functions and classes.

Import

from browseruse_bench.utils import (
    EvaluationModel,
    load_evaluation_model,
    encode_image,
    extract_score_from_response,
    calculate_success,
    normalized_results_file,
)

EvaluationModel

OpenAI model wrapper class for task evaluation.
class EvaluationModel:
    def __init__(
        self,
        model: str = "gpt-4o",
        api_key: str = None,
        base_url: str = None
    )
model
str
默认值:"gpt-4o"
Model name
api_key
str
默认值:"None"
API Key, defaults to environment variable
base_url
str
默认值:"None"
API Base URL, defaults to environment variable

generate

Generate evaluation response with automatic retry.
def generate(
    self,
    messages: List[Dict],
    max_tokens: int = 2048,
    temperature: float = 0.3,
    **kwargs
) -> str

load_evaluation_model

Load evaluation model with environment variable fallback.
def load_evaluation_model(
    model: str = None,
    api_key: str = None,
    base_url: str = None
) -> EvaluationModel

Environment Variables

VariableDescription
EVAL_MODEL_NAMEModel name (fallback: gpt-4o)
EVAL_MODEL_API_KEYAPI Key (fallback: OPENAI_API_KEY)
EVAL_MODEL_BASE_URLBase URL (fallback: OPENAI_BASE_URL)

encode_image

Convert a PIL image to base64 string.
def encode_image(
    image: PIL.Image,
    scale_factor: float = 1.0
) -> str
image
PIL.Image
必填
PIL Image object
scale_factor
float
默认值:"1.0"
Image scale factor (between 0.0 and 1.0), e.g., 0.5 means 50% size

extract_score_from_response

Extract numerical score from evaluation response.
def extract_score_from_response(response: str) -> int
response
str
必填
Evaluation response text
return
int
Extracted score (0 if not found)

calculate_success

Determine if task is successful based on score threshold.
def calculate_success(
    score: int,
    threshold: int = 60
) -> bool
score
int
必填
Task score
threshold
int
默认值:"60"
Success threshold
return
bool
True if score meets or exceeds threshold

normalized_results_file

Context manager that yields a path guaranteed to be JSONL format.
@contextmanager
def normalized_results_file(results_file: Path) -> Generator[Path, None, None]
results_file
Path
必填
Results file path

Usage Example

from browseruse_bench.utils import normalized_results_file

with normalized_results_file(Path("results.json")) as jsonl_path:
    # jsonl_path is guaranteed to be JSONL format
    with open(jsonl_path) as f:
        for line in f:
            record = json.loads(line)