browseruse_bench.utils.stats_utils

Statistics-related utility functions.

Import

from browseruse_bench.utils import (
    calculate_metric_stats,
    calculate_all_metrics_stats,
    filter_tasks_by_label,
    generate_evaluation_summary,
)

calculate_metric_stats

Calculate statistics for a specific metric.

def calculate_metric_stats(
    tasks: List[Dict[str, Any]],
    metric: str,
    path: str = "evaluation_details"
) -> Dict[str, float]

tasks

list of dict

required

List of task results

metric

str

required

Metric name, e.g., ttft_ms, end_to_end_ms, steps

path

str

default:"evaluation_details"

Path to metric in task dictionary

return

dict

Statistics dictionary containing count, mean, min, max, median

calculate_all_metrics_stats

Calculate statistics for multiple metrics.

def calculate_all_metrics_stats(
    tasks: List[Dict[str, Any]],
    metrics: Optional[List[str]] = None,
    path: str = "evaluation_details"
) -> Dict[str, Dict[str, float]]

metrics

list of str

default:"see below"

List of metric names

Return Structure

{
    "ttft_ms": {"count": 10, "mean": 1234.5, ...},
    "end_to_end_ms": {"count": 10, "mean": 5678.9, ...},
    "steps": {"count": 10, "mean": 3.2, ...},
    "usage": {
        "total_tokens": {"count": 10, "mean": 1500, ...},
        "total_cost": {"count": 10, "mean": 0.05, ...},
        ...
    }
}

filter_tasks_by_label

Filter tasks by label.

def filter_tasks_by_label(
    tasks: List[Dict[str, Any]],
    key: str = "predicted_label",
    val: int = 1
) -> List[Dict[str, Any]]

tasks

list of dict

required

List of task results

key

str

default:"predicted_label"

Label key name

val

int

default:"1"

Label value (1 = success, 0 = failure)

generate_evaluation_summary

Generate evaluation summary.

def generate_evaluation_summary(
    results: List[Dict[str, Any]],
    total: int,
    metrics: Optional[List[str]] = None
) -> Dict[str, Any]

results

list of dict

required

List of evaluation results

total

int

required

Total number of tasks

metrics

list of str

default:"see below"

List of metrics to calculate

Return Structure

{
    "overall_statistics": {
        "total_tasks": 100,
        "evaluated_tasks": 95,
        "successful_tasks": 70,
        "failed_tasks": 25,
        "success_rate": 73.68,
        "failure_rate": 26.32
    },
    "metrics_statistics": {...},
    "successful_tasks_metrics": {...},
    "failed_tasks_metrics": {...},
    "failure_category_statistics": {...},
    "task_list": {
        "successful_task_ids": [...],
        "failed_task_ids": [...]
    }
}

Overview

Utils

stats_utils

browseruse_bench.utils.stats_utils

Import

calculate_metric_stats

calculate_all_metrics_stats

Return Structure

filter_tasks_by_label

generate_evaluation_summary

Return Structure

Overview

Utils

​browseruse_bench.utils.stats_utils

​Import

​calculate_metric_stats

​calculate_all_metrics_stats

​Return Structure

​filter_tasks_by_label

​generate_evaluation_summary

​Return Structure

browseruse_bench.utils.stats_utils

Import

calculate_metric_stats

calculate_all_metrics_stats

Return Structure

filter_tasks_by_label

generate_evaluation_summary

Return Structure