跳转到主要内容

browseruse_bench.utils.stats_utils

Statistics-related utility functions.

Import

from browseruse_bench.utils import (
    calculate_metric_stats,
    calculate_all_metrics_stats,
    filter_tasks_by_label,
    generate_evaluation_summary,
)

calculate_metric_stats

Calculate statistics for a specific metric.
def calculate_metric_stats(
    tasks: List[Dict[str, Any]],
    metric: str,
    path: str = "evaluation_details"
) -> Dict[str, float]
tasks
list of dict
必填
List of task results
metric
str
必填
Metric name, e.g., ttft_ms, end_to_end_ms, steps
path
str
默认值:"evaluation_details"
Path to metric in task dictionary
return
dict
Statistics dictionary containing count, mean, min, max, median

calculate_all_metrics_stats

Calculate statistics for multiple metrics.
def calculate_all_metrics_stats(
    tasks: List[Dict[str, Any]],
    metrics: Optional[List[str]] = None,
    path: str = "evaluation_details"
) -> Dict[str, Dict[str, float]]
metrics
list of str
默认值:"see below"
List of metric names

Return Structure

{
    "ttft_ms": {"count": 10, "mean": 1234.5, ...},
    "end_to_end_ms": {"count": 10, "mean": 5678.9, ...},
    "steps": {"count": 10, "mean": 3.2, ...},
    "usage": {
        "total_tokens": {"count": 10, "mean": 1500, ...},
        "total_cost": {"count": 10, "mean": 0.05, ...},
        ...
    }
}

filter_tasks_by_label

Filter tasks by label.
def filter_tasks_by_label(
    tasks: List[Dict[str, Any]],
    key: str = "predicted_label",
    val: int = 1
) -> List[Dict[str, Any]]
tasks
list of dict
必填
List of task results
key
str
默认值:"predicted_label"
Label key name
val
int
默认值:"1"
Label value (1 = success, 0 = failure)

generate_evaluation_summary

Generate evaluation summary.
def generate_evaluation_summary(
    results: List[Dict[str, Any]],
    total: int,
    metrics: Optional[List[str]] = None
) -> Dict[str, Any]
results
list of dict
必填
List of evaluation results
total
int
必填
Total number of tasks
metrics
list of str
默认值:"see below"
List of metrics to calculate

Return Structure

{
    "overall_statistics": {
        "total_tasks": 100,
        "evaluated_tasks": 95,
        "successful_tasks": 70,
        "failed_tasks": 25,
        "success_rate": 73.68,
        "failure_rate": 26.32
    },
    "metrics_statistics": {...},
    "successful_tasks_metrics": {...},
    "failed_tasks_metrics": {...},
    "failure_category_statistics": {...},
    "task_list": {
        "successful_task_ids": [...],
        "failed_task_ids": [...]
    }
}