bug fix - html rendering when multiple percentiles are the same (#515)

sjmonson · web-flow · commit 7666c658460b · 2025-12-12T15:04:27.000-05:00
## Summary Fixes a bug when an HTML benchmark reports is generated. The HTML report crashes in the browsers with e.g. `Error: xs[2] == xs[3]` when rendering distributions with consecutive duplicate percentile values. This occurs when benchmarks have limited data points (e.g., < 10 requests), causing multiple percentiles to collapse to the same value. The visualization library expects strictly increasing x-values and fails when encountering duplicates. ### Root Cause When a benchmark has very few successful requests (e.g., 3 requests), percentile calculations result in many duplicate values: ```python # Before fix - causes visualization error { "p001": 15.288, "p01": 15.288, # duplicate "p05": 15.288, # duplicate "p10": 15.288, # duplicate "p25": 15.288, # duplicate "p50": 16.413, # unique "p75": 16.413, # duplicate "p90": 17.035, # unique ... } # After fix - renders successfully { "p001": 15.288, "p50": 16.413, "p90": 17.035 } ``` ## Details - Added `_filter_duplicate_percentiles()` helper function to remove consecutive duplicate percentile values - Override `model_dump()` in `_TabularDistributionSummary` to automatically apply filtering during HTML generation - Added comprehensive unit tests (8 test cases) covering all edge cases ## Test Plan ### Unit Tests ```bash uv run pytest tests/unit/benchmark/test_html_output.py -v # 8/8 tests passing ``` ### Related - Replaces #490 --- - [x] "I certify that all code in this PR is my own, except as noted below." ## Use of AI - [x] Includes AI-assisted code completion - [x] Includes code generated by an AI application - [x] Includes AI-generated tests (NOTE: AI written tests should have a docstring that includes `## WRITTEN BY AI ##`)
diff --git a/src/guidellm/benchmark/outputs/html.py b/src/guidellm/benchmark/outputs/html.py
@@ -29,7 +29,7 @@
     GenerativeBenchmark,
     GenerativeBenchmarksReport,
 )
-from guidellm.schemas import DistributionSummary
+from guidellm.schemas import DistributionSummary, Percentiles
 from guidellm.settings import settings
 from guidellm.utils import camelize_str, recursive_key_update
 from guidellm.utils.text import load_text
@@ -190,6 +190,24 @@ def percentile_rows(self) -> list[dict[str, str | float]]:
             filter(lambda row: row["percentile"] in ["p50", "p90", "p95", "p99"], rows)
         )
 
+    def model_dump(self, **kwargs) -> dict:
+        """
+        Override model_dump to filter duplicate consecutive percentile values.
+
+        This prevents visualization errors when distributions have limited data
+        points causing multiple percentiles to collapse to the same value.
+
+        :param kwargs: Arguments to pass to parent model_dump
+        :return: Dictionary with filtered percentiles
+        """
+        data = super().model_dump(**kwargs)
+
+        if "percentiles" in data and data["percentiles"]:
+            filtered_percentiles = _filter_duplicate_percentiles(data["percentiles"])
+            data["percentiles"] = filtered_percentiles
+
+        return data
+
     @classmethod
     def from_distribution_summary(
         cls, distribution: DistributionSummary
@@ -222,6 +240,39 @@ def _create_html_report(js_data: dict[str, str], output_path: Path) -> Path:
     return output_path
 
 
+def _filter_duplicate_percentiles(percentiles: dict[str, float]) -> dict[str, float]:
+    """
+    Filter out consecutive duplicate percentile values.
+
+    When distributions have very few data points, multiple percentiles can have
+    the same value, which causes visualization libraries to fail. This function
+    keeps only the largest percentile for consecutive duplicate values, which is
+    more mathematically accurate as higher percentiles have greater statistical
+    significance.
+
+    :param percentiles: Dictionary of percentile names to values
+    :return: Filtered percentiles dictionary with no consecutive duplicates
+    """
+    if not percentiles:
+        return percentiles
+
+    percentile_order = list(Percentiles.model_fields.keys())
+
+    # Iterate in reverse to keep the largest percentile for each value
+    filtered = {}
+    previous_value = None
+
+    for key in reversed(percentile_order):
+        if key in percentiles:
+            current_value = percentiles[key]
+            if previous_value is None or current_value != previous_value:
+                filtered[key] = current_value
+                previous_value = current_value
+
+    # Restore original order
+    return {key: filtered[key] for key in percentile_order if key in filtered}
+
+
 def _inject_data(js_data: dict[str, str], html: str) -> str:
     """
     Inject JavaScript data into HTML head section.
diff --git a/tests/unit/benchmark/test_html_output.py b/tests/unit/benchmark/test_html_output.py
@@ -0,0 +1,164 @@
+## WRITTEN BY AI ##
+from guidellm.benchmark.outputs.html import _filter_duplicate_percentiles
+from guidellm.schemas import Percentiles
+
+
+def test_filter_all_same_values():
+    """Test filtering when all percentiles have the same value."""
+    percentiles = {
+        "p001": 15.288091352804853,
+        "p01": 15.288091352804853,
+        "p05": 15.288091352804853,
+        "p10": 15.288091352804853,
+        "p25": 15.288091352804853,
+        "p50": 15.288091352804853,
+        "p75": 15.288091352804853,
+        "p90": 15.288091352804853,
+        "p95": 15.288091352804853,
+        "p99": 15.288091352804853,
+        "p999": 15.288091352804853,
+    }
+
+    filtered = _filter_duplicate_percentiles(percentiles)
+
+    # Should only keep the largest (p999) for mathematical accuracy
+    assert filtered == {"p999": 15.288091352804853}
+
+
+def test_filter_consecutive_duplicates():
+    """Test filtering when some consecutive percentiles have the same value."""
+    percentiles = {
+        "p001": 15.288091352804853,
+        "p01": 15.288091352804853,
+        "p05": 15.288091352804853,
+        "p10": 15.288091352804853,
+        "p25": 15.288091352804853,
+        "p50": 16.41327511776994,  # Different value
+        "p75": 16.41327511776994,
+        "p90": 17.03541629998259,  # Different value
+        "p95": 17.03541629998259,
+        "p99": 17.03541629998259,
+        "p999": 17.03541629998259,
+    }
+
+    filtered = _filter_duplicate_percentiles(percentiles)
+
+    # Should keep largest of each group for mathematical accuracy
+    assert filtered == {
+        "p25": 15.288091352804853,
+        "p75": 16.41327511776994,
+        "p999": 17.03541629998259,
+    }
+
+
+def test_no_duplicates():
+    """Test that unique values are all preserved."""
+    percentiles = {
+        "p001": 13.181080445834912,
+        "p01": 13.181080445834912,  # Same as p001
+        "p05": 13.530595573836457,  # Different
+        "p10": 13.843972502554365,
+        "p25": 14.086376978251748,
+        "p50": 14.403258051191058,
+        "p75": 14.738608817056042,
+        "p90": 15.18136631856698,
+        "p95": 15.7213110894772,
+        "p99": 15.7213110894772,  # Same as p95
+        "p999": 15.7213110894772,  # Same as p99
+    }
+
+    filtered = _filter_duplicate_percentiles(percentiles)
+
+    # Should keep largest of each duplicate group (e.g. p999 instead of p95)
+    assert filtered == {
+        "p01": 13.181080445834912,
+        "p05": 13.530595573836457,
+        "p10": 13.843972502554365,
+        "p25": 14.086376978251748,
+        "p50": 14.403258051191058,
+        "p75": 14.738608817056042,
+        "p90": 15.18136631856698,
+        "p999": 15.7213110894772,
+    }
+
+
+def test_empty_percentiles():
+    """Test with empty percentiles dictionary."""
+    filtered = _filter_duplicate_percentiles({})
+    assert filtered == {}
+
+
+def test_single_percentile():
+    """Test with only one percentile."""
+    percentiles = {"p50": 14.403258051191058}
+    filtered = _filter_duplicate_percentiles(percentiles)
+    assert filtered == {"p50": 14.403258051191058}
+
+
+def test_two_different_values():
+    """Test with two different values."""
+    percentiles = {
+        "p25": 14.086376978251748,
+        "p50": 14.403258051191058,
+    }
+    filtered = _filter_duplicate_percentiles(percentiles)
+    assert filtered == percentiles
+
+
+def test_partial_percentiles():
+    """Test that order is maintained even with partial percentiles."""
+    percentiles = {
+        "p50": 16.41327511776994,
+        "p10": 15.288091352804853,
+        "p90": 17.03541629998259,
+    }
+
+    filtered = _filter_duplicate_percentiles(percentiles)
+
+    # Should maintain order from percentile_order list
+    assert list(filtered.keys()) == ["p10", "p50", "p90"]
+
+
+def test_model_dump_filters_duplicates():
+    """Test that model_dump applies percentile filtering."""
+    from guidellm.benchmark.outputs.html import _TabularDistributionSummary
+
+    # Create a distribution with duplicate percentiles (typical of small datasets)
+    dist = _TabularDistributionSummary(
+        mean=15.5,
+        median=15.288091352804853,
+        mode=15.288091352804853,
+        variance=0.1,
+        std_dev=0.316,
+        min=15.288091352804853,
+        max=17.03541629998259,
+        count=3,
+        total_sum=46.5,
+        percentiles=Percentiles(
+            p001=15.288091352804853,
+            p01=15.288091352804853,
+            p05=15.288091352804853,
+            p10=15.288091352804853,
+            p25=15.288091352804853,
+            p50=16.41327511776994,
+            p75=16.41327511776994,
+            p90=17.03541629998259,
+            p95=17.03541629998259,
+            p99=17.03541629998259,
+            p999=17.03541629998259,
+        ),
+    )
+
+    data = dist.model_dump()
+
+    # Check that percentiles were filtered, keeping largest of each group
+    assert data["percentiles"] == {
+        "p25": 15.288091352804853,
+        "p75": 16.41327511776994,
+        "p999": 17.03541629998259,
+    }
+
+    # Ensure other fields remain unchanged
+    assert data["mean"] == 15.5
+    assert data["median"] == 15.288091352804853
+    assert data["count"] == 3