Skip to content

Commit 7666c65

Browse files
authored
bug fix - html rendering when multiple percentiles are the same (#515)
## Summary Fixes a bug when an HTML benchmark reports is generated. The HTML report crashes in the browsers with e.g. `Error: xs[2] == xs[3]` when rendering distributions with consecutive duplicate percentile values. This occurs when benchmarks have limited data points (e.g., < 10 requests), causing multiple percentiles to collapse to the same value. The visualization library expects strictly increasing x-values and fails when encountering duplicates. ### Root Cause When a benchmark has very few successful requests (e.g., 3 requests), percentile calculations result in many duplicate values: ```python # Before fix - causes visualization error { "p001": 15.288, "p01": 15.288, # duplicate "p05": 15.288, # duplicate "p10": 15.288, # duplicate "p25": 15.288, # duplicate "p50": 16.413, # unique "p75": 16.413, # duplicate "p90": 17.035, # unique ... } # After fix - renders successfully { "p001": 15.288, "p50": 16.413, "p90": 17.035 } ``` ## Details - Added `_filter_duplicate_percentiles()` helper function to remove consecutive duplicate percentile values - Override `model_dump()` in `_TabularDistributionSummary` to automatically apply filtering during HTML generation - Added comprehensive unit tests (8 test cases) covering all edge cases ## Test Plan ### Unit Tests ```bash uv run pytest tests/unit/benchmark/test_html_output.py -v # 8/8 tests passing ``` ### Related - Replaces #490 --- - [x] "I certify that all code in this PR is my own, except as noted below." ## Use of AI - [x] Includes AI-assisted code completion - [x] Includes code generated by an AI application - [x] Includes AI-generated tests (NOTE: AI written tests should have a docstring that includes `## WRITTEN BY AI ##`)
2 parents 9a08c65 + 65a6023 commit 7666c65

File tree

2 files changed

+216
-1
lines changed

2 files changed

+216
-1
lines changed

src/guidellm/benchmark/outputs/html.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
GenerativeBenchmark,
3030
GenerativeBenchmarksReport,
3131
)
32-
from guidellm.schemas import DistributionSummary
32+
from guidellm.schemas import DistributionSummary, Percentiles
3333
from guidellm.settings import settings
3434
from guidellm.utils import camelize_str, recursive_key_update
3535
from guidellm.utils.text import load_text
@@ -190,6 +190,24 @@ def percentile_rows(self) -> list[dict[str, str | float]]:
190190
filter(lambda row: row["percentile"] in ["p50", "p90", "p95", "p99"], rows)
191191
)
192192

193+
def model_dump(self, **kwargs) -> dict:
194+
"""
195+
Override model_dump to filter duplicate consecutive percentile values.
196+
197+
This prevents visualization errors when distributions have limited data
198+
points causing multiple percentiles to collapse to the same value.
199+
200+
:param kwargs: Arguments to pass to parent model_dump
201+
:return: Dictionary with filtered percentiles
202+
"""
203+
data = super().model_dump(**kwargs)
204+
205+
if "percentiles" in data and data["percentiles"]:
206+
filtered_percentiles = _filter_duplicate_percentiles(data["percentiles"])
207+
data["percentiles"] = filtered_percentiles
208+
209+
return data
210+
193211
@classmethod
194212
def from_distribution_summary(
195213
cls, distribution: DistributionSummary
@@ -222,6 +240,39 @@ def _create_html_report(js_data: dict[str, str], output_path: Path) -> Path:
222240
return output_path
223241

224242

243+
def _filter_duplicate_percentiles(percentiles: dict[str, float]) -> dict[str, float]:
244+
"""
245+
Filter out consecutive duplicate percentile values.
246+
247+
When distributions have very few data points, multiple percentiles can have
248+
the same value, which causes visualization libraries to fail. This function
249+
keeps only the largest percentile for consecutive duplicate values, which is
250+
more mathematically accurate as higher percentiles have greater statistical
251+
significance.
252+
253+
:param percentiles: Dictionary of percentile names to values
254+
:return: Filtered percentiles dictionary with no consecutive duplicates
255+
"""
256+
if not percentiles:
257+
return percentiles
258+
259+
percentile_order = list(Percentiles.model_fields.keys())
260+
261+
# Iterate in reverse to keep the largest percentile for each value
262+
filtered = {}
263+
previous_value = None
264+
265+
for key in reversed(percentile_order):
266+
if key in percentiles:
267+
current_value = percentiles[key]
268+
if previous_value is None or current_value != previous_value:
269+
filtered[key] = current_value
270+
previous_value = current_value
271+
272+
# Restore original order
273+
return {key: filtered[key] for key in percentile_order if key in filtered}
274+
275+
225276
def _inject_data(js_data: dict[str, str], html: str) -> str:
226277
"""
227278
Inject JavaScript data into HTML head section.
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
## WRITTEN BY AI ##
2+
from guidellm.benchmark.outputs.html import _filter_duplicate_percentiles
3+
from guidellm.schemas import Percentiles
4+
5+
6+
def test_filter_all_same_values():
7+
"""Test filtering when all percentiles have the same value."""
8+
percentiles = {
9+
"p001": 15.288091352804853,
10+
"p01": 15.288091352804853,
11+
"p05": 15.288091352804853,
12+
"p10": 15.288091352804853,
13+
"p25": 15.288091352804853,
14+
"p50": 15.288091352804853,
15+
"p75": 15.288091352804853,
16+
"p90": 15.288091352804853,
17+
"p95": 15.288091352804853,
18+
"p99": 15.288091352804853,
19+
"p999": 15.288091352804853,
20+
}
21+
22+
filtered = _filter_duplicate_percentiles(percentiles)
23+
24+
# Should only keep the largest (p999) for mathematical accuracy
25+
assert filtered == {"p999": 15.288091352804853}
26+
27+
28+
def test_filter_consecutive_duplicates():
29+
"""Test filtering when some consecutive percentiles have the same value."""
30+
percentiles = {
31+
"p001": 15.288091352804853,
32+
"p01": 15.288091352804853,
33+
"p05": 15.288091352804853,
34+
"p10": 15.288091352804853,
35+
"p25": 15.288091352804853,
36+
"p50": 16.41327511776994, # Different value
37+
"p75": 16.41327511776994,
38+
"p90": 17.03541629998259, # Different value
39+
"p95": 17.03541629998259,
40+
"p99": 17.03541629998259,
41+
"p999": 17.03541629998259,
42+
}
43+
44+
filtered = _filter_duplicate_percentiles(percentiles)
45+
46+
# Should keep largest of each group for mathematical accuracy
47+
assert filtered == {
48+
"p25": 15.288091352804853,
49+
"p75": 16.41327511776994,
50+
"p999": 17.03541629998259,
51+
}
52+
53+
54+
def test_no_duplicates():
55+
"""Test that unique values are all preserved."""
56+
percentiles = {
57+
"p001": 13.181080445834912,
58+
"p01": 13.181080445834912, # Same as p001
59+
"p05": 13.530595573836457, # Different
60+
"p10": 13.843972502554365,
61+
"p25": 14.086376978251748,
62+
"p50": 14.403258051191058,
63+
"p75": 14.738608817056042,
64+
"p90": 15.18136631856698,
65+
"p95": 15.7213110894772,
66+
"p99": 15.7213110894772, # Same as p95
67+
"p999": 15.7213110894772, # Same as p99
68+
}
69+
70+
filtered = _filter_duplicate_percentiles(percentiles)
71+
72+
# Should keep largest of each duplicate group (e.g. p999 instead of p95)
73+
assert filtered == {
74+
"p01": 13.181080445834912,
75+
"p05": 13.530595573836457,
76+
"p10": 13.843972502554365,
77+
"p25": 14.086376978251748,
78+
"p50": 14.403258051191058,
79+
"p75": 14.738608817056042,
80+
"p90": 15.18136631856698,
81+
"p999": 15.7213110894772,
82+
}
83+
84+
85+
def test_empty_percentiles():
86+
"""Test with empty percentiles dictionary."""
87+
filtered = _filter_duplicate_percentiles({})
88+
assert filtered == {}
89+
90+
91+
def test_single_percentile():
92+
"""Test with only one percentile."""
93+
percentiles = {"p50": 14.403258051191058}
94+
filtered = _filter_duplicate_percentiles(percentiles)
95+
assert filtered == {"p50": 14.403258051191058}
96+
97+
98+
def test_two_different_values():
99+
"""Test with two different values."""
100+
percentiles = {
101+
"p25": 14.086376978251748,
102+
"p50": 14.403258051191058,
103+
}
104+
filtered = _filter_duplicate_percentiles(percentiles)
105+
assert filtered == percentiles
106+
107+
108+
def test_partial_percentiles():
109+
"""Test that order is maintained even with partial percentiles."""
110+
percentiles = {
111+
"p50": 16.41327511776994,
112+
"p10": 15.288091352804853,
113+
"p90": 17.03541629998259,
114+
}
115+
116+
filtered = _filter_duplicate_percentiles(percentiles)
117+
118+
# Should maintain order from percentile_order list
119+
assert list(filtered.keys()) == ["p10", "p50", "p90"]
120+
121+
122+
def test_model_dump_filters_duplicates():
123+
"""Test that model_dump applies percentile filtering."""
124+
from guidellm.benchmark.outputs.html import _TabularDistributionSummary
125+
126+
# Create a distribution with duplicate percentiles (typical of small datasets)
127+
dist = _TabularDistributionSummary(
128+
mean=15.5,
129+
median=15.288091352804853,
130+
mode=15.288091352804853,
131+
variance=0.1,
132+
std_dev=0.316,
133+
min=15.288091352804853,
134+
max=17.03541629998259,
135+
count=3,
136+
total_sum=46.5,
137+
percentiles=Percentiles(
138+
p001=15.288091352804853,
139+
p01=15.288091352804853,
140+
p05=15.288091352804853,
141+
p10=15.288091352804853,
142+
p25=15.288091352804853,
143+
p50=16.41327511776994,
144+
p75=16.41327511776994,
145+
p90=17.03541629998259,
146+
p95=17.03541629998259,
147+
p99=17.03541629998259,
148+
p999=17.03541629998259,
149+
),
150+
)
151+
152+
data = dist.model_dump()
153+
154+
# Check that percentiles were filtered, keeping largest of each group
155+
assert data["percentiles"] == {
156+
"p25": 15.288091352804853,
157+
"p75": 16.41327511776994,
158+
"p999": 17.03541629998259,
159+
}
160+
161+
# Ensure other fields remain unchanged
162+
assert data["mean"] == 15.5
163+
assert data["median"] == 15.288091352804853
164+
assert data["count"] == 3

0 commit comments

Comments
 (0)