Skip to content

Commit 1f09760

Browse files
tukwilasjmonson
authored andcommitted
UT for src/guidellm/data/deserializers/huggingface.py
Signed-off-by: guangli.bao <[email protected]>
1 parent 5f25b9b commit 1f09760

File tree

1 file changed

+61
-0
lines changed

1 file changed

+61
-0
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import pytest
2+
from datasets import Dataset
3+
4+
from guidellm.data.deserializers.huggingface import (
5+
HuggingFaceDatasetDeserializer,
6+
)
7+
8+
9+
@pytest.fixture
10+
def processor_factory():
11+
return None
12+
13+
14+
@pytest.fixture
15+
def deserializer():
16+
return HuggingFaceDatasetDeserializer()
17+
18+
19+
def test_hf_dataset_direct_return(deserializer, processor_factory):
20+
# build one simple HF dataset
21+
data = Dataset.from_dict({"text": ["hello", "world"]})
22+
result = deserializer(data, processor_factory, random_seed=42)
23+
assert result is data, "return original Dataset object"
24+
25+
26+
def test_local_hf_directory_dataset(deserializer, processor_factory, tmp_path):
27+
# --- 1. build one simple HF dataset ---
28+
dataset = Dataset.from_dict({"id": [1, 2], "text": ["a", "b"]})
29+
# --- 2. Save to a local directory ---
30+
dataset_dir = tmp_path / "local_hf_dataset"
31+
dataset.save_to_disk(dataset_dir)
32+
33+
# --- 3. call HF DatasetDeserializer ---
34+
result = deserializer(
35+
dataset_dir,
36+
processor_factory,
37+
random_seed=123,
38+
)
39+
40+
# --- 4. assertion ---
41+
assert isinstance(result, Dataset)
42+
assert result["text"] == ["a", "b"]
43+
44+
45+
@pytest.mark.parametrize(
46+
"internal_ds_name",
47+
[
48+
"mnist",
49+
"imdb",
50+
],
51+
)
52+
def test_hf_internal_dataset(deserializer, processor_factory, internal_ds_name):
53+
result = deserializer(internal_ds_name, processor_factory, random_seed=42)
54+
55+
assert isinstance(result, (Dataset | dict)), "HF dataset loading failed"
56+
assert "train" in result or isinstance(result, Dataset), (
57+
"Expected 'train' split in the loaded dataset"
58+
)
59+
assert "test" in result or isinstance(result, Dataset), (
60+
"Expected 'test' split in the loaded dataset"
61+
)

0 commit comments

Comments
 (0)