|
| 1 | +import pytest |
| 2 | +from datasets import Dataset |
| 3 | + |
| 4 | +from guidellm.data.deserializers.huggingface import ( |
| 5 | + HuggingFaceDatasetDeserializer, |
| 6 | +) |
| 7 | + |
| 8 | + |
| 9 | +@pytest.fixture |
| 10 | +def processor_factory(): |
| 11 | + return None |
| 12 | + |
| 13 | + |
| 14 | +@pytest.fixture |
| 15 | +def deserializer(): |
| 16 | + return HuggingFaceDatasetDeserializer() |
| 17 | + |
| 18 | + |
| 19 | +def test_hf_dataset_direct_return(deserializer, processor_factory): |
| 20 | + # build one simple HF dataset |
| 21 | + data = Dataset.from_dict({"text": ["hello", "world"]}) |
| 22 | + result = deserializer(data, processor_factory, random_seed=42) |
| 23 | + assert result is data, "return original Dataset object" |
| 24 | + |
| 25 | + |
| 26 | +def test_local_hf_directory_dataset(deserializer, processor_factory, tmp_path): |
| 27 | + # --- 1. build one simple HF dataset --- |
| 28 | + dataset = Dataset.from_dict({"id": [1, 2], "text": ["a", "b"]}) |
| 29 | + # --- 2. Save to a local directory --- |
| 30 | + dataset_dir = tmp_path / "local_hf_dataset" |
| 31 | + dataset.save_to_disk(dataset_dir) |
| 32 | + |
| 33 | + # --- 3. call HF DatasetDeserializer --- |
| 34 | + result = deserializer( |
| 35 | + dataset_dir, |
| 36 | + processor_factory, |
| 37 | + random_seed=123, |
| 38 | + ) |
| 39 | + |
| 40 | + # --- 4. assertion --- |
| 41 | + assert isinstance(result, Dataset) |
| 42 | + assert result["text"] == ["a", "b"] |
| 43 | + |
| 44 | + |
| 45 | +@pytest.mark.parametrize( |
| 46 | + "internal_ds_name", |
| 47 | + [ |
| 48 | + "mnist", |
| 49 | + "imdb", |
| 50 | + ], |
| 51 | +) |
| 52 | +def test_hf_internal_dataset(deserializer, processor_factory, internal_ds_name): |
| 53 | + result = deserializer(internal_ds_name, processor_factory, random_seed=42) |
| 54 | + |
| 55 | + assert isinstance(result, (Dataset | dict)), "HF dataset loading failed" |
| 56 | + assert "train" in result or isinstance(result, Dataset), ( |
| 57 | + "Expected 'train' split in the loaded dataset" |
| 58 | + ) |
| 59 | + assert "test" in result or isinstance(result, Dataset), ( |
| 60 | + "Expected 'test' split in the loaded dataset" |
| 61 | + ) |
0 commit comments