Skip to content

Commit 113c2c8

Browse files
[CI] This PR enhances testing of the CI procedures on both v6e and v7x.
Signed-off-by: dennis yeh <[email protected]>
1 parent b72c41a commit 113c2c8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+1183
-961
lines changed
Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,48 @@
11
# Collective Communication Matmul
22
# kernel support matrix
33
steps:
4-
- label: "Correctness tests for Collective Communication Matmul"
5-
key: "Collective_Communication_Matmul_CorrectnessTest"
4+
- label: "${TEST_LABEL_PREFIX} Correctness tests for Collective Communication Matmul"
5+
key: "${TEST_KEY_PREFIX}Collective_Communication_Matmul_CorrectnessTest"
66
soft_fail: true
77
agents:
8-
queue: tpu_v6e_queue
8+
queue: "${TPU_SMALL_CORE_QUEUE}"
9+
env:
10+
IS_FOR_V7X: "${IS_FOR_V7X}"
911
commands:
1012
- .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/kernels/collectives/all_gather_matmul_kernel_test.py
11-
- label: "Record correctness test result for Collective Communication Matmul"
12-
key: "record_Collective_Communication_Matmul_CorrectnessTest"
13-
depends_on: "Collective_Communication_Matmul_CorrectnessTest"
13+
- label: "${TEST_LABEL_PREFIX} Record correctness test result for Collective Communication Matmul"
14+
key: "${TEST_KEY_PREFIX}record_Collective_Communication_Matmul_CorrectnessTest"
15+
depends_on: "${TEST_KEY_PREFIX}Collective_Communication_Matmul_CorrectnessTest"
1416
env:
15-
CI_TARGET: "Collective Communication Matmul"
17+
CI_TARGET: "${TEST_LABEL_PREFIX}Collective Communication Matmul"
1618
CI_STAGE: "CorrectnessTest"
1719
CI_CATEGORY: "kernel support matrix"
1820
agents:
1921
queue: cpu
2022
commands:
2123
- |
22-
.buildkite/scripts/record_step_result.sh Collective_Communication_Matmul_CorrectnessTest
24+
.buildkite/scripts/record_step_result.sh ${TEST_KEY_PREFIX}Collective_Communication_Matmul_CorrectnessTest
2325
24-
- label: "Performance tests for Collective Communication Matmul"
25-
key: "Collective_Communication_Matmul_PerformanceTest"
26-
depends_on: "record_Collective_Communication_Matmul_CorrectnessTest"
26+
- label: "${TEST_LABEL_PREFIX} Performance tests for Collective Communication Matmul"
27+
key: "${TEST_KEY_PREFIX}Collective_Communication_Matmul_PerformanceTest"
28+
depends_on: "${TEST_KEY_PREFIX}record_Collective_Communication_Matmul_CorrectnessTest"
2729
soft_fail: true
2830
agents:
29-
queue: tpu_v6e_queue
31+
queue: "${TPU_SMALL_CORE_QUEUE}"
32+
env:
33+
IS_FOR_V7X: "${IS_FOR_V7X}"
3034
commands:
3135
- |
3236
buildkite-agent meta-data set "Collective_Communication_Matmul_PerformanceTest" "unverified"
33-
- label: "Record performance test result for Collective Communication Matmul"
34-
key: "record_Collective_Communication_Matmul_PerformanceTest"
35-
depends_on: "Collective_Communication_Matmul_PerformanceTest"
37+
- label: "${TEST_LABEL_PREFIX} Record performance test result for Collective Communication Matmul"
38+
key: "${TEST_KEY_PREFIX}record_Collective_Communication_Matmul_PerformanceTest"
39+
depends_on: "${TEST_KEY_PREFIX}Collective_Communication_Matmul_PerformanceTest"
3640
env:
37-
CI_TARGET: "Collective Communication Matmul"
41+
CI_TARGET: "${TEST_LABEL_PREFIX}Collective Communication Matmul"
3842
CI_STAGE: "PerformanceTest"
3943
CI_CATEGORY: "kernel support matrix"
4044
agents:
4145
queue: cpu
4246
commands:
4347
- |
44-
.buildkite/scripts/record_step_result.sh Collective_Communication_Matmul_PerformanceTest
48+
.buildkite/scripts/record_step_result.sh ${TEST_KEY_PREFIX}Collective_Communication_Matmul_PerformanceTest
Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,49 @@
11
# DCN-based P/D disaggregation
22
# feature support matrix
33
steps:
4-
- label: "Correctness tests for DCN-based P/D disaggregation"
5-
key: "DCN_based_P-D_disaggregation_CorrectnessTest"
4+
- label: "${TEST_LABEL_PREFIX} Correctness tests for DCN-based P/D disaggregation"
5+
key: "${TEST_KEY_PREFIX}DCN_based_P-D_disaggregation_CorrectnessTest"
66
soft_fail: true
77
agents:
8-
queue: tpu_v6e_queue
8+
queue: "${TPU_SMALL_CORE_QUEUE}"
9+
env:
10+
IS_FOR_V7X: "${IS_FOR_V7X}"
911
commands:
1012
- |
1113
buildkite-agent meta-data set "DCN_based_P-D_disaggregation_CorrectnessTest" "unverified"
12-
- label: "Record correctness test result for DCN-based P/D disaggregation"
13-
key: "record_DCN_based_P-D_disaggregation_CorrectnessTest"
14-
depends_on: "DCN_based_P-D_disaggregation_CorrectnessTest"
14+
- label: "${TEST_LABEL_PREFIX} Record correctness test result for DCN-based P/D disaggregation"
15+
key: "${TEST_KEY_PREFIX}record_DCN_based_P-D_disaggregation_CorrectnessTest"
16+
depends_on: "${TEST_KEY_PREFIX}DCN_based_P-D_disaggregation_CorrectnessTest"
1517
env:
16-
CI_TARGET: "DCN-based P/D disaggregation"
18+
CI_TARGET: "${TEST_LABEL_PREFIX}DCN-based P/D disaggregation"
1719
CI_STAGE: "CorrectnessTest"
1820
CI_CATEGORY: "feature support matrix"
1921
agents:
2022
queue: cpu
2123
commands:
2224
- |
23-
.buildkite/scripts/record_step_result.sh DCN_based_P-D_disaggregation_CorrectnessTest
25+
.buildkite/scripts/record_step_result.sh ${TEST_KEY_PREFIX}DCN_based_P-D_disaggregation_CorrectnessTest
2426
25-
- label: "Performance tests for DCN-based P/D disaggregation"
26-
key: "DCN_based_P-D_disaggregation_PerformanceTest"
27-
depends_on: "record_DCN_based_P-D_disaggregation_CorrectnessTest"
27+
- label: "${TEST_LABEL_PREFIX} Performance tests for DCN-based P/D disaggregation"
28+
key: "${TEST_KEY_PREFIX}DCN_based_P-D_disaggregation_PerformanceTest"
29+
depends_on: "${TEST_KEY_PREFIX}record_DCN_based_P-D_disaggregation_CorrectnessTest"
2830
soft_fail: true
2931
agents:
30-
queue: tpu_v6e_queue
32+
queue: "${TPU_SMALL_CORE_QUEUE}"
33+
env:
34+
IS_FOR_V7X: "${IS_FOR_V7X}"
3135
commands:
3236
- |
3337
buildkite-agent meta-data set "DCN_based_P-D_disaggregation_PerformanceTest" "unverified"
34-
- label: "Record performance test result for DCN-based P/D disaggregation"
35-
key: "record_DCN_based_P-D_disaggregation_PerformanceTest"
36-
depends_on: "DCN_based_P-D_disaggregation_PerformanceTest"
38+
- label: "${TEST_LABEL_PREFIX} Record performance test result for DCN-based P/D disaggregation"
39+
key: "${TEST_KEY_PREFIX}record_DCN_based_P-D_disaggregation_PerformanceTest"
40+
depends_on: "${TEST_KEY_PREFIX}DCN_based_P-D_disaggregation_PerformanceTest"
3741
env:
38-
CI_TARGET: "DCN-based P/D disaggregation"
42+
CI_TARGET: "${TEST_LABEL_PREFIX}DCN-based P/D disaggregation"
3943
CI_STAGE: "PerformanceTest"
4044
CI_CATEGORY: "feature support matrix"
4145
agents:
4246
queue: cpu
4347
commands:
4448
- |
45-
.buildkite/scripts/record_step_result.sh DCN_based_P-D_disaggregation_PerformanceTest
49+
.buildkite/scripts/record_step_result.sh ${TEST_KEY_PREFIX}DCN_based_P-D_disaggregation_PerformanceTest
Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,49 @@
11
# KV cache host offloading
22
# feature support matrix
33
steps:
4-
- label: "Correctness tests for KV cache host offloading"
5-
key: "KV_Cache_Host_Offloading_CorrectnessTest"
4+
- label: "${TEST_LABEL_PREFIX} Correctness tests for KV cache host offloading"
5+
key: "${TEST_KEY_PREFIX}KV_Cache_Host_Offloading_CorrectnessTest"
66
soft_fail: true
77
agents:
8-
queue: tpu_v6e_queue
8+
queue: "${TPU_SMALL_CORE_QUEUE}"
9+
env:
10+
IS_FOR_V7X: "${IS_FOR_V7X}"
911
commands:
1012
- |
1113
buildkite-agent meta-data set "KV_Cache_Host_Offloading_CorrectnessTest" "unverified"
12-
- label: "Record correctness test result for KV cache host offloading"
13-
key: "record_KV_Cache_Host_Offloading_CorrectnessTest"
14-
depends_on: "KV_Cache_Host_Offloading_CorrectnessTest"
14+
- label: "${TEST_LABEL_PREFIX} Record correctness test result for KV cache host offloading"
15+
key: "${TEST_KEY_PREFIX}record_KV_Cache_Host_Offloading_CorrectnessTest"
16+
depends_on: "${TEST_KEY_PREFIX}KV_Cache_Host_Offloading_CorrectnessTest"
1517
env:
16-
CI_TARGET: "KV cache host offloading"
18+
CI_TARGET: "${TEST_LABEL_PREFIX}KV cache host offloading"
1719
CI_STAGE: "CorrectnessTest"
1820
CI_CATEGORY: "feature support matrix"
1921
agents:
2022
queue: cpu
2123
commands:
2224
- |
23-
.buildkite/scripts/record_step_result.sh KV_Cache_Host_Offloading_CorrectnessTest
25+
.buildkite/scripts/record_step_result.sh ${TEST_KEY_PREFIX}KV_Cache_Host_Offloading_CorrectnessTest
2426
25-
- label: "Performance tests for KV cache host offloading"
26-
key: "KV_Cache_Host_Offloading_PerformanceTest"
27-
depends_on: "record_KV_Cache_Host_Offloading_CorrectnessTest"
27+
- label: "${TEST_LABEL_PREFIX} Performance tests for KV cache host offloading"
28+
key: "${TEST_KEY_PREFIX}KV_Cache_Host_Offloading_PerformanceTest"
29+
depends_on: "${TEST_KEY_PREFIX}record_KV_Cache_Host_Offloading_CorrectnessTest"
2830
soft_fail: true
2931
agents:
30-
queue: tpu_v6e_queue
32+
queue: "${TPU_SMALL_CORE_QUEUE}"
33+
env:
34+
IS_FOR_V7X: "${IS_FOR_V7X}"
3135
commands:
3236
- |
3337
buildkite-agent meta-data set "KV_Cache_Host_Offloading_PerformanceTest" "unverified"
34-
- label: "Record performance test result for KV cache host offloading"
35-
key: "record_KV_Cache_Host_Offloading_PerformanceTest"
36-
depends_on: "KV_Cache_Host_Offloading_PerformanceTest"
38+
- label: "${TEST_LABEL_PREFIX} Record performance test result for KV cache host offloading"
39+
key: "${TEST_KEY_PREFIX}record_KV_Cache_Host_Offloading_PerformanceTest"
40+
depends_on: "${TEST_KEY_PREFIX}KV_Cache_Host_Offloading_PerformanceTest"
3741
env:
38-
CI_TARGET: "KV cache host offloading"
42+
CI_TARGET: "${TEST_LABEL_PREFIX}KV cache host offloading"
3943
CI_STAGE: "PerformanceTest"
4044
CI_CATEGORY: "feature support matrix"
4145
agents:
4246
queue: cpu
4347
commands:
4448
- |
45-
.buildkite/scripts/record_step_result.sh KV_Cache_Host_Offloading_PerformanceTest
49+
.buildkite/scripts/record_step_result.sh ${TEST_KEY_PREFIX}KV_Cache_Host_Offloading_PerformanceTest

.buildkite/features/LoRA_Torch.yml

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,51 @@
11
# LoRA_Torch
22
# feature support matrix
33
steps:
4-
- label: "Correctness tests for LoRA_Torch"
5-
key: "LoRA_Torch_CorrectnessTest"
4+
- label: "${TEST_LABEL_PREFIX} Correctness tests for LoRA_Torch"
5+
key: "${TEST_KEY_PREFIX}LoRA_Torch_CorrectnessTest"
66
soft_fail: true
77
agents:
8-
queue: tpu_v6e_queue
8+
queue: "${TPU_SMALL_CORE_QUEUE}"
9+
env:
10+
IS_FOR_V7X: "${IS_FOR_V7X}"
911
commands:
1012
- |
1113
.buildkite/scripts/run_in_docker.sh \
1214
bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py'
13-
- label: "Record correctness test result for LoRA_Torch"
14-
key: "record_LoRA_Torch_CorrectnessTest"
15-
depends_on: "LoRA_Torch_CorrectnessTest"
15+
- label: "${TEST_LABEL_PREFIX} Record correctness test result for LoRA_Torch"
16+
key: "${TEST_KEY_PREFIX}record_LoRA_Torch_CorrectnessTest"
17+
depends_on: "${TEST_KEY_PREFIX}LoRA_Torch_CorrectnessTest"
1618
env:
17-
CI_TARGET: "LoRA_Torch"
19+
CI_TARGET: "${TEST_LABEL_PREFIX}LoRA_Torch"
1820
CI_STAGE: "CorrectnessTest"
1921
CI_CATEGORY: "feature support matrix"
2022
agents:
2123
queue: cpu
2224
commands:
2325
- |
24-
.buildkite/scripts/record_step_result.sh LoRA_Torch_CorrectnessTest
26+
.buildkite/scripts/record_step_result.sh ${TEST_KEY_PREFIX}LoRA_Torch_CorrectnessTest
2527
26-
- label: "Performance tests for LoRA_Torch"
27-
key: "LoRA_Torch_PerformanceTest"
28-
depends_on: "record_LoRA_Torch_CorrectnessTest"
28+
- label: "${TEST_LABEL_PREFIX} Performance tests for LoRA_Torch"
29+
key: "${TEST_KEY_PREFIX}LoRA_Torch_PerformanceTest"
30+
depends_on: "${TEST_KEY_PREFIX}record_LoRA_Torch_CorrectnessTest"
2931
soft_fail: true
3032
agents:
31-
queue: tpu_v6e_queue
33+
queue: "${TPU_SMALL_CORE_QUEUE}"
34+
env:
35+
IS_FOR_V7X: "${IS_FOR_V7X}"
3236
commands:
3337
- |
3438
.buildkite/scripts/run_in_docker.sh \
3539
bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora_perf.py'
36-
- label: "Record performance test result for LoRA_Torch"
37-
key: "record_LoRA_Torch_PerformanceTest"
38-
depends_on: "LoRA_Torch_PerformanceTest"
40+
- label: "${TEST_LABEL_PREFIX} Record performance test result for LoRA_Torch"
41+
key: "${TEST_KEY_PREFIX}record_LoRA_Torch_PerformanceTest"
42+
depends_on: "${TEST_KEY_PREFIX}LoRA_Torch_PerformanceTest"
3943
env:
40-
CI_TARGET: "LoRA_Torch"
44+
CI_TARGET: "${TEST_LABEL_PREFIX}LoRA_Torch"
4145
CI_STAGE: "PerformanceTest"
4246
CI_CATEGORY: "feature support matrix"
4347
agents:
4448
queue: cpu
4549
commands:
4650
- |
47-
.buildkite/scripts/record_step_result.sh LoRA_Torch_PerformanceTest
51+
.buildkite/scripts/record_step_result.sh ${TEST_KEY_PREFIX}LoRA_Torch_PerformanceTest

.buildkite/features/MLA.yml

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,49 @@
11
# MLA
22
# kernel support matrix
33
steps:
4-
- label: "Correctness tests for MLA"
5-
key: "MLA_CorrectnessTest"
4+
- label: "${TEST_LABEL_PREFIX} Correctness tests for MLA"
5+
key: "${TEST_KEY_PREFIX}MLA_CorrectnessTest"
66
soft_fail: true
77
agents:
8-
queue: tpu_v6e_queue
8+
queue: "${TPU_SMALL_CORE_QUEUE}"
9+
env:
10+
IS_FOR_V7X: "${IS_FOR_V7X}"
911
commands:
1012
- |
1113
buildkite-agent meta-data set "MLA_CorrectnessTest" "unverified"
12-
- label: "Record correctness test result for MLA"
13-
key: "record_MLA_CorrectnessTest"
14-
depends_on: "MLA_CorrectnessTest"
14+
- label: "${TEST_LABEL_PREFIX} Record correctness test result for MLA"
15+
key: "${TEST_KEY_PREFIX}record_MLA_CorrectnessTest"
16+
depends_on: "${TEST_KEY_PREFIX}MLA_CorrectnessTest"
1517
env:
16-
CI_TARGET: "MLA"
18+
CI_TARGET: "${TEST_LABEL_PREFIX}MLA"
1719
CI_STAGE: "CorrectnessTest"
1820
CI_CATEGORY: "kernel support matrix"
1921
agents:
2022
queue: cpu
2123
commands:
2224
- |
23-
.buildkite/scripts/record_step_result.sh MLA_CorrectnessTest
25+
.buildkite/scripts/record_step_result.sh ${TEST_KEY_PREFIX}MLA_CorrectnessTest
2426
25-
- label: "Performance tests for MLA"
26-
key: "MLA_PerformanceTest"
27-
depends_on: "record_MLA_CorrectnessTest"
27+
- label: "${TEST_LABEL_PREFIX} Performance tests for MLA"
28+
key: "${TEST_KEY_PREFIX}MLA_PerformanceTest"
29+
depends_on: "${TEST_KEY_PREFIX}record_MLA_CorrectnessTest"
2830
soft_fail: true
2931
agents:
30-
queue: tpu_v6e_queue
32+
queue: "${TPU_SMALL_CORE_QUEUE}"
33+
env:
34+
IS_FOR_V7X: "${IS_FOR_V7X}"
3135
commands:
3236
- |
3337
buildkite-agent meta-data set "MLA_PerformanceTest" "unverified"
34-
- label: "Record performance test result for MLA"
35-
key: "record_MLA_PerformanceTest"
36-
depends_on: "MLA_PerformanceTest"
38+
- label: "${TEST_LABEL_PREFIX} Record performance test result for MLA"
39+
key: "${TEST_KEY_PREFIX}record_MLA_PerformanceTest"
40+
depends_on: "${TEST_KEY_PREFIX}MLA_PerformanceTest"
3741
env:
38-
CI_TARGET: "MLA"
42+
CI_TARGET: "${TEST_LABEL_PREFIX}MLA"
3943
CI_STAGE: "PerformanceTest"
4044
CI_CATEGORY: "kernel support matrix"
4145
agents:
4246
queue: cpu
4347
commands:
4448
- |
45-
.buildkite/scripts/record_step_result.sh MLA_PerformanceTest
49+
.buildkite/scripts/record_step_result.sh ${TEST_KEY_PREFIX}MLA_PerformanceTest

0 commit comments

Comments
 (0)