|
771 | 771 | "metadata": { |
772 | 772 | "id": "_dGO4a7eTbFX" |
773 | 773 | }, |
774 | | - "execution_count": 1, |
| 774 | + "execution_count": null, |
775 | 775 | "outputs": [] |
776 | 776 | }, |
777 | 777 | { |
|
797 | 797 | "id": "ms0ps6ZCT2xs", |
798 | 798 | "outputId": "6a5ffafe-73c6-45f9-f16b-36e400cc2ded" |
799 | 799 | }, |
800 | | - "execution_count": 2, |
| 800 | + "execution_count": null, |
801 | 801 | "outputs": [ |
802 | 802 | { |
803 | 803 | "output_type": "stream", |
|
865 | 865 | "metadata": { |
866 | 866 | "id": "8DCCwexcU6gO" |
867 | 867 | }, |
868 | | - "execution_count": 4, |
| 868 | + "execution_count": null, |
869 | 869 | "outputs": [] |
870 | 870 | }, |
871 | 871 | { |
|
949 | 949 | "id": "PtOufRspVekp", |
950 | 950 | "outputId": "711403bd-2543-4582-de1b-1e6b369045af" |
951 | 951 | }, |
952 | | - "execution_count": 5, |
| 952 | + "execution_count": null, |
953 | 953 | "outputs": [ |
954 | 954 | { |
955 | 955 | "output_type": "stream", |
|
1037 | 1037 | "id": "hQkMUyB0lROT", |
1038 | 1038 | "outputId": "01fc243a-7911-4762-f44b-4652f748e138" |
1039 | 1039 | }, |
1040 | | - "execution_count": 6, |
| 1040 | + "execution_count": null, |
1041 | 1041 | "outputs": [ |
1042 | 1042 | { |
1043 | 1043 | "output_type": "stream", |
|
1074 | 1074 | { |
1075 | 1075 | "cell_type": "markdown", |
1076 | 1076 | "source": [ |
1077 | | - "## Train RF-DETR on custom dataset" |
| 1077 | + "## Train RF-DETR on custom dataset\n", |
| 1078 | + "\n", |
| 1079 | + "### Choose the right `batch_size`\n", |
| 1080 | + "\n", |
| 1081 | + "Different GPUs have different amounts of VRAM (video memory), which limits how much data they can handle at once during training. To make training work well on any machine, you can adjust two settings: `batch_size` and `grad_accum_steps`. These control how many samples are processed at a time. The key is to keep their product equal to 16 — that’s our recommended total batch size. For example, on powerful GPUs like the A100, set `batch_size=16` and `grad_accum_steps=1`. On smaller GPUs like the T4, use `batch_size=4` and `grad_accum_steps=4`. We use a method called gradient accumulation, which lets the model simulate training with a larger batch size by gradually collecting updates before adjusting the weights.\n", |
| 1082 | + "\n", |
| 1083 | + "### Train with multiple GPUs\n", |
| 1084 | + "\n", |
| 1085 | + "You can fine-tune RF-DETR on multiple GPUs using PyTorch’s Distributed Data Parallel (DDP). Create a `main.py` script that initializes your model and calls `.train()` as usual than run it in terminal.\n", |
| 1086 | + "\n", |
| 1087 | + "```bash\n", |
| 1088 | + "python -m torch.distributed.launch \\\n", |
| 1089 | + " --nproc_per_node=8 \\\n", |
| 1090 | + " --use_env \\\n", |
| 1091 | + " main.py\n", |
| 1092 | + "```\n", |
| 1093 | + "\n", |
| 1094 | + "Replace `8` in the `--nproc_per_node argument` with the number of GPUs you want to use. This approach creates one training process per GPU and splits the workload automatically. Note that your effective batch size is multiplied by the number of GPUs, so you may need to adjust your `batch_size` and `grad_accum_steps` to maintain the same overall batch size." |
1078 | 1095 | ], |
1079 | 1096 | "metadata": { |
1080 | 1097 | "id": "vmT8f_bAq3zX" |
|
1143 | 1160 | "id": "7UEX3GVCmYaq", |
1144 | 1161 | "outputId": "a579ba09-43ab-4863-d2b6-94dbd77bbb6b" |
1145 | 1162 | }, |
1146 | | - "execution_count": 8, |
| 1163 | + "execution_count": null, |
1147 | 1164 | "outputs": [ |
1148 | 1165 | { |
1149 | 1166 | "output_type": "display_data", |
|
1203 | 1220 | "id": "gcjxmZeqqAdv", |
1204 | 1221 | "outputId": "356c1fea-316e-4b3d-888c-fe869e8adffd" |
1205 | 1222 | }, |
1206 | | - "execution_count": 9, |
| 1223 | + "execution_count": null, |
1207 | 1224 | "outputs": [ |
1208 | 1225 | { |
1209 | 1226 | "output_type": "display_data", |
|
1249 | 1266 | "metadata": { |
1250 | 1267 | "id": "xm-lmRWLswO4" |
1251 | 1268 | }, |
1252 | | - "execution_count": 10, |
| 1269 | + "execution_count": null, |
1253 | 1270 | "outputs": [] |
1254 | 1271 | }, |
1255 | 1272 | { |
|
1304 | 1321 | "id": "msor_5HgAkm3", |
1305 | 1322 | "outputId": "3c43efc2-c7d8-42b8-8d6d-c235f537222a" |
1306 | 1323 | }, |
1307 | | - "execution_count": 11, |
| 1324 | + "execution_count": null, |
1308 | 1325 | "outputs": [ |
1309 | 1326 | { |
1310 | 1327 | "output_type": "display_data", |
|
1365 | 1382 | "id": "RFEgIOz1YDCe", |
1366 | 1383 | "outputId": "8f885c5c-5122-44a6-d8ed-3783e442cc69" |
1367 | 1384 | }, |
1368 | | - "execution_count": 14, |
| 1385 | + "execution_count": null, |
1369 | 1386 | "outputs": [ |
1370 | 1387 | { |
1371 | 1388 | "output_type": "display_data", |
|
1412 | 1429 | "id": "szxs3PZsBVxa", |
1413 | 1430 | "outputId": "46b8ab4b-41a0-4fc3-ff63-bb28bad0b442" |
1414 | 1431 | }, |
1415 | | - "execution_count": 20, |
| 1432 | + "execution_count": null, |
1416 | 1433 | "outputs": [ |
1417 | 1434 | { |
1418 | 1435 | "output_type": "stream", |
|
1439 | 1456 | "id": "fxqvXOQcsRF2", |
1440 | 1457 | "outputId": "240e349f-4051-4c7a-eb74-2c521ac03e2b" |
1441 | 1458 | }, |
1442 | | - "execution_count": 21, |
| 1459 | + "execution_count": null, |
1443 | 1460 | "outputs": [ |
1444 | 1461 | { |
1445 | 1462 | "output_type": "display_data", |
|
1472 | 1489 | "id": "WuiNB-UM1xsJ", |
1473 | 1490 | "outputId": "822fea57-acb9-4d09-ad34-02aa640ad808" |
1474 | 1491 | }, |
1475 | | - "execution_count": 24, |
| 1492 | + "execution_count": null, |
1476 | 1493 | "outputs": [ |
1477 | 1494 | { |
1478 | 1495 | "output_type": "display_data", |
|
0 commit comments