Change DS-Chat script flags for deployment type (deepspeedai#291)

mrwyattii · yaozhewei · web-flow · commit d570b2cc8a8f · 2023-04-13T19:26:46.000-04:00
* refactor num-gpus flag to deployment-type

* update docs

* improve error message

---------

Co-authored-by: Zhewei Yao &lt;zheweiy@berkeley.edu&gt;
diff --git a/applications/DeepSpeed-Chat/README.md b/applications/DeepSpeed-Chat/README.md
@@ -116,7 +116,7 @@ pip install -r requirements.txt
 If you only have around **1-2 hour** for coffee or lunch break, you can also try to train a small/toy model with DeepSpeed-Chat. For example, we prepared a training example for a **1.3B** model with a single dataset to test our framework on your consumer-grade GPUs. The best part is that you will have your model checkpoint ready to play with when you are back from your lunch break!
 
   ```bash
-  python train.py --actor-model facebook/opt-1.3b --reward-model facebook/opt-350m --num-gpus 1
+  python train.py --actor-model facebook/opt-1.3b --reward-model facebook/opt-350m --deployment-type single_gpu
   ```
 
   See the following table for the E2E time breakdown for training a 1.3 billion parameter ChatGPT model via DeepSpeed-Chat on a single commodity NVIDIA A6000 GPU with 48GB memory.
@@ -136,7 +136,7 @@ If you only have around **1-2 hour** for coffee or lunch break, you can also try
 If you only have around **half a day** and only a single server node, we suggest to use an example of pretrained **OPT-13B** as the actor model and OPT-350M as the reward model in the following single script to generate a final 13B ChatGPT-style model:
 
   ```bash
-  python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --num-gpus 8
+  python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --deployment-type single_node
   ```
 
   See the following table for the E2E time breakdown for training a 13 billion parameter ChatGPT model via DeepSpeed-Chat on a single DGX node with 8 NVIDIA A100-40G GPUs.
@@ -155,7 +155,7 @@ If you only have around **half a day** and only a single server node, we suggest
 Want to try different model sizes and configurations? You got it! With DeepSpeed-Chat, users can easily do that. For example, if you have access to multi-nodes cluster or cloud resources and prefer to train a larger and higher-quality model for your research or business, you can simply use a similar script with your desired model sizes, e.g., **66B** and GPU counts=64:
 
   ```bash
-  python train.py --actor-model facebook/opt-66b --reward-model facebook/opt-350m --num-gpus 64
+  python train.py --actor-model facebook/opt-66b --reward-model facebook/opt-350m --deployment-type multi_node
   ```
 
   See the following table for E2E time breakdown for training a 66 billion parameter ChatGPT model via DeepSpeed-Chat on 8 DGX nodes with 8 NVIDIA A100-80G GPUs/node.
diff --git a/applications/DeepSpeed-Chat/train.py b/applications/DeepSpeed-Chat/train.py
@@ -40,16 +40,7 @@
     3: "training/step3_rlhf_finetuning",
 }
 model_type = {1: "actor", 2: "reward", 3: "step3"}
-default_zero_stage = {
-    "single_node": {
-        "1.3b": 2,
-        "6.7b": 3,
-        "13b": 3
-    },
-    "multi_node": {
-        "66b": 3
-    },
-}
+dse_url = "https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/"
 
 
 def parse_args():
@@ -97,10 +88,10 @@ def parse_args():
         help="Directory for output of each step",
     )
     parser.add_argument(
-        "--num-gpus",
-        type=int,
-        default=1,
-        choices=(1, 8, 64),
+        "--deployment-type",
+        type=str,
+        default="single_gpu",
+        choices=("single_gpu", "single_node", "multi_node"),
         help="Number of GPUs to run the actor/reward models on",
     )
     args = parser.parse_args()
@@ -110,15 +101,6 @@ def parse_args():
             "Non-default zero stages may result in OOM errors or worse performance."
         )
 
-    if args.num_gpus == 1:
-        args.script_type = "single_gpu"
-    elif args.num_gpus == 8:
-        args.script_type = "single_node"
-    elif args.num_gpus == 64:
-        args.script_type = "multi_node"
-    else:
-        raise NotImplementedError(
-            f"{args.num_gpus} GPUs not supported by this script")
     return args
 
 
@@ -146,7 +128,7 @@ def get_script(args, step_num):
         os.getcwd(),
         step_dirs[step_num],
         "training_scripts",
-        args.script_type,
+        args.deployment_type,
         f"run_{model_size}.sh",
     )
     assert os.path.isfile(
@@ -184,13 +166,23 @@ def get_cmd(args, step_num):
     return cmd
 
 
-def launch_cmd(cmd, step_num):
+def launch_cmd(args, step_num, cmd):
     working_dir = step_dirs[step_num]
+    print(f"Running:\n{cmd}")
     p = subprocess.Popen(cmd, cwd=working_dir, shell=True)
     p.wait()
     if p.returncode != 0:
-        raise RuntimeError(
-            f"Step {step_num} exited with non-zero status {p.returncode}")
+        raise RuntimeError('\n\n'.join((
+            f"Step {step_num} exited with non-zero status {p.returncode}",
+            f"Launch command: {cmd}",
+            f"Log output: {os.path.join(get_output_dir(args, step_num), 'training.log')}",
+            f"Please see our tutorial at {dse_url}{step_dirs[step_num]}",
+            "Please check that you have installed our requirements: `pip install -r requirements.txt`",
+            f"If you are seeing an OOM error, try modifying {get_script(args, step_num)}:",
+            "  - Reduce `--per_device_*_batch_size`",
+            "  - Increase `--zero_stage {0,1,2,3}` on multi-gpu setups",
+            "  - Enable `--gradient_checkpointing` or `--only_optimizer_lora`"
+        )))
 
 
 def main(args):
@@ -200,7 +192,7 @@ def main(args):
         step_start_time = time.time()
 
         cmd = get_cmd(args, step_num)
-        launch_cmd(cmd, step_num)
+        launch_cmd(args, step_num, cmd)
 
         step_time = int(time.time() - start_time)
         time_str = str(datetime.timedelta(seconds=step_time))