deepspeedai · RezaYazdaniAminabadi · Nov 12, 2022 · Nov 11, 2022 · Nov 11, 2022 · Nov 12, 2022
@@ -1,6 +1,13 @@
 
 # DeepSpeed Huggingface Text Generation Examples
 
+# Contents
+   * [Setup](#setup)
+   * [Usage](#usage)
+   * [Single-batch Example](#single-batch-example)
+   * [Multi-batch Example](#multi-batch-example)
+   * [`DSPipeline` utility class](#dspipeline-utility-class)
+
 # Setup
 Python dependencies:
 <pre>
@@ -9,12 +16,13 @@ pip install -r requirements.txt
 
 # Usage
 Examples can be run as follows:
-<pre>deepspeed --num_gpus [number of GPUs] inference_test.py --name [model name/path] --batch_size [batch] --dtype [data type] 
+<pre>deepspeed --num_gpus [number of GPUs] inference-test.py --name [model name/path] --batch_size [batch] --dtype [data type]
 </pre>
+
 # Single-batch Example
 Command:
 <pre>
-deepspeed --num_gpus 1 inference_test.py --name facebook/opt-125m
+deepspeed --num_gpus 1 inference-test.py --name facebook/opt-125m
 </pre>
 
 Output:
@@ -27,7 +35,7 @@ out=DeepSpeed is a machine learning framework based on TensorFlow. It was first
 # Multi-batch Example
 Command:
 <pre>
-deepspeed --num_gpus 1 inference_test.py --name bigscience/bloom-3b --batch_size 2
+deepspeed --num_gpus 1 inference-test.py --name bigscience/bloom-3b --batch_size 2
 </pre>
 
 Output:
@@ -40,4 +48,16 @@ in=He is working on
 out=He is working on the new video game 'Bloodborne's' expansion pack. Check out the trailer here: Bloodborne's expansion pack includes a complete remaster of the original game, including over 120 maps, playable characters, new quests, and the possibility
  to bring Blood
 ------------------------------------------------------------     
-</pre>
+</pre>
+
+# `DSPipeline` utility class
+The text-generation examples make use of the [`DSPipeline`](utils.py) utility class, a class that helps with loading DeepSpeed meta tensors and is meant to mimic the Hugging Face transformer pipeline.
+
+The BLOOM model is quite large and the way DeepSpeed loads checkpoints for this model is a little different than other HF models. Specifically, we use meta tensors to initialize the model before loading the weights:
+
+<pre>
+with deepspeed.OnDevice(dtype=self.dtype, device="meta"):
+</pre>
+
+This reduces the total system/GPU memory needed to load the model across multiple GPUs and makes the checkpoint loading faster.
+The DSPipeline class helps to load the model and run inference on it, given these differences.
@@ -4,7 +4,7 @@
 import math
 import os
 import torch
-from utils import Pipeline
+from utils import DSPipeline
 
 
 parser = ArgumentParser()
@@ -18,24 +18,29 @@
 parser.add_argument("--greedy", default=False, type=bool, help="greedy generation mode")
 parser.add_argument("--use_meta_tensor", default=False, type=bool, help="use the meta tensors to initialize model")
 parser.add_argument("--use_cache", default=True, type=bool, help="use cache for generation")
+parser.add_argument("--local_rank", type=int, default=0, help="local rank")
 args = parser.parse_args()
 
-local_rank = int(os.getenv('LOCAL_RANK', '0'))
 world_size = int(os.getenv('WORLD_SIZE', '1'))
 
 data_type = getattr(torch, args.dtype)
-pipe = Pipeline(model_name=args.name,
-                dtype=data_type,
-                is_meta=args.use_meta_tensor,
-                device=local_rank
-)
+pipe = DSPipeline(model_name=args.name,
+                  dtype=data_type,
+                  is_meta=args.use_meta_tensor,
+                  device=args.local_rank)
+
+if args.use_meta_tensor:
+    ds_kwargs = dict(base_dir=pipe.repo_root, checkpoint=pipe.checkpoints_json)
+else:
+    ds_kwargs = dict()
 
 if args.ds_inference:
     pipe.model = deepspeed.init_inference(pipe.model,
                                     dtype=data_type,
                                     mp_size=world_size,
                                     replace_with_kernel_inject=True,
-                                    max_tokens=args.max_tokens
+                                    max_tokens=args.max_tokens,
+                                    **ds_kwargs
                                     )
 
 input_sentences = [
@@ -55,10 +60,9 @@
 
 inputs = input_sentences[:args.batch_size]
 
-outputs = pipe(inputs, 
-              num_tokens=args.max_new_tokens, 
-              do_sample=(not args.greedy),
-              use_cache=args.use_cache)
+outputs = pipe(inputs,
+              num_tokens=args.max_new_tokens,
+              do_sample=(not args.greedy))
 
 for i, o in zip(inputs, outputs):
     print(f"\nin={i}\nout={o}\n{'-'*60}")

@@ -11,8 +11,11 @@
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
-class Pipeline():
-    '''Example helper class, meant to mimic HF pipelines'''
+class DSPipeline():
+    '''
+    Example helper class for comprehending DeepSpeed Meta Tensors, meant to mimic HF pipelines.
+    The DSPipeline can run with and without meta tensors.
+    '''
     def __init__(self,
                  model_name='bigscience/bloom-3b',
                  dtype=torch.float16,