Add BLOOM huggingface inference example (deepspeedai#211)

awan-10 · lekurile · web-flow · commit 7d0260bdc06a · 2022-10-28T11:03:06.000-07:00
This PR adds a bloom inference example (bigscience/bloom-3b) and a corresponding helper Pipeline class meant to mimic the functionality and API of the huggingface pipelines. This class was added in order to comprehend bloom meta tensors and checkpoint loading in a more organized way, that closely matched the existing examples.

This PR also cleans up extra whitespace across the inference examples.

Co-authored-by: Lev Kurilenko &lt;lekurile@microsoft.com&gt;
diff --git a/inference/huggingface/automatic-speech-recognition/test-wav2vec2.py b/inference/huggingface/automatic-speech-recognition/test-wav2vec2.py
@@ -27,7 +27,7 @@
                                  dtype=torch.float,
                                  injection_policy={Wav2Vec2EncoderLayer: ('attention.out_proj','feed_forward.output_dense')},
                                  replace_with_kernel_inject=False)
-model.to(f'cuda:{local_rank}')                                 
+model.to(f'cuda:{local_rank}')
 def map_to_array(batch):
     speech, _ = sf.read(batch["file"])
     batch["speech"] = speech
diff --git a/inference/huggingface/text-generation/run-generation-script/test-run-generation.py b/inference/huggingface/text-generation/run-generation-script/test-run-generation.py
@@ -193,7 +193,7 @@ def main():
         required=False,
         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
     )
-    
+
     parser.add_argument("--prompt", type=str, default="")
     parser.add_argument("--length", type=int, default=20)
     parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
@@ -214,7 +214,7 @@ def main():
     parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.")
     parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
 
-    parser.add_argument("--local_rank", type=int, default=0, help="local rank")  
+    parser.add_argument("--local_rank", type=int, default=0, help="local rank")
     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
     parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
     parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
@@ -235,7 +235,7 @@ def main():
         args.n_gpu,
         args.fp16,
     )
-    
+
     set_seed(args)
 
     # Initialize the model and tokenizer
@@ -256,9 +256,9 @@ def main():
     if args.ds_inference:
         import deepspeed.module_inject as module_inject
         import deepspeed
-        injection_policy={gpt2_transformer: 
+        injection_policy={gpt2_transformer:
                           module_inject.replace_policy.HFGPT2LayerPolicy}
-        model = deepspeed.init_inference(model, 
+        model = deepspeed.init_inference(model,
                                          mp_size=1,
                                          dtype=(torch.half if args.fp16 else torch.float),
                                          injection_policy=injection_policy,
@@ -293,7 +293,7 @@ def main():
         prefix = args.prefix if args.prefix else args.padding_text
         for ppt in prompt_text:
             eprompt.append(tokenizer.encode(prefix + ppt, add_special_tokens=False, return_tensors="pt"))
-    
+
     latencies = []
     for encoded_prompt, ppt in zip(eprompt, prompt_text):
         encoded_prompt = encoded_prompt.to(args.device)
@@ -302,10 +302,10 @@ def main():
             input_ids = None
         else:
             input_ids = encoded_prompt
-            
+
         torch.cuda.synchronize()
         t0 = time.time()
-        
+
         output_sequences = model.generate(
             input_ids=input_ids,
             max_length=args.length + len(encoded_prompt[0]),
diff --git a/inference/huggingface/text-generation/test-bloom.py b/inference/huggingface/text-generation/test-bloom.py
@@ -0,0 +1,31 @@
+import os
+import torch
+import deepspeed
+import transformers
+
+# Pipeline class to mimic HF pipeline
+from utils import Pipeline
+
+model_name = 'bigscience/bloom-3b'
+dtype = torch.float16
+num_tokens = 100
+
+# Get local gpu rank from torch.distributed/deepspeed launcher
+local_rank = int(os.getenv('LOCAL_RANK', '0'))
+world_size = int(os.getenv('WORLD_SIZE', '1'))
+
+pipe = Pipeline(model_name=model_name,
+                dtype=dtype
+)
+
+pipe.model = deepspeed.init_inference(
+                pipe.model,
+                mp_size=world_size,
+                dtype=dtype,
+                replace_with_kernel_inject=True,
+                base_dir=pipe.repo_root,
+                checkpoint=pipe.checkpoints_json
+)
+
+output = pipe('DeepSpeed is', num_tokens=num_tokens, do_sample=False)
+print(output)
diff --git a/inference/huggingface/text-generation/test-gpt2.py b/inference/huggingface/text-generation/test-gpt2.py
@@ -27,6 +27,6 @@
                                  mp_size=world_size,
                                  dtype=torch.half,
                                  replace_with_kernel_inject=True)
-                                 
+
 string = generator("DeepSpeed is", min_length=50, max_length=50, do_sample=True, use_cache=True)
 print(string)
diff --git a/inference/huggingface/text-generation/test-gptj.py b/inference/huggingface/text-generation/test-gptj.py
@@ -27,6 +27,6 @@
                                  mp_size=world_size,
                                  dtype=torch.half,
                                  replace_with_kernel_inject=True)
-                                 
+
 string = generator("DeepSpeed is", min_length=50, max_length=50, do_sample=True, use_cache=True)
 print(string)
diff --git a/inference/huggingface/text-generation/utils.py b/inference/huggingface/text-generation/utils.py
@@ -0,0 +1,77 @@
+'''
+Helper classes and functions for examples
+'''
+
+import io
+from pathlib import Path
+import json
+import deepspeed
+import torch
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+class Pipeline():
+    '''Example helper class, meant to mimic HF pipelines'''
+    def __init__(self,
+                 model_name='bigscience/bloom-3b',
+                 dtype=torch.float16,
+                 is_meta=True
+                 ):
+        self.model_name = model_name
+        self.dtype = dtype
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        if (is_meta):
+            '''When meta tensors enabled, use checkpoints'''
+            self.config = AutoConfig.from_pretrained(self.model_name)
+            self.repo_root, self.checkpoints_json = self.generate_json()
+
+            with deepspeed.OnDevice(dtype=self.dtype, device="meta"):
+                self.model = AutoModelForCausalLM.from_config(self.config, torch_dtype=self.dtype)
+        else:
+            self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
+
+        self.model.eval()
+
+
+    def __call__(self,
+                inputs=["test"],
+                num_tokens=100,
+                do_sample=False):
+        if isinstance(inputs, str):
+            input_list = [inputs]
+        else:
+            input_list = inputs
+
+        outputs = self.generate_outputs(input_list, num_tokens=num_tokens, do_sample=do_sample)
+        return outputs
+
+
+    def generate_json(self):
+        repo_root = snapshot_download(self.model_name, allow_patterns=["*"], local_files_only=False, revision=None)
+
+        checkpoints_json = "checkpoints.json"
+
+        with io.open(checkpoints_json, "w", encoding="utf-8") as f:
+            file_list = [str(entry) for entry in Path(repo_root).rglob("*.[bp][it][n]") if entry.is_file()]
+            data = {"type": self.config.model_type, "checkpoints": file_list, "version": 1.0}
+            json.dump(data, f)
+
+        return repo_root, checkpoints_json
+
+
+    def generate_outputs(self,
+                         inputs=["test"],
+                         num_tokens=100,
+                         do_sample=False):
+        generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=do_sample)
+
+        input_tokens = self.tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
+        for t in input_tokens:
+            if torch.is_tensor(input_tokens[t]):
+                input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
+
+        outputs = self.model.generate(**input_tokens, **generate_kwargs)
+        outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+        return outputs