Add device comprehension to BLOOM Pipeline utility class (deepspeedai#217)

lekurile · web-flow · commit e0db814cb9ca · 2022-11-10T13:24:12.000-08:00
This PR adds device comprehension to the BLOOM Pipeline utility class to expand support for devices and also support the case where the DeepSpeed init_inference API isn't used.
diff --git a/inference/huggingface/text-generation/test-bloom.py b/inference/huggingface/text-generation/test-bloom.py
@@ -15,7 +15,9 @@
 world_size = int(os.getenv('WORLD_SIZE', '1'))
 
 pipe = Pipeline(model_name=model_name,
-                dtype=dtype
+                dtype=dtype,
+                is_meta=True,
+                device=local_rank
 )
 
 pipe.model = deepspeed.init_inference(
diff --git a/inference/huggingface/text-generation/utils.py b/inference/huggingface/text-generation/utils.py
@@ -16,11 +16,21 @@ class Pipeline():
     def __init__(self,
                  model_name='bigscience/bloom-3b',
                  dtype=torch.float16,
-                 is_meta=True
+                 is_meta=True,
+                 device=-1
                  ):
         self.model_name = model_name
         self.dtype = dtype
 
+        if isinstance(device, torch.device):
+            self.device = device
+        elif isinstance(device, str):
+            self.device = torch.device(device)
+        elif device < 0:
+            self.device = torch.device("cpu")
+        else:
+            self.device = torch.device(f"cuda:{device}")
+
         # the Deepspeed team made these so it's super fast to load (~1 minute), rather than wait 10-20min loading time.
         self.tp_presharded_models = ["microsoft/bloom-deepspeed-inference-int8", "microsoft/bloom-deepspeed-inference-fp16"]
 
@@ -78,7 +88,9 @@ def generate_outputs(self,
         input_tokens = self.tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
         for t in input_tokens:
             if torch.is_tensor(input_tokens[t]):
-                input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
+                input_tokens[t] = input_tokens[t].to(self.device)
+
+        self.model.cuda().to(self.device)
 
         outputs = self.model.generate(**input_tokens, **generate_kwargs)
         outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,9 @@`
`15`	`15`	`world_size = int(os.getenv('WORLD_SIZE', '1'))`
`16`	`16`
`17`	`17`	`pipe = Pipeline(model_name=model_name,`
`18`		`- dtype=dtype`
	`18`	`+ dtype=dtype,`
	`19`	`+ is_meta=True,`
	`20`	`+ device=local_rank`
`19`	`21`	`)`
`20`	`22`
`21`	`23`	`pipe.model = deepspeed.init_inference(`