From 1331ff310c03663db5534f27437c0b2085adf8ea Mon Sep 17 00:00:00 2001 From: Pareesa Date: Thu, 6 Apr 2023 19:33:21 +0000 Subject: [PATCH 01/11] test --- .../stable-diffusion/test-stable-diffusion.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/inference/huggingface/stable-diffusion/test-stable-diffusion.py b/inference/huggingface/stable-diffusion/test-stable-diffusion.py index 02bd2387e..41b72e662 100644 --- a/inference/huggingface/stable-diffusion/test-stable-diffusion.py +++ b/inference/huggingface/stable-diffusion/test-stable-diffusion.py @@ -21,13 +21,13 @@ # NOTE: DeepSpeed inference supports local CUDA graphs for replaced SD modules. # Local CUDA graphs for replaced SD modules will only be enabled when `mp_size==1` -pipe = deepspeed.init_inference( - pipe, - mp_size=world_size, - dtype=torch.half, - replace_with_kernel_inject=True, - enable_cuda_graph=True if world_size==1 else False, -) +#pipe = deepspeed.init_inference( +# pipe, +# mp_size=world_size, +# dtype=torch.half, +# replace_with_kernel_inject=True, +# enable_cuda_graph=True if world_size==1 else False, +#) generator.manual_seed(0xABEDABE7) deepspeed_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0] From c66710f5bf3e6efe057352f20843470c5118ad54 Mon Sep 17 00:00:00 2001 From: Pareesa Date: Thu, 6 Apr 2023 19:41:05 +0000 Subject: [PATCH 02/11] test --- .../huggingface/stable-diffusion/test-stable-diffusion.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inference/huggingface/stable-diffusion/test-stable-diffusion.py b/inference/huggingface/stable-diffusion/test-stable-diffusion.py index 41b72e662..6bea376b5 100644 --- a/inference/huggingface/stable-diffusion/test-stable-diffusion.py +++ b/inference/huggingface/stable-diffusion/test-stable-diffusion.py @@ -29,6 +29,6 @@ # enable_cuda_graph=True if world_size==1 else False, #) -generator.manual_seed(0xABEDABE7) -deepspeed_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0] -deepspeed_image.save(f"deepspeed.png") +#generator.manual_seed(0xABEDABE7) +#deepspeed_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0] +#deepspeed_image.save(f"deepspeed.png") From 6fac0daaf527764111cbbebf70180e2af4e0eae0 Mon Sep 17 00:00:00 2001 From: PareesaMS Date: Thu, 6 Apr 2023 23:19:19 +0000 Subject: [PATCH 03/11] Add local_pipeline --- .../local_pipeline_stable_diffusion.py | 714 ++++++++++++++++++ .../stable-diffusion/test-stable-diffusion.py | 6 +- 2 files changed, 718 insertions(+), 2 deletions(-) create mode 100644 inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py diff --git a/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py new file mode 100644 index 000000000..504479798 --- /dev/null +++ b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py @@ -0,0 +1,714 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Union + +import torch +from packaging import version +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer + +from ...configuration_utils import FrozenDict +from ...models import AutoencoderKL, UNet2DConditionModel +from ...schedulers import KarrasDiffusionSchedulers +from ...utils import ( + deprecate, + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) +from ..pipeline_utils import DiffusionPipeline +from . import StableDiffusionPipelineOutput +from .safety_checker import StableDiffusionSafetyChecker + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import StableDiffusionPipeline + + >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) + >>> pipe = pipe.to("cuda") + + >>> prompt = "a photo of an astronaut riding a horse on mars" + >>> image = pipe(prompt).images[0] + ``` +""" + + +class StableDiffusionPipeline(DiffusionPipeline): + r""" + Pipeline for text-to-image generation using Stable Diffusion. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. + feature_extractor ([`CLIPFeatureExtractor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. + """ + _optional_components = ["safety_checker", "feature_extractor"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = True, + ): + super().__init__() + + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" + f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " + "to update the config accordingly as leaving `steps_offset` might led to incorrect results" + " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," + " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["steps_offset"] = 1 + scheduler._internal_dict = FrozenDict(new_config) + + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." + " `clip_sample` should be set to False in the configuration file. Please make sure to update the" + " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in" + " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" + " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" + ) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["clip_sample"] = False + scheduler._internal_dict = FrozenDict(new_config) + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse( + version.parse(unet.config._diffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 + if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: + deprecation_message = ( + "The configuration file of the unet has set the default `sample_size` to smaller than" + " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the" + " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-" + " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5" + " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the" + " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" + " in the config might lead to incorrect results in future versions. If you have downloaded this" + " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(unet.config) + new_config["sample_size"] = 64 + unet._internal_dict = FrozenDict(new_config) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. + + When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several + steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. + + When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in + several steps. This is useful to save a large amount of memory and to allow the processing of larger images. + """ + self.vae.enable_tiling() + + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): + from accelerate import cpu_offload + else: + raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + else: + has_nsfw_concept = None + return image, has_nsfw_concept + + def decode_latents(self, latents): + latents = 1 / self.vae.config.scaling_factor * latents + image = self.vae.decode(latents).sample + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + ) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds = self._encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ) + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 5. Prepare latent variables + num_channels_latents = self.unet.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + ).sample + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + if output_type == "latent": + image = latents + has_nsfw_concept = None + elif output_type == "pil": + # 8. Post-processing + image = self.decode_latents(latents) + + # 9. Run safety checker + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # 10. Convert to PIL + image = self.numpy_to_pil(image) + else: + # 8. Post-processing + image = self.decode_latents(latents) + + # 9. Run safety checker + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/inference/huggingface/stable-diffusion/test-stable-diffusion.py b/inference/huggingface/stable-diffusion/test-stable-diffusion.py index 6bea376b5..9e344258d 100644 --- a/inference/huggingface/stable-diffusion/test-stable-diffusion.py +++ b/inference/huggingface/stable-diffusion/test-stable-diffusion.py @@ -1,8 +1,9 @@ import deepspeed import torch import os - +import pdb from diffusers import DiffusionPipeline +import local_pipeline_stable_diffusion prompt = "a dog on a rocket" @@ -12,12 +13,13 @@ world_size = int(os.getenv('WORLD_SIZE', '1')) generator = torch.Generator(device=torch.cuda.current_device()) +pdb.set_trace() pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=torch.half) pipe = pipe.to(device) generator.manual_seed(0xABEDABE7) baseline_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0] -baseline_image.save(f"baseline.png") +#baseline_image.save(f"baseline.png") # NOTE: DeepSpeed inference supports local CUDA graphs for replaced SD modules. # Local CUDA graphs for replaced SD modules will only be enabled when `mp_size==1` From 5d80ec97fefa767ce04e225c7f75e802e79b22b0 Mon Sep 17 00:00:00 2001 From: PareesaMS Date: Thu, 6 Apr 2023 23:22:12 +0000 Subject: [PATCH 04/11] Use local version --- .../huggingface/stable-diffusion/test-stable-diffusion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inference/huggingface/stable-diffusion/test-stable-diffusion.py b/inference/huggingface/stable-diffusion/test-stable-diffusion.py index 9e344258d..ac23a91df 100644 --- a/inference/huggingface/stable-diffusion/test-stable-diffusion.py +++ b/inference/huggingface/stable-diffusion/test-stable-diffusion.py @@ -14,7 +14,8 @@ generator = torch.Generator(device=torch.cuda.current_device()) pdb.set_trace() -pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=torch.half) +#pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=torch.half) +pipe = local_pipeline_stable_diffusion.from_pretrained(model, torch_dtype=torch.half) pipe = pipe.to(device) generator.manual_seed(0xABEDABE7) From 06e85ef82b1eafcadceb79bcb3e46dc46ec0a6ca Mon Sep 17 00:00:00 2001 From: PareesaMS Date: Thu, 6 Apr 2023 23:39:29 +0000 Subject: [PATCH 05/11] Calling from the local copy --- .../local_pipeline_stable_diffusion.py | 29 ++++++++++++++----- .../stable-diffusion/test-stable-diffusion.py | 6 ++-- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py index 504479798..7e29c9a1f 100644 --- a/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py +++ b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py @@ -16,13 +16,21 @@ from typing import Any, Callable, Dict, List, Optional, Union import torch +import pdb from packaging import version from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer -from ...configuration_utils import FrozenDict -from ...models import AutoencoderKL, UNet2DConditionModel -from ...schedulers import KarrasDiffusionSchedulers -from ...utils import ( +#from ...configuration_utils import FrozenDict +from diffusers.configuration_utils import FrozenDict + +#from ...models import AutoencoderKL, UNet2DConditionModel +from diffusers.models import AutoencoderKL, UNet2DConditionModel + +#from ...schedulers import KarrasDiffusionSchedulers +from diffusers.schedulers import KarrasDiffusionSchedulers + +#from ...utils import ( +from diffusers.utils import ( deprecate, is_accelerate_available, is_accelerate_version, @@ -30,9 +38,13 @@ randn_tensor, replace_example_docstring, ) -from ..pipeline_utils import DiffusionPipeline -from . import StableDiffusionPipelineOutput -from .safety_checker import StableDiffusionSafetyChecker +#from ..pipeline_utils import DiffusionPipeline +from diffusers.pipeline_utils import DiffusionPipeline + +#from . import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +#from .safety_checker import StableDiffusionSafetyChecker +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -605,6 +617,9 @@ def __call__( height = height or self.unet.config.sample_size * self.vae_scale_factor width = width or self.unet.config.sample_size * self.vae_scale_factor + print("MY VERSION") + pdb.set_trace() + # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds diff --git a/inference/huggingface/stable-diffusion/test-stable-diffusion.py b/inference/huggingface/stable-diffusion/test-stable-diffusion.py index ac23a91df..1ce03d361 100644 --- a/inference/huggingface/stable-diffusion/test-stable-diffusion.py +++ b/inference/huggingface/stable-diffusion/test-stable-diffusion.py @@ -2,8 +2,8 @@ import torch import os import pdb -from diffusers import DiffusionPipeline -import local_pipeline_stable_diffusion +#from diffusers import DiffusionPipeline +from local_pipeline_stable_diffusion import StableDiffusionPipeline prompt = "a dog on a rocket" @@ -15,7 +15,7 @@ pdb.set_trace() #pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=torch.half) -pipe = local_pipeline_stable_diffusion.from_pretrained(model, torch_dtype=torch.half) +pipe = StableDiffusionPipeline.from_pretrained(model, torch_dtype=torch.half) pipe = pipe.to(device) generator.manual_seed(0xABEDABE7) From dfe73a27b4041422517558877cccb23b9a20b36a Mon Sep 17 00:00:00 2001 From: PareesaMS Date: Mon, 10 Apr 2023 17:21:20 +0000 Subject: [PATCH 06/11] Added optimization to the local file and included the local file --- .../local_pipeline_stable_diffusion.py | 27 +++++++++---------- .../stable-diffusion/test-stable-diffusion.py | 23 ++++++++-------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py index 7e29c9a1f..77d1b179c 100644 --- a/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py +++ b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py @@ -16,7 +16,6 @@ from typing import Any, Callable, Dict, List, Optional, Union import torch -import pdb from packaging import version from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer @@ -531,6 +530,7 @@ def __call__( width: Optional[int] = None, num_inference_steps: int = 50, guidance_scale: float = 7.5, + optimized_iterations: float = 0, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, @@ -616,9 +616,6 @@ def __call__( # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor width = width or self.unet.config.sample_size * self.vae_scale_factor - - print("MY VERSION") - pdb.set_trace() # 1. Check inputs. Raise error if not correct self.check_inputs( @@ -671,25 +668,27 @@ def __call__( extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop + max_iter = len(timesteps) + start_opt_iter = max_iter * (1 - optimized_iterations) num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + _ , prompt_embeds_text = prompt_embeds.chunk(2) with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - # predict the noise residual - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, - ).sample - # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + # Apply optimization + if i > start_opt_iter: + # Compute only the conditional noise + noise_pred = self.unet(latents, t, encoder_hidden_states=prompt_embeds_text).sample + # No optimization + else: + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample diff --git a/inference/huggingface/stable-diffusion/test-stable-diffusion.py b/inference/huggingface/stable-diffusion/test-stable-diffusion.py index 1ce03d361..aa0c01187 100644 --- a/inference/huggingface/stable-diffusion/test-stable-diffusion.py +++ b/inference/huggingface/stable-diffusion/test-stable-diffusion.py @@ -1,8 +1,6 @@ import deepspeed import torch import os -import pdb -#from diffusers import DiffusionPipeline from local_pipeline_stable_diffusion import StableDiffusionPipeline prompt = "a dog on a rocket" @@ -13,14 +11,17 @@ world_size = int(os.getenv('WORLD_SIZE', '1')) generator = torch.Generator(device=torch.cuda.current_device()) -pdb.set_trace() -#pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=torch.half) pipe = StableDiffusionPipeline.from_pretrained(model, torch_dtype=torch.half) pipe = pipe.to(device) -generator.manual_seed(0xABEDABE7) -baseline_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0] -#baseline_image.save(f"baseline.png") +seed = 0xABEDABE7 +generator.manual_seed(seed) +baseline_image = pipe(prompt, guidance_scale=7.5, generator=generator, optimized_iterations=0).images[0] +baseline_image.save(f"baseline.png") + +#generator.manual_seed(seed) +#saving_image = pipe(prompt, guidance_scale=7.5, generator=generator, optimized_iterations=0.20).images[0] +#baseline_image.save(f"saving-20percent.png") # NOTE: DeepSpeed inference supports local CUDA graphs for replaced SD modules. # Local CUDA graphs for replaced SD modules will only be enabled when `mp_size==1` @@ -30,8 +31,8 @@ # dtype=torch.half, # replace_with_kernel_inject=True, # enable_cuda_graph=True if world_size==1 else False, -#) +# ) -#generator.manual_seed(0xABEDABE7) -#deepspeed_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0] -#deepspeed_image.save(f"deepspeed.png") +generator.manual_seed(seed) +deepspeed_image = pipe(prompt, guidance_scale=7.5, generator=generator, optimized_iterations=0.20).images[0] +deepspeed_image.save(f"deepspeed.png") From ca523f04be01a06afbec21319d58b74678c2573d Mon Sep 17 00:00:00 2001 From: PareesaMS Date: Mon, 10 Apr 2023 21:53:28 +0000 Subject: [PATCH 07/11] clean-up --- .../stable-diffusion/test-stable-diffusion.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/inference/huggingface/stable-diffusion/test-stable-diffusion.py b/inference/huggingface/stable-diffusion/test-stable-diffusion.py index aa0c01187..ab9303275 100644 --- a/inference/huggingface/stable-diffusion/test-stable-diffusion.py +++ b/inference/huggingface/stable-diffusion/test-stable-diffusion.py @@ -19,19 +19,16 @@ baseline_image = pipe(prompt, guidance_scale=7.5, generator=generator, optimized_iterations=0).images[0] baseline_image.save(f"baseline.png") -#generator.manual_seed(seed) -#saving_image = pipe(prompt, guidance_scale=7.5, generator=generator, optimized_iterations=0.20).images[0] -#baseline_image.save(f"saving-20percent.png") # NOTE: DeepSpeed inference supports local CUDA graphs for replaced SD modules. # Local CUDA graphs for replaced SD modules will only be enabled when `mp_size==1` -#pipe = deepspeed.init_inference( -# pipe, -# mp_size=world_size, -# dtype=torch.half, -# replace_with_kernel_inject=True, -# enable_cuda_graph=True if world_size==1 else False, -# ) +pipe = deepspeed.init_inference( + pipe, + mp_size=world_size, + dtype=torch.half, + replace_with_kernel_inject=True, + enable_cuda_graph=False if world_size==1 else False, + ) generator.manual_seed(seed) deepspeed_image = pipe(prompt, guidance_scale=7.5, generator=generator, optimized_iterations=0.20).images[0] From 230ad38c944f3826e23a67ea985aeee00ac516cb Mon Sep 17 00:00:00 2001 From: PareesaMS Date: Mon, 10 Apr 2023 22:06:39 +0000 Subject: [PATCH 08/11] Enable the cuda graph --- inference/huggingface/stable-diffusion/test-stable-diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/huggingface/stable-diffusion/test-stable-diffusion.py b/inference/huggingface/stable-diffusion/test-stable-diffusion.py index ab9303275..b91cebf28 100644 --- a/inference/huggingface/stable-diffusion/test-stable-diffusion.py +++ b/inference/huggingface/stable-diffusion/test-stable-diffusion.py @@ -27,7 +27,7 @@ mp_size=world_size, dtype=torch.half, replace_with_kernel_inject=True, - enable_cuda_graph=False if world_size==1 else False, + enable_cuda_graph=True if world_size==1 else False, ) generator.manual_seed(seed) From 610c21188ef4d4a6ee284f17094d8fffce620c64 Mon Sep 17 00:00:00 2001 From: Pareesa Date: Tue, 13 Jun 2023 17:09:55 +0000 Subject: [PATCH 09/11] Applying comments on local file --- .../stable-diffusion/local_pipeline_stable_diffusion.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py index 77d1b179c..cc41f530c 100644 --- a/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py +++ b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py @@ -14,7 +14,6 @@ import inspect from typing import Any, Callable, Dict, List, Optional, Union - import torch from packaging import version from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer @@ -530,7 +529,7 @@ def __call__( width: Optional[int] = None, num_inference_steps: int = 50, guidance_scale: float = 7.5, - optimized_iterations: float = 0, + opt_percentage: int = 0, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, @@ -616,7 +615,7 @@ def __call__( # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor width = width or self.unet.config.sample_size * self.vae_scale_factor - + # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds @@ -669,7 +668,7 @@ def __call__( # 7. Denoising loop max_iter = len(timesteps) - start_opt_iter = max_iter * (1 - optimized_iterations) + start_opt_iter = round(max_iter * (1 - opt_percentage / 100.0)) num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order _ , prompt_embeds_text = prompt_embeds.chunk(2) with self.progress_bar(total=num_inference_steps) as progress_bar: @@ -681,7 +680,7 @@ def __call__( # perform guidance if do_classifier_free_guidance: # Apply optimization - if i > start_opt_iter: + if i >= start_opt_iter: # Compute only the conditional noise noise_pred = self.unet(latents, t, encoder_hidden_states=prompt_embeds_text).sample # No optimization From bb0050dd41ac9facc59c50ab1500f7bdedace501 Mon Sep 17 00:00:00 2001 From: Pareesa Date: Tue, 13 Jun 2023 17:11:04 +0000 Subject: [PATCH 10/11] Applying comments to the main python file --- .../stable-diffusion/test-stable-diffusion.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/inference/huggingface/stable-diffusion/test-stable-diffusion.py b/inference/huggingface/stable-diffusion/test-stable-diffusion.py index b91cebf28..82b282840 100644 --- a/inference/huggingface/stable-diffusion/test-stable-diffusion.py +++ b/inference/huggingface/stable-diffusion/test-stable-diffusion.py @@ -2,8 +2,23 @@ import torch import os from local_pipeline_stable_diffusion import StableDiffusionPipeline +import argparse -prompt = "a dog on a rocket" +# In this example the SD inference pipeline is optimized based on recommendations in the research paper +# titled "Selective Guidance: Are All the Denoising Steps of Guided Diffusion Important?"(https://arxiv.org/abs/2305.09847). +# In this method, the last iterations of the denoising loop are optimized by simplifying the noise computation. In the guided diffusion +# process of the SD pipeline, the computed noise consists of two components: conditional noise and unconditional noise. +# However, including both components of noise doubles the computational requirements for running the denoising loop. +# To address this issue, the optimized iterations in this approach eliminate the unconditional noise, thereby reducing the +# compute cost associated with the generation process. +# The user has the flexibility to choose the percentage of iterations they want to optimize (opt_percentage). As well as +# setting the prompt and the guidance_scale. + +parser = argparse.ArgumentParser() +parser.add_argument('--prompt', type=str, default='A dog on a rocket', help='path to dataset (default: imagenet)') +parser.add_argument('--guidance_scale', type=float, default=7.5, help='Guidance Scale') +parser.add_argument('--opt_percentage', type=int, default=0, help='The percentage of the last iterations to be optimized') +args = parser.parse_args() model = "prompthero/midjourney-v4-diffusion" local_rank = int(os.getenv("LOCAL_RANK", "0")) @@ -16,7 +31,7 @@ seed = 0xABEDABE7 generator.manual_seed(seed) -baseline_image = pipe(prompt, guidance_scale=7.5, generator=generator, optimized_iterations=0).images[0] +baseline_image = pipe(args.prompt, guidance_scale=args.guidance_scale, generator=generator, opt_percentage=0).images[0] baseline_image.save(f"baseline.png") @@ -31,5 +46,5 @@ ) generator.manual_seed(seed) -deepspeed_image = pipe(prompt, guidance_scale=7.5, generator=generator, optimized_iterations=0.20).images[0] +deepspeed_image = pipe(args.prompt, guidance_scale=args.guidance_scale, generator=generator, opt_percentage=args.opt_percentage).images[0] deepspeed_image.save(f"deepspeed.png") From e88997110f6b1b9367dce0fd369c5cc235d3ac1f Mon Sep 17 00:00:00 2001 From: Pareesa Date: Wed, 14 Jun 2023 23:15:34 +0000 Subject: [PATCH 11/11] Fix a comment and some clean-up --- .../stable-diffusion/local_pipeline_stable_diffusion.py | 9 +-------- .../stable-diffusion/test-stable-diffusion.py | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py index cc41f530c..ab0b708e6 100644 --- a/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py +++ b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py @@ -18,16 +18,12 @@ from packaging import version from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer -#from ...configuration_utils import FrozenDict from diffusers.configuration_utils import FrozenDict -#from ...models import AutoencoderKL, UNet2DConditionModel from diffusers.models import AutoencoderKL, UNet2DConditionModel -#from ...schedulers import KarrasDiffusionSchedulers from diffusers.schedulers import KarrasDiffusionSchedulers -#from ...utils import ( from diffusers.utils import ( deprecate, is_accelerate_available, @@ -36,12 +32,9 @@ randn_tensor, replace_example_docstring, ) -#from ..pipeline_utils import DiffusionPipeline -from diffusers.pipeline_utils import DiffusionPipeline -#from . import StableDiffusionPipelineOutput +from diffusers.pipeline_utils import DiffusionPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput -#from .safety_checker import StableDiffusionSafetyChecker from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker diff --git a/inference/huggingface/stable-diffusion/test-stable-diffusion.py b/inference/huggingface/stable-diffusion/test-stable-diffusion.py index 82b282840..3abbc7456 100644 --- a/inference/huggingface/stable-diffusion/test-stable-diffusion.py +++ b/inference/huggingface/stable-diffusion/test-stable-diffusion.py @@ -15,7 +15,7 @@ # setting the prompt and the guidance_scale. parser = argparse.ArgumentParser() -parser.add_argument('--prompt', type=str, default='A dog on a rocket', help='path to dataset (default: imagenet)') +parser.add_argument('--prompt', type=str, default='A dog on a rocket', help='The prompt to use in creating the image') parser.add_argument('--guidance_scale', type=float, default=7.5, help='Guidance Scale') parser.add_argument('--opt_percentage', type=int, default=0, help='The percentage of the last iterations to be optimized') args = parser.parse_args()