Skip to content

Commit 86494a1

Browse files
Merge branch 'main' into main
2 parents 90281cb + abc47de commit 86494a1

9 files changed

Lines changed: 103 additions & 10 deletions

File tree

docs/source/en/using-diffusers/sdxl.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,8 @@ image = pipeline(prompt=prompt, prompt_2=prompt_2).images[0]
397397
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-double-prompt.png" alt="generated image of an astronaut in a jungle in the style of a van gogh painting"/>
398398
</div>
399399

400+
The dual text-encoders also support textual inversion embeddings that need to be loaded separately as explained in the [SDXL textual inversion](textual_inversion_inference#stable-diffusion-xl] section.
401+
400402
## Optimizations
401403

402404
SDXL is a large model, and you may need to optimize memory to get it to run on your hardware. Here are some tips to save memory and speed up inference.
@@ -426,4 +428,4 @@ SDXL is a large model, and you may need to optimize memory to get it to run on y
426428

427429
## Other resources
428430

429-
If you're interested in experimenting with a minimal version of the [`UNet2DConditionModel`] used in SDXL, take a look at the [minSDXL](https://github.com/cloneofsimo/minSDXL) implementation which is written in PyTorch and directly compatible with 🤗 Diffusers.
431+
If you're interested in experimenting with a minimal version of the [`UNet2DConditionModel`] used in SDXL, take a look at the [minSDXL](https://github.com/cloneofsimo/minSDXL) implementation which is written in PyTorch and directly compatible with 🤗 Diffusers.

docs/source/en/using-diffusers/textual_inversion_inference.md

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ from diffusers.utils import make_image_grid
2828
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
2929
```
3030

31+
## Stable Diffusion 1 and 2
32+
3133
Pick a Stable Diffusion checkpoint and a pre-learned concept from the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer):
3234

3335
```py
@@ -69,3 +71,50 @@ grid
6971
<div class="flex justify-center">
7072
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/textual_inversion_inference.png">
7173
</div>
74+
75+
76+
## Stable Diffusion XL
77+
78+
Stable Diffusion XL (SDXL) can also use textual inversion vectors for inference. In contrast to Stable Diffusion 1 and 2, SDXL has two text encoders so you'll need two textual inversion embeddings - one for each text encoder model.
79+
80+
Let's download the SDXL textual inversion embeddings and have a closer look at it's structure:
81+
82+
```py
83+
from huggingface_hub import hf_hub_download
84+
from safetensors.torch import load_file
85+
86+
file = hf_hub_download("dn118/unaestheticXL", filename="unaestheticXLv31.safetensors")
87+
state_dict = load_file(file)
88+
state_dict
89+
```
90+
91+
```
92+
{'clip_g': tensor([[ 0.0077, -0.0112, 0.0065, ..., 0.0195, 0.0159, 0.0275],
93+
...,
94+
[-0.0170, 0.0213, 0.0143, ..., -0.0302, -0.0240, -0.0362]],
95+
'clip_l': tensor([[ 0.0023, 0.0192, 0.0213, ..., -0.0385, 0.0048, -0.0011],
96+
...,
97+
[ 0.0475, -0.0508, -0.0145, ..., 0.0070, -0.0089, -0.0163]],
98+
```
99+
100+
There are two tensors, `"clip-g"` and `"clip-l"`.
101+
`"clip-g"` corresponds to the bigger text encoder in SDXL and refers to
102+
`pipe.text_encoder_2` and `"clip-l"` refers to `pipe.text_encoder`.
103+
104+
Now you can load each tensor separately by passing them along with the correct text encoder and tokenizer
105+
to [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`]:
106+
107+
```py
108+
from diffusers import AutoPipelineForText2Image
109+
import torch
110+
111+
pipe = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", variant="fp16", torch_dtype=torch.float16)
112+
pipe.to("cuda")
113+
114+
pipe.load_textual_inversion(state_dict["clip_g"], token="unaestheticXLv31", text_encoder=pipe.text_encoder_2, tokenizer=pipe.tokenizer_2)
115+
pipe.load_textual_inversion(state_dict["clip_l"], token="unaestheticXLv31", text_encoder=pipe.text_encoder, tokenizer=pipe.tokenizer)
116+
117+
# the embedding should be used as a negative embedding, so we pass it as a negative prompt
118+
generator = torch.Generator().manual_seed(33)
119+
image = pipe("a woman standing in front of a mountain", negative_prompt="unaestheticXLv31", generator=generator).images[0]
120+
```

examples/community/run_onnx_controlnet.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import numpy as np
99
import PIL.Image
1010
import torch
11-
from diffuser.utils.torch_utils import randn_tensor
11+
from diffusers.utils.torch_utils import randn_tensor
1212
from PIL import Image
1313
from transformers import CLIPTokenizer
1414

examples/community/run_tensorrt_controlnet.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import pycuda.driver as cuda
1212
import tensorrt as trt
1313
import torch
14-
from diffuser.utils.torch_utils import randn_tensor
14+
from diffusers.utils.torch_utils import randn_tensor
1515
from PIL import Image
1616
from pycuda.tools import make_default_context
1717
from transformers import CLIPTokenizer

scripts/convert_consistency_to_diffusers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
"ResnetUpsampleBlock2D",
2828
],
2929
"resnet_time_scale_shift": "scale_shift",
30+
"attn_norm_num_groups": 32,
3031
"upsample_type": "resnet",
3132
"downsample_type": "resnet",
3233
}
@@ -52,6 +53,7 @@
5253
"ResnetUpsampleBlock2D",
5354
],
5455
"resnet_time_scale_shift": "scale_shift",
56+
"attn_norm_num_groups": 32,
5557
"upsample_type": "resnet",
5658
"downsample_type": "resnet",
5759
}

src/diffusers/models/unet_2d.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ class UNet2DModel(ModelMixin, ConfigMixin):
7474
act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
7575
attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension.
7676
norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for normalization.
77+
attn_norm_num_groups (`int`, *optional*, defaults to `None`):
78+
If set to an integer, a group norm layer will be created in the mid block's [`Attention`] layer with the
79+
given number of groups. If left as `None`, the group norm layer will only be created if
80+
`resnet_time_scale_shift` is set to `default`, and if created will have `norm_num_groups` groups.
7781
norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for normalization.
7882
resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
7983
for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
@@ -107,6 +111,7 @@ def __init__(
107111
act_fn: str = "silu",
108112
attention_head_dim: Optional[int] = 8,
109113
norm_num_groups: int = 32,
114+
attn_norm_num_groups: Optional[int] = None,
110115
norm_eps: float = 1e-5,
111116
resnet_time_scale_shift: str = "default",
112117
add_attention: bool = True,
@@ -192,6 +197,7 @@ def __init__(
192197
resnet_time_scale_shift=resnet_time_scale_shift,
193198
attention_head_dim=attention_head_dim if attention_head_dim is not None else block_out_channels[-1],
194199
resnet_groups=norm_num_groups,
200+
attn_groups=attn_norm_num_groups,
195201
add_attention=add_attention,
196202
)
197203

src/diffusers/models/unet_2d_blocks.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,7 @@ def __init__(
485485
resnet_time_scale_shift: str = "default", # default, spatial
486486
resnet_act_fn: str = "swish",
487487
resnet_groups: int = 32,
488+
attn_groups: Optional[int] = None,
488489
resnet_pre_norm: bool = True,
489490
add_attention: bool = True,
490491
attention_head_dim=1,
@@ -494,6 +495,9 @@ def __init__(
494495
resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
495496
self.add_attention = add_attention
496497

498+
if attn_groups is None:
499+
attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
500+
497501
# there is always at least one resnet
498502
resnets = [
499503
ResnetBlock2D(
@@ -526,7 +530,7 @@ def __init__(
526530
dim_head=attention_head_dim,
527531
rescale_output_factor=output_scale_factor,
528532
eps=resnet_eps,
529-
norm_num_groups=resnet_groups if resnet_time_scale_shift == "default" else None,
533+
norm_num_groups=attn_groups,
530534
spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
531535
residual_connection=True,
532536
bias=True,

tests/models/test_models_unet_2d.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,36 @@ def prepare_init_args_and_inputs_for_common(self):
7474
inputs_dict = self.dummy_input
7575
return init_dict, inputs_dict
7676

77+
def test_mid_block_attn_groups(self):
78+
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
79+
80+
init_dict["norm_num_groups"] = 16
81+
init_dict["add_attention"] = True
82+
init_dict["attn_norm_num_groups"] = 8
83+
84+
model = self.model_class(**init_dict)
85+
model.to(torch_device)
86+
model.eval()
87+
88+
self.assertIsNotNone(
89+
model.mid_block.attentions[0].group_norm, "Mid block Attention group norm should exist but does not."
90+
)
91+
self.assertEqual(
92+
model.mid_block.attentions[0].group_norm.num_groups,
93+
init_dict["attn_norm_num_groups"],
94+
"Mid block Attention group norm does not have the expected number of groups.",
95+
)
96+
97+
with torch.no_grad():
98+
output = model(**inputs_dict)
99+
100+
if isinstance(output, dict):
101+
output = output.to_tuple()[0]
102+
103+
self.assertIsNotNone(output)
104+
expected_shape = inputs_dict["sample"].shape
105+
self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
106+
77107

78108
class UNetLDMModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
79109
model_class = UNet2DModel

tests/pipelines/consistency_models/test_consistency_models.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -216,9 +216,9 @@ def test_consistency_model_cd_multistep(self):
216216

217217
image_slice = image[0, -3:, -3:, -1]
218218

219-
expected_slice = np.array([0.0888, 0.0881, 0.0666, 0.0479, 0.0292, 0.0195, 0.0201, 0.0163, 0.0254])
219+
expected_slice = np.array([0.0146, 0.0158, 0.0092, 0.0086, 0.0000, 0.0000, 0.0000, 0.0000, 0.0058])
220220

221-
assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
221+
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
222222

223223
def test_consistency_model_cd_onestep(self):
224224
unet = UNet2DModel.from_pretrained("diffusers/consistency_models", subfolder="diffusers_cd_imagenet64_l2")
@@ -239,9 +239,9 @@ def test_consistency_model_cd_onestep(self):
239239

240240
image_slice = image[0, -3:, -3:, -1]
241241

242-
expected_slice = np.array([0.0340, 0.0152, 0.0063, 0.0267, 0.0221, 0.0107, 0.0416, 0.0186, 0.0217])
242+
expected_slice = np.array([0.0059, 0.0003, 0.0000, 0.0023, 0.0052, 0.0007, 0.0165, 0.0081, 0.0095])
243243

244-
assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
244+
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
245245

246246
@require_torch_2
247247
def test_consistency_model_cd_multistep_flash_attn(self):
@@ -263,7 +263,7 @@ def test_consistency_model_cd_multistep_flash_attn(self):
263263

264264
image_slice = image[0, -3:, -3:, -1]
265265

266-
expected_slice = np.array([0.1875, 0.1428, 0.1289, 0.2151, 0.2092, 0.1477, 0.1877, 0.1641, 0.1353])
266+
expected_slice = np.array([0.1845, 0.1371, 0.1211, 0.2035, 0.1954, 0.1323, 0.1773, 0.1593, 0.1314])
267267

268268
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
269269

@@ -289,6 +289,6 @@ def test_consistency_model_cd_onestep_flash_attn(self):
289289

290290
image_slice = image[0, -3:, -3:, -1]
291291

292-
expected_slice = np.array([0.1663, 0.1948, 0.2275, 0.1680, 0.1204, 0.1245, 0.1858, 0.1338, 0.2095])
292+
expected_slice = np.array([0.1623, 0.2009, 0.2387, 0.1731, 0.1168, 0.1202, 0.2031, 0.1327, 0.2447])
293293

294294
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3

0 commit comments

Comments
 (0)