diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 368ea30a26..704fb4d529 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -132,6 +132,8 @@ - sections: - local: api/models title: Models + - local: api/attnprocessor + title: Attention Processor - local: api/diffusion_pipeline title: Diffusion Pipeline - local: api/logging diff --git a/docs/source/en/api/attnprocessor.mdx b/docs/source/en/api/attnprocessor.mdx new file mode 100644 index 0000000000..ead639feff --- /dev/null +++ b/docs/source/en/api/attnprocessor.mdx @@ -0,0 +1,39 @@ +# Attention Processor + +An attention processor is a class for applying different types of attention mechanisms. + +## AttnProcessor +[[autodoc]] models.attention_processor.AttnProcessor + +## AttnProcessor2_0 +[[autodoc]] models.attention_processor.AttnProcessor2_0 + +## LoRAAttnProcessor +[[autodoc]] models.attention_processor.LoRAAttnProcessor + +## CustomDiffusionAttnProcessor +[[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor + +## AttnAddedKVProcessor +[[autodoc]] models.attention_processor.AttnAddedKVProcessor + +## AttnAddedKVProcessor2_0 +[[autodoc]] models.attention_processor.AttnAddedKVProcessor2_0 + +## LoRAAttnAddedKVProcessor +[[autodoc]] models.attention_processor.LoRAAttnAddedKVProcessor + +## XFormersAttnProcessor +[[autodoc]] models.attention_processor.XFormersAttnProcessor + +## LoRAXFormersAttnProcessor +[[autodoc]] models.attention_processor.LoRAXFormersAttnProcessor + +## CustomDiffusionXFormersAttnProcessor +[[autodoc]] models.attention_processor.CustomDiffusionXFormersAttnProcessor + +## SlicedAttnProcessor +[[autodoc]] models.attention_processor.SlicedAttnProcessor + +## SlicedAttnAddedKVProcessor +[[autodoc]] models.attention_processor.SlicedAttnAddedKVProcessor \ No newline at end of file diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index acdee10c76..4b65d164bd 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -431,6 +431,10 @@ class Attention(nn.Module): class AttnProcessor: + r""" + Default processor for performing attention-related computations. + """ + def __call__( self, attn: Attention, @@ -516,6 +520,18 @@ class LoRALinearLayer(nn.Module): class LoRAAttnProcessor(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism. + + Args: + hidden_size (`int`, *optional*): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + """ + def __init__(self, hidden_size, cross_attention_dim=None, rank=4): super().__init__() @@ -580,6 +596,24 @@ class LoRAAttnProcessor(nn.Module): class CustomDiffusionAttnProcessor(nn.Module): + r""" + Processor for implementing attention for the Custom Diffusion method. + + Args: + train_kv (`bool`, defaults to `True`): + Whether to newly train the key and value matrices corresponding to the text features. + train_q_out (`bool`, defaults to `True`): + Whether to newly train query matrices corresponding to the latent image features. + hidden_size (`int`, *optional*, defaults to `None`): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*, defaults to `None`): + The number of channels in the `encoder_hidden_states`. + out_bias (`bool`, defaults to `True`): + Whether to include the bias parameter in `train_q_out`. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability to use. + """ + def __init__( self, train_kv=True, @@ -658,6 +692,11 @@ class CustomDiffusionAttnProcessor(nn.Module): class AttnAddedKVProcessor: + r""" + Processor for performing attention-related computations with extra learnable key and value matrices for the text + encoder. + """ + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): residual = hidden_states hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) @@ -707,6 +746,11 @@ class AttnAddedKVProcessor: class AttnAddedKVProcessor2_0: + r""" + Processor for performing scaled dot-product attention (enabled by default if you're using PyTorch 2.0), with extra + learnable key and value matrices for the text encoder. + """ + def __init__(self): if not hasattr(F, "scaled_dot_product_attention"): raise ImportError( @@ -765,6 +809,19 @@ class AttnAddedKVProcessor2_0: class LoRAAttnAddedKVProcessor(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism with extra learnable key and value matrices for the text + encoder. + + Args: + hidden_size (`int`, *optional*): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*, defaults to `None`): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + """ + def __init__(self, hidden_size, cross_attention_dim=None, rank=4): super().__init__() @@ -832,6 +889,17 @@ class LoRAAttnAddedKVProcessor(nn.Module): class XFormersAttnProcessor: + r""" + Processor for implementing memory efficient attention using xFormers. + + Args: + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to + use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best + operator. + """ + def __init__(self, attention_op: Optional[Callable] = None): self.attention_op = attention_op @@ -905,6 +973,10 @@ class XFormersAttnProcessor: class AttnProcessor2_0: + r""" + Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). + """ + def __init__(self): if not hasattr(F, "scaled_dot_product_attention"): raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") @@ -983,6 +1055,23 @@ class AttnProcessor2_0: class LoRAXFormersAttnProcessor(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism with memory efficient attention using xFormers. + + Args: + hidden_size (`int`, *optional*): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to + use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best + operator. + """ + def __init__(self, hidden_size, cross_attention_dim, rank=4, attention_op: Optional[Callable] = None): super().__init__() @@ -1049,6 +1138,28 @@ class LoRAXFormersAttnProcessor(nn.Module): class CustomDiffusionXFormersAttnProcessor(nn.Module): + r""" + Processor for implementing memory efficient attention using xFormers for the Custom Diffusion method. + + Args: + train_kv (`bool`, defaults to `True`): + Whether to newly train the key and value matrices corresponding to the text features. + train_q_out (`bool`, defaults to `True`): + Whether to newly train query matrices corresponding to the latent image features. + hidden_size (`int`, *optional*, defaults to `None`): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*, defaults to `None`): + The number of channels in the `encoder_hidden_states`. + out_bias (`bool`, defaults to `True`): + Whether to include the bias parameter in `train_q_out`. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability to use. + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to use + as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best operator. + """ + def __init__( self, train_kv=True, @@ -1134,6 +1245,15 @@ class CustomDiffusionXFormersAttnProcessor(nn.Module): class SlicedAttnProcessor: + r""" + Processor for implementing sliced attention. + + Args: + slice_size (`int`, *optional*): + The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and + `attention_head_dim` must be a multiple of the `slice_size`. + """ + def __init__(self, slice_size): self.slice_size = slice_size @@ -1206,6 +1326,15 @@ class SlicedAttnProcessor: class SlicedAttnAddedKVProcessor: + r""" + Processor for implementing sliced attention with extra learnable key and value matrices for the text encoder. + + Args: + slice_size (`int`, *optional*): + The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and + `attention_head_dim` must be a multiple of the `slice_size`. + """ + def __init__(self, slice_size): self.slice_size = slice_size