1
0
mirror of https://github.com/huggingface/diffusers.git synced 2026-01-27 17:22:53 +03:00

[modular]some small fix (#12307)

* fix

* add mellon node registry

* style

* update docstring to include more info!

* support custom node mellon

* HTTPErrpr -> HfHubHTTPErrpr

* up

* Update src/diffusers/modular_pipelines/qwenimage/node_utils.py
This commit is contained in:
YiYi Xu
2025-09-29 11:42:34 -10:00
committed by GitHub
parent c07fcf780a
commit 76d4e416bc
20 changed files with 1107 additions and 741 deletions

View File

@@ -32,6 +32,8 @@ class FluxModularPipeline(ModularPipeline, FluxLoraLoaderMixin, TextualInversion
</Tip>
"""
default_blocks_name = "FluxAutoBlocks"
@property
def default_height(self):
return self.default_sample_size * self.vae_scale_factor

View File

@@ -0,0 +1,763 @@
import json
import logging
import os
# Simple typed wrapper for parameter overrides
from dataclasses import asdict, dataclass
from typing import Any, Dict, List, Optional, Tuple, Union
from huggingface_hub import create_repo, hf_hub_download
from huggingface_hub.utils import (
EntryNotFoundError,
HfHubHTTPError,
RepositoryNotFoundError,
RevisionNotFoundError,
validate_hf_hub_args,
)
from ..utils import HUGGINGFACE_CO_RESOLVE_ENDPOINT, PushToHubMixin, extract_commit_hash
from .modular_pipeline import ModularPipelineBlocks
logger = logging.getLogger(__name__)
SUPPORTED_NODE_TYPES = {"controlnet", "vae_encoder", "denoise", "text_encoder", "decoder"}
# Mellon Input Parameters (runtime parameters, not models)
MELLON_INPUT_PARAMS = {
# controlnet
"control_image": {
"label": "Control Image",
"type": "image",
"display": "input",
},
"controlnet_conditioning_scale": {
"label": "Scale",
"type": "float",
"default": 0.5,
"min": 0,
"max": 1,
},
"control_guidance_end": {
"label": "End",
"type": "float",
"default": 1.0,
"min": 0,
"max": 1,
},
"control_guidance_start": {
"label": "Start",
"type": "float",
"default": 0.0,
"min": 0,
"max": 1,
},
"controlnet": {
"label": "Controlnet",
"type": "custom_controlnet",
"display": "input",
},
"embeddings": {
"label": "Text Embeddings",
"display": "input",
"type": "embeddings",
},
"image": {
"label": "Image",
"type": "image",
"display": "input",
},
"negative_prompt": {
"label": "Negative Prompt",
"type": "string",
"default": "",
"display": "textarea",
},
"prompt": {
"label": "Prompt",
"type": "string",
"default": "",
"display": "textarea",
},
"guidance_scale": {
"label": "Guidance Scale",
"type": "float",
"display": "slider",
"default": 5,
"min": 1.0,
"max": 30.0,
"step": 0.1,
},
"height": {
"label": "Height",
"type": "int",
"default": 1024,
"min": 64,
"step": 8,
},
"image_latents": {
"label": "Image Latents",
"type": "latents",
"display": "input",
"onChange": {False: ["height", "width"], True: ["strength"]},
},
"latents": {
"label": "Latents",
"type": "latents",
"display": "input",
},
"num_inference_steps": {
"label": "Steps",
"type": "int",
"display": "slider",
"default": 25,
"min": 1,
"max": 100,
},
"seed": {
"label": "Seed",
"type": "int",
"display": "random",
"default": 0,
"min": 0,
"max": 4294967295,
},
"strength": {
"label": "Strength",
"type": "float",
"default": 0.5,
"min": 0.0,
"max": 1.0,
"step": 0.01,
},
"width": {
"label": "Width",
"type": "int",
"default": 1024,
"min": 64,
"step": 8,
},
"ip_adapter": {
"label": "IP Adapter",
"type": "custom_ip_adapter",
"display": "input",
},
}
# Mellon Model Parameters (diffusers_auto_model types)
MELLON_MODEL_PARAMS = {
"scheduler": {
"label": "Scheduler",
"display": "input",
"type": "diffusers_auto_model",
},
"text_encoders": {
"label": "Text Encoders",
"type": "diffusers_auto_models",
"display": "input",
},
"unet": {
"label": "Unet",
"display": "input",
"type": "diffusers_auto_model",
"onSignal": {
"action": "signal",
"target": "guider",
},
},
"guider": {
"label": "Guider",
"display": "input",
"type": "custom_guider",
"onChange": {False: ["guidance_scale"], True: []},
},
"vae": {
"label": "VAE",
"display": "input",
"type": "diffusers_auto_model",
},
"controlnet": {
"label": "Controlnet Model",
"type": "diffusers_auto_model",
"display": "input",
},
}
# Mellon Output Parameters (display = "output")
MELLON_OUTPUT_PARAMS = {
"embeddings": {
"label": "Text Embeddings",
"display": "output",
"type": "embeddings",
},
"images": {
"label": "Images",
"type": "image",
"display": "output",
},
"image_latents": {
"label": "Image Latents",
"type": "latents",
"display": "output",
},
"latents": {
"label": "Latents",
"type": "latents",
"display": "output",
},
"latents_preview": {
"label": "Latents Preview",
"display": "output",
"type": "latent",
},
"controlnet_out": {
"label": "Controlnet",
"display": "output",
"type": "controlnet",
},
}
# Default param selections per supported node_type
# from MELLON_INPUT_PARAMS / MELLON_MODEL_PARAMS / MELLON_OUTPUT_PARAMS.
NODE_TYPE_PARAMS_MAP = {
"controlnet": {
"inputs": [
"control_image",
"controlnet_conditioning_scale",
"control_guidance_start",
"control_guidance_end",
"height",
"width",
],
"model_inputs": [
"controlnet",
"vae",
],
"outputs": [
"controlnet",
],
"block_names": ["controlnet_vae_encoder"],
},
"denoise": {
"inputs": [
"embeddings",
"width",
"height",
"seed",
"num_inference_steps",
"guidance_scale",
"image_latents",
"strength",
# custom adapters coming in as inputs
"controlnet",
# ip_adapter is optional and custom; include if available
"ip_adapter",
],
"model_inputs": [
"unet",
"guider",
"scheduler",
],
"outputs": [
"latents",
"latents_preview",
],
"block_names": ["denoise"],
},
"vae_encoder": {
"inputs": [
"image",
"width",
"height",
],
"model_inputs": [
"vae",
],
"outputs": [
"image_latents",
],
"block_names": ["vae_encoder"],
},
"text_encoder": {
"inputs": [
"prompt",
"negative_prompt",
# optional image prompt input supported in embeddings node
"image",
],
"model_inputs": [
"text_encoders",
],
"outputs": [
"embeddings",
],
"block_names": ["text_encoder"],
},
"decoder": {
"inputs": [
"latents",
],
"model_inputs": [
"vae",
],
"outputs": [
"images",
],
"block_names": ["decode"],
},
}
@dataclass(frozen=True)
class MellonParam:
name: str
label: str
type: str
display: Optional[str] = None
default: Any = None
min: Optional[float] = None
max: Optional[float] = None
step: Optional[float] = None
options: Any = None
value: Any = None
fieldOptions: Optional[Dict[str, Any]] = None
onChange: Any = None
onSignal: Any = None
_map_to_input: Any = None # the block input name this parameter maps to
def to_dict(self) -> Dict[str, Any]:
data = asdict(self)
return {k: v for k, v in data.items() if not k.startswith("_") and v is not None}
@dataclass
class MellonNodeConfig(PushToHubMixin):
"""
A MellonNodeConfig is a base class to build Mellon nodes UI with modular diffusers.
<Tip warning={true}>
This is an experimental feature and is likely to change in the future.
</Tip>
"""
inputs: List[Union[str, MellonParam]]
model_inputs: List[Union[str, MellonParam]]
outputs: List[Union[str, MellonParam]]
blocks_names: list[str]
node_type: str
config_name = "mellon_config.json"
def __post_init__(self):
if isinstance(self.inputs, list):
self.inputs = self._resolve_params_list(self.inputs, MELLON_INPUT_PARAMS)
if isinstance(self.model_inputs, list):
self.model_inputs = self._resolve_params_list(self.model_inputs, MELLON_MODEL_PARAMS)
if isinstance(self.outputs, list):
self.outputs = self._resolve_params_list(self.outputs, MELLON_OUTPUT_PARAMS)
@staticmethod
def _resolve_params_list(
params: List[Union[str, MellonParam]], default_map: Dict[str, Dict[str, Any]]
) -> Dict[str, Dict[str, Any]]:
def _resolve_param(
param: Union[str, MellonParam], default_params_map: Dict[str, Dict[str, Any]]
) -> Tuple[str, Dict[str, Any]]:
if isinstance(param, str):
if param not in default_params_map:
raise ValueError(f"Unknown param '{param}', please define a `MellonParam` object instead")
return param, default_params_map[param].copy()
elif isinstance(param, MellonParam):
param_dict = param.to_dict()
param_name = param_dict.pop("name")
return param_name, param_dict
else:
raise ValueError(
f"Unknown param type '{type(param)}', please use a string or a `MellonParam` object instead"
)
resolved = {}
for p in params:
logger.info(f" Resolving param: {p}")
name, cfg = _resolve_param(p, default_map)
if name in resolved:
raise ValueError(f"Duplicate param '{name}'")
resolved[name] = cfg
return resolved
@classmethod
@validate_hf_hub_args
def load_mellon_config(
cls,
pretrained_model_name_or_path: Union[str, os.PathLike],
return_unused_kwargs=False,
return_commit_hash=False,
**kwargs,
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
r"""
Load a model or scheduler configuration.
Parameters:
pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
Can be either:
- A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
the Hub.
- A path to a *directory* (for example `./my_model_directory`) containing model weights saved with
[`~ConfigMixin.save_config`].
cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
is not used.
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
output_loading_info(`bool`, *optional*, defaults to `False`):
Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
local_files_only (`bool`, *optional*, defaults to `False`):
Whether to only load local model weights and configuration files or not. If set to `True`, the model
won't be downloaded from the Hub.
token (`str` or *bool*, *optional*):
The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
`diffusers-cli login` (stored in `~/.huggingface`) is used.
revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
allowed by Git.
subfolder (`str`, *optional*, defaults to `""`):
The subfolder location of a model file within a larger model repository on the Hub or locally.
return_unused_kwargs (`bool`, *optional*, defaults to `False):
Whether unused keyword arguments of the config are returned.
return_commit_hash (`bool`, *optional*, defaults to `False):
Whether the `commit_hash` of the loaded configuration are returned.
Returns:
`dict`:
A dictionary of all the parameters stored in a JSON configuration file.
"""
cache_dir = kwargs.pop("cache_dir", None)
local_dir = kwargs.pop("local_dir", None)
local_dir_use_symlinks = kwargs.pop("local_dir_use_symlinks", "auto")
force_download = kwargs.pop("force_download", False)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
local_files_only = kwargs.pop("local_files_only", False)
revision = kwargs.pop("revision", None)
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
if cls.config_name is None:
raise ValueError(
"`self.config_name` is not defined. Note that one should not load a config from "
"`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
)
if os.path.isfile(pretrained_model_name_or_path):
config_file = pretrained_model_name_or_path
elif os.path.isdir(pretrained_model_name_or_path):
if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
# Load from a PyTorch checkpoint
config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
else:
raise EnvironmentError(
f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
)
else:
try:
# Load from URL or cache if already cached
config_file = hf_hub_download(
pretrained_model_name_or_path,
filename=cls.config_name,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
local_files_only=local_files_only,
token=token,
revision=revision,
local_dir=local_dir,
local_dir_use_symlinks=local_dir_use_symlinks,
)
except RepositoryNotFoundError:
raise EnvironmentError(
f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier"
" listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a"
" token having permission to this repo with `token` or log in with `hf auth login`."
)
except RevisionNotFoundError:
raise EnvironmentError(
f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for"
" this model name. Check the model page at"
f" 'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
)
except EntryNotFoundError:
raise EnvironmentError(
f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}."
)
except HfHubHTTPError as err:
raise EnvironmentError(
"There was a specific connection error when trying to load"
f" {pretrained_model_name_or_path}:\n{err}"
)
except ValueError:
raise EnvironmentError(
f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
f" directory containing a {cls.config_name} file.\nCheckout your internet connection or see how to"
" run the library in offline mode at"
" 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
)
except EnvironmentError:
raise EnvironmentError(
f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from "
"'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
f"containing a {cls.config_name} file"
)
try:
with open(config_file, "r", encoding="utf-8") as reader:
text = reader.read()
config_dict = json.loads(text)
commit_hash = extract_commit_hash(config_file)
except (json.JSONDecodeError, UnicodeDecodeError):
raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
if not (return_unused_kwargs or return_commit_hash):
return config_dict
outputs = (config_dict,)
if return_unused_kwargs:
outputs += (kwargs,)
if return_commit_hash:
outputs += (commit_hash,)
return outputs
def save_mellon_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
"""
Save the Mellon node definition to a JSON file.
Args:
save_directory (`str` or `os.PathLike`):
Directory where the configuration JSON file is saved (will be created if it does not exist).
push_to_hub (`bool`, *optional*, defaults to `False`):
Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
namespace).
kwargs (`Dict[str, Any]`, *optional*):
Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
"""
if os.path.isfile(save_directory):
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
os.makedirs(save_directory, exist_ok=True)
# If we save using the predefined names, we can load using `from_config`
output_config_file = os.path.join(save_directory, self.config_name)
self.to_json_file(output_config_file)
logger.info(f"Mellon node definition saved in {output_config_file}")
if push_to_hub:
commit_message = kwargs.pop("commit_message", None)
private = kwargs.pop("private", None)
create_pr = kwargs.pop("create_pr", False)
token = kwargs.pop("token", None)
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
subfolder = kwargs.pop("subfolder", None)
self._upload_folder(
save_directory,
repo_id,
token=token,
commit_message=commit_message,
create_pr=create_pr,
subfolder=subfolder,
)
def to_json_file(self, json_file_path: Union[str, os.PathLike]):
"""
Save the Mellon schema dictionary to a JSON file.
Args:
json_file_path (`str` or `os.PathLike`):
Path to the JSON file to save a configuration instance's parameters.
"""
with open(json_file_path, "w", encoding="utf-8") as writer:
writer.write(self.to_json_string())
def to_json_string(self) -> str:
"""
Serializes this instance to a JSON string of the Mellon schema dict.
Args:
Returns:
`str`: String containing all the attributes that make up this configuration instance in JSON format.
"""
mellon_dict = self.to_mellon_dict()
return json.dumps(mellon_dict, indent=2, sort_keys=True) + "\n"
def to_mellon_dict(self) -> Dict[str, Any]:
"""Return a JSON-serializable dict focusing on the Mellon schema fields only.
params is a single flat dict composed as: {**inputs, **model_inputs, **outputs}.
"""
# inputs/model_inputs/outputs are already normalized dicts
merged_params = {}
merged_params.update(self.inputs or {})
merged_params.update(self.model_inputs or {})
merged_params.update(self.outputs or {})
return {
"node_type": self.node_type,
"blocks_names": self.blocks_names,
"params": merged_params,
}
@classmethod
def from_mellon_dict(cls, mellon_dict: Dict[str, Any]) -> "MellonNodeConfig":
"""Create a config from a Mellon schema dict produced by to_mellon_dict().
Splits the flat params dict back into inputs/model_inputs/outputs using the known key spaces from
MELLON_INPUT_PARAMS / MELLON_MODEL_PARAMS / MELLON_OUTPUT_PARAMS. Unknown keys are treated as inputs by
default.
"""
flat_params = mellon_dict.get("params", {})
inputs: Dict[str, Any] = {}
model_inputs: Dict[str, Any] = {}
outputs: Dict[str, Any] = {}
for param_name, param_dict in flat_params.items():
if param_dict.get("display", "") == "output":
outputs[param_name] = param_dict
elif param_dict.get("type", "") in ("diffusers_auto_model", "diffusers_auto_models"):
model_inputs[param_name] = param_dict
else:
inputs[param_name] = param_dict
return cls(
inputs=inputs,
model_inputs=model_inputs,
outputs=outputs,
blocks_names=mellon_dict.get("blocks_names", []),
node_type=mellon_dict.get("node_type"),
)
# YiYi Notes: not used yet
@classmethod
def from_blocks(cls, blocks: ModularPipelineBlocks, node_type: str) -> "MellonNodeConfig":
"""
Create an instance from a ModularPipeline object. If a preset exists in NODE_TYPE_PARAMS_MAP for the node_type,
use it; otherwise fall back to deriving lists from the pipeline's expected inputs/components/outputs.
"""
if node_type not in NODE_TYPE_PARAMS_MAP:
raise ValueError(f"Node type {node_type} not supported")
blocks_names = list(blocks.sub_blocks.keys())
default_node_config = NODE_TYPE_PARAMS_MAP[node_type]
inputs_list: List[Union[str, MellonParam]] = default_node_config.get("inputs", [])
model_inputs_list: List[Union[str, MellonParam]] = default_node_config.get("model_inputs", [])
outputs_list: List[Union[str, MellonParam]] = default_node_config.get("outputs", [])
for required_input_name in blocks.required_inputs:
if required_input_name not in inputs_list:
inputs_list.append(
MellonParam(
name=required_input_name, label=required_input_name, type=required_input_name, display="input"
)
)
for component_spec in blocks.expected_components:
if component_spec.name not in model_inputs_list:
model_inputs_list.append(
MellonParam(
name=component_spec.name,
label=component_spec.name,
type="diffusers_auto_model",
display="input",
)
)
return cls(
inputs=inputs_list,
model_inputs=model_inputs_list,
outputs=outputs_list,
blocks_names=blocks_names,
node_type=node_type,
)
# Minimal modular registry for Mellon node configs
class ModularMellonNodeRegistry:
"""Registry mapping (pipeline class, blocks_name) -> list of MellonNodeConfig."""
def __init__(self):
self._registry = {}
self._initialized = False
def register(self, pipeline_cls: type, node_params: Dict[str, MellonNodeConfig]):
if not self._initialized:
_initialize_registry(self)
self._registry[pipeline_cls] = node_params
def get(self, pipeline_cls: type) -> MellonNodeConfig:
if not self._initialized:
_initialize_registry(self)
return self._registry.get(pipeline_cls, None)
def get_all(self) -> Dict[type, Dict[str, MellonNodeConfig]]:
if not self._initialized:
_initialize_registry(self)
return self._registry
def _register_preset_node_types(
pipeline_cls, params_map: Dict[str, Dict[str, Any]], registry: ModularMellonNodeRegistry
):
"""Register all node-type presets for a given pipeline class from a params map."""
node_configs = {}
for node_type, spec in params_map.items():
node_config = MellonNodeConfig(
inputs=spec.get("inputs", []),
model_inputs=spec.get("model_inputs", []),
outputs=spec.get("outputs", []),
blocks_names=spec.get("block_names", []),
node_type=node_type,
)
node_configs[node_type] = node_config
registry.register(pipeline_cls, node_configs)
def _initialize_registry(registry: ModularMellonNodeRegistry):
"""Initialize the registry and register all available pipeline configs."""
print("Initializing registry")
registry._initialized = True
try:
from .qwenimage.modular_pipeline import QwenImageModularPipeline
from .qwenimage.node_utils import QwenImage_NODE_TYPES_PARAMS_MAP
_register_preset_node_types(QwenImageModularPipeline, QwenImage_NODE_TYPES_PARAMS_MAP, registry)
except Exception:
raise Exception("Failed to register QwenImageModularPipeline")
try:
from .stable_diffusion_xl.modular_pipeline import StableDiffusionXLModularPipeline
from .stable_diffusion_xl.node_utils import SDXL_NODE_TYPES_PARAMS_MAP
_register_preset_node_types(StableDiffusionXLModularPipeline, SDXL_NODE_TYPES_PARAMS_MAP, registry)
except Exception:
raise Exception("Failed to register StableDiffusionXLModularPipeline")

View File

@@ -51,6 +51,7 @@ if is_accelerate_available():
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
# map regular pipeline to modular pipeline class name
MODULAR_PIPELINE_MAPPING = OrderedDict(
[
("stable-diffusion-xl", "StableDiffusionXLModularPipeline"),
@@ -61,16 +62,6 @@ MODULAR_PIPELINE_MAPPING = OrderedDict(
]
)
MODULAR_PIPELINE_BLOCKS_MAPPING = OrderedDict(
[
("StableDiffusionXLModularPipeline", "StableDiffusionXLAutoBlocks"),
("WanModularPipeline", "WanAutoBlocks"),
("FluxModularPipeline", "FluxAutoBlocks"),
("QwenImageModularPipeline", "QwenImageAutoBlocks"),
("QwenImageEditModularPipeline", "QwenImageEditAutoBlocks"),
]
)
@dataclass
class PipelineState:
@@ -423,7 +414,7 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
state.set(input_param.name, param, input_param.kwargs_type)
elif input_param.kwargs_type:
# if it is a kwargs type, e.g. "guider_input_fields", it is likely to be a list of parameters
# if it is a kwargs type, e.g. "denoiser_input_fields", it is likely to be a list of parameters
# we need to first find out which inputs are and loop through them.
intermediate_kwargs = state.get_by_kwargs(input_param.kwargs_type)
for param_name, current_value in intermediate_kwargs.items():
@@ -1454,6 +1445,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
config_name = "modular_model_index.json"
hf_device_map = None
default_blocks_name = None
# YiYi TODO: add warning for passing multiple ComponentSpec/ConfigSpec with the same name
def __init__(
@@ -1514,7 +1506,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
`_blocks_class_name` in the config dict
"""
if blocks is None:
blocks_class_name = MODULAR_PIPELINE_BLOCKS_MAPPING.get(self.__class__.__name__)
blocks_class_name = self.default_blocks_name
if blocks_class_name is not None:
diffusers_module = importlib.import_module("diffusers")
blocks_class = getattr(diffusers_module, blocks_class_name)

View File

@@ -1,665 +0,0 @@
import json
import logging
import os
from pathlib import Path
from typing import List, Optional, Tuple, Union
import numpy as np
import PIL
import torch
from ..configuration_utils import ConfigMixin
from ..image_processor import PipelineImageInput
from .modular_pipeline import ModularPipelineBlocks, SequentialPipelineBlocks
from .modular_pipeline_utils import InputParam
logger = logging.getLogger(__name__)
# YiYi Notes: this is actually for SDXL, put it here for now
SDXL_INPUTS_SCHEMA = {
"prompt": InputParam(
"prompt", type_hint=Union[str, List[str]], description="The prompt or prompts to guide the image generation"
),
"prompt_2": InputParam(
"prompt_2",
type_hint=Union[str, List[str]],
description="The prompt or prompts to be sent to the tokenizer_2 and text_encoder_2",
),
"negative_prompt": InputParam(
"negative_prompt",
type_hint=Union[str, List[str]],
description="The prompt or prompts not to guide the image generation",
),
"negative_prompt_2": InputParam(
"negative_prompt_2",
type_hint=Union[str, List[str]],
description="The negative prompt or prompts for text_encoder_2",
),
"cross_attention_kwargs": InputParam(
"cross_attention_kwargs",
type_hint=Optional[dict],
description="Kwargs dictionary passed to the AttentionProcessor",
),
"clip_skip": InputParam(
"clip_skip", type_hint=Optional[int], description="Number of layers to skip in CLIP text encoder"
),
"image": InputParam(
"image",
type_hint=PipelineImageInput,
required=True,
description="The image(s) to modify for img2img or inpainting",
),
"mask_image": InputParam(
"mask_image",
type_hint=PipelineImageInput,
required=True,
description="Mask image for inpainting, white pixels will be repainted",
),
"generator": InputParam(
"generator",
type_hint=Optional[Union[torch.Generator, List[torch.Generator]]],
description="Generator(s) for deterministic generation",
),
"height": InputParam("height", type_hint=Optional[int], description="Height in pixels of the generated image"),
"width": InputParam("width", type_hint=Optional[int], description="Width in pixels of the generated image"),
"num_images_per_prompt": InputParam(
"num_images_per_prompt", type_hint=int, default=1, description="Number of images to generate per prompt"
),
"num_inference_steps": InputParam(
"num_inference_steps", type_hint=int, default=50, description="Number of denoising steps"
),
"timesteps": InputParam(
"timesteps", type_hint=Optional[torch.Tensor], description="Custom timesteps for the denoising process"
),
"sigmas": InputParam(
"sigmas", type_hint=Optional[torch.Tensor], description="Custom sigmas for the denoising process"
),
"denoising_end": InputParam(
"denoising_end",
type_hint=Optional[float],
description="Fraction of denoising process to complete before termination",
),
# YiYi Notes: img2img defaults to 0.3, inpainting defaults to 0.9999
"strength": InputParam(
"strength", type_hint=float, default=0.3, description="How much to transform the reference image"
),
"denoising_start": InputParam(
"denoising_start", type_hint=Optional[float], description="Starting point of the denoising process"
),
"latents": InputParam(
"latents", type_hint=Optional[torch.Tensor], description="Pre-generated noisy latents for image generation"
),
"padding_mask_crop": InputParam(
"padding_mask_crop",
type_hint=Optional[Tuple[int, int]],
description="Size of margin in crop for image and mask",
),
"original_size": InputParam(
"original_size",
type_hint=Optional[Tuple[int, int]],
description="Original size of the image for SDXL's micro-conditioning",
),
"target_size": InputParam(
"target_size", type_hint=Optional[Tuple[int, int]], description="Target size for SDXL's micro-conditioning"
),
"negative_original_size": InputParam(
"negative_original_size",
type_hint=Optional[Tuple[int, int]],
description="Negative conditioning based on image resolution",
),
"negative_target_size": InputParam(
"negative_target_size",
type_hint=Optional[Tuple[int, int]],
description="Negative conditioning based on target resolution",
),
"crops_coords_top_left": InputParam(
"crops_coords_top_left",
type_hint=Tuple[int, int],
default=(0, 0),
description="Top-left coordinates for SDXL's micro-conditioning",
),
"negative_crops_coords_top_left": InputParam(
"negative_crops_coords_top_left",
type_hint=Tuple[int, int],
default=(0, 0),
description="Negative conditioning crop coordinates",
),
"aesthetic_score": InputParam(
"aesthetic_score", type_hint=float, default=6.0, description="Simulates aesthetic score of generated image"
),
"negative_aesthetic_score": InputParam(
"negative_aesthetic_score", type_hint=float, default=2.0, description="Simulates negative aesthetic score"
),
"eta": InputParam("eta", type_hint=float, default=0.0, description="Parameter η in the DDIM paper"),
"output_type": InputParam(
"output_type", type_hint=str, default="pil", description="Output format (pil/tensor/np.array)"
),
"ip_adapter_image": InputParam(
"ip_adapter_image",
type_hint=PipelineImageInput,
required=True,
description="Image(s) to be used as IP adapter",
),
"control_image": InputParam(
"control_image", type_hint=PipelineImageInput, required=True, description="ControlNet input condition"
),
"control_guidance_start": InputParam(
"control_guidance_start",
type_hint=Union[float, List[float]],
default=0.0,
description="When ControlNet starts applying",
),
"control_guidance_end": InputParam(
"control_guidance_end",
type_hint=Union[float, List[float]],
default=1.0,
description="When ControlNet stops applying",
),
"controlnet_conditioning_scale": InputParam(
"controlnet_conditioning_scale",
type_hint=Union[float, List[float]],
default=1.0,
description="Scale factor for ControlNet outputs",
),
"guess_mode": InputParam(
"guess_mode",
type_hint=bool,
default=False,
description="Enables ControlNet encoder to recognize input without prompts",
),
"control_mode": InputParam(
"control_mode", type_hint=List[int], required=True, description="Control mode for union controlnet"
),
}
SDXL_INTERMEDIATE_INPUTS_SCHEMA = {
"prompt_embeds": InputParam(
"prompt_embeds",
type_hint=torch.Tensor,
required=True,
description="Text embeddings used to guide image generation",
),
"negative_prompt_embeds": InputParam(
"negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings"
),
"pooled_prompt_embeds": InputParam(
"pooled_prompt_embeds", type_hint=torch.Tensor, required=True, description="Pooled text embeddings"
),
"negative_pooled_prompt_embeds": InputParam(
"negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings"
),
"batch_size": InputParam("batch_size", type_hint=int, required=True, description="Number of prompts"),
"dtype": InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
"preprocess_kwargs": InputParam(
"preprocess_kwargs", type_hint=Optional[dict], description="Kwargs for ImageProcessor"
),
"latents": InputParam(
"latents", type_hint=torch.Tensor, required=True, description="Initial latents for denoising process"
),
"timesteps": InputParam("timesteps", type_hint=torch.Tensor, required=True, description="Timesteps for inference"),
"num_inference_steps": InputParam(
"num_inference_steps", type_hint=int, required=True, description="Number of denoising steps"
),
"latent_timestep": InputParam(
"latent_timestep", type_hint=torch.Tensor, required=True, description="Initial noise level timestep"
),
"image_latents": InputParam(
"image_latents", type_hint=torch.Tensor, required=True, description="Latents representing reference image"
),
"mask": InputParam("mask", type_hint=torch.Tensor, required=True, description="Mask for inpainting"),
"masked_image_latents": InputParam(
"masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting"
),
"add_time_ids": InputParam(
"add_time_ids", type_hint=torch.Tensor, required=True, description="Time ids for conditioning"
),
"negative_add_time_ids": InputParam(
"negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids"
),
"timestep_cond": InputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"),
"noise": InputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"),
"crops_coords": InputParam("crops_coords", type_hint=Optional[Tuple[int]], description="Crop coordinates"),
"ip_adapter_embeds": InputParam(
"ip_adapter_embeds", type_hint=List[torch.Tensor], description="Image embeddings for IP-Adapter"
),
"negative_ip_adapter_embeds": InputParam(
"negative_ip_adapter_embeds",
type_hint=List[torch.Tensor],
description="Negative image embeddings for IP-Adapter",
),
"images": InputParam(
"images",
type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
required=True,
description="Generated images",
),
}
SDXL_PARAM_SCHEMA = {**SDXL_INPUTS_SCHEMA, **SDXL_INTERMEDIATE_INPUTS_SCHEMA}
DEFAULT_PARAM_MAPS = {
"prompt": {
"label": "Prompt",
"type": "string",
"default": "a bear sitting in a chair drinking a milkshake",
"display": "textarea",
},
"negative_prompt": {
"label": "Negative Prompt",
"type": "string",
"default": "deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
"display": "textarea",
},
"num_inference_steps": {
"label": "Steps",
"type": "int",
"default": 25,
"min": 1,
"max": 1000,
},
"seed": {
"label": "Seed",
"type": "int",
"default": 0,
"min": 0,
"display": "random",
},
"width": {
"label": "Width",
"type": "int",
"display": "text",
"default": 1024,
"min": 8,
"max": 8192,
"step": 8,
"group": "dimensions",
},
"height": {
"label": "Height",
"type": "int",
"display": "text",
"default": 1024,
"min": 8,
"max": 8192,
"step": 8,
"group": "dimensions",
},
"images": {
"label": "Images",
"type": "image",
"display": "output",
},
"image": {
"label": "Image",
"type": "image",
"display": "input",
},
}
DEFAULT_TYPE_MAPS = {
"int": {
"type": "int",
"default": 0,
"min": 0,
},
"float": {
"type": "float",
"default": 0.0,
"min": 0.0,
},
"str": {
"type": "string",
"default": "",
},
"bool": {
"type": "boolean",
"default": False,
},
"image": {
"type": "image",
},
}
DEFAULT_MODEL_KEYS = ["unet", "vae", "text_encoder", "tokenizer", "controlnet", "transformer", "image_encoder"]
DEFAULT_CATEGORY = "Modular Diffusers"
DEFAULT_EXCLUDE_MODEL_KEYS = ["processor", "feature_extractor", "safety_checker"]
DEFAULT_PARAMS_GROUPS_KEYS = {
"text_encoders": ["text_encoder", "tokenizer"],
"ip_adapter_embeds": ["ip_adapter_embeds"],
"prompt_embeddings": ["prompt_embeds"],
}
def get_group_name(name, group_params_keys=DEFAULT_PARAMS_GROUPS_KEYS):
"""
Get the group name for a given parameter name, if not part of a group, return None e.g. "prompt_embeds" ->
"text_embeds", "text_encoder" -> "text_encoders", "prompt" -> None
"""
if name is None:
return None
for group_name, group_keys in group_params_keys.items():
for group_key in group_keys:
if group_key in name:
return group_name
return None
class ModularNode(ConfigMixin):
"""
A ModularNode is a base class to build UI nodes using diffusers. Currently only supports Mellon. It is a wrapper
around a ModularPipelineBlocks object.
<Tip warning={true}>
This is an experimental feature and is likely to change in the future.
</Tip>
"""
config_name = "node_config.json"
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: str,
trust_remote_code: Optional[bool] = None,
**kwargs,
):
blocks = ModularPipelineBlocks.from_pretrained(
pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
)
return cls(blocks, **kwargs)
def __init__(self, blocks, category=DEFAULT_CATEGORY, label=None, **kwargs):
self.blocks = blocks
if label is None:
label = self.blocks.__class__.__name__
# blocks param name -> mellon param name
self.name_mapping = {}
input_params = {}
# pass or create a default param dict for each input
# e.g. for prompt,
# prompt = {
# "name": "text_input", # the name of the input in node definition, could be different from the input name in diffusers
# "label": "Prompt",
# "type": "string",
# "default": "a bear sitting in a chair drinking a milkshake",
# "display": "textarea"}
# if type is not specified, it'll be a "custom" param of its own type
# e.g. you can pass ModularNode(scheduler = {name :"scheduler"})
# it will get this spec in node definition {"scheduler": {"label": "Scheduler", "type": "scheduler", "display": "input"}}
# name can be a dict, in that case, it is part of a "dict" input in mellon nodes, e.g. text_encoder= {name: {"text_encoders": "text_encoder"}}
inputs = self.blocks.inputs + self.blocks.intermediate_inputs
for inp in inputs:
param = kwargs.pop(inp.name, None)
if param:
# user can pass a param dict for all inputs, e.g. ModularNode(prompt = {...})
input_params[inp.name] = param
mellon_name = param.pop("name", inp.name)
if mellon_name != inp.name:
self.name_mapping[inp.name] = mellon_name
continue
if inp.name not in DEFAULT_PARAM_MAPS and not inp.required and not get_group_name(inp.name):
continue
if inp.name in DEFAULT_PARAM_MAPS:
# first check if it's in the default param map, if so, directly use that
param = DEFAULT_PARAM_MAPS[inp.name].copy()
elif get_group_name(inp.name):
param = get_group_name(inp.name)
if inp.name not in self.name_mapping:
self.name_mapping[inp.name] = param
else:
# if not, check if it's in the SDXL input schema, if so,
# 1. use the type hint to determine the type
# 2. use the default param dict for the type e.g. if "steps" is a "int" type, {"steps": {"type": "int", "default": 0, "min": 0}}
if inp.type_hint is not None:
type_str = str(inp.type_hint).lower()
else:
inp_spec = SDXL_PARAM_SCHEMA.get(inp.name, None)
type_str = str(inp_spec.type_hint).lower() if inp_spec else ""
for type_key, type_param in DEFAULT_TYPE_MAPS.items():
if type_key in type_str:
param = type_param.copy()
param["label"] = inp.name
param["display"] = "input"
break
else:
param = inp.name
# add the param dict to the inp_params dict
input_params[inp.name] = param
component_params = {}
for comp in self.blocks.expected_components:
param = kwargs.pop(comp.name, None)
if param:
component_params[comp.name] = param
mellon_name = param.pop("name", comp.name)
if mellon_name != comp.name:
self.name_mapping[comp.name] = mellon_name
continue
to_exclude = False
for exclude_key in DEFAULT_EXCLUDE_MODEL_KEYS:
if exclude_key in comp.name:
to_exclude = True
break
if to_exclude:
continue
if get_group_name(comp.name):
param = get_group_name(comp.name)
if comp.name not in self.name_mapping:
self.name_mapping[comp.name] = param
elif comp.name in DEFAULT_MODEL_KEYS:
param = {"label": comp.name, "type": "diffusers_auto_model", "display": "input"}
else:
param = comp.name
# add the param dict to the model_params dict
component_params[comp.name] = param
output_params = {}
if isinstance(self.blocks, SequentialPipelineBlocks):
last_block_name = list(self.blocks.sub_blocks.keys())[-1]
outputs = self.blocks.sub_blocks[last_block_name].intermediate_outputs
else:
outputs = self.blocks.intermediate_outputs
for out in outputs:
param = kwargs.pop(out.name, None)
if param:
output_params[out.name] = param
mellon_name = param.pop("name", out.name)
if mellon_name != out.name:
self.name_mapping[out.name] = mellon_name
continue
if out.name in DEFAULT_PARAM_MAPS:
param = DEFAULT_PARAM_MAPS[out.name].copy()
param["display"] = "output"
else:
group_name = get_group_name(out.name)
if group_name:
param = group_name
if out.name not in self.name_mapping:
self.name_mapping[out.name] = param
else:
param = out.name
# add the param dict to the outputs dict
output_params[out.name] = param
if len(kwargs) > 0:
logger.warning(f"Unused kwargs: {kwargs}")
register_dict = {
"category": category,
"label": label,
"input_params": input_params,
"component_params": component_params,
"output_params": output_params,
"name_mapping": self.name_mapping,
}
self.register_to_config(**register_dict)
def setup(self, components_manager, collection=None):
self.pipeline = self.blocks.init_pipeline(components_manager=components_manager, collection=collection)
self._components_manager = components_manager
@property
def mellon_config(self):
return self._convert_to_mellon_config()
def _convert_to_mellon_config(self):
node = {}
node["label"] = self.config.label
node["category"] = self.config.category
node_param = {}
for inp_name, inp_param in self.config.input_params.items():
if inp_name in self.name_mapping:
mellon_name = self.name_mapping[inp_name]
else:
mellon_name = inp_name
if isinstance(inp_param, str):
param = {
"label": inp_param,
"type": inp_param,
"display": "input",
}
else:
param = inp_param
if mellon_name not in node_param:
node_param[mellon_name] = param
else:
logger.debug(f"Input param {mellon_name} already exists in node_param, skipping {inp_name}")
for comp_name, comp_param in self.config.component_params.items():
if comp_name in self.name_mapping:
mellon_name = self.name_mapping[comp_name]
else:
mellon_name = comp_name
if isinstance(comp_param, str):
param = {
"label": comp_param,
"type": comp_param,
"display": "input",
}
else:
param = comp_param
if mellon_name not in node_param:
node_param[mellon_name] = param
else:
logger.debug(f"Component param {comp_param} already exists in node_param, skipping {comp_name}")
for out_name, out_param in self.config.output_params.items():
if out_name in self.name_mapping:
mellon_name = self.name_mapping[out_name]
else:
mellon_name = out_name
if isinstance(out_param, str):
param = {
"label": out_param,
"type": out_param,
"display": "output",
}
else:
param = out_param
if mellon_name not in node_param:
node_param[mellon_name] = param
else:
logger.debug(f"Output param {out_param} already exists in node_param, skipping {out_name}")
node["params"] = node_param
return node
def save_mellon_config(self, file_path):
"""
Save the Mellon configuration to a JSON file.
Args:
file_path (str or Path): Path where the JSON file will be saved
Returns:
Path: Path to the saved config file
"""
file_path = Path(file_path)
# Create directory if it doesn't exist
os.makedirs(file_path.parent, exist_ok=True)
# Create a combined dictionary with module definition and name mapping
config = {"module": self.mellon_config, "name_mapping": self.name_mapping}
# Save the config to file
with open(file_path, "w", encoding="utf-8") as f:
json.dump(config, f, indent=2)
logger.info(f"Mellon config and name mapping saved to {file_path}")
return file_path
@classmethod
def load_mellon_config(cls, file_path):
"""
Load a Mellon configuration from a JSON file.
Args:
file_path (str or Path): Path to the JSON file containing Mellon config
Returns:
dict: The loaded combined configuration containing 'module' and 'name_mapping'
"""
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"Config file not found: {file_path}")
with open(file_path, "r", encoding="utf-8") as f:
config = json.load(f)
logger.info(f"Mellon config loaded from {file_path}")
return config
def process_inputs(self, **kwargs):
params_components = {}
for comp_name, comp_param in self.config.component_params.items():
logger.debug(f"component: {comp_name}")
mellon_comp_name = self.name_mapping.get(comp_name, comp_name)
if mellon_comp_name in kwargs:
if isinstance(kwargs[mellon_comp_name], dict) and comp_name in kwargs[mellon_comp_name]:
comp = kwargs[mellon_comp_name].pop(comp_name)
else:
comp = kwargs.pop(mellon_comp_name)
if comp:
params_components[comp_name] = self._components_manager.get_one(comp["model_id"])
params_run = {}
for inp_name, inp_param in self.config.input_params.items():
logger.debug(f"input: {inp_name}")
mellon_inp_name = self.name_mapping.get(inp_name, inp_name)
if mellon_inp_name in kwargs:
if isinstance(kwargs[mellon_inp_name], dict) and inp_name in kwargs[mellon_inp_name]:
inp = kwargs[mellon_inp_name].pop(inp_name)
else:
inp = kwargs.pop(mellon_inp_name)
if inp is not None:
params_run[inp_name] = inp
return_output_names = list(self.config.output_params.keys())
return params_components, params_run, return_output_names
def execute(self, **kwargs):
params_components, params_run, return_output_names = self.process_inputs(**kwargs)
self.pipeline.update_components(**params_components)
output = self.pipeline(**params_run, output=return_output_names)
return output

View File

@@ -577,9 +577,8 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam(name="batch_size", required=True),
InputParam(
name="resized_image", required=True, type_hint=torch.Tensor, description="The resized image input"
),
InputParam(name="image_height", required=True),
InputParam(name="image_width", required=True),
InputParam(name="height", required=True),
InputParam(name="width", required=True),
InputParam(name="prompt_embeds_mask"),
@@ -612,10 +611,6 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
block_state = self.get_block_state(state)
# for edit, image size can be different from the target size (height/width)
image = (
block_state.resized_image[0] if isinstance(block_state.resized_image, list) else block_state.resized_image
)
image_width, image_height = image.size
block_state.img_shapes = [
[
@@ -624,7 +619,11 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
block_state.height // components.vae_scale_factor // 2,
block_state.width // components.vae_scale_factor // 2,
),
(1, image_height // components.vae_scale_factor // 2, image_width // components.vae_scale_factor // 2),
(
1,
block_state.image_height // components.vae_scale_factor // 2,
block_state.image_width // components.vae_scale_factor // 2,
),
]
] * block_state.batch_size

View File

@@ -496,7 +496,7 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
)
if components.requires_unconditional_embeds:
negative_prompt = block_state.negative_prompt or ""
negative_prompt = block_state.negative_prompt or " "
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds_edit(
components.text_encoder,
components.processor,

View File

@@ -307,6 +307,13 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
return inputs
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(name="image_height", type_hint=int, description="The height of the image latents"),
OutputParam(name="image_width", type_hint=int, description="The width of the image latents"),
]
@property
def expected_components(self) -> List[ComponentSpec]:
return [
@@ -327,6 +334,11 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
block_state.height = block_state.height or height
block_state.width = block_state.width or width
if not hasattr(block_state, "image_height"):
block_state.image_height = height
if not hasattr(block_state, "image_width"):
block_state.image_width = width
# 2. Patchify the image latent tensor
image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor)

View File

@@ -511,17 +511,42 @@ class QwenImageAutoDecodeStep(AutoPipelineBlocks):
)
class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [
QwenImageAutoInputStep,
QwenImageOptionalControlNetInputStep,
QwenImageAutoBeforeDenoiseStep,
QwenImageOptionalControlNetBeforeDenoiseStep,
QwenImageAutoDenoiseStep,
]
block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise", "decode"]
@property
def description(self):
return (
"Core step that performs the denoising process. \n"
+ " - `QwenImageAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+ " - `QwenImageOptionalControlNetInputStep` (controlnet_input) prepares the controlnet input.\n"
+ " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+ " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n"
+ " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
+ " - `QwenImageAutoDecodeStep` (decode) decodes the latents into images.\n\n"
+ "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n"
+ " - for image-to-image generation, you need to provide `image_latents`\n"
+ " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n"
+ " - to run the controlnet workflow, you need to provide `control_image_latents`\n"
+ " - for text-to-image generation, all you need to provide is prompt embeddings"
)
## 1.10 QwenImage/auto block & presets
AUTO_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageTextEncoderStep()),
("vae_encoder", QwenImageAutoVaeEncoderStep()),
("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
("input", QwenImageAutoInputStep()),
("controlnet_input", QwenImageOptionalControlNetInputStep()),
("before_denoise", QwenImageAutoBeforeDenoiseStep()),
("controlnet_before_denoise", QwenImageOptionalControlNetBeforeDenoiseStep()),
("denoise", QwenImageAutoDenoiseStep()),
("denoise", QwenImageCoreDenoiseStep()),
("decode", QwenImageAutoDecodeStep()),
]
)
@@ -699,7 +724,7 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
class QwenImageEditAutoInputStep(AutoPipelineBlocks):
block_classes = [QwenImageInpaintInputStep, QwenImageEditInputStep]
block_names = ["edit_inpaint", "edit"]
block_trigger_inputs = ["processed_mask_image", "image"]
block_trigger_inputs = ["processed_mask_image", "image_latents"]
@property
def description(self):
@@ -800,13 +825,34 @@ class QwenImageEditAutoDenoiseStep(AutoPipelineBlocks):
## 2.7 QwenImage-Edit/auto blocks & presets
class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditAutoInputStep,
QwenImageEditAutoBeforeDenoiseStep,
QwenImageEditAutoDenoiseStep,
]
block_names = ["input", "before_denoise", "denoise"]
@property
def description(self):
return (
"Core step that performs the denoising process. \n"
+ " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+ " - `QwenImageEditAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+ " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n"
+ "This step support edit (img2img) and edit inpainting workflow for QwenImage Edit:\n"
+ " - When `processed_mask_image` is provided, it will be used for edit inpainting task.\n"
+ " - When `image_latents` is provided, it will be used for edit (img2img) task.\n"
)
EDIT_AUTO_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageEditVLEncoderStep()),
("vae_encoder", QwenImageEditAutoVaeEncoderStep()),
("input", QwenImageEditAutoInputStep()),
("before_denoise", QwenImageEditAutoBeforeDenoiseStep()),
("denoise", QwenImageEditAutoDenoiseStep()),
("denoise", QwenImageEditCoreDenoiseStep()),
("decode", QwenImageAutoDecodeStep()),
]
)

View File

@@ -104,6 +104,8 @@ class QwenImageModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
</Tip>
"""
default_blocks_name = "QwenImageAutoBlocks"
@property
def default_height(self):
return self.default_sample_size * self.vae_scale_factor
@@ -158,6 +160,8 @@ class QwenImageEditModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
</Tip>
"""
default_blocks_name = "QwenImageEditAutoBlocks"
# YiYi TODO: qwen edit should not provide default height/width, should be derived from the resized input image (after adjustment) produced by the resize step.
@property
def default_height(self):

View File

@@ -0,0 +1,95 @@
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# mellon nodes
QwenImage_NODE_TYPES_PARAMS_MAP = {
"controlnet": {
"inputs": [
"control_image",
"controlnet_conditioning_scale",
"control_guidance_start",
"control_guidance_end",
"height",
"width",
],
"model_inputs": [
"controlnet",
"vae",
],
"outputs": [
"controlnet_out",
],
"block_names": ["controlnet_vae_encoder"],
},
"denoise": {
"inputs": [
"embeddings",
"width",
"height",
"seed",
"num_inference_steps",
"guidance_scale",
"image_latents",
"strength",
"controlnet",
],
"model_inputs": [
"unet",
"guider",
"scheduler",
],
"outputs": [
"latents",
"latents_preview",
],
"block_names": ["denoise"],
},
"vae_encoder": {
"inputs": [
"image",
"width",
"height",
],
"model_inputs": [
"vae",
],
"outputs": [
"image_latents",
],
},
"text_encoder": {
"inputs": [
"prompt",
"negative_prompt",
],
"model_inputs": [
"text_encoders",
],
"outputs": [
"embeddings",
],
},
"decoder": {
"inputs": [
"latents",
],
"model_inputs": [
"vae",
],
"outputs": [
"images",
],
},
}

View File

@@ -262,37 +262,37 @@ class StableDiffusionXLInputStep(ModularPipelineBlocks):
OutputParam(
"prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="text embeddings used to guide the image generation",
),
OutputParam(
"negative_prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="negative text embeddings used to guide the image generation",
),
OutputParam(
"pooled_prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="pooled text embeddings used to guide the image generation",
),
OutputParam(
"negative_pooled_prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="negative pooled text embeddings used to guide the image generation",
),
OutputParam(
"ip_adapter_embeds",
type_hint=List[torch.Tensor],
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="image embeddings for IP-Adapter",
),
OutputParam(
"negative_ip_adapter_embeds",
type_hint=List[torch.Tensor],
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="negative image embeddings for IP-Adapter",
),
]
@@ -1120,13 +1120,13 @@ class StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep(ModularPipelineB
OutputParam(
"add_time_ids",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields",
kwargs_type="denoiser_input_fields",
description="The time ids to condition the denoising process",
),
OutputParam(
"negative_add_time_ids",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields",
kwargs_type="denoiser_input_fields",
description="The negative time ids to condition the denoising process",
),
OutputParam("timestep_cond", type_hint=torch.Tensor, description="The timestep cond to use for LCM"),
@@ -1331,13 +1331,13 @@ class StableDiffusionXLPrepareAdditionalConditioningStep(ModularPipelineBlocks):
OutputParam(
"add_time_ids",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields",
kwargs_type="denoiser_input_fields",
description="The time ids to condition the denoising process",
),
OutputParam(
"negative_add_time_ids",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields",
kwargs_type="denoiser_input_fields",
description="The negative time ids to condition the denoising process",
),
OutputParam("timestep_cond", type_hint=torch.Tensor, description="The timestep cond to use for LCM"),

View File

@@ -183,14 +183,14 @@ class StableDiffusionXLLoopDenoiser(ModularPipelineBlocks):
description="The guidance scale embedding to use for Latent Consistency Models(LCMs). Can be generated in prepare_additional_conditioning step.",
),
InputParam(
kwargs_type="guider_input_fields",
kwargs_type="denoiser_input_fields",
description=(
"All conditional model inputs that need to be prepared with guider. "
"It should contain prompt_embeds/negative_prompt_embeds, "
"add_time_ids/negative_add_time_ids, "
"pooled_prompt_embeds/negative_pooled_prompt_embeds, "
"and ip_adapter_embeds/negative_ip_adapter_embeds (optional)."
"please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
"please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
),
),
]
@@ -307,14 +307,14 @@ class StableDiffusionXLControlNetLoopDenoiser(ModularPipelineBlocks):
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam(
kwargs_type="guider_input_fields",
kwargs_type="denoiser_input_fields",
description=(
"All conditional model inputs that need to be prepared with guider. "
"It should contain prompt_embeds/negative_prompt_embeds, "
"add_time_ids/negative_add_time_ids, "
"pooled_prompt_embeds/negative_pooled_prompt_embeds, "
"and ip_adapter_embeds/negative_ip_adapter_embeds (optional)."
"please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
"please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
),
),
InputParam(

View File

@@ -258,25 +258,25 @@ class StableDiffusionXLTextEncoderStep(ModularPipelineBlocks):
OutputParam(
"prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields",
kwargs_type="denoiser_input_fields",
description="text embeddings used to guide the image generation",
),
OutputParam(
"negative_prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields",
kwargs_type="denoiser_input_fields",
description="negative text embeddings used to guide the image generation",
),
OutputParam(
"pooled_prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields",
kwargs_type="denoiser_input_fields",
description="pooled text embeddings used to guide the image generation",
),
OutputParam(
"negative_pooled_prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields",
kwargs_type="denoiser_input_fields",
description="negative pooled text embeddings used to guide the image generation",
),
]

View File

@@ -82,19 +82,17 @@ class StableDiffusionXLAutoIPAdapterStep(AutoPipelineBlocks):
# before_denoise: text2img
class StableDiffusionXLBeforeDenoiseStep(SequentialPipelineBlocks):
block_classes = [
StableDiffusionXLInputStep,
StableDiffusionXLSetTimestepsStep,
StableDiffusionXLPrepareLatentsStep,
StableDiffusionXLPrepareAdditionalConditioningStep,
]
block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"]
block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"]
@property
def description(self):
return (
"Before denoise step that prepare the inputs for the denoise step.\n"
+ "This is a sequential pipeline blocks:\n"
+ " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
+ " - `StableDiffusionXLSetTimestepsStep` is used to set the timesteps\n"
+ " - `StableDiffusionXLPrepareLatentsStep` is used to prepare the latents\n"
+ " - `StableDiffusionXLPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
@@ -104,19 +102,17 @@ class StableDiffusionXLBeforeDenoiseStep(SequentialPipelineBlocks):
# before_denoise: img2img
class StableDiffusionXLImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
block_classes = [
StableDiffusionXLInputStep,
StableDiffusionXLImg2ImgSetTimestepsStep,
StableDiffusionXLImg2ImgPrepareLatentsStep,
StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep,
]
block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"]
block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"]
@property
def description(self):
return (
"Before denoise step that prepare the inputs for the denoise step for img2img task.\n"
+ "This is a sequential pipeline blocks:\n"
+ " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
+ " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n"
+ " - `StableDiffusionXLImg2ImgPrepareLatentsStep` is used to prepare the latents\n"
+ " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
@@ -126,19 +122,17 @@ class StableDiffusionXLImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
# before_denoise: inpainting
class StableDiffusionXLInpaintBeforeDenoiseStep(SequentialPipelineBlocks):
block_classes = [
StableDiffusionXLInputStep,
StableDiffusionXLImg2ImgSetTimestepsStep,
StableDiffusionXLInpaintPrepareLatentsStep,
StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep,
]
block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"]
block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"]
@property
def description(self):
return (
"Before denoise step that prepare the inputs for the denoise step for inpainting task.\n"
+ "This is a sequential pipeline blocks:\n"
+ " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
+ " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n"
+ " - `StableDiffusionXLInpaintPrepareLatentsStep` is used to prepare the latents\n"
+ " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
@@ -255,25 +249,48 @@ class StableDiffusionXLAutoDecodeStep(AutoPipelineBlocks):
)
class StableDiffusionXLCoreDenoiseStep(SequentialPipelineBlocks):
block_classes = [
StableDiffusionXLInputStep,
StableDiffusionXLAutoBeforeDenoiseStep,
StableDiffusionXLAutoControlNetInputStep,
StableDiffusionXLAutoDenoiseStep,
]
block_names = ["input", "before_denoise", "controlnet_input", "denoise"]
@property
def description(self):
return (
"Core step that performs the denoising process. \n"
+ " - `StableDiffusionXLInputStep` (input) standardizes the inputs for the denoising step.\n"
+ " - `StableDiffusionXLAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+ " - `StableDiffusionXLAutoControlNetInputStep` (controlnet_input) prepares the controlnet input.\n"
+ " - `StableDiffusionXLAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n"
+ "This step support text-to-image, image-to-image, inpainting, with or without controlnet/controlnet_union/ip_adapter for Stable Diffusion XL:\n"
+ "- for image-to-image generation, you need to provide `image_latents`\n"
+ "- for inpainting, you need to provide `mask_image` and `image_latents`\n"
+ "- to run the controlnet workflow, you need to provide `control_image`\n"
+ "- to run the controlnet_union workflow, you need to provide `control_image` and `control_mode`\n"
+ "- to run the ip_adapter workflow, you need to load ip_adapter into your unet and provide `ip_adapter_embeds`\n"
+ "- for text-to-image generation, all you need to provide is prompt embeddings\n"
)
# ip-adapter, controlnet, text2img, img2img, inpainting
class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
block_classes = [
StableDiffusionXLTextEncoderStep,
StableDiffusionXLAutoIPAdapterStep,
StableDiffusionXLAutoVaeEncoderStep,
StableDiffusionXLAutoBeforeDenoiseStep,
StableDiffusionXLAutoControlNetInputStep,
StableDiffusionXLAutoDenoiseStep,
StableDiffusionXLCoreDenoiseStep,
StableDiffusionXLAutoDecodeStep,
]
block_names = [
"text_encoder",
"ip_adapter",
"image_encoder",
"before_denoise",
"controlnet_input",
"vae_encoder",
"denoise",
"decoder",
"decode",
]
@property
@@ -321,7 +338,7 @@ TEXT2IMAGE_BLOCKS = InsertableDict(
IMAGE2IMAGE_BLOCKS = InsertableDict(
[
("text_encoder", StableDiffusionXLTextEncoderStep),
("image_encoder", StableDiffusionXLVaeEncoderStep),
("vae_encoder", StableDiffusionXLVaeEncoderStep),
("input", StableDiffusionXLInputStep),
("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
("prepare_latents", StableDiffusionXLImg2ImgPrepareLatentsStep),
@@ -334,7 +351,7 @@ IMAGE2IMAGE_BLOCKS = InsertableDict(
INPAINT_BLOCKS = InsertableDict(
[
("text_encoder", StableDiffusionXLTextEncoderStep),
("image_encoder", StableDiffusionXLInpaintVaeEncoderStep),
("vae_encoder", StableDiffusionXLInpaintVaeEncoderStep),
("input", StableDiffusionXLInputStep),
("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
("prepare_latents", StableDiffusionXLInpaintPrepareLatentsStep),
@@ -361,10 +378,8 @@ AUTO_BLOCKS = InsertableDict(
[
("text_encoder", StableDiffusionXLTextEncoderStep),
("ip_adapter", StableDiffusionXLAutoIPAdapterStep),
("image_encoder", StableDiffusionXLAutoVaeEncoderStep),
("before_denoise", StableDiffusionXLAutoBeforeDenoiseStep),
("controlnet_input", StableDiffusionXLAutoControlNetInputStep),
("denoise", StableDiffusionXLAutoDenoiseStep),
("vae_encoder", StableDiffusionXLAutoVaeEncoderStep),
("denoise", StableDiffusionXLCoreDenoiseStep),
("decode", StableDiffusionXLAutoDecodeStep),
]
)

View File

@@ -54,6 +54,8 @@ class StableDiffusionXLModularPipeline(
</Tip>
"""
default_blocks_name = "StableDiffusionXLAutoBlocks"
@property
def default_height(self):
return self.default_sample_size * self.vae_scale_factor

View File

@@ -0,0 +1,99 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SDXL_NODE_TYPES_PARAMS_MAP = {
"controlnet": {
"inputs": [
"control_image",
"controlnet_conditioning_scale",
"control_guidance_start",
"control_guidance_end",
"height",
"width",
],
"model_inputs": [
"controlnet",
],
"outputs": [
"controlnet_out",
],
"block_names": [None],
},
"denoise": {
"inputs": [
"embeddings",
"width",
"height",
"seed",
"num_inference_steps",
"guidance_scale",
"image_latents",
"strength",
# custom adapters coming in as inputs
"controlnet",
# ip_adapter is optional and custom; include if available
"ip_adapter",
],
"model_inputs": [
"unet",
"guider",
"scheduler",
],
"outputs": [
"latents",
"latents_preview",
],
"block_names": ["denoise"],
},
"vae_encoder": {
"inputs": [
"image",
"width",
"height",
],
"model_inputs": [
"vae",
],
"outputs": [
"image_latents",
],
"block_names": ["vae_encoder"],
},
"text_encoder": {
"inputs": [
"prompt",
"negative_prompt",
],
"model_inputs": [
"text_encoders",
],
"outputs": [
"embeddings",
],
"block_names": ["text_encoder"],
},
"decoder": {
"inputs": [
"latents",
],
"model_inputs": [
"vae",
],
"outputs": [
"images",
],
"block_names": ["decode"],
},
}

View File

@@ -146,13 +146,13 @@ class WanInputStep(ModularPipelineBlocks):
OutputParam(
"prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="text embeddings used to guide the image generation",
),
OutputParam(
"negative_prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="negative text embeddings used to guide the image generation",
),
]

View File

@@ -79,11 +79,11 @@ class WanLoopDenoiser(ModularPipelineBlocks):
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam(
kwargs_type="guider_input_fields",
kwargs_type="denoiser_input_fields",
description=(
"All conditional model inputs that need to be prepared with guider. "
"It should contain prompt_embeds/negative_prompt_embeds. "
"Please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
"Please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
),
),
]

View File

@@ -89,13 +89,13 @@ class WanTextEncoderStep(ModularPipelineBlocks):
OutputParam(
"prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields",
kwargs_type="denoiser_input_fields",
description="text embeddings used to guide the image generation",
),
OutputParam(
"negative_prompt_embeds",
type_hint=torch.Tensor,
kwargs_type="guider_input_fields",
kwargs_type="denoiser_input_fields",
description="negative text embeddings used to guide the image generation",
),
]

View File

@@ -37,6 +37,8 @@ class WanModularPipeline(
</Tip>
"""
default_blocks_name = "WanAutoBlocks"
@property
def default_height(self):
return self.default_sample_height * self.vae_scale_factor_spatial