diff --git a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py index 854b5218c6..a112f6464f 100644 --- a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py +++ b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py @@ -37,9 +37,14 @@ class TestFluxModularPipelineFast(ModularPipelineTesterMixin): pipeline_class = FluxModularPipeline pipeline_blocks_class = FluxAutoBlocks pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-modular" + default_repo_id = "black-forest-labs/FLUX.1-dev" params = frozenset(["prompt", "height", "width", "guidance_scale"]) batch_params = frozenset(["prompt"]) + # should choose from the dict returned by `get_dummy_inputs` + text_encoder_block_params = frozenset(["prompt", "max_sequence_length"]) + decode_block_params = frozenset(["output_type"]) + vae_encoder_block_params = None # None if vae_encoder is not supported def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) @@ -63,10 +68,16 @@ class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin): pipeline_class = FluxModularPipeline pipeline_blocks_class = FluxAutoBlocks pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-modular" + default_repo_id = "black-forest-labs/FLUX.1-dev" params = frozenset(["prompt", "height", "width", "guidance_scale", "image"]) batch_params = frozenset(["prompt", "image"]) + # should choose from the dict returned by `get_dummy_inputs` + text_encoder_block_params = frozenset(["prompt", "max_sequence_length", ]) + decode_block_params = frozenset(["output_type"]) + vae_encoder_block_params = frozenset(["image", "height", "width"]) + def get_pipeline(self, components_manager=None, torch_dtype=torch.float32): pipeline = super().get_pipeline(components_manager, torch_dtype) @@ -129,9 +140,13 @@ class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin): pipeline_class = FluxKontextModularPipeline pipeline_blocks_class = FluxKontextAutoBlocks pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-kontext-pipe" + default_repo_id = "black-forest-labs/FLUX.1-kontext-dev" params = frozenset(["prompt", "height", "width", "guidance_scale", "image"]) batch_params = frozenset(["prompt", "image"]) + text_encoder_block_params = frozenset(["prompt", "max_sequence_length"]) + decode_block_params = frozenset(["latents"]) + vae_encoder_block_params = frozenset(["image", "height", "width"]) def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py index 8fd529e97e..99b2b0e6c4 100644 --- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py +++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py @@ -32,9 +32,14 @@ class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin): pipeline_class = Flux2ModularPipeline pipeline_blocks_class = Flux2AutoBlocks pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-modular" + default_repo_id = "black-forest-labs/FLUX.2-dev" params = frozenset(["prompt", "height", "width", "guidance_scale"]) batch_params = frozenset(["prompt"]) + # should choose from the dict returned by `get_dummy_inputs` + text_encoder_block_params = frozenset(["prompt", "max_sequence_length", "text_encoder_out_layers"]) + decode_block_params = frozenset(["output_type"]) + vae_encoder_block_params = None def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) @@ -63,6 +68,10 @@ class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin): params = frozenset(["prompt", "height", "width", "guidance_scale", "image"]) batch_params = frozenset(["prompt", "image"]) + # should choose from the dict returned by `get_dummy_inputs` + text_encoder_block_params = frozenset(["prompt", "max_sequence_length", "text_encoder_out_layers"]) + decode_block_params = frozenset(["output_type"]) + vae_encoder_block_params = frozenset(["image", "height", "width"]) def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) diff --git a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py index f4bd27b7ea..b0bf3eaa86 100644 --- a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py +++ b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py @@ -34,10 +34,16 @@ class TestQwenImageModularPipelineFast(ModularPipelineTesterMixin, ModularGuider pipeline_class = QwenImageModularPipeline pipeline_blocks_class = QwenImageAutoBlocks pretrained_model_name_or_path = "hf-internal-testing/tiny-qwenimage-modular" + default_repo_id = "Qwen/Qwen-Image" params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"]) batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"]) + # should choose from the dict returned by `get_dummy_inputs` + text_encoder_block_params = frozenset(["prompt", "negative_prompt", "max_sequence_length"]) + decode_block_params = frozenset(["output_type"]) + vae_encoder_block_params = None # None if vae_encoder is not supported + def get_dummy_inputs(self): generator = self.get_generator() inputs = { @@ -60,10 +66,16 @@ class TestQwenImageEditModularPipelineFast(ModularPipelineTesterMixin, ModularGu pipeline_class = QwenImageEditModularPipeline pipeline_blocks_class = QwenImageEditAutoBlocks pretrained_model_name_or_path = "hf-internal-testing/tiny-qwenimage-edit-modular" + default_repo_id = "Qwen/Qwen-Image-Edit" params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"]) batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"]) + # should choose from the dict returned by `get_dummy_inputs` + text_encoder_block_params = frozenset(["prompt", "negative_prompt", "max_sequence_length"]) + decode_block_params = frozenset(["output_type"]) + vae_encoder_block_params = frozenset(["image", "height", "width"]) + def get_dummy_inputs(self): generator = self.get_generator() inputs = { @@ -86,6 +98,7 @@ class TestQwenImageEditPlusModularPipelineFast(ModularPipelineTesterMixin, Modul pipeline_class = QwenImageEditPlusModularPipeline pipeline_blocks_class = QwenImageEditPlusAutoBlocks pretrained_model_name_or_path = "hf-internal-testing/tiny-qwenimage-edit-plus-modular" + default_repo_id = "Qwen/Qwen-Image-Edit-2509" # No `mask_image` yet. params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image"]) diff --git a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py index 7b55933e4c..9907bee46a 100644 --- a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py +++ b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py @@ -279,6 +279,8 @@ class TestSDXLModularPipelineFast( pipeline_class = StableDiffusionXLModularPipeline pipeline_blocks_class = StableDiffusionXLAutoBlocks pretrained_model_name_or_path = "hf-internal-testing/tiny-sdxl-modular" + default_repo_id = "stabilityai/stable-diffusion-xl-base-1.0" + params = frozenset( [ "prompt", @@ -291,6 +293,11 @@ class TestSDXLModularPipelineFast( batch_params = frozenset(["prompt", "negative_prompt"]) expected_image_output_shape = (1, 3, 64, 64) + # should choose from the dict returned by `get_dummy_inputs` + text_encoder_block_params = frozenset(["prompt"]) + decode_block_params = frozenset(["output_type"]) + vae_encoder_block_params = None # None if vae_encoder is not supported + def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) inputs = { @@ -339,6 +346,11 @@ class TestSDXLImg2ImgModularPipelineFast( batch_params = frozenset(["prompt", "negative_prompt", "image"]) expected_image_output_shape = (1, 3, 64, 64) + # should choose from the dict returned by `get_dummy_inputs` + text_encoder_block_params = frozenset(["prompt"]) + decode_block_params = frozenset(["output_type"]) + vae_encoder_block_params = frozenset(["image"]) + def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) inputs = { diff --git a/tests/modular_pipelines/test_modular_pipelines_common.py b/tests/modular_pipelines/test_modular_pipelines_common.py index 661fcc2537..9e469ac26d 100644 --- a/tests/modular_pipelines/test_modular_pipelines_common.py +++ b/tests/modular_pipelines/test_modular_pipelines_common.py @@ -48,6 +48,12 @@ class ModularPipelineTesterMixin: "You need to set the attribute `pretrained_model_name_or_path` in the child test class. See existing pipeline tests for reference." ) + @property + def default_repo_id(self) -> str: + raise NotImplementedError( + "You need to set the attribute `default_repo_id` in the child test class. See existing pipeline tests for reference." + ) + @property def pipeline_blocks_class(self) -> Union[Callable, ModularPipelineBlocks]: raise NotImplementedError( @@ -90,6 +96,30 @@ class ModularPipelineTesterMixin: "See existing pipeline tests for reference." ) + def text_encoder_block_params(self) -> frozenset: + raise NotImplementedError( + "You need to set the attribute `text_encoder_block_params` in the child test class. " + "`text_encoder_block_params` are the parameters required to be passed to the text encoder block. " + " if should be a subset of the parameters returned by `get_dummy_inputs`" + "See existing pipeline tests for reference." + ) + + def decode_block_params(self) -> frozenset: + raise NotImplementedError( + "You need to set the attribute `decode_block_params` in the child test class. " + "`decode_block_params` are the parameters required to be passed to the decode block. " + " if should be a subset of the parameters returned by `get_dummy_inputs`" + "See existing pipeline tests for reference." + ) + + def vae_encoder_block_params(self) -> frozenset: + raise NotImplementedError( + "You need to set the attribute `vae_encoder_block_params` in the child test class. " + "`vae_encoder_block_params` are the parameters required to be passed to the vae encoder block. " + " if should be a subset of the parameters returned by `get_dummy_inputs`" + "See existing pipeline tests for reference." + ) + def setup_method(self): # clean up the VRAM before each test torch.compiler.reset() @@ -124,6 +154,98 @@ class ModularPipelineTesterMixin: _check_for_parameters(self.params, input_parameters, "input") _check_for_parameters(self.optional_params, optional_parameters, "optional") + def test_loading_from_default_repo(self): + if self.default_repo_id is None: + return + + try: + pipe = ModularPipeline.from_pretrained(self.default_repo_id) + assert pipe.blocks.__class__ == self.pipeline_blocks_class + except Exception as e: + assert False, f"Failed to load pipeline from default repo: {e}" + + + + def test_modular_inference(self): + + # run the pipeline to get the base output for comparison + pipe = self.get_pipeline() + pipe.to(torch_device, torch.float32) + + inputs = self.get_dummy_inputs() + standard_output = pipe(**inputs, output="images") + + # create text, denoise, decoder (and optional vae encoder) nodes + blocks = self.pipeline_blocks_class() + + assert "text_encoder" in blocks.sub_blocks, "`text_encoder` block is not present in the pipeline" + assert "denoise" in blocks.sub_blocks, "`denoise` block is not present in the pipeline" + assert "decode" in blocks.sub_blocks, "`decode` block is not present in the pipeline" + if self.vae_encoder_block_params is not None: + assert "vae_encoder" in blocks.sub_blocks, "`vae_encoder` block is not present in the pipeline" + + # manually set the components in the sub_pipe + # a hack to workaround the fact the default pipeline properties are often incorrect for testing cases, + # #e.g. vae_scale_factor is ususally not 8 because vae is configured to be smaller for testing + def manually_set_all_components(pipe: ModularPipeline, sub_pipe: ModularPipeline): + for n, comp in pipe.components.items(): + if not hasattr(sub_pipe, n): + setattr(sub_pipe, n, comp) + + text_node = blocks.sub_blocks["text_encoder"].init_pipeline(self.pretrained_model_name_or_path) + text_node.load_components(torch_dtype=torch.float32) + text_node.to(torch_device) + manually_set_all_components(pipe, text_node) + + denoise_node = blocks.sub_blocks["denoise"].init_pipeline(self.pretrained_model_name_or_path) + denoise_node.load_components(torch_dtype=torch.float32) + denoise_node.to(torch_device) + manually_set_all_components(pipe, denoise_node) + + decoder_node = blocks.sub_blocks["decode"].init_pipeline(self.pretrained_model_name_or_path) + decoder_node.load_components(torch_dtype=torch.float32) + decoder_node.to(torch_device) + manually_set_all_components(pipe, decoder_node) + + if self.vae_encoder_block_params is not None: + vae_encoder_node = blocks.sub_blocks["vae_encoder"].init_pipeline(self.pretrained_model_name_or_path) + vae_encoder_node.load_components(torch_dtype=torch.float32) + vae_encoder_node.to(torch_device) + manually_set_all_components(pipe, vae_encoder_node) + else: + vae_encoder_node = None + + # prepare inputs for each node + inputs = self.get_dummy_inputs() + + def get_block_inputs(inputs: dict, block_params: frozenset) -> tuple[dict, dict]: + block_inputs = {} + for name in block_params: + if name in inputs: + block_inputs[name] = inputs.pop(name) + return block_inputs, inputs + + text_inputs, inputs = get_block_inputs(inputs, self.text_encoder_block_params) + decoder_inputs, inputs = get_block_inputs(inputs, self.decode_block_params) + if vae_encoder_node is not None: + vae_encoder_inputs, inputs = get_block_inputs(inputs, self.vae_encoder_block_params) + + # this is also to make sure pipelines mark text outputs as denoiser_input_fields + text_output = text_node(**text_inputs).get_by_kwargs("denoiser_input_fields") + if vae_encoder_node is not None: + vae_encoder_output = vae_encoder_node(**vae_encoder_inputs).values + denoise_inputs = {**text_output, **vae_encoder_output, **inputs} + else: + denoise_inputs = {**text_output, **inputs} + + # denoise node output should be "latents" + latents = denoise_node(**denoise_inputs).latents + # denoder node input should be "latents" and output should be "images" + modular_output = decoder_node(**decoder_inputs, latents=latents).images + + assert modular_output.shape == standard_output.shape, f"Modular output should have same shape as standard output {standard_output.shape}, but got {modular_output.shape}" + + def test_inference_batch_consistent(self, batch_sizes=[2], batch_generator=True): pipe = self.get_pipeline().to(torch_device)