sdnext/modules/processing_diffusers.py

import inspect
import typing
import torch
import modules.devices as devices
import modules.shared as shared
import modules.sd_samplers as sd_samplers
import modules.sd_models as sd_models
import modules.sd_vae as sd_vae
import modules.taesd.sd_vae_taesd as sd_vae_taesd
import modules.images as images
from modules.lora_diffusers import lora_state, unload_diffusers_lora
from modules.processing import StableDiffusionProcessing
import modules.prompt_parser_diffusers as prompt_parser_diffusers


try:
    import diffusers
except Exception as ex:
    shared.log.error(f'Failed to import diffusers: {ex}')


def process_diffusers(p: StableDiffusionProcessing, seeds, prompts, negative_prompts):
    results = []
    if p.enable_hr and p.hr_upscaler != 'None' and p.denoising_strength > 0 and len(getattr(p, 'init_images', [])) == 0:
        p.is_hr_pass = True
    is_refiner_enabled = p.enable_hr and shared.sd_refiner is not None

    def hires_resize(latents): # input=latents output=pil
        latent_upscaler = shared.latent_upscale_modes.get(p.hr_upscaler, None)
        shared.log.info(f'Diffusers Hires: upscaler={p.hr_upscaler} width={p.hr_upscale_to_x} height={p.hr_upscale_to_y} images={latents.shape[0]}')
        if latent_upscaler is not None:
            latents = torch.nn.functional.interpolate(latents, size=(p.hr_upscale_to_y // 8, p.hr_upscale_to_x // 8), mode=latent_upscaler["mode"], antialias=latent_upscaler["antialias"])
        first_pass_images = vae_decode(latents=latents, model=shared.sd_model, full_quality=True, output_type='pil')
        p.init_images = []
        for first_pass_image in first_pass_images:
            init_image = images.resize_image(1, first_pass_image, p.hr_upscale_to_x, p.hr_upscale_to_y, upscaler_name=p.hr_upscaler) if latent_upscaler is None else first_pass_image
            p.init_images.append(init_image)
        p.width = p.hr_upscale_to_x
        p.height = p.hr_upscale_to_y

    def save_intermediate(latents, suffix):
        for i in range(len(latents)):
            from modules.processing import create_infotext
            info=create_infotext(p, p.all_prompts, p.all_seeds, p.all_subseeds, [], iteration=p.iteration, position_in_batch=i)
            decoded = vae_decode(latents=latents, model=shared.sd_model, output_type='pil', full_quality=p.full_quality)
            for i in range(len(decoded)):
                images.save_image(decoded[i], path=p.outpath_samples, basename="", seed=seeds[i], prompt=prompts[i], extension=shared.opts.samples_format, info=info, p=p, suffix=suffix)

    def diffusers_callback(_step: int, _timestep: int, latents: torch.FloatTensor):
        shared.state.sampling_step += 1
        shared.state.sampling_steps = p.steps
        if p.is_hr_pass:
            shared.state.sampling_steps += p.hr_second_pass_steps
        shared.state.current_latent = latents

    def full_vae_decode(latents, model):
        shared.log.debug(f'Diffusers VAE decode: name={sd_vae.loaded_vae_file if sd_vae.loaded_vae_file is not None else "baked"} dtype={model.vae.dtype} upcast={model.vae.config.get("force_upcast", None)} images={latents.shape[0]}')
        if shared.opts.diffusers_move_unet and not model.has_accelerate:
            shared.log.debug('Diffusers: Moving UNet to CPU')
            unet_device = model.unet.device
            model.unet.to(devices.cpu)
            devices.torch_gc()
        model.vae.to(devices.device)
        latents.to(model.vae.device)
        decoded = model.vae.decode(latents / model.vae.config.scaling_factor, return_dict=False)[0]
        if shared.opts.diffusers_move_unet and not model.has_accelerate:
            model.unet.to(unet_device)
        return decoded

    def taesd_vae_decode(latents):
        shared.log.debug(f'Diffusers VAE decode: name=TAESD images={latents.shape[0]}')
        decoded = torch.zeros((len(latents), 3, p.height, p.width), dtype=devices.dtype_vae, device=devices.device)
        for i in range(len(output.images)):
            decoded[i] = (sd_vae_taesd.decode(latents[i]) * 2.0) - 1.0
        return decoded

    def vae_decode(latents, model, output_type='np', full_quality=True):
        if not torch.is_tensor(latents): # already decoded
            return latents
        if latents.shape[0] == 0:
            shared.log.error(f'VAE nothing to decode: {latents.shape}')
            return []
        if shared.state.interrupted or shared.state.skipped:
            return []
        if not hasattr(model, 'vae'):
            shared.log.error('VAE not found in model')
            return []
        if len(latents.shape) == 3: # lost a batch dim in hires
            latents = latents.unsqueeze(0)
        if full_quality:
            decoded = full_vae_decode(latents=latents, model=shared.sd_model)
        else:
            decoded = taesd_vae_decode(latents=latents)
        imgs = model.image_processor.postprocess(decoded, output_type=output_type)
        return imgs

    def fix_prompts(prompts, negative_prompts, prompts_2, negative_prompts_2):
        if type(prompts) is str:
            prompts = [prompts]
        if type(negative_prompts) is str:
            negative_prompts = [negative_prompts]
        while len(negative_prompts) < len(prompts):
            negative_prompts.append(negative_prompts[-1])
        if type(prompts_2) is str:
            prompts_2 = [prompts_2]
        if type(prompts_2) is list:
            while len(prompts_2) < len(prompts):
                prompts_2.append(prompts_2[-1])
        if type(negative_prompts_2) is str:
            negative_prompts_2 = [negative_prompts_2]
        if type(negative_prompts_2) is list:
            while len(negative_prompts_2) < len(prompts_2):
                negative_prompts_2.append(negative_prompts_2[-1])
        return prompts, negative_prompts, prompts_2, negative_prompts_2

    def set_pipeline_args(model, prompts: list, negative_prompts: list, prompts_2: typing.Optional[list]=None, negative_prompts_2: typing.Optional[list]=None, is_refiner: bool=False, desc:str='', **kwargs):
        if hasattr(model, "set_progress_bar_config"):
            model.set_progress_bar_config(bar_format='Progress {rate_fmt}{postfix} {bar} {percentage:3.0f}% {n_fmt}/{total_fmt} {elapsed} {remaining} '+desc, ncols=80, colour='#327fba')
        args = {}
        pipeline = model
        signature = inspect.signature(type(pipeline).__call__)
        possible = signature.parameters.keys()
        generator_device = devices.cpu if shared.opts.diffusers_generator_device == "cpu" else shared.device
        generator = [torch.Generator(generator_device).manual_seed(s) for s in seeds]
        prompt_embed = None
        pooled = None
        negative_embed = None
        negative_pooled = None
        prompts, negative_prompts, prompts_2, negative_prompts_2 = fix_prompts(prompts, negative_prompts, prompts_2, negative_prompts_2)
        if shared.opts.prompt_attention in {'Compel parser', 'Full parser'}:
            prompt_embed, pooled, negative_embed, negative_pooled = prompt_parser_diffusers.compel_encode_prompts(model, prompts, negative_prompts,
                                                                                                                  prompts_2, negative_prompts_2,
                                                                                                                  is_refiner, kwargs.pop("clip_skip", None))
        if 'prompt' in possible:
            if hasattr(model, 'text_encoder') and 'prompt_embeds' in possible and prompt_embed is not None:
                args['prompt_embeds'] = prompt_embed
                if shared.sd_model_type == "sdxl":
                    args['pooled_prompt_embeds'] = pooled
                    args['prompt_2'] = None #Cannot pass prompts when passing embeds
            else:
                args['prompt'] = prompts
        if 'negative_prompt' in possible:
            if hasattr(model, 'text_encoder') and 'negative_prompt_embeds' in possible and negative_embed is not None:
                args['negative_prompt_embeds'] = negative_embed
                if shared.sd_model_type == "sdxl":
                    args['negative_pooled_prompt_embeds'] = negative_pooled
                    args['negative_prompt_2'] = None
            else:
                args['negative_prompt'] = negative_prompts
        if 'num_inference_steps' in possible:
            args['num_inference_steps'] = p.steps if not p.is_hr_pass else p.hr_second_pass_steps
        if 'guidance_scale' in possible:
            args['guidance_scale'] = p.cfg_scale
        if 'generator' in possible:
            args['generator'] = generator
        if 'output_type' in possible:
            args['output_type'] = 'np'
        if 'callback_steps' in possible:
            args['callback_steps'] = 1
        if 'callback' in possible:
            args['callback'] = diffusers_callback
        if 'cross_attention_kwargs' in possible and lora_state['active'] and shared.opts.diffusers_lora_loader == "diffusers default":
            args['cross_attention_kwargs'] = { 'scale': lora_state['multiplier'][0]}
        for arg in kwargs:
            if arg in possible:
                args[arg] = kwargs[arg]
            else:
                pass
                # shared.log.debug(f'Diffuser not supported: pipeline={pipeline.__class__.__name__} task={sd_models.get_diffusers_task(model)} arg={arg}')
        # shared.log.debug(f'Diffuser pipeline: {pipeline.__class__.__name__} possible={possible}')
        clean = args.copy()
        clean.pop('callback', None)
        clean.pop('callback_steps', None)
        if 'image' in clean:
            clean['image'] = type(clean['image'])
        if 'mask_image' in clean:
            clean['mask_image'] = type(clean['mask_image'])
        if 'prompt' in clean:
            clean['prompt'] = len(clean['prompt'])
        if 'negative_prompt' in clean:
            clean['negative_prompt'] = len(clean['negative_prompt'])
        if 'prompt_embeds' in clean:
            clean['prompt_embeds'] = clean['prompt_embeds'].shape
        if 'pooled_prompt_embeds' in clean:
            clean['pooled_prompt_embeds'] = clean['pooled_prompt_embeds'].shape
        if 'negative_prompt_embeds' in clean:
            clean['negative_prompt_embeds'] = clean['negative_prompt_embeds'].shape
        if 'negative_pooled_prompt_embeds' in clean:
            clean['negative_pooled_prompt_embeds'] = clean['negative_pooled_prompt_embeds'].shape
        clean['generator'] = generator_device
        shared.log.debug(f'Diffuser pipeline: {pipeline.__class__.__name__} task={sd_models.get_diffusers_task(model)} set={clean}')
        return args

    is_karras_compatible = shared.sd_model.__class__.__init__.__annotations__.get("scheduler", None) == diffusers.schedulers.scheduling_utils.KarrasDiffusionSchedulers
    use_sampler = p.sampler_name if not p.is_hr_pass else p.latent_sampler
    if (not hasattr(shared.sd_model.scheduler, 'name')) or (shared.sd_model.scheduler.name != use_sampler) and (use_sampler != 'Default') and is_karras_compatible:
        sampler = sd_samplers.all_samplers_map.get(use_sampler, None)
        if sampler is None:
            sampler = sd_samplers.all_samplers_map.get("UniPC")
        sd_samplers.create_sampler(sampler.name, shared.sd_model) # TODO(Patrick): For wrapped pipelines this is currently a no-op
        sampler_options = f'type:{shared.opts.schedulers_prediction_type} ' if shared.opts.schedulers_prediction_type != 'default' else ''
        sampler_options += 'no_karras ' if not shared.opts.schedulers_use_karras else ''
        sampler_options += 'no_low_order' if not shared.opts.schedulers_use_loworder else ''
        sampler_options += 'dynamic_thresholding' if shared.opts.schedulers_use_thresholding else ''
        sampler_options += f'solver:{shared.opts.schedulers_dpm_solver}' if shared.opts.schedulers_dpm_solver != 'sde-dpmsolver++' else ''
        sampler_options += f'beta:{shared.opts.schedulers_beta_schedule}:{shared.opts.schedulers_beta_start}:{shared.opts.schedulers_beta_end}' if shared.opts.schedulers_beta_schedule != 'default' else ''
        p.extra_generation_params['Sampler options'] = sampler_options if len(sampler_options) > 0 else None
        p.extra_generation_params['Pipeline'] = shared.sd_model.__class__.__name__

    cross_attention_kwargs={}
    if len(getattr(p, 'init_images', [])) > 0:
        while len(p.init_images) < len(prompts):
            p.init_images.append(p.init_images[-1])
    if lora_state['active']:
        cross_attention_kwargs['scale'] = lora_state['multiplier']
    task_specific_kwargs={}
    if sd_models.get_diffusers_task(shared.sd_model) == sd_models.DiffusersTaskType.TEXT_2_IMAGE:
        p.ops.append('txt2img')
        task_specific_kwargs = {"height": p.height, "width": p.width}
    elif sd_models.get_diffusers_task(shared.sd_model) == sd_models.DiffusersTaskType.IMAGE_2_IMAGE:
        p.ops.append('img2img')
        task_specific_kwargs = {"image": p.init_images, "strength": p.denoising_strength}
    elif sd_models.get_diffusers_task(shared.sd_model) == sd_models.DiffusersTaskType.INPAINTING:
        p.ops.append('inpaint')
        task_specific_kwargs = {"image": p.init_images, "mask_image": p.mask, "strength": p.denoising_strength, "height": p.height, "width": p.width}

    if shared.state.interrupted or shared.state.skipped:
        unload_diffusers_lora()
        return results

    if shared.opts.diffusers_move_base and not shared.sd_model.has_accelerate:
        shared.sd_model.to(devices.device)

    base_args = set_pipeline_args(
        model=shared.sd_model,
        prompts=prompts,
        negative_prompts=negative_prompts,
        prompts_2=[p.refiner_prompt] if len(p.refiner_prompt) > 0 else prompts,
        negative_prompts_2=[p.refiner_negative] if len(p.refiner_negative) > 0 else negative_prompts,
        eta=shared.opts.eta_ddim,
        guidance_rescale=p.diffusers_guidance_rescale,
        denoising_start=0 if is_refiner_enabled and p.refiner_start > 0 and p.refiner_start < 1 else None,
        denoising_end=p.refiner_start if is_refiner_enabled and p.refiner_start > 0 and p.refiner_start < 1 else None,
        output_type='latent' if hasattr(shared.sd_model, 'vae') else 'np',
        is_refiner=False,
        clip_skip=p.clip_skip,
        desc='Base',
        **task_specific_kwargs
    )
    p.extra_generation_params['CFG rescale'] = p.diffusers_guidance_rescale
    p.extra_generation_params["Eta DDIM"] = shared.opts.eta_ddim if shared.opts.eta_ddim is not None and shared.opts.eta_ddim > 0 else None
    output = shared.sd_model(**base_args) # pylint: disable=not-callable

    if lora_state['active']:
        p.extra_generation_params['LoRA method'] = shared.opts.diffusers_lora_loader
        unload_diffusers_lora()

    if shared.state.interrupted or shared.state.skipped:
        return results

    # optional hires pass
    if p.is_hr_pass:
        p.init_hr()
        if p.width != p.hr_upscale_to_x or p.height != p.hr_upscale_to_y:
            if shared.opts.save and not p.do_not_save_samples and shared.opts.save_images_before_highres_fix and hasattr(shared.sd_model, 'vae'):
                save_intermediate(latents=output.images, suffix="-before-hires")
            hires_resize(latents=output.images)
            print('HERE', p.init_images)
            sd_models.set_diffuser_pipe(shared.sd_model, sd_models.DiffusersTaskType.IMAGE_2_IMAGE)
            p.ops.append('hires')
            hires_args = set_pipeline_args(
                model=shared.sd_model,
                prompts=prompts,
                negative_prompts=negative_prompts,
                prompts_2=[p.refiner_prompt] if len(p.refiner_prompt) > 0 else prompts,
                negative_prompts_2=[p.refiner_negative] if len(p.refiner_negative) > 0 else negative_prompts,
                eta=shared.opts.eta_ddim,
                guidance_rescale=p.diffusers_guidance_rescale,
                output_type='latent' if hasattr(shared.sd_model, 'vae') else 'np',
                is_refiner=False,
                clip_skip=p.clip_skip,
                image=p.init_images,
                strength=p.denoising_strength,
                desc='Hires',
            )
            output = shared.sd_model(**hires_args) # pylint: disable=not-callable

    # optional refiner pass or decode
    if is_refiner_enabled:
        if shared.opts.save and not p.do_not_save_samples and shared.opts.save_images_before_refiner and hasattr(shared.sd_model, 'vae'):
            save_intermediate(latents=output.images, suffix="-before-refiner")
        if shared.opts.diffusers_move_base and not shared.sd_model.has_accelerate:
            shared.log.debug('Diffusers: Moving base model to CPU')
            shared.sd_model.to(devices.cpu)
            devices.torch_gc()

        if (not hasattr(shared.sd_refiner.scheduler, 'name')) or (shared.sd_refiner.scheduler.name != p.latent_sampler) and (p.sampler_name != 'Default'):
            sampler = sd_samplers.all_samplers_map.get(p.latent_sampler, None)
            if sampler is None:
                sampler = sd_samplers.all_samplers_map.get("UniPC")
            sd_samplers.create_sampler(sampler.name, shared.sd_refiner) # TODO(Patrick): For wrapped pipelines this is currently a no-op

        if shared.state.interrupted or shared.state.skipped:
            return results

        if shared.opts.diffusers_move_refiner and not shared.sd_refiner.has_accelerate:
            shared.sd_refiner.to(devices.device)
        p.ops.append('refine')
        for i in range(len(output.images)):
            refiner_args = set_pipeline_args(
                model=shared.sd_refiner,
                prompts=[p.refiner_prompt] if len(p.refiner_prompt) > 0 else prompts[i],
                negative_prompts=[p.refiner_negative] if len(p.refiner_negative) > 0 else negative_prompts[i],
                num_inference_steps=p.hr_second_pass_steps,
                eta=shared.opts.eta_ddim,
                strength=p.denoising_strength,
                guidance_scale=p.image_cfg_scale if p.image_cfg_scale is not None else p.cfg_scale,
                guidance_rescale=p.diffusers_guidance_rescale,
                denoising_start=p.refiner_start if p.refiner_start > 0 and p.refiner_start < 1 else None,
                denoising_end=1 if p.refiner_start > 0 and p.refiner_start < 1 else None,
                image=output.images[i],
                output_type='latent' if hasattr(shared.sd_refiner, 'vae') else 'np',
                is_refiner=True,
                clip_skip=p.clip_skip,
                desc='Refiner',
            )
            refiner_output = shared.sd_refiner(**refiner_args) # pylint: disable=not-callable
            p.extra_generation_params['Image CFG scale'] = p.image_cfg_scale if p.image_cfg_scale is not None else None
            p.extra_generation_params['Refiner start'] = p.refiner_start
            p.extra_generation_params["Hires steps"] = p.hr_second_pass_steps

            if not shared.state.interrupted and not shared.state.skipped:
                refiner_images = vae_decode(latents=refiner_output.images, model=shared.sd_refiner, full_quality=True)
                for refiner_image in refiner_images:
                    results.append(refiner_image)

        if shared.opts.diffusers_move_refiner and not shared.sd_refiner.has_accelerate:
            shared.log.debug('Diffusers: Moving refiner model to CPU')
            shared.sd_refiner.to(devices.cpu)
            devices.torch_gc()

    # final decode since there is no refiner
    if not is_refiner_enabled:
        results = vae_decode(latents=output.images, model=shared.sd_model, full_quality=p.full_quality)

    return results