From b9eea06e9fd0d00aedd1948db972a13f7110367d Mon Sep 17 00:00:00 2001 From: Kane Wallmann <57159130+kanewallmann@users.noreply.github.com> Date: Wed, 5 Oct 2022 20:22:07 +1000 Subject: [PATCH] Include CLIPTextModel parameters in conversion (#695) --- ...t_original_stable_diffusion_to_diffusers.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py index ee7fc33543..db1b307369 100644 --- a/scripts/convert_original_stable_diffusion_to_diffusers.py +++ b/scripts/convert_original_stable_diffusion_to_diffusers.py @@ -595,6 +595,22 @@ def convert_ldm_bert_checkpoint(checkpoint, config): return hf_model +def convert_ldm_clip_checkpoint(checkpoint): + text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") + + keys = list(checkpoint.keys()) + + text_model_dict = {} + + for key in keys: + if key.startswith("cond_stage_model.transformer"): + text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key] + + text_model.load_state_dict(text_model_dict) + + return text_model + + if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -668,7 +684,7 @@ if __name__ == "__main__": # Convert the text model. text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] if text_model_type == "FrozenCLIPEmbedder": - text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") + text_model = convert_ldm_clip_checkpoint(checkpoint) tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")