From 37fe8e00b21514bb8b83adde7142e6e7dbe94e05 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 19 Jul 2022 15:05:40 +0000 Subject: [PATCH] upload --- conversion.py | 138 ------ ...rt_ldm_original_checkpoint_to_diffusers.py | 17 +- ...ncsnpp_original_checkpoint_to_diffusers.py | 95 ++-- tests/test_modeling_utils.py | 411 +++--------------- 4 files changed, 136 insertions(+), 525 deletions(-) delete mode 100755 conversion.py diff --git a/conversion.py b/conversion.py deleted file mode 100755 index fbb17571ea..0000000000 --- a/conversion.py +++ /dev/null @@ -1,138 +0,0 @@ -# coding=utf-8 -# Copyright 2022 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import inspect -import tempfile -import unittest - -import numpy as np -import torch - -from diffusers import ( - AutoencoderKL, - DDIMPipeline, - DDIMScheduler, - DDPMPipeline, - DDPMScheduler, - GlidePipeline, - GlideSuperResUNetModel, - GlideTextToImageUNetModel, - LatentDiffusionPipeline, - LatentDiffusionUncondPipeline, - NCSNpp, - PNDMPipeline, - PNDMScheduler, - ScoreSdeVePipeline, - ScoreSdeVeScheduler, - ScoreSdeVpPipeline, - ScoreSdeVpScheduler, - UNetLDMModel, - UNetModel, - UNetUnconditionalModel, - VQModel, -) -from diffusers.configuration_utils import ConfigMixin -from diffusers.pipeline_utils import DiffusionPipeline -from diffusers.testing_utils import floats_tensor, slow, torch_device -from diffusers.training_utils import EMAModel - - -# 1. LDM - -def test_output_pretrained_ldm_dummy(): - model = UNetUnconditionalModel.from_pretrained("fusing/unet-ldm-dummy", ldm=True) - model.eval() - - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size) - time_step = torch.tensor([10] * noise.shape[0]) - - with torch.no_grad(): - output = model(noise, time_step) - - print(model) - import ipdb; ipdb.set_trace() - - -def test_output_pretrained_ldm(): - model = UNetUnconditionalModel.from_pretrained("fusing/latent-diffusion-celeba-256", subfolder="unet", ldm=True) - model.eval() - - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size) - time_step = torch.tensor([10] * noise.shape[0]) - - with torch.no_grad(): - output = model(noise, time_step) - - print(model) - import ipdb; ipdb.set_trace() - -# To see the how the final model should look like -# => this is the architecture in which the model should be saved in the new format -# -> verify new repo with the following tests (in `test_modeling_utils.py`) -# - test_ldm_uncond (in PipelineTesterMixin) -# - test_output_pretrained ( in UNetLDMModelTests) - -#test_output_pretrained_ldm_dummy() -#test_output_pretrained_ldm() - - -# 2. DDPM - -def get_model(model_id): - model = UNetUnconditionalModel.from_pretrained(model_id, ldm=True) - - noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size) - time_step = torch.tensor([10] * noise.shape[0]) - - with torch.no_grad(): - output = model(noise, time_step) - - print(model) - -# Repos to convert and port to google (part of https://github.com/hojonathanho/diffusion) -# - fusing/ddpm_dummy -# - fusing/ddpm-cifar10 -# - https://huggingface.co/fusing/ddpm-lsun-church-ema -# - https://huggingface.co/fusing/ddpm-lsun-bedroom-ema -# - https://huggingface.co/fusing/ddpm-celeba-hq - -# tests to make sure to pass -# - test_ddim_cifar10, test_ddim_lsun, test_ddpm_cifar10, test_ddim_cifar10 (in PipelineTesterMixin) -# - test_output_pretrained ( in UNetModelTests) - -# e.g. -get_model("fusing/ddpm-cifar10") - -# 3. NCSNpp - -# Repos to convert and port to google (part of https://github.com/yang-song/score_sde) -# - https://huggingface.co/fusing/ffhq_ncsnpp -# - https://huggingface.co/fusing/church_256-ncsnpp-ve -# - https://huggingface.co/fusing/celebahq_256-ncsnpp-ve -# - https://huggingface.co/fusing/bedroom_256-ncsnpp-ve -# - https://huggingface.co/fusing/ffhq_256-ncsnpp-ve - -# tests to make sure to pass -# - test_score_sde_ve_pipeline (in PipelineTesterMixin) -# - test_output_pretrained_ve_mid, test_output_pretrained_ve_large (in NCSNppModelTests) diff --git a/scripts/convert_ldm_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_original_checkpoint_to_diffusers.py index 34a02b036a..2ec816f08c 100644 --- a/scripts/convert_ldm_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ldm_original_checkpoint_to_diffusers.py @@ -17,6 +17,7 @@ import argparse import json import torch +from diffusers import VQModel, DDPMScheduler, UNetUnconditionalModel, LatentDiffusionUncondPipeline def shave_segments(path, n_shave_prefix_segments=1): @@ -314,4 +315,18 @@ if __name__ == "__main__": config = json.loads(f.read()) converted_checkpoint = convert_ldm_checkpoint(checkpoint, config) - torch.save(checkpoint, args.dump_path) + + if "ldm" in config: + del config["ldm"] + + model = UNetUnconditionalModel(**config) + model.load_state_dict(converted_checkpoint) + + try: + scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1])) + vqvae = VQModel.from_pretrained("/".join(args.checkpoint_path.split("/")[:-1])) + + pipe = LatentDiffusionUncondPipeline(unet=model, scheduler=scheduler, vae=vqvae) + pipe.save_pretrained(args.dump_path) + except: + model.save_pretrained(args.dump_path) diff --git a/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py b/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py index 79bdb560a5..a50b780e51 100644 --- a/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py +++ b/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py @@ -17,25 +17,24 @@ import argparse import json import torch -from diffusers import UNetUnconditionalModel - +from diffusers import UNetUnconditionalModel def convert_ncsnpp_checkpoint(checkpoint, config): """ Takes a state dict and the path to """ - new_model_architecture = UNetUnconditionalModel(**config) - new_model_architecture.time_steps.W.data= checkpoint['all_modules.0.W'].data - new_model_architecture.time_steps.weight.data = checkpoint['all_modules.0.W'].data - new_model_architecture.time_embedding.linear_1.weight.data = checkpoint['all_modules.1.weight'].data - new_model_architecture.time_embedding.linear_1.bias.data = checkpoint['all_modules.1.bias'].data - - new_model_architecture.time_embedding.linear_2.weight.data = checkpoint['all_modules.2.weight'].data - new_model_architecture.time_embedding.linear_2.bias.data= checkpoint['all_modules.2.bias'].data + new_model_architecture = UNetUnconditionalModel(**config) + new_model_architecture.time_steps.W.data = checkpoint["all_modules.0.W"].data + new_model_architecture.time_steps.weight.data = checkpoint["all_modules.0.W"].data + new_model_architecture.time_embedding.linear_1.weight.data = checkpoint["all_modules.1.weight"].data + new_model_architecture.time_embedding.linear_1.bias.data = checkpoint["all_modules.1.bias"].data - new_model_architecture.conv_in.weight.data = checkpoint['all_modules.3.weight'].data - new_model_architecture.conv_in.bias.data = checkpoint['all_modules.3.bias'].data + new_model_architecture.time_embedding.linear_2.weight.data = checkpoint["all_modules.2.weight"].data + new_model_architecture.time_embedding.linear_2.bias.data = checkpoint["all_modules.2.bias"].data + + new_model_architecture.conv_in.weight.data = checkpoint["all_modules.3.weight"].data + new_model_architecture.conv_in.bias.data = checkpoint["all_modules.3.bias"].data new_model_architecture.conv_norm_out.weight.data = checkpoint[list(checkpoint.keys())[-4]].data new_model_architecture.conv_norm_out.bias.data = checkpoint[list(checkpoint.keys())[-3]].data @@ -44,12 +43,11 @@ def convert_ncsnpp_checkpoint(checkpoint, config): module_index = 4 - - def set_attention_weights(new_layer,old_checkpoint,index): + def set_attention_weights(new_layer, old_checkpoint, index): new_layer.query.weight.data = old_checkpoint[f"all_modules.{index}.NIN_0.W"].data.T new_layer.key.weight.data = old_checkpoint[f"all_modules.{index}.NIN_1.W"].data.T new_layer.value.weight.data = old_checkpoint[f"all_modules.{index}.NIN_2.W"].data.T - + new_layer.query.bias.data = old_checkpoint[f"all_modules.{index}.NIN_0.b"].data new_layer.key.bias.data = old_checkpoint[f"all_modules.{index}.NIN_1.b"].data new_layer.value.bias.data = old_checkpoint[f"all_modules.{index}.NIN_2.b"].data @@ -60,7 +58,7 @@ def convert_ncsnpp_checkpoint(checkpoint, config): new_layer.group_norm.weight.data = old_checkpoint[f"all_modules.{index}.GroupNorm_0.weight"].data new_layer.group_norm.bias.data = old_checkpoint[f"all_modules.{index}.GroupNorm_0.bias"].data - def set_resnet_weights(new_layer,old_checkpoint,index): + def set_resnet_weights(new_layer, old_checkpoint, index): new_layer.conv1.weight.data = old_checkpoint[f"all_modules.{index}.Conv_0.weight"].data new_layer.conv1.bias.data = old_checkpoint[f"all_modules.{index}.Conv_0.bias"].data new_layer.norm1.weight.data = old_checkpoint[f"all_modules.{index}.GroupNorm_0.weight"].data @@ -70,7 +68,7 @@ def convert_ncsnpp_checkpoint(checkpoint, config): new_layer.conv2.bias.data = old_checkpoint[f"all_modules.{index}.Conv_1.bias"].data new_layer.norm2.weight.data = old_checkpoint[f"all_modules.{index}.GroupNorm_1.weight"].data new_layer.norm2.bias.data = old_checkpoint[f"all_modules.{index}.GroupNorm_1.bias"].data - + new_layer.time_emb_proj.weight.data = old_checkpoint[f"all_modules.{index}.Dense_0.weight"].data new_layer.time_emb_proj.bias.data = old_checkpoint[f"all_modules.{index}.Dense_0.bias"].data @@ -81,37 +79,37 @@ def convert_ncsnpp_checkpoint(checkpoint, config): for i, block in enumerate(new_model_architecture.downsample_blocks): has_attentions = hasattr(block, "attentions") for j in range(len(block.resnets)): - set_resnet_weights(block.resnets[j],checkpoint, module_index) + set_resnet_weights(block.resnets[j], checkpoint, module_index) module_index += 1 if has_attentions: - set_attention_weights(block.attentions[j],checkpoint, module_index) + set_attention_weights(block.attentions[j], checkpoint, module_index) module_index += 1 - + if hasattr(block, "downsamplers") and block.downsamplers is not None: - set_resnet_weights(block.resnet_down,checkpoint, module_index) + set_resnet_weights(block.resnet_down, checkpoint, module_index) module_index += 1 block.skip_conv.weight.data = checkpoint[f"all_modules.{module_index}.Conv_0.weight"].data block.skip_conv.bias.data = checkpoint[f"all_modules.{module_index}.Conv_0.bias"].data module_index += 1 - - - set_resnet_weights(new_model_architecture.mid.resnets[0],checkpoint,module_index) + set_resnet_weights(new_model_architecture.mid.resnets[0], checkpoint, module_index) module_index += 1 - set_attention_weights(new_model_architecture.mid.attentions[0],checkpoint, module_index) + set_attention_weights(new_model_architecture.mid.attentions[0], checkpoint, module_index) module_index += 1 - set_resnet_weights(new_model_architecture.mid.resnets[1],checkpoint,module_index) + set_resnet_weights(new_model_architecture.mid.resnets[1], checkpoint, module_index) module_index += 1 for i, block in enumerate(new_model_architecture.upsample_blocks): has_attentions = hasattr(block, "attentions") for j in range(len(block.resnets)): - set_resnet_weights(block.resnets[j],checkpoint, module_index) + set_resnet_weights(block.resnets[j], checkpoint, module_index) module_index += 1 if has_attentions: - set_attention_weights(block.attentions[0],checkpoint, module_index) # why can there only be a single attention layer for up? + set_attention_weights( + block.attentions[0], checkpoint, module_index + ) # why can there only be a single attention layer for up? module_index += 1 - + if hasattr(block, "resnet_up") and block.resnet_up is not None: block.skip_norm.weight.data = checkpoint[f"all_modules.{module_index}.weight"].data block.skip_norm.bias.data = checkpoint[f"all_modules.{module_index}.bias"].data @@ -119,9 +117,9 @@ def convert_ncsnpp_checkpoint(checkpoint, config): block.skip_conv.weight.data = checkpoint[f"all_modules.{module_index}.weight"].data block.skip_conv.bias.data = checkpoint[f"all_modules.{module_index}.bias"].data module_index += 1 - set_resnet_weights(block.resnet_up,checkpoint, module_index) + set_resnet_weights(block.resnet_up, checkpoint, module_index) module_index += 1 - + new_model_architecture.conv_norm_out.weight.data = checkpoint[f"all_modules.{module_index}.weight"].data new_model_architecture.conv_norm_out.bias.data = checkpoint[f"all_modules.{module_index}.bias"].data module_index += 1 @@ -130,11 +128,16 @@ def convert_ncsnpp_checkpoint(checkpoint, config): return new_model_architecture.state_dict() + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--checkpoint_path", default="/Users/arthurzucker/Work/diffusers/ArthurZ/diffusion_model.pt", type=str, required=False, help="Path to the checkpoint to convert." + "--checkpoint_path", + default="/Users/arthurzucker/Work/diffusers/ArthurZ/diffusion_model.pt", + type=str, + required=False, + help="Path to the checkpoint to convert.", ) parser.add_argument( @@ -146,19 +149,35 @@ if __name__ == "__main__": ) parser.add_argument( - "--dump_path", default="/Users/arthurzucker/Work/diffusers/ArthurZ/diffusion_model_new.pt", type=str, required=False, help="Path to the output model." + "--dump_path", + default="/Users/arthurzucker/Work/diffusers/ArthurZ/diffusion_model_new.pt", + type=str, + required=False, + help="Path to the output model.", ) args = parser.parse_args() - - - checkpoint = torch.load(args.checkpoint_path, map_location="cpu") with open(args.config_file) as f: config = json.loads(f.read()) + converted_checkpoint = convert_ncsnpp_checkpoint( + checkpoint, + config, + ) - converted_checkpoint = convert_ncsnpp_checkpoint(checkpoint, config,) - torch.save(converted_checkpoint, args.dump_path) + if "sde" in config: + del config["sde"] + + model = UNetUnconditionalModel(**config) + model.load_state_dict(converted_checkpoint) + + try: + scheduler = ScoreSdeVeScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1])) + + pipe = ScoreSdeVePipeline(unet=model, scheduler=scheduler) + pipe.save_pretrained(args.dump_path) + except: + model.save_pretrained(args.dump_path) diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py index e8c87f73c9..64db9c9b4f 100755 --- a/tests/test_modeling_utils.py +++ b/tests/test_modeling_utils.py @@ -22,26 +22,19 @@ import unittest import numpy as np import torch -from diffusers import UNetConditionalModel # TODO(Patrick) - need to write tests with it +from diffusers import UNetConditionalModel # noqa: F401 TODO(Patrick) - need to write tests with it from diffusers import ( AutoencoderKL, DDIMPipeline, DDIMScheduler, DDPMPipeline, DDPMScheduler, - GlidePipeline, - GlideSuperResUNetModel, - GlideTextToImageUNetModel, LatentDiffusionPipeline, LatentDiffusionUncondPipeline, - NCSNpp, PNDMPipeline, PNDMScheduler, ScoreSdeVePipeline, ScoreSdeVeScheduler, - ScoreSdeVpPipeline, - ScoreSdeVpScheduler, - UNetLDMModel, UNetUnconditionalModel, VQModel, ) @@ -278,222 +271,27 @@ class UnetModelTests(ModelTesterMixin, unittest.TestCase): inputs_dict = self.dummy_input return init_dict, inputs_dict - def test_from_pretrained_hub(self): - model, loading_info = UNetUnconditionalModel.from_pretrained( - "fusing/ddpm_dummy", output_loading_info=True, ddpm=True - ) - self.assertIsNotNone(model) - # self.assertEqual(len(loading_info["missing_keys"]), 0) - model.to(torch_device) - image = model(**self.dummy_input)["sample"] - - assert image is not None, "Make sure output is not None" - - def test_output_pretrained(self): - model = UNetUnconditionalModel.from_pretrained("fusing/ddpm_dummy", ddpm=True) - model.eval() - - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size) - time_step = torch.tensor([10]) - - with torch.no_grad(): - output = model(noise, time_step)["sample"] - - output_slice = output[0, -1, -3:, -3:].flatten() - # fmt: off - expected_output_slice = torch.tensor([0.2891, -0.1899, 0.2595, -0.6214, 0.0968, -0.2622, 0.4688, 0.1311, 0.0053]) - # fmt: on - self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2)) - - -class GlideSuperResUNetTests(ModelTesterMixin, unittest.TestCase): - model_class = GlideSuperResUNetModel - - @property - def dummy_input(self): - batch_size = 4 - num_channels = 6 - sizes = (32, 32) - low_res_size = (4, 4) - - noise = torch.randn((batch_size, num_channels // 2) + sizes).to(torch_device) - low_res = torch.randn((batch_size, 3) + low_res_size).to(torch_device) - time_step = torch.tensor([10] * noise.shape[0], device=torch_device) - - return {"sample": noise, "timestep": time_step, "low_res": low_res} - - @property - def input_shape(self): - return (3, 32, 32) - - @property - def output_shape(self): - return (6, 32, 32) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "attention_resolutions": (2,), - "channel_mult": (1, 2), - "in_channels": 6, - "out_channels": 6, - "model_channels": 32, - "num_head_channels": 8, - "num_heads_upsample": 1, - "num_res_blocks": 2, - "resblock_updown": True, - "resolution": 32, - "use_scale_shift_norm": True, - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_output(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - output = model(**inputs_dict) - - output, _ = torch.split(output, 3, dim=1) - - self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - def test_from_pretrained_hub(self): - model, loading_info = GlideSuperResUNetModel.from_pretrained( - "fusing/glide-super-res-dummy", output_loading_info=True - ) - self.assertIsNotNone(model) - # self.assertEqual(len(loading_info["missing_keys"]), 0) - - model.to(torch_device) - image = model(**self.dummy_input) - - assert image is not None, "Make sure output is not None" - - def test_output_pretrained(self): - model = GlideSuperResUNetModel.from_pretrained("fusing/glide-super-res-dummy") - - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - noise = torch.randn(1, 3, 64, 64) - low_res = torch.randn(1, 3, 4, 4) - time_step = torch.tensor([42] * noise.shape[0]) - - with torch.no_grad(): - output = model(noise, time_step, low_res) - - output, _ = torch.split(output, 3, dim=1) - output_slice = output[0, -1, -3:, -3:].flatten() - # fmt: off - expected_output_slice = torch.tensor([-22.8782, -23.2652, -15.3966, -22.8034, -23.3159, -15.5640, -15.3970, -15.4614, - 10.4370]) - # fmt: on - self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3)) - - -class GlideTextToImageUNetModelTests(ModelTesterMixin, unittest.TestCase): - model_class = GlideTextToImageUNetModel - - @property - def dummy_input(self): - batch_size = 4 - num_channels = 3 - sizes = (32, 32) - transformer_dim = 32 - seq_len = 16 - - noise = torch.randn((batch_size, num_channels) + sizes).to(torch_device) - emb = torch.randn((batch_size, seq_len, transformer_dim)).to(torch_device) - time_step = torch.tensor([10] * noise.shape[0], device=torch_device) - - return {"sample": noise, "timestep": time_step, "transformer_out": emb} - - @property - def input_shape(self): - return (3, 32, 32) - - @property - def output_shape(self): - return (6, 32, 32) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "attention_resolutions": (2,), - "channel_mult": (1, 2), - "in_channels": 3, - "out_channels": 6, - "model_channels": 32, - "num_head_channels": 8, - "num_heads_upsample": 1, - "num_res_blocks": 2, - "resblock_updown": True, - "resolution": 32, - "use_scale_shift_norm": True, - "transformer_dim": 32, - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_output(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - output = model(**inputs_dict) - - output, _ = torch.split(output, 3, dim=1) - - self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - def test_from_pretrained_hub(self): - model, loading_info = GlideTextToImageUNetModel.from_pretrained( - "fusing/unet-glide-text2im-dummy", output_loading_info=True - ) - self.assertIsNotNone(model) - # self.assertEqual(len(loading_info["missing_keys"]), 0) - - model.to(torch_device) - image = model(**self.dummy_input) - - assert image is not None, "Make sure output is not None" - - def test_output_pretrained(self): - model = GlideTextToImageUNetModel.from_pretrained("fusing/unet-glide-text2im-dummy") - - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - noise = torch.randn((1, model.config.in_channels, model.config.resolution, model.config.resolution)).to( - torch_device - ) - emb = torch.randn((1, 16, model.config.transformer_dim)).to(torch_device) - time_step = torch.tensor([10] * noise.shape[0], device=torch_device) - - model.to(torch_device) - with torch.no_grad(): - output = model(noise, time_step, emb) - - output, _ = torch.split(output, 3, dim=1) - output_slice = output[0, -1, -3:, -3:].cpu().flatten() - # fmt: off - expected_output_slice = torch.tensor([2.7766, -10.3558, -14.9149, -0.9376, -14.9175, -17.7679, -5.5565, -12.9521, -12.9845]) - # fmt: on - self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3)) +# TODO(Patrick) - Re-add this test after having correctly added the final VE checkpoints +# def test_output_pretrained(self): +# model = UNetUnconditionalModel.from_pretrained("fusing/ddpm_dummy_update", subfolder="unet") +# model.eval() +# +# torch.manual_seed(0) +# if torch.cuda.is_available(): +# torch.cuda.manual_seed_all(0) +# +# noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size) +# time_step = torch.tensor([10]) +# +# with torch.no_grad(): +# output = model(noise, time_step)["sample"] +# +# output_slice = output[0, -1, -3:, -3:].flatten() +# fmt: off +# expected_output_slice = torch.tensor([0.2891, -0.1899, 0.2595, -0.6214, 0.0968, -0.2622, 0.4688, 0.1311, 0.0053]) +# fmt: on +# self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2)) class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase): @@ -537,10 +335,10 @@ class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase): def test_from_pretrained_hub(self): model, loading_info = UNetUnconditionalModel.from_pretrained( - "fusing/unet-ldm-dummy", output_loading_info=True, ldm=True + "fusing/unet-ldm-dummy-update", output_loading_info=True ) self.assertIsNotNone(model) - # self.assertEqual(len(loading_info["missing_keys"]), 0) + self.assertEqual(len(loading_info["missing_keys"]), 0) model.to(torch_device) image = model(**self.dummy_input)["sample"] @@ -548,7 +346,7 @@ class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase): assert image is not None, "Make sure output is not None" def test_output_pretrained(self): - model = UNetUnconditionalModel.from_pretrained("fusing/unet-ldm-dummy", ldm=True) + model = UNetUnconditionalModel.from_pretrained("fusing/unet-ldm-dummy-update") model.eval() torch.manual_seed(0) @@ -568,27 +366,30 @@ class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase): self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3)) - def test_output_pretrained_spatial_transformer(self): - model = UNetLDMModel.from_pretrained("fusing/unet-ldm-dummy-spatial") - model.eval() - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size) - context = torch.ones((1, 16, 64), dtype=torch.float32) - time_step = torch.tensor([10] * noise.shape[0]) - - with torch.no_grad(): - output = model(noise, time_step, context=context) - - output_slice = output[0, -1, -3:, -3:].flatten() - # fmt: off - expected_output_slice = torch.tensor([61.3445, 56.9005, 29.4339, 59.5497, 60.7375, 34.1719, 48.1951, 42.6569, 25.0890]) - # fmt: on - - self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3)) +# TODO(Patrick) - Re-add this test after having cleaned up LDM +# def test_output_pretrained_spatial_transformer(self): +# model = UNetLDMModel.from_pretrained("fusing/unet-ldm-dummy-spatial") +# model.eval() +# +# torch.manual_seed(0) +# if torch.cuda.is_available(): +# torch.cuda.manual_seed_all(0) +# +# noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size) +# context = torch.ones((1, 16, 64), dtype=torch.float32) +# time_step = torch.tensor([10] * noise.shape[0]) +# +# with torch.no_grad(): +# output = model(noise, time_step, context=context) +# +# output_slice = output[0, -1, -3:, -3:].flatten() +# fmt: off +# expected_output_slice = torch.tensor([61.3445, 56.9005, 29.4339, 59.5497, 60.7375, 34.1719, 48.1951, 42.6569, 25.0890]) +# fmt: on +# +# self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3)) +# class NCSNppModelTests(ModelTesterMixin, unittest.TestCase): @@ -641,44 +442,18 @@ class NCSNppModelTests(ModelTesterMixin, unittest.TestCase): def test_from_pretrained_hub(self): model, loading_info = UNetUnconditionalModel.from_pretrained( - "fusing/ncsnpp-ffhq-ve-dummy", sde=True, output_loading_info=True + "fusing/ncsnpp-ffhq-ve-dummy-update", output_loading_info=True ) self.assertIsNotNone(model) - # self.assertEqual(len(loading_info["missing_keys"]), 0) + self.assertEqual(len(loading_info["missing_keys"]), 0) model.to(torch_device) image = model(**self.dummy_input) assert image is not None, "Make sure output is not None" - def test_output_pretrained_ve_small(self): - model = NCSNpp.from_pretrained("fusing/ncsnpp-cifar10-ve-dummy") - model.eval() - model.to(torch_device) - - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - batch_size = 4 - num_channels = 3 - sizes = (32, 32) - - noise = torch.ones((batch_size, num_channels) + sizes).to(torch_device) - time_step = torch.tensor(batch_size * [1e-4]).to(torch_device) - - with torch.no_grad(): - output = model(noise, time_step) - - output_slice = output[0, -3:, -3:, -1].flatten().cpu() - # fmt: off - expected_output_slice = torch.tensor([0.1315, 0.0741, 0.0393, 0.0455, 0.0556, 0.0180, -0.0832, -0.0644, -0.0856]) - # fmt: on - - self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2)) - def test_output_pretrained_ve_mid(self): - model = UNetUnconditionalModel.from_pretrained("fusing/celebahq_256-ncsnpp-ve", sde=True) + model = UNetUnconditionalModel.from_pretrained("google/ncsnpp-celebahq-256") model.to(torch_device) torch.manual_seed(0) @@ -703,7 +478,7 @@ class NCSNppModelTests(ModelTesterMixin, unittest.TestCase): self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2)) def test_output_pretrained_ve_large(self): - model = UNetUnconditionalModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy", sde=True) + model = UNetUnconditionalModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update") model.to(torch_device) torch.manual_seed(0) @@ -727,31 +502,6 @@ class NCSNppModelTests(ModelTesterMixin, unittest.TestCase): self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2)) - def test_output_pretrained_vp(self): - model = NCSNpp.from_pretrained("fusing/cifar10-ddpmpp-vp") - model.to(torch_device) - - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - batch_size = 4 - num_channels = 3 - sizes = (32, 32) - - noise = torch.randn((batch_size, num_channels) + sizes).to(torch_device) - time_step = torch.tensor(batch_size * [9.0]).to(torch_device) - - with torch.no_grad(): - output = model(noise, time_step) - - output_slice = output[0, -3:, -3:, -1].flatten().cpu() - # fmt: off - expected_output_slice = torch.tensor([0.3303, -0.2275, -2.8872, -0.1309, -1.2861, 3.4567, -1.0083, 2.5325, -1.3866]) - # fmt: on - - self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2)) - class VQModelTests(ModelTesterMixin, unittest.TestCase): model_class = VQModel @@ -802,7 +552,7 @@ class VQModelTests(ModelTesterMixin, unittest.TestCase): def test_from_pretrained_hub(self): model, loading_info = VQModel.from_pretrained("fusing/vqgan-dummy", output_loading_info=True) self.assertIsNotNone(model) - # self.assertEqual(len(loading_info["missing_keys"]), 0) + self.assertEqual(len(loading_info["missing_keys"]), 0) model.to(torch_device) image = model(**self.dummy_input) @@ -873,7 +623,7 @@ class AutoencoderKLTests(ModelTesterMixin, unittest.TestCase): def test_from_pretrained_hub(self): model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True) self.assertIsNotNone(model) - # self.assertEqual(len(loading_info["missing_keys"]), 0) + self.assertEqual(len(loading_info["missing_keys"]), 0) model.to(torch_device) image = model(**self.dummy_input) @@ -930,7 +680,7 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_from_pretrained_hub(self): - model_path = "google/ddpm-cifar10" + model_path = "google/ddpm-cifar10-32" ddpm = DDPMPipeline.from_pretrained(model_path) ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path) @@ -948,7 +698,7 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_ddpm_cifar10(self): - model_id = "google/ddpm-cifar10" + model_id = "google/ddpm-cifar10-32" unet = UNetUnconditionalModel.from_pretrained(model_id) scheduler = DDPMScheduler.from_config(model_id) @@ -969,7 +719,7 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_ddim_lsun(self): - model_id = "google/ddpm-lsun-bedroom-ema" + model_id = "google/ddpm-ema-bedroom-256" unet = UNetUnconditionalModel.from_pretrained(model_id) scheduler = DDIMScheduler.from_config(model_id) @@ -989,7 +739,7 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_ddim_cifar10(self): - model_id = "google/ddpm-cifar10" + model_id = "google/ddpm-cifar10-32" unet = UNetUnconditionalModel.from_pretrained(model_id) scheduler = DDIMScheduler(tensor_format="pt") @@ -1009,7 +759,7 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_pndm_cifar10(self): - model_id = "google/ddpm-cifar10" + model_id = "google/ddpm-cifar10-32" unet = UNetUnconditionalModel.from_pretrained(model_id) scheduler = PNDMScheduler(tensor_format="pt") @@ -1028,7 +778,7 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_ldm_text2img(self): - ldm = LatentDiffusionPipeline.from_pretrained("CompVis/latent-diffusion-text2im-large") + ldm = LatentDiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256") prompt = "A painting of a squirrel eating a burger" generator = torch.manual_seed(0) @@ -1042,7 +792,7 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_ldm_text2img_fast(self): - ldm = LatentDiffusionPipeline.from_pretrained("CompVis/latent-diffusion-text2im-large") + ldm = LatentDiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256") prompt = "A painting of a squirrel eating a burger" generator = torch.manual_seed(0) @@ -1054,30 +804,15 @@ class PipelineTesterMixin(unittest.TestCase): expected_slice = torch.tensor([0.3163, 0.8670, 0.6465, 0.1865, 0.6291, 0.5139, 0.2824, 0.3723, 0.4344]) assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2 - @slow - def test_glide_text2img(self): - model_id = "fusing/glide-base" - glide = GlidePipeline.from_pretrained(model_id) - - prompt = "a pencil sketch of a corgi" - generator = torch.manual_seed(0) - image = glide(prompt, generator=generator, num_inference_steps_upscale=20) - - image_slice = image[0, :3, :3, -1].cpu() - - assert image.shape == (1, 256, 256, 3) - expected_slice = torch.tensor([0.7119, 0.7073, 0.6460, 0.7780, 0.7423, 0.6926, 0.7378, 0.7189, 0.7784]) - assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2 - @slow def test_score_sde_ve_pipeline(self): - model = UNetUnconditionalModel.from_pretrained("fusing/ffhq_ncsnpp", sde=True) + model = UNetUnconditionalModel.from_pretrained("google/ncsnpp-ffhq-1024") torch.manual_seed(0) if torch.cuda.is_available(): torch.cuda.manual_seed_all(0) - scheduler = ScoreSdeVeScheduler.from_config("fusing/ffhq_ncsnpp") + scheduler = ScoreSdeVeScheduler.from_config("google/ncsnpp-ffhq-1024") sde_ve = ScoreSdeVePipeline(model=model, scheduler=scheduler) @@ -1099,29 +834,9 @@ class PipelineTesterMixin(unittest.TestCase): assert (image.abs().sum() - expected_image_sum).abs().cpu().item() < 1e-2 assert (image.abs().mean() - expected_image_mean).abs().cpu().item() < 1e-4 - @slow - def test_score_sde_vp_pipeline(self): - model = NCSNpp.from_pretrained("fusing/cifar10-ddpmpp-vp") - scheduler = ScoreSdeVpScheduler.from_config("fusing/cifar10-ddpmpp-vp") - - sde_vp = ScoreSdeVpPipeline(model=model, scheduler=scheduler) - - torch.manual_seed(0) - image = sde_vp(num_inference_steps=10) - - expected_image_sum = 4183.2012 - expected_image_mean = 1.3617 - - # on m1 mbp - # expected_image_sum = 4318.6729 - # expected_image_mean = 1.4058 - - assert (image.abs().sum() - expected_image_sum).abs().cpu().item() < 1e-2 - assert (image.abs().mean() - expected_image_mean).abs().cpu().item() < 1e-4 - @slow def test_ldm_uncond(self): - ldm = LatentDiffusionUncondPipeline.from_pretrained("CompVis/latent-diffusion-celeba-256") + ldm = LatentDiffusionUncondPipeline.from_pretrained("CompVis/ldm-celebahq-256") generator = torch.manual_seed(0) image = ldm(generator=generator, num_inference_steps=5)["sample"]