From fa9e35fca4f32436f4c6bb890a1b3dfcefa465f7 Mon Sep 17 00:00:00 2001 From: Isamu Isozaki Date: Thu, 4 May 2023 21:42:32 +0900 Subject: [PATCH] Added input pretubation (#3292) * Added input pretubation * Fixed spelling --- examples/text_to_image/train_text_to_image.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index 1d62cb7f81..f9592e5adc 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -112,6 +112,9 @@ def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight def parse_args(): parser = argparse.ArgumentParser(description="Simple example of a training script.") + parser.add_argument( + "--input_pertubation", type=float, default=0, help="The scale of input pretubation. Recommended 0.1." + ) parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -801,7 +804,8 @@ def main(): noise += args.noise_offset * torch.randn( (latents.shape[0], latents.shape[1], 1, 1), device=latents.device ) - + if args.input_pertubation: + new_noise = noise + args.input_pertubation * torch.randn_like(noise) bsz = latents.shape[0] # Sample a random timestep for each image timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) @@ -809,7 +813,10 @@ def main(): # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) - noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + if args.input_pertubation: + noisy_latents = noise_scheduler.add_noise(latents, new_noise, timesteps) + else: + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) # Get the text embedding for conditioning encoder_hidden_states = text_encoder(batch["input_ids"])[0]