diff --git a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py index ef3aab4d..6572f5b7 100644 --- a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py +++ b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py @@ -12,6 +12,7 @@ from pathlib import Path import torch +from torchvision import transforms from labml import lab, monit from labml_nn.diffusion.stable_diffusion.sampler.ddim import DDIMSampler @@ -69,6 +70,8 @@ def __call__(self, *, orig_image = load_img(orig_img).to(self.device) # Encode the image in the latent space and make `batch_size` copies of it orig = self.model.autoencoder_encode(orig_image).repeat(batch_size, 1, 1, 1) + # Encode the image in the latent space and make `batch_size` copies of it + orig = self.model.autoencoder_encode(orig_image).repeat(batch_size, 1, 1, 1) # Get the number of steps to diffuse the original assert 0. <= strength <= 1., 'can only work with strength in [0.0, 1.0]' diff --git a/labml_nn/diffusion/stable_diffusion/util.py b/labml_nn/diffusion/stable_diffusion/util.py index fe1c2f09..6e7e03f8 100644 --- a/labml_nn/diffusion/stable_diffusion/util.py +++ b/labml_nn/diffusion/stable_diffusion/util.py @@ -115,9 +115,9 @@ def load_img(path: str): image = Image.open(path).convert("RGB") # Get image size w, h = image.size - # Resize to a multiple of 32 - w = w - w % 32 - h = h - h % 32 + # Resize to a multiple of 64 + w = w - w % 64 + h = h - h % 64 image = image.resize((w, h), resample=PIL.Image.LANCZOS) # Convert to numpy and map to `[-1, 1]` for `[0, 255]` image = np.array(image).astype(np.float32) * (2. / 255.0) - 1