From b82be61c9e86751ddcb3a87c3b84c843320f5636 Mon Sep 17 00:00:00 2001 From: CatManJr <138437716+CatManJr@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:47:08 +0800 Subject: [PATCH 1/3] Update image_to_image.py --- .../stable_diffusion/scripts/image_to_image.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py index ef3aab4d..8138c778 100644 --- a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py +++ b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py @@ -12,6 +12,7 @@ from pathlib import Path import torch +from torchvision import transforms from labml import lab, monit from labml_nn.diffusion.stable_diffusion.sampler.ddim import DDIMSampler @@ -65,8 +66,17 @@ def __call__(self, *, """ # Make a batch of prompts prompts = batch_size * [prompt] - # Load image - orig_image = load_img(orig_img).to(self.device) + + # Load and resize the image to a multiple of 64 + image = Image.open(orig_img) + width, height = image.size + new_width = width - width % 64 + new_height = height - height % 64 + image = image.resize((new_width, new_height), Image.LANCZOS) + + # Convert the image to tensor and move to device + orig_image = transforms.ToTensor()(image).unsqueeze(0).to(self.device) + # Encode the image in the latent space and make `batch_size` copies of it orig = self.model.autoencoder_encode(orig_image).repeat(batch_size, 1, 1, 1) From d1a341b9cc4539f29b665f5ee3a696861763b2e1 Mon Sep 17 00:00:00 2001 From: CatManJr <138437716+CatManJr@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:52:55 +0800 Subject: [PATCH 2/3] Update image_to_image.py --- .../stable_diffusion/scripts/image_to_image.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py index 8138c778..6572f5b7 100644 --- a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py +++ b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py @@ -66,17 +66,10 @@ def __call__(self, *, """ # Make a batch of prompts prompts = batch_size * [prompt] - - # Load and resize the image to a multiple of 64 - image = Image.open(orig_img) - width, height = image.size - new_width = width - width % 64 - new_height = height - height % 64 - image = image.resize((new_width, new_height), Image.LANCZOS) - - # Convert the image to tensor and move to device - orig_image = transforms.ToTensor()(image).unsqueeze(0).to(self.device) - + # Load image + orig_image = load_img(orig_img).to(self.device) + # Encode the image in the latent space and make `batch_size` copies of it + orig = self.model.autoencoder_encode(orig_image).repeat(batch_size, 1, 1, 1) # Encode the image in the latent space and make `batch_size` copies of it orig = self.model.autoencoder_encode(orig_image).repeat(batch_size, 1, 1, 1) From 46e4c3e64fcc59d6604f6cd073cfdf3f24e61baa Mon Sep 17 00:00:00 2001 From: CatManJr <138437716+CatManJr@users.noreply.github.com> Date: Thu, 28 Nov 2024 14:00:13 +0800 Subject: [PATCH 3/3] Update util.py in stable diffusion to avoid Sizes of tensors error In my experiments, I found that when I directly load an image, both in_paint.py and image_to_image.py encounter an error: File "/root/autodl-tmp/SD/model/unet.py", line 178, in forward x = torch.cat([x, x_input_block.pop()], dim=1) RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 34 but got size 33 for tensor number 1 in the list. This error can be avoided by modifying the load_img function in utils.py, specifically by changing the original multiple of 32 to a multiple of 64. This issue might be caused by the sd-v1-4.ckpt checkpoint. I apologize if I may have mistakenly made two modifications on the master branch. Please disregard those changes, and this Pull-Request is the real debug version. --- labml_nn/diffusion/stable_diffusion/util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/labml_nn/diffusion/stable_diffusion/util.py b/labml_nn/diffusion/stable_diffusion/util.py index fe1c2f09..6e7e03f8 100644 --- a/labml_nn/diffusion/stable_diffusion/util.py +++ b/labml_nn/diffusion/stable_diffusion/util.py @@ -115,9 +115,9 @@ def load_img(path: str): image = Image.open(path).convert("RGB") # Get image size w, h = image.size - # Resize to a multiple of 32 - w = w - w % 32 - h = h - h % 32 + # Resize to a multiple of 64 + w = w - w % 64 + h = h - h % 64 image = image.resize((w, h), resample=PIL.Image.LANCZOS) # Convert to numpy and map to `[-1, 1]` for `[0, 255]` image = np.array(image).astype(np.float32) * (2. / 255.0) - 1