From b82be61c9e86751ddcb3a87c3b84c843320f5636 Mon Sep 17 00:00:00 2001
From: CatManJr <138437716+CatManJr@users.noreply.github.com>
Date: Thu, 28 Nov 2024 13:47:08 +0800
Subject: [PATCH 1/3] Update image_to_image.py

---
 .../stable_diffusion/scripts/image_to_image.py     | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py
index ef3aab4d..8138c778 100644
--- a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py
+++ b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py
@@ -12,6 +12,7 @@
 from pathlib import Path
 
 import torch
+from torchvision import transforms
 
 from labml import lab, monit
 from labml_nn.diffusion.stable_diffusion.sampler.ddim import DDIMSampler
@@ -65,8 +66,17 @@ def __call__(self, *,
         """
         # Make a batch of prompts
         prompts = batch_size * [prompt]
-        # Load image
-        orig_image = load_img(orig_img).to(self.device)
+        
+        # Load and resize the image to a multiple of 64
+        image = Image.open(orig_img)
+        width, height = image.size
+        new_width = width - width % 64
+        new_height = height - height % 64
+        image = image.resize((new_width, new_height), Image.LANCZOS)
+
+        # Convert the image to tensor and move to device
+        orig_image = transforms.ToTensor()(image).unsqueeze(0).to(self.device)
+                  
         # Encode the image in the latent space and make `batch_size` copies of it
         orig = self.model.autoencoder_encode(orig_image).repeat(batch_size, 1, 1, 1)
 

From d1a341b9cc4539f29b665f5ee3a696861763b2e1 Mon Sep 17 00:00:00 2001
From: CatManJr <138437716+CatManJr@users.noreply.github.com>
Date: Thu, 28 Nov 2024 13:52:55 +0800
Subject: [PATCH 2/3] Update image_to_image.py

---
 .../stable_diffusion/scripts/image_to_image.py    | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py
index 8138c778..6572f5b7 100644
--- a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py
+++ b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py
@@ -66,17 +66,10 @@ def __call__(self, *,
         """
         # Make a batch of prompts
         prompts = batch_size * [prompt]
-        
-        # Load and resize the image to a multiple of 64
-        image = Image.open(orig_img)
-        width, height = image.size
-        new_width = width - width % 64
-        new_height = height - height % 64
-        image = image.resize((new_width, new_height), Image.LANCZOS)
-
-        # Convert the image to tensor and move to device
-        orig_image = transforms.ToTensor()(image).unsqueeze(0).to(self.device)
-                  
+        # Load image
+        orig_image = load_img(orig_img).to(self.device)
+        # Encode the image in the latent space and make `batch_size` copies of it
+        orig = self.model.autoencoder_encode(orig_image).repeat(batch_size, 1, 1, 1)
         # Encode the image in the latent space and make `batch_size` copies of it
         orig = self.model.autoencoder_encode(orig_image).repeat(batch_size, 1, 1, 1)
 

From 46e4c3e64fcc59d6604f6cd073cfdf3f24e61baa Mon Sep 17 00:00:00 2001
From: CatManJr <138437716+CatManJr@users.noreply.github.com>
Date: Thu, 28 Nov 2024 14:00:13 +0800
Subject: [PATCH 3/3] Update util.py in stable diffusion to avoid Sizes of
 tensors error

In my experiments, I found that when I directly load an image, both in_paint.py and image_to_image.py encounter an error:

File "/root/autodl-tmp/SD/model/unet.py", line 178, in forward
    x = torch.cat([x, x_input_block.pop()], dim=1)
RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 34 but got size 33 for tensor number 1 in the list.

This error can be avoided by modifying the load_img function in utils.py, specifically by changing the original multiple of 32 to a multiple of 64. This issue might be caused by the sd-v1-4.ckpt checkpoint.

I apologize if I may have mistakenly made two modifications on the master branch. Please disregard those changes, and this Pull-Request is the real debug version.
---
 labml_nn/diffusion/stable_diffusion/util.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/labml_nn/diffusion/stable_diffusion/util.py b/labml_nn/diffusion/stable_diffusion/util.py
index fe1c2f09..6e7e03f8 100644
--- a/labml_nn/diffusion/stable_diffusion/util.py
+++ b/labml_nn/diffusion/stable_diffusion/util.py
@@ -115,9 +115,9 @@ def load_img(path: str):
     image = Image.open(path).convert("RGB")
     # Get image size
     w, h = image.size
-    # Resize to a multiple of 32
-    w = w - w % 32
-    h = h - h % 32
+    # Resize to a multiple of 64
+    w = w - w % 64
+    h = h - h % 64
     image = image.resize((w, h), resample=PIL.Image.LANCZOS)
     # Convert to numpy and map to `[-1, 1]` for `[0, 255]`
     image = np.array(image).astype(np.float32) * (2. / 255.0) - 1