1 жил өмнө · b0f5934234
--- a/extensions-builtin/ScuNET/scripts/scunet_model.py
+++ b/extensions-builtin/ScuNET/scripts/scunet_model.py
@@ -7,9 +7,7 @@ from tqdm import tqdm
 
				 
			
 
				 import modules.upscaler
			
 
				 from modules import devices, modelloader, script_callbacks, errors
			
 
				-from scunet_model_arch import SCUNet
			
 
				 
			
 
				-from modules.modelloader import load_file_from_url
			
 
				 from modules.shared import opts
			
 
				 
			
 
				 
			
@@ -120,17 +118,10 @@ class UpscalerScuNET(modules.upscaler.Upscaler):
 
				         device = devices.get_device_for('scunet')
			
 
				         if path.startswith("http"):
			
 
				             # TODO: this doesn't use `path` at all?
			
 
				-            filename = load_file_from_url(self.model_url, model_dir=self.model_download_path, file_name=f"{self.name}.pth")
			
 
				+            filename = modelloader.load_file_from_url(self.model_url, model_dir=self.model_download_path, file_name=f"{self.name}.pth")
			
 
				         else:
			
 
				             filename = path
			
 
				-        model = SCUNet(in_nc=3, config=[4, 4, 4, 4, 4, 4, 4], dim=64)
			
 
				-        model.load_state_dict(torch.load(filename), strict=True)
			
 
				-        model.eval()
			
 
				-        for _, v in model.named_parameters():
			
 
				-            v.requires_grad = False
			
 
				-        model = model.to(device)
			
 
				-
			
 
				-        return model
			
 
				+        return modelloader.load_spandrel_model(filename, device=device)
			
 
				 
			
 
				 
			
 
				 def on_ui_settings():
			
--- a/extensions-builtin/ScuNET/scunet_model_arch.py
+++ b/extensions-builtin/ScuNET/scunet_model_arch.py
@@ -1,268 +0,0 @@
 
				-# -*- coding: utf-8 -*-
			
 
				-import numpy as np
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-from einops import rearrange
			
 
				-from einops.layers.torch import Rearrange
			
 
				-from timm.models.layers import trunc_normal_, DropPath
			
 
				-
			
 
				-
			
 
				-class WMSA(nn.Module):
			
 
				-    """ Self-attention module in Swin Transformer
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, input_dim, output_dim, head_dim, window_size, type):
			
 
				-        super(WMSA, self).__init__()
			
 
				-        self.input_dim = input_dim
			
 
				-        self.output_dim = output_dim
			
 
				-        self.head_dim = head_dim
			
 
				-        self.scale = self.head_dim ** -0.5
			
 
				-        self.n_heads = input_dim // head_dim
			
 
				-        self.window_size = window_size
			
 
				-        self.type = type
			
 
				-        self.embedding_layer = nn.Linear(self.input_dim, 3 * self.input_dim, bias=True)
			
 
				-
			
 
				-        self.relative_position_params = nn.Parameter(
			
 
				-            torch.zeros((2 * window_size - 1) * (2 * window_size - 1), self.n_heads))
			
 
				-
			
 
				-        self.linear = nn.Linear(self.input_dim, self.output_dim)
			
 
				-
			
 
				-        trunc_normal_(self.relative_position_params, std=.02)
			
 
				-        self.relative_position_params = torch.nn.Parameter(
			
 
				-            self.relative_position_params.view(2 * window_size - 1, 2 * window_size - 1, self.n_heads).transpose(1,
			
 
				-                                                                                                                 2).transpose(
			
 
				-                0, 1))
			
 
				-
			
 
				-    def generate_mask(self, h, w, p, shift):
			
 
				-        """ generating the mask of SW-MSA
			
 
				-        Args:
			
 
				-            shift: shift parameters in CyclicShift.
			
 
				-        Returns:
			
 
				-            attn_mask: should be (1 1 w p p),
			
 
				-        """
			
 
				-        # supporting square.
			
 
				-        attn_mask = torch.zeros(h, w, p, p, p, p, dtype=torch.bool, device=self.relative_position_params.device)
			
 
				-        if self.type == 'W':
			
 
				-            return attn_mask
			
 
				-
			
 
				-        s = p - shift
			
 
				-        attn_mask[-1, :, :s, :, s:, :] = True
			
 
				-        attn_mask[-1, :, s:, :, :s, :] = True
			
 
				-        attn_mask[:, -1, :, :s, :, s:] = True
			
 
				-        attn_mask[:, -1, :, s:, :, :s] = True
			
 
				-        attn_mask = rearrange(attn_mask, 'w1 w2 p1 p2 p3 p4 -> 1 1 (w1 w2) (p1 p2) (p3 p4)')
			
 
				-        return attn_mask
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        """ Forward pass of Window Multi-head Self-attention module.
			
 
				-        Args:
			
 
				-            x: input tensor with shape of [b h w c];
			
 
				-            attn_mask: attention mask, fill -inf where the value is True;
			
 
				-        Returns:
			
 
				-            output: tensor shape [b h w c]
			
 
				-        """
			
 
				-        if self.type != 'W':
			
 
				-            x = torch.roll(x, shifts=(-(self.window_size // 2), -(self.window_size // 2)), dims=(1, 2))
			
 
				-
			
 
				-        x = rearrange(x, 'b (w1 p1) (w2 p2) c -> b w1 w2 p1 p2 c', p1=self.window_size, p2=self.window_size)
			
 
				-        h_windows = x.size(1)
			
 
				-        w_windows = x.size(2)
			
 
				-        # square validation
			
 
				-        # assert h_windows == w_windows
			
 
				-
			
 
				-        x = rearrange(x, 'b w1 w2 p1 p2 c -> b (w1 w2) (p1 p2) c', p1=self.window_size, p2=self.window_size)
			
 
				-        qkv = self.embedding_layer(x)
			
 
				-        q, k, v = rearrange(qkv, 'b nw np (threeh c) -> threeh b nw np c', c=self.head_dim).chunk(3, dim=0)
			
 
				-        sim = torch.einsum('hbwpc,hbwqc->hbwpq', q, k) * self.scale
			
 
				-        # Adding learnable relative embedding
			
 
				-        sim = sim + rearrange(self.relative_embedding(), 'h p q -> h 1 1 p q')
			
 
				-        # Using Attn Mask to distinguish different subwindows.
			
 
				-        if self.type != 'W':
			
 
				-            attn_mask = self.generate_mask(h_windows, w_windows, self.window_size, shift=self.window_size // 2)
			
 
				-            sim = sim.masked_fill_(attn_mask, float("-inf"))
			
 
				-
			
 
				-        probs = nn.functional.softmax(sim, dim=-1)
			
 
				-        output = torch.einsum('hbwij,hbwjc->hbwic', probs, v)
			
 
				-        output = rearrange(output, 'h b w p c -> b w p (h c)')
			
 
				-        output = self.linear(output)
			
 
				-        output = rearrange(output, 'b (w1 w2) (p1 p2) c -> b (w1 p1) (w2 p2) c', w1=h_windows, p1=self.window_size)
			
 
				-
			
 
				-        if self.type != 'W':
			
 
				-            output = torch.roll(output, shifts=(self.window_size // 2, self.window_size // 2), dims=(1, 2))
			
 
				-
			
 
				-        return output
			
 
				-
			
 
				-    def relative_embedding(self):
			
 
				-        cord = torch.tensor(np.array([[i, j] for i in range(self.window_size) for j in range(self.window_size)]))
			
 
				-        relation = cord[:, None, :] - cord[None, :, :] + self.window_size - 1
			
 
				-        # negative is allowed
			
 
				-        return self.relative_position_params[:, relation[:, :, 0].long(), relation[:, :, 1].long()]
			
 
				-
			
 
				-
			
 
				-class Block(nn.Module):
			
 
				-    def __init__(self, input_dim, output_dim, head_dim, window_size, drop_path, type='W', input_resolution=None):
			
 
				-        """ SwinTransformer Block
			
 
				-        """
			
 
				-        super(Block, self).__init__()
			
 
				-        self.input_dim = input_dim
			
 
				-        self.output_dim = output_dim
			
 
				-        assert type in ['W', 'SW']
			
 
				-        self.type = type
			
 
				-        if input_resolution <= window_size:
			
 
				-            self.type = 'W'
			
 
				-
			
 
				-        self.ln1 = nn.LayerNorm(input_dim)
			
 
				-        self.msa = WMSA(input_dim, input_dim, head_dim, window_size, self.type)
			
 
				-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
			
 
				-        self.ln2 = nn.LayerNorm(input_dim)
			
 
				-        self.mlp = nn.Sequential(
			
 
				-            nn.Linear(input_dim, 4 * input_dim),
			
 
				-            nn.GELU(),
			
 
				-            nn.Linear(4 * input_dim, output_dim),
			
 
				-        )
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x = x + self.drop_path(self.msa(self.ln1(x)))
			
 
				-        x = x + self.drop_path(self.mlp(self.ln2(x)))
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-class ConvTransBlock(nn.Module):
			
 
				-    def __init__(self, conv_dim, trans_dim, head_dim, window_size, drop_path, type='W', input_resolution=None):
			
 
				-        """ SwinTransformer and Conv Block
			
 
				-        """
			
 
				-        super(ConvTransBlock, self).__init__()
			
 
				-        self.conv_dim = conv_dim
			
 
				-        self.trans_dim = trans_dim
			
 
				-        self.head_dim = head_dim
			
 
				-        self.window_size = window_size
			
 
				-        self.drop_path = drop_path
			
 
				-        self.type = type
			
 
				-        self.input_resolution = input_resolution
			
 
				-
			
 
				-        assert self.type in ['W', 'SW']
			
 
				-        if self.input_resolution <= self.window_size:
			
 
				-            self.type = 'W'
			
 
				-
			
 
				-        self.trans_block = Block(self.trans_dim, self.trans_dim, self.head_dim, self.window_size, self.drop_path,
			
 
				-                                 self.type, self.input_resolution)
			
 
				-        self.conv1_1 = nn.Conv2d(self.conv_dim + self.trans_dim, self.conv_dim + self.trans_dim, 1, 1, 0, bias=True)
			
 
				-        self.conv1_2 = nn.Conv2d(self.conv_dim + self.trans_dim, self.conv_dim + self.trans_dim, 1, 1, 0, bias=True)
			
 
				-
			
 
				-        self.conv_block = nn.Sequential(
			
 
				-            nn.Conv2d(self.conv_dim, self.conv_dim, 3, 1, 1, bias=False),
			
 
				-            nn.ReLU(True),
			
 
				-            nn.Conv2d(self.conv_dim, self.conv_dim, 3, 1, 1, bias=False)
			
 
				-        )
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        conv_x, trans_x = torch.split(self.conv1_1(x), (self.conv_dim, self.trans_dim), dim=1)
			
 
				-        conv_x = self.conv_block(conv_x) + conv_x
			
 
				-        trans_x = Rearrange('b c h w -> b h w c')(trans_x)
			
 
				-        trans_x = self.trans_block(trans_x)
			
 
				-        trans_x = Rearrange('b h w c -> b c h w')(trans_x)
			
 
				-        res = self.conv1_2(torch.cat((conv_x, trans_x), dim=1))
			
 
				-        x = x + res
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-class SCUNet(nn.Module):
			
 
				-    # def __init__(self, in_nc=3, config=[2, 2, 2, 2, 2, 2, 2], dim=64, drop_path_rate=0.0, input_resolution=256):
			
 
				-    def __init__(self, in_nc=3, config=None, dim=64, drop_path_rate=0.0, input_resolution=256):
			
 
				-        super(SCUNet, self).__init__()
			
 
				-        if config is None:
			
 
				-            config = [2, 2, 2, 2, 2, 2, 2]
			
 
				-        self.config = config
			
 
				-        self.dim = dim
			
 
				-        self.head_dim = 32
			
 
				-        self.window_size = 8
			
 
				-
			
 
				-        # drop path rate for each layer
			
 
				-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(config))]
			
 
				-
			
 
				-        self.m_head = [nn.Conv2d(in_nc, dim, 3, 1, 1, bias=False)]
			
 
				-
			
 
				-        begin = 0
			
 
				-        self.m_down1 = [ConvTransBlock(dim // 2, dim // 2, self.head_dim, self.window_size, dpr[i + begin],
			
 
				-                                       'W' if not i % 2 else 'SW', input_resolution)
			
 
				-                        for i in range(config[0])] + \
			
 
				-                       [nn.Conv2d(dim, 2 * dim, 2, 2, 0, bias=False)]
			
 
				-
			
 
				-        begin += config[0]
			
 
				-        self.m_down2 = [ConvTransBlock(dim, dim, self.head_dim, self.window_size, dpr[i + begin],
			
 
				-                                       'W' if not i % 2 else 'SW', input_resolution // 2)
			
 
				-                        for i in range(config[1])] + \
			
 
				-                       [nn.Conv2d(2 * dim, 4 * dim, 2, 2, 0, bias=False)]
			
 
				-
			
 
				-        begin += config[1]
			
 
				-        self.m_down3 = [ConvTransBlock(2 * dim, 2 * dim, self.head_dim, self.window_size, dpr[i + begin],
			
 
				-                                       'W' if not i % 2 else 'SW', input_resolution // 4)
			
 
				-                        for i in range(config[2])] + \
			
 
				-                       [nn.Conv2d(4 * dim, 8 * dim, 2, 2, 0, bias=False)]
			
 
				-
			
 
				-        begin += config[2]
			
 
				-        self.m_body = [ConvTransBlock(4 * dim, 4 * dim, self.head_dim, self.window_size, dpr[i + begin],
			
 
				-                                      'W' if not i % 2 else 'SW', input_resolution // 8)
			
 
				-                       for i in range(config[3])]
			
 
				-
			
 
				-        begin += config[3]
			
 
				-        self.m_up3 = [nn.ConvTranspose2d(8 * dim, 4 * dim, 2, 2, 0, bias=False), ] + \
			
 
				-                     [ConvTransBlock(2 * dim, 2 * dim, self.head_dim, self.window_size, dpr[i + begin],
			
 
				-                                     'W' if not i % 2 else 'SW', input_resolution // 4)
			
 
				-                      for i in range(config[4])]
			
 
				-
			
 
				-        begin += config[4]
			
 
				-        self.m_up2 = [nn.ConvTranspose2d(4 * dim, 2 * dim, 2, 2, 0, bias=False), ] + \
			
 
				-                     [ConvTransBlock(dim, dim, self.head_dim, self.window_size, dpr[i + begin],
			
 
				-                                     'W' if not i % 2 else 'SW', input_resolution // 2)
			
 
				-                      for i in range(config[5])]
			
 
				-
			
 
				-        begin += config[5]
			
 
				-        self.m_up1 = [nn.ConvTranspose2d(2 * dim, dim, 2, 2, 0, bias=False), ] + \
			
 
				-                     [ConvTransBlock(dim // 2, dim // 2, self.head_dim, self.window_size, dpr[i + begin],
			
 
				-                                     'W' if not i % 2 else 'SW', input_resolution)
			
 
				-                      for i in range(config[6])]
			
 
				-
			
 
				-        self.m_tail = [nn.Conv2d(dim, in_nc, 3, 1, 1, bias=False)]
			
 
				-
			
 
				-        self.m_head = nn.Sequential(*self.m_head)
			
 
				-        self.m_down1 = nn.Sequential(*self.m_down1)
			
 
				-        self.m_down2 = nn.Sequential(*self.m_down2)
			
 
				-        self.m_down3 = nn.Sequential(*self.m_down3)
			
 
				-        self.m_body = nn.Sequential(*self.m_body)
			
 
				-        self.m_up3 = nn.Sequential(*self.m_up3)
			
 
				-        self.m_up2 = nn.Sequential(*self.m_up2)
			
 
				-        self.m_up1 = nn.Sequential(*self.m_up1)
			
 
				-        self.m_tail = nn.Sequential(*self.m_tail)
			
 
				-        # self.apply(self._init_weights)
			
 
				-
			
 
				-    def forward(self, x0):
			
 
				-
			
 
				-        h, w = x0.size()[-2:]
			
 
				-        paddingBottom = int(np.ceil(h / 64) * 64 - h)
			
 
				-        paddingRight = int(np.ceil(w / 64) * 64 - w)
			
 
				-        x0 = nn.ReplicationPad2d((0, paddingRight, 0, paddingBottom))(x0)
			
 
				-
			
 
				-        x1 = self.m_head(x0)
			
 
				-        x2 = self.m_down1(x1)
			
 
				-        x3 = self.m_down2(x2)
			
 
				-        x4 = self.m_down3(x3)
			
 
				-        x = self.m_body(x4)
			
 
				-        x = self.m_up3(x + x4)
			
 
				-        x = self.m_up2(x + x3)
			
 
				-        x = self.m_up1(x + x2)
			
 
				-        x = self.m_tail(x + x1)
			
 
				-
			
 
				-        x = x[..., :h, :w]
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-    def _init_weights(self, m):
			
 
				-        if isinstance(m, nn.Linear):
			
 
				-            trunc_normal_(m.weight, std=.02)
			
 
				-            if m.bias is not None:
			
 
				-                nn.init.constant_(m.bias, 0)
			
 
				-        elif isinstance(m, nn.LayerNorm):
			
 
				-            nn.init.constant_(m.bias, 0)
			
 
				-            nn.init.constant_(m.weight, 1.0)
			
--- a/extensions-builtin/SwinIR/scripts/swinir_model.py
+++ b/extensions-builtin/SwinIR/scripts/swinir_model.py
@@ -1,5 +1,5 @@
 
				+import logging
			
 
				 import sys
			
 
				-import platform
			
 
				 
			
 
				 import numpy as np
			
 
				 import torch
			
@@ -8,13 +8,11 @@ from tqdm import tqdm
 
				 
			
 
				 from modules import modelloader, devices, script_callbacks, shared
			
 
				 from modules.shared import opts, state
			
 
				-from swinir_model_arch import SwinIR
			
 
				-from swinir_model_arch_v2 import Swin2SR
			
 
				 from modules.upscaler import Upscaler, UpscalerData
			
 
				 
			
 
				 SWINIR_MODEL_URL = "https://github.com/JingyunLiang/SwinIR/releases/download/v0.0/003_realSR_BSRGAN_DFOWMFC_s64w8_SwinIR-L_x4_GAN.pth"
			
 
				 
			
 
				-device_swinir = devices.get_device_for('swinir')
			
 
				+logger = logging.getLogger(__name__)
			
 
				 
			
 
				 
			
 
				 class UpscalerSwinIR(Upscaler):
			
@@ -37,26 +35,29 @@ class UpscalerSwinIR(Upscaler):
 
				             scalers.append(model_data)
			
 
				         self.scalers = scalers
			
 
				 
			
 
				-    def do_upscale(self, img, model_file):
			
 
				-        use_compile = hasattr(opts, 'SWIN_torch_compile') and opts.SWIN_torch_compile \
			
 
				-            and int(torch.__version__.split('.')[0]) >= 2 and platform.system() != "Windows"
			
 
				+    def do_upscale(self, img: Image.Image, model_file: str) -> Image.Image:
			
 
				         current_config = (model_file, opts.SWIN_tile)
			
 
				 
			
 
				-        if use_compile and self._cached_model_config == current_config:
			
 
				+        device = self._get_device()
			
 
				+
			
 
				+        if self._cached_model_config == current_config:
			
 
				             model = self._cached_model
			
 
				         else:
			
 
				-            self._cached_model = None
			
 
				             try:
			
 
				                 model = self.load_model(model_file)
			
 
				             except Exception as e:
			
 
				                 print(f"Failed loading SwinIR model {model_file}: {e}", file=sys.stderr)
			
 
				                 return img
			
 
				-            model = model.to(device_swinir, dtype=devices.dtype)
			
 
				-            if use_compile:
			
 
				-                model = torch.compile(model)
			
 
				-                self._cached_model = model
			
 
				-                self._cached_model_config = current_config
			
 
				-        img = upscale(img, model)
			
 
				+            self._cached_model = model
			
 
				+            self._cached_model_config = current_config
			
 
				+
			
 
				+        img = upscale(
			
 
				+            img,
			
 
				+            model,
			
 
				+            tile=opts.SWIN_tile,
			
 
				+            tile_overlap=opts.SWIN_tile_overlap,
			
 
				+            device=device,
			
 
				+        )
			
 
				         devices.torch_gc()
			
 
				         return img
			
 
				 
			
@@ -69,69 +70,54 @@ class UpscalerSwinIR(Upscaler):
 
				             )
			
 
				         else:
			
 
				             filename = path
			
 
				-        if filename.endswith(".v2.pth"):
			
 
				-            model = Swin2SR(
			
 
				-                upscale=scale,
			
 
				-                in_chans=3,
			
 
				-                img_size=64,
			
 
				-                window_size=8,
			
 
				-                img_range=1.0,
			
 
				-                depths=[6, 6, 6, 6, 6, 6],
			
 
				-                embed_dim=180,
			
 
				-                num_heads=[6, 6, 6, 6, 6, 6],
			
 
				-                mlp_ratio=2,
			
 
				-                upsampler="nearest+conv",
			
 
				-                resi_connection="1conv",
			
 
				-            )
			
 
				-            params = None
			
 
				-        else:
			
 
				-            model = SwinIR(
			
 
				-                upscale=scale,
			
 
				-                in_chans=3,
			
 
				-                img_size=64,
			
 
				-                window_size=8,
			
 
				-                img_range=1.0,
			
 
				-                depths=[6, 6, 6, 6, 6, 6, 6, 6, 6],
			
 
				-                embed_dim=240,
			
 
				-                num_heads=[8, 8, 8, 8, 8, 8, 8, 8, 8],
			
 
				-                mlp_ratio=2,
			
 
				-                upsampler="nearest+conv",
			
 
				-                resi_connection="3conv",
			
 
				-            )
			
 
				-            params = "params_ema"
			
 
				 
			
 
				-        pretrained_model = torch.load(filename)
			
 
				-        if params is not None:
			
 
				-            model.load_state_dict(pretrained_model[params], strict=True)
			
 
				-        else:
			
 
				-            model.load_state_dict(pretrained_model, strict=True)
			
 
				+        model = modelloader.load_spandrel_model(
			
 
				+            filename,
			
 
				+            device=self._get_device(),
			
 
				+            dtype=devices.dtype,
			
 
				+        )
			
 
				+        if getattr(opts, 'SWIN_torch_compile', False):
			
 
				+            try:
			
 
				+                model = torch.compile(model)
			
 
				+            except Exception:
			
 
				+                logger.warning("Failed to compile SwinIR model, fallback to JIT", exc_info=True)
			
 
				         return model
			
 
				 
			
 
				+    def _get_device(self):
			
 
				+        return devices.get_device_for('swinir')
			
 
				+
			
 
				 
			
 
				 def upscale(
			
 
				-        img,
			
 
				-        model,
			
 
				-        tile=None,
			
 
				-        tile_overlap=None,
			
 
				-        window_size=8,
			
 
				-        scale=4,
			
 
				+    img,
			
 
				+    model,
			
 
				+    *,
			
 
				+    tile: int,
			
 
				+    tile_overlap: int,
			
 
				+    window_size=8,
			
 
				+    scale=4,
			
 
				+    device,
			
 
				 ):
			
 
				-    tile = tile or opts.SWIN_tile
			
 
				-    tile_overlap = tile_overlap or opts.SWIN_tile_overlap
			
 
				-
			
 
				 
			
 
				     img = np.array(img)
			
 
				     img = img[:, :, ::-1]
			
 
				     img = np.moveaxis(img, 2, 0) / 255
			
 
				     img = torch.from_numpy(img).float()
			
 
				-    img = img.unsqueeze(0).to(device_swinir, dtype=devices.dtype)
			
 
				+    img = img.unsqueeze(0).to(device, dtype=devices.dtype)
			
 
				     with torch.no_grad(), devices.autocast():
			
 
				         _, _, h_old, w_old = img.size()
			
 
				         h_pad = (h_old // window_size + 1) * window_size - h_old
			
 
				         w_pad = (w_old // window_size + 1) * window_size - w_old
			
 
				         img = torch.cat([img, torch.flip(img, [2])], 2)[:, :, : h_old + h_pad, :]
			
 
				         img = torch.cat([img, torch.flip(img, [3])], 3)[:, :, :, : w_old + w_pad]
			
 
				-        output = inference(img, model, tile, tile_overlap, window_size, scale)
			
 
				+        output = inference(
			
 
				+            img,
			
 
				+            model,
			
 
				+            tile=tile,
			
 
				+            tile_overlap=tile_overlap,
			
 
				+            window_size=window_size,
			
 
				+            scale=scale,
			
 
				+            device=device,
			
 
				+        )
			
 
				         output = output[..., : h_old * scale, : w_old * scale]
			
 
				         output = output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
			
 
				         if output.ndim == 3:
			
@@ -142,7 +128,16 @@ def upscale(
 
				         return Image.fromarray(output, "RGB")
			
 
				 
			
 
				 
			
 
				-def inference(img, model, tile, tile_overlap, window_size, scale):
			
 
				+def inference(
			
 
				+    img,
			
 
				+    model,
			
 
				+    *,
			
 
				+    tile: int,
			
 
				+    tile_overlap: int,
			
 
				+    window_size: int,
			
 
				+    scale: int,
			
 
				+    device,
			
 
				+):
			
 
				     # test the image tile by tile
			
 
				     b, c, h, w = img.size()
			
 
				     tile = min(tile, h, w)
			
@@ -152,8 +147,8 @@ def inference(img, model, tile, tile_overlap, window_size, scale):
 
				     stride = tile - tile_overlap
			
 
				     h_idx_list = list(range(0, h - tile, stride)) + [h - tile]
			
 
				     w_idx_list = list(range(0, w - tile, stride)) + [w - tile]
			
 
				-    E = torch.zeros(b, c, h * sf, w * sf, dtype=devices.dtype, device=device_swinir).type_as(img)
			
 
				-    W = torch.zeros_like(E, dtype=devices.dtype, device=device_swinir)
			
 
				+    E = torch.zeros(b, c, h * sf, w * sf, dtype=devices.dtype, device=device).type_as(img)
			
 
				+    W = torch.zeros_like(E, dtype=devices.dtype, device=device)
			
 
				 
			
 
				     with tqdm(total=len(h_idx_list) * len(w_idx_list), desc="SwinIR tiles") as pbar:
			
 
				         for h_idx in h_idx_list:
			
@@ -185,8 +180,7 @@ def on_ui_settings():
 
				 
			
 
				     shared.opts.add_option("SWIN_tile", shared.OptionInfo(192, "Tile size for all SwinIR.", gr.Slider, {"minimum": 16, "maximum": 512, "step": 16}, section=('upscaling', "Upscaling")))
			
 
				     shared.opts.add_option("SWIN_tile_overlap", shared.OptionInfo(8, "Tile overlap, in pixels for SwinIR. Low values = visible seam.", gr.Slider, {"minimum": 0, "maximum": 48, "step": 1}, section=('upscaling', "Upscaling")))
			
 
				-    if int(torch.__version__.split('.')[0]) >= 2 and platform.system() != "Windows":    # torch.compile() require pytorch 2.0 or above, and not on Windows
			
 
				-        shared.opts.add_option("SWIN_torch_compile", shared.OptionInfo(False, "Use torch.compile to accelerate SwinIR.", gr.Checkbox, {"interactive": True}, section=('upscaling', "Upscaling")).info("Takes longer on first run"))
			
 
				+    shared.opts.add_option("SWIN_torch_compile", shared.OptionInfo(False, "Use torch.compile to accelerate SwinIR.", gr.Checkbox, {"interactive": True}, section=('upscaling', "Upscaling")).info("Takes longer on first run"))
			
 
				 
			
 
				 
			
 
				 script_callbacks.on_ui_settings(on_ui_settings)
			
--- a/extensions-builtin/SwinIR/swinir_model_arch.py
+++ b/extensions-builtin/SwinIR/swinir_model_arch.py
@@ -1,867 +0,0 @@
 
				-# -----------------------------------------------------------------------------------
			
 
				-# SwinIR: Image Restoration Using Swin Transformer, https://arxiv.org/abs/2108.10257
			
 
				-# Originally Written by Ze Liu, Modified by Jingyun Liang.
			
 
				-# -----------------------------------------------------------------------------------
			
 
				-
			
 
				-import math
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-import torch.utils.checkpoint as checkpoint
			
 
				-from timm.models.layers import DropPath, to_2tuple, trunc_normal_
			
 
				-
			
 
				-
			
 
				-class Mlp(nn.Module):
			
 
				-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
			
 
				-        super().__init__()
			
 
				-        out_features = out_features or in_features
			
 
				-        hidden_features = hidden_features or in_features
			
 
				-        self.fc1 = nn.Linear(in_features, hidden_features)
			
 
				-        self.act = act_layer()
			
 
				-        self.fc2 = nn.Linear(hidden_features, out_features)
			
 
				-        self.drop = nn.Dropout(drop)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x = self.fc1(x)
			
 
				-        x = self.act(x)
			
 
				-        x = self.drop(x)
			
 
				-        x = self.fc2(x)
			
 
				-        x = self.drop(x)
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-def window_partition(x, window_size):
			
 
				-    """
			
 
				-    Args:
			
 
				-        x: (B, H, W, C)
			
 
				-        window_size (int): window size
			
 
				-
			
 
				-    Returns:
			
 
				-        windows: (num_windows*B, window_size, window_size, C)
			
 
				-    """
			
 
				-    B, H, W, C = x.shape
			
 
				-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
			
 
				-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
			
 
				-    return windows
			
 
				-
			
 
				-
			
 
				-def window_reverse(windows, window_size, H, W):
			
 
				-    """
			
 
				-    Args:
			
 
				-        windows: (num_windows*B, window_size, window_size, C)
			
 
				-        window_size (int): Window size
			
 
				-        H (int): Height of image
			
 
				-        W (int): Width of image
			
 
				-
			
 
				-    Returns:
			
 
				-        x: (B, H, W, C)
			
 
				-    """
			
 
				-    B = int(windows.shape[0] / (H * W / window_size / window_size))
			
 
				-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
			
 
				-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
			
 
				-    return x
			
 
				-
			
 
				-
			
 
				-class WindowAttention(nn.Module):
			
 
				-    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
			
 
				-    It supports both of shifted and non-shifted window.
			
 
				-
			
 
				-    Args:
			
 
				-        dim (int): Number of input channels.
			
 
				-        window_size (tuple[int]): The height and width of the window.
			
 
				-        num_heads (int): Number of attention heads.
			
 
				-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
			
 
				-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
			
 
				-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
			
 
				-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
			
 
				-
			
 
				-        super().__init__()
			
 
				-        self.dim = dim
			
 
				-        self.window_size = window_size  # Wh, Ww
			
 
				-        self.num_heads = num_heads
			
 
				-        head_dim = dim // num_heads
			
 
				-        self.scale = qk_scale or head_dim ** -0.5
			
 
				-
			
 
				-        # define a parameter table of relative position bias
			
 
				-        self.relative_position_bias_table = nn.Parameter(
			
 
				-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
			
 
				-
			
 
				-        # get pair-wise relative position index for each token inside the window
			
 
				-        coords_h = torch.arange(self.window_size[0])
			
 
				-        coords_w = torch.arange(self.window_size[1])
			
 
				-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
			
 
				-        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
			
 
				-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
			
 
				-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
			
 
				-        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
			
 
				-        relative_coords[:, :, 1] += self.window_size[1] - 1
			
 
				-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
			
 
				-        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
			
 
				-        self.register_buffer("relative_position_index", relative_position_index)
			
 
				-
			
 
				-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
			
 
				-        self.attn_drop = nn.Dropout(attn_drop)
			
 
				-        self.proj = nn.Linear(dim, dim)
			
 
				-
			
 
				-        self.proj_drop = nn.Dropout(proj_drop)
			
 
				-
			
 
				-        trunc_normal_(self.relative_position_bias_table, std=.02)
			
 
				-        self.softmax = nn.Softmax(dim=-1)
			
 
				-
			
 
				-    def forward(self, x, mask=None):
			
 
				-        """
			
 
				-        Args:
			
 
				-            x: input features with shape of (num_windows*B, N, C)
			
 
				-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
			
 
				-        """
			
 
				-        B_, N, C = x.shape
			
 
				-        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
			
 
				-        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
			
 
				-
			
 
				-        q = q * self.scale
			
 
				-        attn = (q @ k.transpose(-2, -1))
			
 
				-
			
 
				-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
			
 
				-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
			
 
				-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
			
 
				-        attn = attn + relative_position_bias.unsqueeze(0)
			
 
				-
			
 
				-        if mask is not None:
			
 
				-            nW = mask.shape[0]
			
 
				-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
			
 
				-            attn = attn.view(-1, self.num_heads, N, N)
			
 
				-            attn = self.softmax(attn)
			
 
				-        else:
			
 
				-            attn = self.softmax(attn)
			
 
				-
			
 
				-        attn = self.attn_drop(attn)
			
 
				-
			
 
				-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
			
 
				-        x = self.proj(x)
			
 
				-        x = self.proj_drop(x)
			
 
				-        return x
			
 
				-
			
 
				-    def extra_repr(self) -> str:
			
 
				-        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
			
 
				-
			
 
				-    def flops(self, N):
			
 
				-        # calculate flops for 1 window with token length of N
			
 
				-        flops = 0
			
 
				-        # qkv = self.qkv(x)
			
 
				-        flops += N * self.dim * 3 * self.dim
			
 
				-        # attn = (q @ k.transpose(-2, -1))
			
 
				-        flops += self.num_heads * N * (self.dim // self.num_heads) * N
			
 
				-        #  x = (attn @ v)
			
 
				-        flops += self.num_heads * N * N * (self.dim // self.num_heads)
			
 
				-        # x = self.proj(x)
			
 
				-        flops += N * self.dim * self.dim
			
 
				-        return flops
			
 
				-
			
 
				-
			
 
				-class SwinTransformerBlock(nn.Module):
			
 
				-    r""" Swin Transformer Block.
			
 
				-
			
 
				-    Args:
			
 
				-        dim (int): Number of input channels.
			
 
				-        input_resolution (tuple[int]): Input resolution.
			
 
				-        num_heads (int): Number of attention heads.
			
 
				-        window_size (int): Window size.
			
 
				-        shift_size (int): Shift size for SW-MSA.
			
 
				-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
			
 
				-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
			
 
				-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
			
 
				-        drop (float, optional): Dropout rate. Default: 0.0
			
 
				-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
			
 
				-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
			
 
				-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
			
 
				-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
			
 
				-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
			
 
				-                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
			
 
				-        super().__init__()
			
 
				-        self.dim = dim
			
 
				-        self.input_resolution = input_resolution
			
 
				-        self.num_heads = num_heads
			
 
				-        self.window_size = window_size
			
 
				-        self.shift_size = shift_size
			
 
				-        self.mlp_ratio = mlp_ratio
			
 
				-        if min(self.input_resolution) <= self.window_size:
			
 
				-            # if window size is larger than input resolution, we don't partition windows
			
 
				-            self.shift_size = 0
			
 
				-            self.window_size = min(self.input_resolution)
			
 
				-        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
			
 
				-
			
 
				-        self.norm1 = norm_layer(dim)
			
 
				-        self.attn = WindowAttention(
			
 
				-            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
			
 
				-            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
			
 
				-
			
 
				-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
			
 
				-        self.norm2 = norm_layer(dim)
			
 
				-        mlp_hidden_dim = int(dim * mlp_ratio)
			
 
				-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
			
 
				-
			
 
				-        if self.shift_size > 0:
			
 
				-            attn_mask = self.calculate_mask(self.input_resolution)
			
 
				-        else:
			
 
				-            attn_mask = None
			
 
				-
			
 
				-        self.register_buffer("attn_mask", attn_mask)
			
 
				-
			
 
				-    def calculate_mask(self, x_size):
			
 
				-        # calculate attention mask for SW-MSA
			
 
				-        H, W = x_size
			
 
				-        img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
			
 
				-        h_slices = (slice(0, -self.window_size),
			
 
				-                    slice(-self.window_size, -self.shift_size),
			
 
				-                    slice(-self.shift_size, None))
			
 
				-        w_slices = (slice(0, -self.window_size),
			
 
				-                    slice(-self.window_size, -self.shift_size),
			
 
				-                    slice(-self.shift_size, None))
			
 
				-        cnt = 0
			
 
				-        for h in h_slices:
			
 
				-            for w in w_slices:
			
 
				-                img_mask[:, h, w, :] = cnt
			
 
				-                cnt += 1
			
 
				-
			
 
				-        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
			
 
				-        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
			
 
				-        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
			
 
				-        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
			
 
				-
			
 
				-        return attn_mask
			
 
				-
			
 
				-    def forward(self, x, x_size):
			
 
				-        H, W = x_size
			
 
				-        B, L, C = x.shape
			
 
				-        # assert L == H * W, "input feature has wrong size"
			
 
				-
			
 
				-        shortcut = x
			
 
				-        x = self.norm1(x)
			
 
				-        x = x.view(B, H, W, C)
			
 
				-
			
 
				-        # cyclic shift
			
 
				-        if self.shift_size > 0:
			
 
				-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
			
 
				-        else:
			
 
				-            shifted_x = x
			
 
				-
			
 
				-        # partition windows
			
 
				-        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
			
 
				-        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
			
 
				-
			
 
				-        # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size
			
 
				-        if self.input_resolution == x_size:
			
 
				-            attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
			
 
				-        else:
			
 
				-            attn_windows = self.attn(x_windows, mask=self.calculate_mask(x_size).to(x.device))
			
 
				-
			
 
				-        # merge windows
			
 
				-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
			
 
				-        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
			
 
				-
			
 
				-        # reverse cyclic shift
			
 
				-        if self.shift_size > 0:
			
 
				-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
			
 
				-        else:
			
 
				-            x = shifted_x
			
 
				-        x = x.view(B, H * W, C)
			
 
				-
			
 
				-        # FFN
			
 
				-        x = shortcut + self.drop_path(x)
			
 
				-        x = x + self.drop_path(self.mlp(self.norm2(x)))
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-    def extra_repr(self) -> str:
			
 
				-        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
			
 
				-               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
			
 
				-
			
 
				-    def flops(self):
			
 
				-        flops = 0
			
 
				-        H, W = self.input_resolution
			
 
				-        # norm1
			
 
				-        flops += self.dim * H * W
			
 
				-        # W-MSA/SW-MSA
			
 
				-        nW = H * W / self.window_size / self.window_size
			
 
				-        flops += nW * self.attn.flops(self.window_size * self.window_size)
			
 
				-        # mlp
			
 
				-        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
			
 
				-        # norm2
			
 
				-        flops += self.dim * H * W
			
 
				-        return flops
			
 
				-
			
 
				-
			
 
				-class PatchMerging(nn.Module):
			
 
				-    r""" Patch Merging Layer.
			
 
				-
			
 
				-    Args:
			
 
				-        input_resolution (tuple[int]): Resolution of input feature.
			
 
				-        dim (int): Number of input channels.
			
 
				-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
			
 
				-        super().__init__()
			
 
				-        self.input_resolution = input_resolution
			
 
				-        self.dim = dim
			
 
				-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
			
 
				-        self.norm = norm_layer(4 * dim)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        """
			
 
				-        x: B, H*W, C
			
 
				-        """
			
 
				-        H, W = self.input_resolution
			
 
				-        B, L, C = x.shape
			
 
				-        assert L == H * W, "input feature has wrong size"
			
 
				-        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
			
 
				-
			
 
				-        x = x.view(B, H, W, C)
			
 
				-
			
 
				-        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
			
 
				-        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
			
 
				-        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
			
 
				-        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
			
 
				-        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
			
 
				-        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
			
 
				-
			
 
				-        x = self.norm(x)
			
 
				-        x = self.reduction(x)
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-    def extra_repr(self) -> str:
			
 
				-        return f"input_resolution={self.input_resolution}, dim={self.dim}"
			
 
				-
			
 
				-    def flops(self):
			
 
				-        H, W = self.input_resolution
			
 
				-        flops = H * W * self.dim
			
 
				-        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
			
 
				-        return flops
			
 
				-
			
 
				-
			
 
				-class BasicLayer(nn.Module):
			
 
				-    """ A basic Swin Transformer layer for one stage.
			
 
				-
			
 
				-    Args:
			
 
				-        dim (int): Number of input channels.
			
 
				-        input_resolution (tuple[int]): Input resolution.
			
 
				-        depth (int): Number of blocks.
			
 
				-        num_heads (int): Number of attention heads.
			
 
				-        window_size (int): Local window size.
			
 
				-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
			
 
				-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
			
 
				-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
			
 
				-        drop (float, optional): Dropout rate. Default: 0.0
			
 
				-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
			
 
				-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
			
 
				-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
			
 
				-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
			
 
				-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
			
 
				-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
			
 
				-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False):
			
 
				-
			
 
				-        super().__init__()
			
 
				-        self.dim = dim
			
 
				-        self.input_resolution = input_resolution
			
 
				-        self.depth = depth
			
 
				-        self.use_checkpoint = use_checkpoint
			
 
				-
			
 
				-        # build blocks
			
 
				-        self.blocks = nn.ModuleList([
			
 
				-            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
			
 
				-                                 num_heads=num_heads, window_size=window_size,
			
 
				-                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
			
 
				-                                 mlp_ratio=mlp_ratio,
			
 
				-                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
			
 
				-                                 drop=drop, attn_drop=attn_drop,
			
 
				-                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
			
 
				-                                 norm_layer=norm_layer)
			
 
				-            for i in range(depth)])
			
 
				-
			
 
				-        # patch merging layer
			
 
				-        if downsample is not None:
			
 
				-            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
			
 
				-        else:
			
 
				-            self.downsample = None
			
 
				-
			
 
				-    def forward(self, x, x_size):
			
 
				-        for blk in self.blocks:
			
 
				-            if self.use_checkpoint:
			
 
				-                x = checkpoint.checkpoint(blk, x, x_size)
			
 
				-            else:
			
 
				-                x = blk(x, x_size)
			
 
				-        if self.downsample is not None:
			
 
				-            x = self.downsample(x)
			
 
				-        return x
			
 
				-
			
 
				-    def extra_repr(self) -> str:
			
 
				-        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
			
 
				-
			
 
				-    def flops(self):
			
 
				-        flops = 0
			
 
				-        for blk in self.blocks:
			
 
				-            flops += blk.flops()
			
 
				-        if self.downsample is not None:
			
 
				-            flops += self.downsample.flops()
			
 
				-        return flops
			
 
				-
			
 
				-
			
 
				-class RSTB(nn.Module):
			
 
				-    """Residual Swin Transformer Block (RSTB).
			
 
				-
			
 
				-    Args:
			
 
				-        dim (int): Number of input channels.
			
 
				-        input_resolution (tuple[int]): Input resolution.
			
 
				-        depth (int): Number of blocks.
			
 
				-        num_heads (int): Number of attention heads.
			
 
				-        window_size (int): Local window size.
			
 
				-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
			
 
				-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
			
 
				-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
			
 
				-        drop (float, optional): Dropout rate. Default: 0.0
			
 
				-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
			
 
				-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
			
 
				-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
			
 
				-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
			
 
				-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
			
 
				-        img_size: Input image size.
			
 
				-        patch_size: Patch size.
			
 
				-        resi_connection: The convolutional block before residual connection.
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
			
 
				-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
			
 
				-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
			
 
				-                 img_size=224, patch_size=4, resi_connection='1conv'):
			
 
				-        super(RSTB, self).__init__()
			
 
				-
			
 
				-        self.dim = dim
			
 
				-        self.input_resolution = input_resolution
			
 
				-
			
 
				-        self.residual_group = BasicLayer(dim=dim,
			
 
				-                                         input_resolution=input_resolution,
			
 
				-                                         depth=depth,
			
 
				-                                         num_heads=num_heads,
			
 
				-                                         window_size=window_size,
			
 
				-                                         mlp_ratio=mlp_ratio,
			
 
				-                                         qkv_bias=qkv_bias, qk_scale=qk_scale,
			
 
				-                                         drop=drop, attn_drop=attn_drop,
			
 
				-                                         drop_path=drop_path,
			
 
				-                                         norm_layer=norm_layer,
			
 
				-                                         downsample=downsample,
			
 
				-                                         use_checkpoint=use_checkpoint)
			
 
				-
			
 
				-        if resi_connection == '1conv':
			
 
				-            self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
			
 
				-        elif resi_connection == '3conv':
			
 
				-            # to save parameters and memory
			
 
				-            self.conv = nn.Sequential(nn.Conv2d(dim, dim // 4, 3, 1, 1), nn.LeakyReLU(negative_slope=0.2, inplace=True),
			
 
				-                                      nn.Conv2d(dim // 4, dim // 4, 1, 1, 0),
			
 
				-                                      nn.LeakyReLU(negative_slope=0.2, inplace=True),
			
 
				-                                      nn.Conv2d(dim // 4, dim, 3, 1, 1))
			
 
				-
			
 
				-        self.patch_embed = PatchEmbed(
			
 
				-            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim,
			
 
				-            norm_layer=None)
			
 
				-
			
 
				-        self.patch_unembed = PatchUnEmbed(
			
 
				-            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim,
			
 
				-            norm_layer=None)
			
 
				-
			
 
				-    def forward(self, x, x_size):
			
 
				-        return self.patch_embed(self.conv(self.patch_unembed(self.residual_group(x, x_size), x_size))) + x
			
 
				-
			
 
				-    def flops(self):
			
 
				-        flops = 0
			
 
				-        flops += self.residual_group.flops()
			
 
				-        H, W = self.input_resolution
			
 
				-        flops += H * W * self.dim * self.dim * 9
			
 
				-        flops += self.patch_embed.flops()
			
 
				-        flops += self.patch_unembed.flops()
			
 
				-
			
 
				-        return flops
			
 
				-
			
 
				-
			
 
				-class PatchEmbed(nn.Module):
			
 
				-    r""" Image to Patch Embedding
			
 
				-
			
 
				-    Args:
			
 
				-        img_size (int): Image size.  Default: 224.
			
 
				-        patch_size (int): Patch token size. Default: 4.
			
 
				-        in_chans (int): Number of input image channels. Default: 3.
			
 
				-        embed_dim (int): Number of linear projection output channels. Default: 96.
			
 
				-        norm_layer (nn.Module, optional): Normalization layer. Default: None
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
			
 
				-        super().__init__()
			
 
				-        img_size = to_2tuple(img_size)
			
 
				-        patch_size = to_2tuple(patch_size)
			
 
				-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
			
 
				-        self.img_size = img_size
			
 
				-        self.patch_size = patch_size
			
 
				-        self.patches_resolution = patches_resolution
			
 
				-        self.num_patches = patches_resolution[0] * patches_resolution[1]
			
 
				-
			
 
				-        self.in_chans = in_chans
			
 
				-        self.embed_dim = embed_dim
			
 
				-
			
 
				-        if norm_layer is not None:
			
 
				-            self.norm = norm_layer(embed_dim)
			
 
				-        else:
			
 
				-            self.norm = None
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x = x.flatten(2).transpose(1, 2)  # B Ph*Pw C
			
 
				-        if self.norm is not None:
			
 
				-            x = self.norm(x)
			
 
				-        return x
			
 
				-
			
 
				-    def flops(self):
			
 
				-        flops = 0
			
 
				-        H, W = self.img_size
			
 
				-        if self.norm is not None:
			
 
				-            flops += H * W * self.embed_dim
			
 
				-        return flops
			
 
				-
			
 
				-
			
 
				-class PatchUnEmbed(nn.Module):
			
 
				-    r""" Image to Patch Unembedding
			
 
				-
			
 
				-    Args:
			
 
				-        img_size (int): Image size.  Default: 224.
			
 
				-        patch_size (int): Patch token size. Default: 4.
			
 
				-        in_chans (int): Number of input image channels. Default: 3.
			
 
				-        embed_dim (int): Number of linear projection output channels. Default: 96.
			
 
				-        norm_layer (nn.Module, optional): Normalization layer. Default: None
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
			
 
				-        super().__init__()
			
 
				-        img_size = to_2tuple(img_size)
			
 
				-        patch_size = to_2tuple(patch_size)
			
 
				-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
			
 
				-        self.img_size = img_size
			
 
				-        self.patch_size = patch_size
			
 
				-        self.patches_resolution = patches_resolution
			
 
				-        self.num_patches = patches_resolution[0] * patches_resolution[1]
			
 
				-
			
 
				-        self.in_chans = in_chans
			
 
				-        self.embed_dim = embed_dim
			
 
				-
			
 
				-    def forward(self, x, x_size):
			
 
				-        B, HW, C = x.shape
			
 
				-        x = x.transpose(1, 2).view(B, self.embed_dim, x_size[0], x_size[1])  # B Ph*Pw C
			
 
				-        return x
			
 
				-
			
 
				-    def flops(self):
			
 
				-        flops = 0
			
 
				-        return flops
			
 
				-
			
 
				-
			
 
				-class Upsample(nn.Sequential):
			
 
				-    """Upsample module.
			
 
				-
			
 
				-    Args:
			
 
				-        scale (int): Scale factor. Supported scales: 2^n and 3.
			
 
				-        num_feat (int): Channel number of intermediate features.
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, scale, num_feat):
			
 
				-        m = []
			
 
				-        if (scale & (scale - 1)) == 0:  # scale = 2^n
			
 
				-            for _ in range(int(math.log(scale, 2))):
			
 
				-                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
			
 
				-                m.append(nn.PixelShuffle(2))
			
 
				-        elif scale == 3:
			
 
				-            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
			
 
				-            m.append(nn.PixelShuffle(3))
			
 
				-        else:
			
 
				-            raise ValueError(f'scale {scale} is not supported. ' 'Supported scales: 2^n and 3.')
			
 
				-        super(Upsample, self).__init__(*m)
			
 
				-
			
 
				-
			
 
				-class UpsampleOneStep(nn.Sequential):
			
 
				-    """UpsampleOneStep module (the difference with Upsample is that it always only has 1conv + 1pixelshuffle)
			
 
				-       Used in lightweight SR to save parameters.
			
 
				-
			
 
				-    Args:
			
 
				-        scale (int): Scale factor. Supported scales: 2^n and 3.
			
 
				-        num_feat (int): Channel number of intermediate features.
			
 
				-
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, scale, num_feat, num_out_ch, input_resolution=None):
			
 
				-        self.num_feat = num_feat
			
 
				-        self.input_resolution = input_resolution
			
 
				-        m = []
			
 
				-        m.append(nn.Conv2d(num_feat, (scale ** 2) * num_out_ch, 3, 1, 1))
			
 
				-        m.append(nn.PixelShuffle(scale))
			
 
				-        super(UpsampleOneStep, self).__init__(*m)
			
 
				-
			
 
				-    def flops(self):
			
 
				-        H, W = self.input_resolution
			
 
				-        flops = H * W * self.num_feat * 3 * 9
			
 
				-        return flops
			
 
				-
			
 
				-
			
 
				-class SwinIR(nn.Module):
			
 
				-    r""" SwinIR
			
 
				-        A PyTorch impl of : `SwinIR: Image Restoration Using Swin Transformer`, based on Swin Transformer.
			
 
				-
			
 
				-    Args:
			
 
				-        img_size (int | tuple(int)): Input image size. Default 64
			
 
				-        patch_size (int | tuple(int)): Patch size. Default: 1
			
 
				-        in_chans (int): Number of input image channels. Default: 3
			
 
				-        embed_dim (int): Patch embedding dimension. Default: 96
			
 
				-        depths (tuple(int)): Depth of each Swin Transformer layer.
			
 
				-        num_heads (tuple(int)): Number of attention heads in different layers.
			
 
				-        window_size (int): Window size. Default: 7
			
 
				-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
			
 
				-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
			
 
				-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
			
 
				-        drop_rate (float): Dropout rate. Default: 0
			
 
				-        attn_drop_rate (float): Attention dropout rate. Default: 0
			
 
				-        drop_path_rate (float): Stochastic depth rate. Default: 0.1
			
 
				-        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
			
 
				-        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
			
 
				-        patch_norm (bool): If True, add normalization after patch embedding. Default: True
			
 
				-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
			
 
				-        upscale: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction
			
 
				-        img_range: Image range. 1. or 255.
			
 
				-        upsampler: The reconstruction reconstruction module. 'pixelshuffle'/'pixelshuffledirect'/'nearest+conv'/None
			
 
				-        resi_connection: The convolutional block before residual connection. '1conv'/'3conv'
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, img_size=64, patch_size=1, in_chans=3,
			
 
				-                 embed_dim=96, depths=(6, 6, 6, 6), num_heads=(6, 6, 6, 6),
			
 
				-                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
			
 
				-                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
			
 
				-                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
			
 
				-                 use_checkpoint=False, upscale=2, img_range=1., upsampler='', resi_connection='1conv',
			
 
				-                 **kwargs):
			
 
				-        super(SwinIR, self).__init__()
			
 
				-        num_in_ch = in_chans
			
 
				-        num_out_ch = in_chans
			
 
				-        num_feat = 64
			
 
				-        self.img_range = img_range
			
 
				-        if in_chans == 3:
			
 
				-            rgb_mean = (0.4488, 0.4371, 0.4040)
			
 
				-            self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
			
 
				-        else:
			
 
				-            self.mean = torch.zeros(1, 1, 1, 1)
			
 
				-        self.upscale = upscale
			
 
				-        self.upsampler = upsampler
			
 
				-        self.window_size = window_size
			
 
				-
			
 
				-        #####################################################################################################
			
 
				-        ################################### 1, shallow feature extraction ###################################
			
 
				-        self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1)
			
 
				-
			
 
				-        #####################################################################################################
			
 
				-        ################################### 2, deep feature extraction ######################################
			
 
				-        self.num_layers = len(depths)
			
 
				-        self.embed_dim = embed_dim
			
 
				-        self.ape = ape
			
 
				-        self.patch_norm = patch_norm
			
 
				-        self.num_features = embed_dim
			
 
				-        self.mlp_ratio = mlp_ratio
			
 
				-
			
 
				-        # split image into non-overlapping patches
			
 
				-        self.patch_embed = PatchEmbed(
			
 
				-            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
			
 
				-            norm_layer=norm_layer if self.patch_norm else None)
			
 
				-        num_patches = self.patch_embed.num_patches
			
 
				-        patches_resolution = self.patch_embed.patches_resolution
			
 
				-        self.patches_resolution = patches_resolution
			
 
				-
			
 
				-        # merge non-overlapping patches into image
			
 
				-        self.patch_unembed = PatchUnEmbed(
			
 
				-            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
			
 
				-            norm_layer=norm_layer if self.patch_norm else None)
			
 
				-
			
 
				-        # absolute position embedding
			
 
				-        if self.ape:
			
 
				-            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
			
 
				-            trunc_normal_(self.absolute_pos_embed, std=.02)
			
 
				-
			
 
				-        self.pos_drop = nn.Dropout(p=drop_rate)
			
 
				-
			
 
				-        # stochastic depth
			
 
				-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
			
 
				-
			
 
				-        # build Residual Swin Transformer blocks (RSTB)
			
 
				-        self.layers = nn.ModuleList()
			
 
				-        for i_layer in range(self.num_layers):
			
 
				-            layer = RSTB(dim=embed_dim,
			
 
				-                         input_resolution=(patches_resolution[0],
			
 
				-                                           patches_resolution[1]),
			
 
				-                         depth=depths[i_layer],
			
 
				-                         num_heads=num_heads[i_layer],
			
 
				-                         window_size=window_size,
			
 
				-                         mlp_ratio=self.mlp_ratio,
			
 
				-                         qkv_bias=qkv_bias, qk_scale=qk_scale,
			
 
				-                         drop=drop_rate, attn_drop=attn_drop_rate,
			
 
				-                         drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],  # no impact on SR results
			
 
				-                         norm_layer=norm_layer,
			
 
				-                         downsample=None,
			
 
				-                         use_checkpoint=use_checkpoint,
			
 
				-                         img_size=img_size,
			
 
				-                         patch_size=patch_size,
			
 
				-                         resi_connection=resi_connection
			
 
				-
			
 
				-                         )
			
 
				-            self.layers.append(layer)
			
 
				-        self.norm = norm_layer(self.num_features)
			
 
				-
			
 
				-        # build the last conv layer in deep feature extraction
			
 
				-        if resi_connection == '1conv':
			
 
				-            self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)
			
 
				-        elif resi_connection == '3conv':
			
 
				-            # to save parameters and memory
			
 
				-            self.conv_after_body = nn.Sequential(nn.Conv2d(embed_dim, embed_dim // 4, 3, 1, 1),
			
 
				-                                                 nn.LeakyReLU(negative_slope=0.2, inplace=True),
			
 
				-                                                 nn.Conv2d(embed_dim // 4, embed_dim // 4, 1, 1, 0),
			
 
				-                                                 nn.LeakyReLU(negative_slope=0.2, inplace=True),
			
 
				-                                                 nn.Conv2d(embed_dim // 4, embed_dim, 3, 1, 1))
			
 
				-
			
 
				-        #####################################################################################################
			
 
				-        ################################ 3, high quality image reconstruction ################################
			
 
				-        if self.upsampler == 'pixelshuffle':
			
 
				-            # for classical SR
			
 
				-            self.conv_before_upsample = nn.Sequential(nn.Conv2d(embed_dim, num_feat, 3, 1, 1),
			
 
				-                                                      nn.LeakyReLU(inplace=True))
			
 
				-            self.upsample = Upsample(upscale, num_feat)
			
 
				-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
			
 
				-        elif self.upsampler == 'pixelshuffledirect':
			
 
				-            # for lightweight SR (to save parameters)
			
 
				-            self.upsample = UpsampleOneStep(upscale, embed_dim, num_out_ch,
			
 
				-                                            (patches_resolution[0], patches_resolution[1]))
			
 
				-        elif self.upsampler == 'nearest+conv':
			
 
				-            # for real-world SR (less artifacts)
			
 
				-            self.conv_before_upsample = nn.Sequential(nn.Conv2d(embed_dim, num_feat, 3, 1, 1),
			
 
				-                                                      nn.LeakyReLU(inplace=True))
			
 
				-            self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
			
 
				-            if self.upscale == 4:
			
 
				-                self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
			
 
				-            self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
			
 
				-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
			
 
				-            self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
			
 
				-        else:
			
 
				-            # for image denoising and JPEG compression artifact reduction
			
 
				-            self.conv_last = nn.Conv2d(embed_dim, num_out_ch, 3, 1, 1)
			
 
				-
			
 
				-        self.apply(self._init_weights)
			
 
				-
			
 
				-    def _init_weights(self, m):
			
 
				-        if isinstance(m, nn.Linear):
			
 
				-            trunc_normal_(m.weight, std=.02)
			
 
				-            if isinstance(m, nn.Linear) and m.bias is not None:
			
 
				-                nn.init.constant_(m.bias, 0)
			
 
				-        elif isinstance(m, nn.LayerNorm):
			
 
				-            nn.init.constant_(m.bias, 0)
			
 
				-            nn.init.constant_(m.weight, 1.0)
			
 
				-
			
 
				-    @torch.jit.ignore
			
 
				-    def no_weight_decay(self):
			
 
				-        return {'absolute_pos_embed'}
			
 
				-
			
 
				-    @torch.jit.ignore
			
 
				-    def no_weight_decay_keywords(self):
			
 
				-        return {'relative_position_bias_table'}
			
 
				-
			
 
				-    def check_image_size(self, x):
			
 
				-        _, _, h, w = x.size()
			
 
				-        mod_pad_h = (self.window_size - h % self.window_size) % self.window_size
			
 
				-        mod_pad_w = (self.window_size - w % self.window_size) % self.window_size
			
 
				-        x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), 'reflect')
			
 
				-        return x
			
 
				-
			
 
				-    def forward_features(self, x):
			
 
				-        x_size = (x.shape[2], x.shape[3])
			
 
				-        x = self.patch_embed(x)
			
 
				-        if self.ape:
			
 
				-            x = x + self.absolute_pos_embed
			
 
				-        x = self.pos_drop(x)
			
 
				-
			
 
				-        for layer in self.layers:
			
 
				-            x = layer(x, x_size)
			
 
				-
			
 
				-        x = self.norm(x)  # B L C
			
 
				-        x = self.patch_unembed(x, x_size)
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        H, W = x.shape[2:]
			
 
				-        x = self.check_image_size(x)
			
 
				-
			
 
				-        self.mean = self.mean.type_as(x)
			
 
				-        x = (x - self.mean) * self.img_range
			
 
				-
			
 
				-        if self.upsampler == 'pixelshuffle':
			
 
				-            # for classical SR
			
 
				-            x = self.conv_first(x)
			
 
				-            x = self.conv_after_body(self.forward_features(x)) + x
			
 
				-            x = self.conv_before_upsample(x)
			
 
				-            x = self.conv_last(self.upsample(x))
			
 
				-        elif self.upsampler == 'pixelshuffledirect':
			
 
				-            # for lightweight SR
			
 
				-            x = self.conv_first(x)
			
 
				-            x = self.conv_after_body(self.forward_features(x)) + x
			
 
				-            x = self.upsample(x)
			
 
				-        elif self.upsampler == 'nearest+conv':
			
 
				-            # for real-world SR
			
 
				-            x = self.conv_first(x)
			
 
				-            x = self.conv_after_body(self.forward_features(x)) + x
			
 
				-            x = self.conv_before_upsample(x)
			
 
				-            x = self.lrelu(self.conv_up1(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
			
 
				-            if self.upscale == 4:
			
 
				-                x = self.lrelu(self.conv_up2(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
			
 
				-            x = self.conv_last(self.lrelu(self.conv_hr(x)))
			
 
				-        else:
			
 
				-            # for image denoising and JPEG compression artifact reduction
			
 
				-            x_first = self.conv_first(x)
			
 
				-            res = self.conv_after_body(self.forward_features(x_first)) + x_first
			
 
				-            x = x + self.conv_last(res)
			
 
				-
			
 
				-        x = x / self.img_range + self.mean
			
 
				-
			
 
				-        return x[:, :, :H*self.upscale, :W*self.upscale]
			
 
				-
			
 
				-    def flops(self):
			
 
				-        flops = 0
			
 
				-        H, W = self.patches_resolution
			
 
				-        flops += H * W * 3 * self.embed_dim * 9
			
 
				-        flops += self.patch_embed.flops()
			
 
				-        for layer in self.layers:
			
 
				-            flops += layer.flops()
			
 
				-        flops += H * W * 3 * self.embed_dim * self.embed_dim
			
 
				-        flops += self.upsample.flops()
			
 
				-        return flops
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    upscale = 4
			
 
				-    window_size = 8
			
 
				-    height = (1024 // upscale // window_size + 1) * window_size
			
 
				-    width = (720 // upscale // window_size + 1) * window_size
			
 
				-    model = SwinIR(upscale=2, img_size=(height, width),
			
 
				-                   window_size=window_size, img_range=1., depths=[6, 6, 6, 6],
			
 
				-                   embed_dim=60, num_heads=[6, 6, 6, 6], mlp_ratio=2, upsampler='pixelshuffledirect')
			
 
				-    print(model)
			
 
				-    print(height, width, model.flops() / 1e9)
			
 
				-
			
 
				-    x = torch.randn((1, 3, height, width))
			
 
				-    x = model(x)
			
 
				-    print(x.shape)
			
--- a/extensions-builtin/SwinIR/swinir_model_arch_v2.py
+++ b/extensions-builtin/SwinIR/swinir_model_arch_v2.py
@@ -1,1017 +0,0 @@
 
				-# -----------------------------------------------------------------------------------

			
 
				-# Swin2SR: Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration, https://arxiv.org/abs/

			
 
				-# Written by Conde and Choi et al.

			
 
				-# -----------------------------------------------------------------------------------

			
 
				-

			
 
				-import math

			
 
				-import numpy as np

			
 
				-import torch

			
 
				-import torch.nn as nn

			
 
				-import torch.nn.functional as F

			
 
				-import torch.utils.checkpoint as checkpoint

			
 
				-from timm.models.layers import DropPath, to_2tuple, trunc_normal_

			
 
				-

			
 
				-

			
 
				-class Mlp(nn.Module):

			
 
				-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):

			
 
				-        super().__init__()

			
 
				-        out_features = out_features or in_features

			
 
				-        hidden_features = hidden_features or in_features

			
 
				-        self.fc1 = nn.Linear(in_features, hidden_features)

			
 
				-        self.act = act_layer()

			
 
				-        self.fc2 = nn.Linear(hidden_features, out_features)

			
 
				-        self.drop = nn.Dropout(drop)

			
 
				-

			
 
				-    def forward(self, x):

			
 
				-        x = self.fc1(x)

			
 
				-        x = self.act(x)

			
 
				-        x = self.drop(x)

			
 
				-        x = self.fc2(x)

			
 
				-        x = self.drop(x)

			
 
				-        return x

			
 
				-

			
 
				-

			
 
				-def window_partition(x, window_size):

			
 
				-    """

			
 
				-    Args:

			
 
				-        x: (B, H, W, C)

			
 
				-        window_size (int): window size

			
 
				-    Returns:

			
 
				-        windows: (num_windows*B, window_size, window_size, C)

			
 
				-    """

			
 
				-    B, H, W, C = x.shape

			
 
				-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)

			
 
				-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)

			
 
				-    return windows

			
 
				-

			
 
				-

			
 
				-def window_reverse(windows, window_size, H, W):

			
 
				-    """

			
 
				-    Args:

			
 
				-        windows: (num_windows*B, window_size, window_size, C)

			
 
				-        window_size (int): Window size

			
 
				-        H (int): Height of image

			
 
				-        W (int): Width of image

			
 
				-    Returns:

			
 
				-        x: (B, H, W, C)

			
 
				-    """

			
 
				-    B = int(windows.shape[0] / (H * W / window_size / window_size))

			
 
				-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)

			
 
				-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)

			
 
				-    return x

			
 
				-

			
 
				-class WindowAttention(nn.Module):

			
 
				-    r""" Window based multi-head self attention (W-MSA) module with relative position bias.

			
 
				-    It supports both of shifted and non-shifted window.

			
 
				-    Args:

			
 
				-        dim (int): Number of input channels.

			
 
				-        window_size (tuple[int]): The height and width of the window.

			
 
				-        num_heads (int): Number of attention heads.

			
 
				-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True

			
 
				-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0

			
 
				-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0

			
 
				-        pretrained_window_size (tuple[int]): The height and width of the window in pre-training.

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, dim, window_size, num_heads, qkv_bias=True, attn_drop=0., proj_drop=0.,

			
 
				-                 pretrained_window_size=(0, 0)):

			
 
				-

			
 
				-        super().__init__()

			
 
				-        self.dim = dim

			
 
				-        self.window_size = window_size  # Wh, Ww

			
 
				-        self.pretrained_window_size = pretrained_window_size

			
 
				-        self.num_heads = num_heads

			
 
				-

			
 
				-        self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True)

			
 
				-

			
 
				-        # mlp to generate continuous relative position bias

			
 
				-        self.cpb_mlp = nn.Sequential(nn.Linear(2, 512, bias=True),

			
 
				-                                     nn.ReLU(inplace=True),

			
 
				-                                     nn.Linear(512, num_heads, bias=False))

			
 
				-

			
 
				-        # get relative_coords_table

			
 
				-        relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.float32)

			
 
				-        relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.float32)

			
 
				-        relative_coords_table = torch.stack(

			
 
				-            torch.meshgrid([relative_coords_h,

			
 
				-                            relative_coords_w])).permute(1, 2, 0).contiguous().unsqueeze(0)  # 1, 2*Wh-1, 2*Ww-1, 2

			
 
				-        if pretrained_window_size[0] > 0:

			
 
				-            relative_coords_table[:, :, :, 0] /= (pretrained_window_size[0] - 1)

			
 
				-            relative_coords_table[:, :, :, 1] /= (pretrained_window_size[1] - 1)

			
 
				-        else:

			
 
				-            relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1)

			
 
				-            relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1)

			
 
				-        relative_coords_table *= 8  # normalize to -8, 8

			
 
				-        relative_coords_table = torch.sign(relative_coords_table) * torch.log2(

			
 
				-            torch.abs(relative_coords_table) + 1.0) / np.log2(8)

			
 
				-

			
 
				-        self.register_buffer("relative_coords_table", relative_coords_table)

			
 
				-

			
 
				-        # get pair-wise relative position index for each token inside the window

			
 
				-        coords_h = torch.arange(self.window_size[0])

			
 
				-        coords_w = torch.arange(self.window_size[1])

			
 
				-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww

			
 
				-        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww

			
 
				-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww

			
 
				-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2

			
 
				-        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0

			
 
				-        relative_coords[:, :, 1] += self.window_size[1] - 1

			
 
				-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1

			
 
				-        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww

			
 
				-        self.register_buffer("relative_position_index", relative_position_index)

			
 
				-

			
 
				-        self.qkv = nn.Linear(dim, dim * 3, bias=False)

			
 
				-        if qkv_bias:

			
 
				-            self.q_bias = nn.Parameter(torch.zeros(dim))

			
 
				-            self.v_bias = nn.Parameter(torch.zeros(dim))

			
 
				-        else:

			
 
				-            self.q_bias = None

			
 
				-            self.v_bias = None

			
 
				-        self.attn_drop = nn.Dropout(attn_drop)

			
 
				-        self.proj = nn.Linear(dim, dim)

			
 
				-        self.proj_drop = nn.Dropout(proj_drop)

			
 
				-        self.softmax = nn.Softmax(dim=-1)

			
 
				-

			
 
				-    def forward(self, x, mask=None):

			
 
				-        """

			
 
				-        Args:

			
 
				-            x: input features with shape of (num_windows*B, N, C)

			
 
				-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None

			
 
				-        """

			
 
				-        B_, N, C = x.shape

			
 
				-        qkv_bias = None

			
 
				-        if self.q_bias is not None:

			
 
				-            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))

			
 
				-        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)

			
 
				-        qkv = qkv.reshape(B_, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)

			
 
				-        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)

			
 
				-

			
 
				-        # cosine attention

			
 
				-        attn = (F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1))

			
 
				-        logit_scale = torch.clamp(self.logit_scale, max=torch.log(torch.tensor(1. / 0.01)).to(self.logit_scale.device)).exp()

			
 
				-        attn = attn * logit_scale

			
 
				-

			
 
				-        relative_position_bias_table = self.cpb_mlp(self.relative_coords_table).view(-1, self.num_heads)

			
 
				-        relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view(

			
 
				-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH

			
 
				-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww

			
 
				-        relative_position_bias = 16 * torch.sigmoid(relative_position_bias)

			
 
				-        attn = attn + relative_position_bias.unsqueeze(0)

			
 
				-

			
 
				-        if mask is not None:

			
 
				-            nW = mask.shape[0]

			
 
				-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)

			
 
				-            attn = attn.view(-1, self.num_heads, N, N)

			
 
				-            attn = self.softmax(attn)

			
 
				-        else:

			
 
				-            attn = self.softmax(attn)

			
 
				-

			
 
				-        attn = self.attn_drop(attn)

			
 
				-

			
 
				-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)

			
 
				-        x = self.proj(x)

			
 
				-        x = self.proj_drop(x)

			
 
				-        return x

			
 
				-

			
 
				-    def extra_repr(self) -> str:

			
 
				-        return f'dim={self.dim}, window_size={self.window_size}, ' \

			
 
				-               f'pretrained_window_size={self.pretrained_window_size}, num_heads={self.num_heads}'

			
 
				-

			
 
				-    def flops(self, N):

			
 
				-        # calculate flops for 1 window with token length of N

			
 
				-        flops = 0

			
 
				-        # qkv = self.qkv(x)

			
 
				-        flops += N * self.dim * 3 * self.dim

			
 
				-        # attn = (q @ k.transpose(-2, -1))

			
 
				-        flops += self.num_heads * N * (self.dim // self.num_heads) * N

			
 
				-        #  x = (attn @ v)

			
 
				-        flops += self.num_heads * N * N * (self.dim // self.num_heads)

			
 
				-        # x = self.proj(x)

			
 
				-        flops += N * self.dim * self.dim

			
 
				-        return flops

			
 
				-

			
 
				-class SwinTransformerBlock(nn.Module):

			
 
				-    r""" Swin Transformer Block.

			
 
				-    Args:

			
 
				-        dim (int): Number of input channels.

			
 
				-        input_resolution (tuple[int]): Input resulotion.

			
 
				-        num_heads (int): Number of attention heads.

			
 
				-        window_size (int): Window size.

			
 
				-        shift_size (int): Shift size for SW-MSA.

			
 
				-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.

			
 
				-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True

			
 
				-        drop (float, optional): Dropout rate. Default: 0.0

			
 
				-        attn_drop (float, optional): Attention dropout rate. Default: 0.0

			
 
				-        drop_path (float, optional): Stochastic depth rate. Default: 0.0

			
 
				-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU

			
 
				-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm

			
 
				-        pretrained_window_size (int): Window size in pre-training.

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,

			
 
				-                 mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0.,

			
 
				-                 act_layer=nn.GELU, norm_layer=nn.LayerNorm, pretrained_window_size=0):

			
 
				-        super().__init__()

			
 
				-        self.dim = dim

			
 
				-        self.input_resolution = input_resolution

			
 
				-        self.num_heads = num_heads

			
 
				-        self.window_size = window_size

			
 
				-        self.shift_size = shift_size

			
 
				-        self.mlp_ratio = mlp_ratio

			
 
				-        if min(self.input_resolution) <= self.window_size:

			
 
				-            # if window size is larger than input resolution, we don't partition windows

			
 
				-            self.shift_size = 0

			
 
				-            self.window_size = min(self.input_resolution)

			
 
				-        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"

			
 
				-

			
 
				-        self.norm1 = norm_layer(dim)

			
 
				-        self.attn = WindowAttention(

			
 
				-            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,

			
 
				-            qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop,

			
 
				-            pretrained_window_size=to_2tuple(pretrained_window_size))

			
 
				-

			
 
				-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()

			
 
				-        self.norm2 = norm_layer(dim)

			
 
				-        mlp_hidden_dim = int(dim * mlp_ratio)

			
 
				-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

			
 
				-

			
 
				-        if self.shift_size > 0:

			
 
				-            attn_mask = self.calculate_mask(self.input_resolution)

			
 
				-        else:

			
 
				-            attn_mask = None

			
 
				-

			
 
				-        self.register_buffer("attn_mask", attn_mask)

			
 
				-

			
 
				-    def calculate_mask(self, x_size):

			
 
				-        # calculate attention mask for SW-MSA

			
 
				-        H, W = x_size

			
 
				-        img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1

			
 
				-        h_slices = (slice(0, -self.window_size),

			
 
				-                    slice(-self.window_size, -self.shift_size),

			
 
				-                    slice(-self.shift_size, None))

			
 
				-        w_slices = (slice(0, -self.window_size),

			
 
				-                    slice(-self.window_size, -self.shift_size),

			
 
				-                    slice(-self.shift_size, None))

			
 
				-        cnt = 0

			
 
				-        for h in h_slices:

			
 
				-            for w in w_slices:

			
 
				-                img_mask[:, h, w, :] = cnt

			
 
				-                cnt += 1

			
 
				-

			
 
				-        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1

			
 
				-        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)

			
 
				-        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)

			
 
				-        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))

			
 
				-

			
 
				-        return attn_mask

			
 
				-

			
 
				-    def forward(self, x, x_size):

			
 
				-        H, W = x_size

			
 
				-        B, L, C = x.shape

			
 
				-        #assert L == H * W, "input feature has wrong size"

			
 
				-

			
 
				-        shortcut = x

			
 
				-        x = x.view(B, H, W, C)

			
 
				-

			
 
				-        # cyclic shift

			
 
				-        if self.shift_size > 0:

			
 
				-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))

			
 
				-        else:

			
 
				-            shifted_x = x

			
 
				-

			
 
				-        # partition windows

			
 
				-        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C

			
 
				-        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C

			
 
				-

			
 
				-        # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size

			
 
				-        if self.input_resolution == x_size:

			
 
				-            attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C

			
 
				-        else:

			
 
				-            attn_windows = self.attn(x_windows, mask=self.calculate_mask(x_size).to(x.device))

			
 
				-

			
 
				-        # merge windows

			
 
				-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)

			
 
				-        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C

			
 
				-

			
 
				-        # reverse cyclic shift

			
 
				-        if self.shift_size > 0:

			
 
				-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))

			
 
				-        else:

			
 
				-            x = shifted_x

			
 
				-        x = x.view(B, H * W, C)

			
 
				-        x = shortcut + self.drop_path(self.norm1(x))

			
 
				-

			
 
				-        # FFN

			
 
				-        x = x + self.drop_path(self.norm2(self.mlp(x)))

			
 
				-

			
 
				-        return x

			
 
				-

			
 
				-    def extra_repr(self) -> str:

			
 
				-        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \

			
 
				-               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"

			
 
				-

			
 
				-    def flops(self):

			
 
				-        flops = 0

			
 
				-        H, W = self.input_resolution

			
 
				-        # norm1

			
 
				-        flops += self.dim * H * W

			
 
				-        # W-MSA/SW-MSA

			
 
				-        nW = H * W / self.window_size / self.window_size

			
 
				-        flops += nW * self.attn.flops(self.window_size * self.window_size)

			
 
				-        # mlp

			
 
				-        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio

			
 
				-        # norm2

			
 
				-        flops += self.dim * H * W

			
 
				-        return flops

			
 
				-

			
 
				-class PatchMerging(nn.Module):

			
 
				-    r""" Patch Merging Layer.

			
 
				-    Args:

			
 
				-        input_resolution (tuple[int]): Resolution of input feature.

			
 
				-        dim (int): Number of input channels.

			
 
				-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):

			
 
				-        super().__init__()

			
 
				-        self.input_resolution = input_resolution

			
 
				-        self.dim = dim

			
 
				-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)

			
 
				-        self.norm = norm_layer(2 * dim)

			
 
				-

			
 
				-    def forward(self, x):

			
 
				-        """

			
 
				-        x: B, H*W, C

			
 
				-        """

			
 
				-        H, W = self.input_resolution

			
 
				-        B, L, C = x.shape

			
 
				-        assert L == H * W, "input feature has wrong size"

			
 
				-        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."

			
 
				-

			
 
				-        x = x.view(B, H, W, C)

			
 
				-

			
 
				-        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C

			
 
				-        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C

			
 
				-        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C

			
 
				-        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C

			
 
				-        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C

			
 
				-        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C

			
 
				-

			
 
				-        x = self.reduction(x)

			
 
				-        x = self.norm(x)

			
 
				-

			
 
				-        return x

			
 
				-

			
 
				-    def extra_repr(self) -> str:

			
 
				-        return f"input_resolution={self.input_resolution}, dim={self.dim}"

			
 
				-

			
 
				-    def flops(self):

			
 
				-        H, W = self.input_resolution

			
 
				-        flops = (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim

			
 
				-        flops += H * W * self.dim // 2

			
 
				-        return flops

			
 
				-

			
 
				-class BasicLayer(nn.Module):

			
 
				-    """ A basic Swin Transformer layer for one stage.

			
 
				-    Args:

			
 
				-        dim (int): Number of input channels.

			
 
				-        input_resolution (tuple[int]): Input resolution.

			
 
				-        depth (int): Number of blocks.

			
 
				-        num_heads (int): Number of attention heads.

			
 
				-        window_size (int): Local window size.

			
 
				-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.

			
 
				-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True

			
 
				-        drop (float, optional): Dropout rate. Default: 0.0

			
 
				-        attn_drop (float, optional): Attention dropout rate. Default: 0.0

			
 
				-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0

			
 
				-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm

			
 
				-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None

			
 
				-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.

			
 
				-        pretrained_window_size (int): Local window size in pre-training.

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,

			
 
				-                 mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0.,

			
 
				-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,

			
 
				-                 pretrained_window_size=0):

			
 
				-

			
 
				-        super().__init__()

			
 
				-        self.dim = dim

			
 
				-        self.input_resolution = input_resolution

			
 
				-        self.depth = depth

			
 
				-        self.use_checkpoint = use_checkpoint

			
 
				-

			
 
				-        # build blocks

			
 
				-        self.blocks = nn.ModuleList([

			
 
				-            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,

			
 
				-                                 num_heads=num_heads, window_size=window_size,

			
 
				-                                 shift_size=0 if (i % 2 == 0) else window_size // 2,

			
 
				-                                 mlp_ratio=mlp_ratio,

			
 
				-                                 qkv_bias=qkv_bias,

			
 
				-                                 drop=drop, attn_drop=attn_drop,

			
 
				-                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,

			
 
				-                                 norm_layer=norm_layer,

			
 
				-                                 pretrained_window_size=pretrained_window_size)

			
 
				-            for i in range(depth)])

			
 
				-

			
 
				-        # patch merging layer

			
 
				-        if downsample is not None:

			
 
				-            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)

			
 
				-        else:

			
 
				-            self.downsample = None

			
 
				-

			
 
				-    def forward(self, x, x_size):

			
 
				-        for blk in self.blocks:

			
 
				-            if self.use_checkpoint:

			
 
				-                x = checkpoint.checkpoint(blk, x, x_size)

			
 
				-            else:

			
 
				-                x = blk(x, x_size)

			
 
				-        if self.downsample is not None:

			
 
				-            x = self.downsample(x)

			
 
				-        return x

			
 
				-

			
 
				-    def extra_repr(self) -> str:

			
 
				-        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"

			
 
				-

			
 
				-    def flops(self):

			
 
				-        flops = 0

			
 
				-        for blk in self.blocks:

			
 
				-            flops += blk.flops()

			
 
				-        if self.downsample is not None:

			
 
				-            flops += self.downsample.flops()

			
 
				-        return flops

			
 
				-

			
 
				-    def _init_respostnorm(self):

			
 
				-        for blk in self.blocks:

			
 
				-            nn.init.constant_(blk.norm1.bias, 0)

			
 
				-            nn.init.constant_(blk.norm1.weight, 0)

			
 
				-            nn.init.constant_(blk.norm2.bias, 0)

			
 
				-            nn.init.constant_(blk.norm2.weight, 0)

			
 
				-

			
 
				-class PatchEmbed(nn.Module):

			
 
				-    r""" Image to Patch Embedding

			
 
				-    Args:

			
 
				-        img_size (int): Image size.  Default: 224.

			
 
				-        patch_size (int): Patch token size. Default: 4.

			
 
				-        in_chans (int): Number of input image channels. Default: 3.

			
 
				-        embed_dim (int): Number of linear projection output channels. Default: 96.

			
 
				-        norm_layer (nn.Module, optional): Normalization layer. Default: None

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):

			
 
				-        super().__init__()

			
 
				-        img_size = to_2tuple(img_size)

			
 
				-        patch_size = to_2tuple(patch_size)

			
 
				-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]

			
 
				-        self.img_size = img_size

			
 
				-        self.patch_size = patch_size

			
 
				-        self.patches_resolution = patches_resolution

			
 
				-        self.num_patches = patches_resolution[0] * patches_resolution[1]

			
 
				-

			
 
				-        self.in_chans = in_chans

			
 
				-        self.embed_dim = embed_dim

			
 
				-

			
 
				-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

			
 
				-        if norm_layer is not None:

			
 
				-            self.norm = norm_layer(embed_dim)

			
 
				-        else:

			
 
				-            self.norm = None

			
 
				-

			
 
				-    def forward(self, x):

			
 
				-        B, C, H, W = x.shape

			
 
				-        # FIXME look at relaxing size constraints

			
 
				-        # assert H == self.img_size[0] and W == self.img_size[1],

			
 
				-        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."

			
 
				-        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C

			
 
				-        if self.norm is not None:

			
 
				-            x = self.norm(x)

			
 
				-        return x

			
 
				-

			
 
				-    def flops(self):

			
 
				-        Ho, Wo = self.patches_resolution

			
 
				-        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])

			
 
				-        if self.norm is not None:

			
 
				-            flops += Ho * Wo * self.embed_dim

			
 
				-        return flops

			
 
				-

			
 
				-class RSTB(nn.Module):

			
 
				-    """Residual Swin Transformer Block (RSTB).

			
 
				-

			
 
				-    Args:

			
 
				-        dim (int): Number of input channels.

			
 
				-        input_resolution (tuple[int]): Input resolution.

			
 
				-        depth (int): Number of blocks.

			
 
				-        num_heads (int): Number of attention heads.

			
 
				-        window_size (int): Local window size.

			
 
				-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.

			
 
				-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True

			
 
				-        drop (float, optional): Dropout rate. Default: 0.0

			
 
				-        attn_drop (float, optional): Attention dropout rate. Default: 0.0

			
 
				-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0

			
 
				-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm

			
 
				-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None

			
 
				-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.

			
 
				-        img_size: Input image size.

			
 
				-        patch_size: Patch size.

			
 
				-        resi_connection: The convolutional block before residual connection.

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,

			
 
				-                 mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0.,

			
 
				-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,

			
 
				-                 img_size=224, patch_size=4, resi_connection='1conv'):

			
 
				-        super(RSTB, self).__init__()

			
 
				-

			
 
				-        self.dim = dim

			
 
				-        self.input_resolution = input_resolution

			
 
				-

			
 
				-        self.residual_group = BasicLayer(dim=dim,

			
 
				-                                         input_resolution=input_resolution,

			
 
				-                                         depth=depth,

			
 
				-                                         num_heads=num_heads,

			
 
				-                                         window_size=window_size,

			
 
				-                                         mlp_ratio=mlp_ratio,

			
 
				-                                         qkv_bias=qkv_bias,

			
 
				-                                         drop=drop, attn_drop=attn_drop,

			
 
				-                                         drop_path=drop_path,

			
 
				-                                         norm_layer=norm_layer,

			
 
				-                                         downsample=downsample,

			
 
				-                                         use_checkpoint=use_checkpoint)

			
 
				-

			
 
				-        if resi_connection == '1conv':

			
 
				-            self.conv = nn.Conv2d(dim, dim, 3, 1, 1)

			
 
				-        elif resi_connection == '3conv':

			
 
				-            # to save parameters and memory

			
 
				-            self.conv = nn.Sequential(nn.Conv2d(dim, dim // 4, 3, 1, 1), nn.LeakyReLU(negative_slope=0.2, inplace=True),

			
 
				-                                      nn.Conv2d(dim // 4, dim // 4, 1, 1, 0),

			
 
				-                                      nn.LeakyReLU(negative_slope=0.2, inplace=True),

			
 
				-                                      nn.Conv2d(dim // 4, dim, 3, 1, 1))

			
 
				-

			
 
				-        self.patch_embed = PatchEmbed(

			
 
				-            img_size=img_size, patch_size=patch_size, in_chans=dim, embed_dim=dim,

			
 
				-            norm_layer=None)

			
 
				-

			
 
				-        self.patch_unembed = PatchUnEmbed(

			
 
				-            img_size=img_size, patch_size=patch_size, in_chans=dim, embed_dim=dim,

			
 
				-            norm_layer=None)

			
 
				-

			
 
				-    def forward(self, x, x_size):

			
 
				-        return self.patch_embed(self.conv(self.patch_unembed(self.residual_group(x, x_size), x_size))) + x

			
 
				-

			
 
				-    def flops(self):

			
 
				-        flops = 0

			
 
				-        flops += self.residual_group.flops()

			
 
				-        H, W = self.input_resolution

			
 
				-        flops += H * W * self.dim * self.dim * 9

			
 
				-        flops += self.patch_embed.flops()

			
 
				-        flops += self.patch_unembed.flops()

			
 
				-

			
 
				-        return flops

			
 
				-

			
 
				-class PatchUnEmbed(nn.Module):

			
 
				-    r""" Image to Patch Unembedding

			
 
				-

			
 
				-    Args:

			
 
				-        img_size (int): Image size.  Default: 224.

			
 
				-        patch_size (int): Patch token size. Default: 4.

			
 
				-        in_chans (int): Number of input image channels. Default: 3.

			
 
				-        embed_dim (int): Number of linear projection output channels. Default: 96.

			
 
				-        norm_layer (nn.Module, optional): Normalization layer. Default: None

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):

			
 
				-        super().__init__()

			
 
				-        img_size = to_2tuple(img_size)

			
 
				-        patch_size = to_2tuple(patch_size)

			
 
				-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]

			
 
				-        self.img_size = img_size

			
 
				-        self.patch_size = patch_size

			
 
				-        self.patches_resolution = patches_resolution

			
 
				-        self.num_patches = patches_resolution[0] * patches_resolution[1]

			
 
				-

			
 
				-        self.in_chans = in_chans

			
 
				-        self.embed_dim = embed_dim

			
 
				-

			
 
				-    def forward(self, x, x_size):

			
 
				-        B, HW, C = x.shape

			
 
				-        x = x.transpose(1, 2).view(B, self.embed_dim, x_size[0], x_size[1])  # B Ph*Pw C

			
 
				-        return x

			
 
				-

			
 
				-    def flops(self):

			
 
				-        flops = 0

			
 
				-        return flops

			
 
				-

			
 
				-

			
 
				-class Upsample(nn.Sequential):

			
 
				-    """Upsample module.

			
 
				-

			
 
				-    Args:

			
 
				-        scale (int): Scale factor. Supported scales: 2^n and 3.

			
 
				-        num_feat (int): Channel number of intermediate features.

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, scale, num_feat):

			
 
				-        m = []

			
 
				-        if (scale & (scale - 1)) == 0:  # scale = 2^n

			
 
				-            for _ in range(int(math.log(scale, 2))):

			
 
				-                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))

			
 
				-                m.append(nn.PixelShuffle(2))

			
 
				-        elif scale == 3:

			
 
				-            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))

			
 
				-            m.append(nn.PixelShuffle(3))

			
 
				-        else:

			
 
				-            raise ValueError(f'scale {scale} is not supported. ' 'Supported scales: 2^n and 3.')

			
 
				-        super(Upsample, self).__init__(*m)

			
 
				-

			
 
				-class Upsample_hf(nn.Sequential):

			
 
				-    """Upsample module.

			
 
				-

			
 
				-    Args:

			
 
				-        scale (int): Scale factor. Supported scales: 2^n and 3.

			
 
				-        num_feat (int): Channel number of intermediate features.

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, scale, num_feat):

			
 
				-        m = []

			
 
				-        if (scale & (scale - 1)) == 0:  # scale = 2^n

			
 
				-            for _ in range(int(math.log(scale, 2))):

			
 
				-                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))

			
 
				-                m.append(nn.PixelShuffle(2))

			
 
				-        elif scale == 3:

			
 
				-            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))

			
 
				-            m.append(nn.PixelShuffle(3))

			
 
				-        else:

			
 
				-            raise ValueError(f'scale {scale} is not supported. ' 'Supported scales: 2^n and 3.')

			
 
				-        super(Upsample_hf, self).__init__(*m)

			
 
				-

			
 
				-

			
 
				-class UpsampleOneStep(nn.Sequential):

			
 
				-    """UpsampleOneStep module (the difference with Upsample is that it always only has 1conv + 1pixelshuffle)

			
 
				-       Used in lightweight SR to save parameters.

			
 
				-

			
 
				-    Args:

			
 
				-        scale (int): Scale factor. Supported scales: 2^n and 3.

			
 
				-        num_feat (int): Channel number of intermediate features.

			
 
				-

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, scale, num_feat, num_out_ch, input_resolution=None):

			
 
				-        self.num_feat = num_feat

			
 
				-        self.input_resolution = input_resolution

			
 
				-        m = []

			
 
				-        m.append(nn.Conv2d(num_feat, (scale ** 2) * num_out_ch, 3, 1, 1))

			
 
				-        m.append(nn.PixelShuffle(scale))

			
 
				-        super(UpsampleOneStep, self).__init__(*m)

			
 
				-

			
 
				-    def flops(self):

			
 
				-        H, W = self.input_resolution

			
 
				-        flops = H * W * self.num_feat * 3 * 9

			
 
				-        return flops

			
 
				-

			
 
				-

			
 
				-

			
 
				-class Swin2SR(nn.Module):

			
 
				-    r""" Swin2SR

			
 
				-        A PyTorch impl of : `Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration`.

			
 
				-

			
 
				-    Args:

			
 
				-        img_size (int | tuple(int)): Input image size. Default 64

			
 
				-        patch_size (int | tuple(int)): Patch size. Default: 1

			
 
				-        in_chans (int): Number of input image channels. Default: 3

			
 
				-        embed_dim (int): Patch embedding dimension. Default: 96

			
 
				-        depths (tuple(int)): Depth of each Swin Transformer layer.

			
 
				-        num_heads (tuple(int)): Number of attention heads in different layers.

			
 
				-        window_size (int): Window size. Default: 7

			
 
				-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4

			
 
				-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True

			
 
				-        drop_rate (float): Dropout rate. Default: 0

			
 
				-        attn_drop_rate (float): Attention dropout rate. Default: 0

			
 
				-        drop_path_rate (float): Stochastic depth rate. Default: 0.1

			
 
				-        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.

			
 
				-        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False

			
 
				-        patch_norm (bool): If True, add normalization after patch embedding. Default: True

			
 
				-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False

			
 
				-        upscale: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction

			
 
				-        img_range: Image range. 1. or 255.

			
 
				-        upsampler: The reconstruction reconstruction module. 'pixelshuffle'/'pixelshuffledirect'/'nearest+conv'/None

			
 
				-        resi_connection: The convolutional block before residual connection. '1conv'/'3conv'

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, img_size=64, patch_size=1, in_chans=3,

			
 
				-                 embed_dim=96, depths=(6, 6, 6, 6), num_heads=(6, 6, 6, 6),

			
 
				-                 window_size=7, mlp_ratio=4., qkv_bias=True,

			
 
				-                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,

			
 
				-                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True,

			
 
				-                 use_checkpoint=False, upscale=2, img_range=1., upsampler='', resi_connection='1conv',

			
 
				-                 **kwargs):

			
 
				-        super(Swin2SR, self).__init__()

			
 
				-        num_in_ch = in_chans

			
 
				-        num_out_ch = in_chans

			
 
				-        num_feat = 64

			
 
				-        self.img_range = img_range

			
 
				-        if in_chans == 3:

			
 
				-            rgb_mean = (0.4488, 0.4371, 0.4040)

			
 
				-            self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)

			
 
				-        else:

			
 
				-            self.mean = torch.zeros(1, 1, 1, 1)

			
 
				-        self.upscale = upscale

			
 
				-        self.upsampler = upsampler

			
 
				-        self.window_size = window_size

			
 
				-

			
 
				-        #####################################################################################################

			
 
				-        ################################### 1, shallow feature extraction ###################################

			
 
				-        self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1)

			
 
				-

			
 
				-        #####################################################################################################

			
 
				-        ################################### 2, deep feature extraction ######################################

			
 
				-        self.num_layers = len(depths)

			
 
				-        self.embed_dim = embed_dim

			
 
				-        self.ape = ape

			
 
				-        self.patch_norm = patch_norm

			
 
				-        self.num_features = embed_dim

			
 
				-        self.mlp_ratio = mlp_ratio

			
 
				-

			
 
				-        # split image into non-overlapping patches

			
 
				-        self.patch_embed = PatchEmbed(

			
 
				-            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,

			
 
				-            norm_layer=norm_layer if self.patch_norm else None)

			
 
				-        num_patches = self.patch_embed.num_patches

			
 
				-        patches_resolution = self.patch_embed.patches_resolution

			
 
				-        self.patches_resolution = patches_resolution

			
 
				-

			
 
				-        # merge non-overlapping patches into image

			
 
				-        self.patch_unembed = PatchUnEmbed(

			
 
				-            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,

			
 
				-            norm_layer=norm_layer if self.patch_norm else None)

			
 
				-

			
 
				-        # absolute position embedding

			
 
				-        if self.ape:

			
 
				-            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))

			
 
				-            trunc_normal_(self.absolute_pos_embed, std=.02)

			
 
				-

			
 
				-        self.pos_drop = nn.Dropout(p=drop_rate)

			
 
				-

			
 
				-        # stochastic depth

			
 
				-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule

			
 
				-

			
 
				-        # build Residual Swin Transformer blocks (RSTB)

			
 
				-        self.layers = nn.ModuleList()

			
 
				-        for i_layer in range(self.num_layers):

			
 
				-            layer = RSTB(dim=embed_dim,

			
 
				-                         input_resolution=(patches_resolution[0],

			
 
				-                                           patches_resolution[1]),

			
 
				-                         depth=depths[i_layer],

			
 
				-                         num_heads=num_heads[i_layer],

			
 
				-                         window_size=window_size,

			
 
				-                         mlp_ratio=self.mlp_ratio,

			
 
				-                         qkv_bias=qkv_bias,

			
 
				-                         drop=drop_rate, attn_drop=attn_drop_rate,

			
 
				-                         drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],  # no impact on SR results

			
 
				-                         norm_layer=norm_layer,

			
 
				-                         downsample=None,

			
 
				-                         use_checkpoint=use_checkpoint,

			
 
				-                         img_size=img_size,

			
 
				-                         patch_size=patch_size,

			
 
				-                         resi_connection=resi_connection

			
 
				-

			
 
				-                         )

			
 
				-            self.layers.append(layer)

			
 
				-

			
 
				-        if self.upsampler == 'pixelshuffle_hf':

			
 
				-            self.layers_hf = nn.ModuleList()

			
 
				-            for i_layer in range(self.num_layers):

			
 
				-                layer = RSTB(dim=embed_dim,

			
 
				-                             input_resolution=(patches_resolution[0],

			
 
				-                                               patches_resolution[1]),

			
 
				-                             depth=depths[i_layer],

			
 
				-                             num_heads=num_heads[i_layer],

			
 
				-                             window_size=window_size,

			
 
				-                             mlp_ratio=self.mlp_ratio,

			
 
				-                             qkv_bias=qkv_bias,

			
 
				-                             drop=drop_rate, attn_drop=attn_drop_rate,

			
 
				-                             drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],  # no impact on SR results

			
 
				-                             norm_layer=norm_layer,

			
 
				-                             downsample=None,

			
 
				-                             use_checkpoint=use_checkpoint,

			
 
				-                             img_size=img_size,

			
 
				-                             patch_size=patch_size,

			
 
				-                             resi_connection=resi_connection

			
 
				-

			
 
				-                             )

			
 
				-                self.layers_hf.append(layer)

			
 
				-

			
 
				-        self.norm = norm_layer(self.num_features)

			
 
				-

			
 
				-        # build the last conv layer in deep feature extraction

			
 
				-        if resi_connection == '1conv':

			
 
				-            self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)

			
 
				-        elif resi_connection == '3conv':

			
 
				-            # to save parameters and memory

			
 
				-            self.conv_after_body = nn.Sequential(nn.Conv2d(embed_dim, embed_dim // 4, 3, 1, 1),

			
 
				-                                                 nn.LeakyReLU(negative_slope=0.2, inplace=True),

			
 
				-                                                 nn.Conv2d(embed_dim // 4, embed_dim // 4, 1, 1, 0),

			
 
				-                                                 nn.LeakyReLU(negative_slope=0.2, inplace=True),

			
 
				-                                                 nn.Conv2d(embed_dim // 4, embed_dim, 3, 1, 1))

			
 
				-

			
 
				-        #####################################################################################################

			
 
				-        ################################ 3, high quality image reconstruction ################################

			
 
				-        if self.upsampler == 'pixelshuffle':

			
 
				-            # for classical SR

			
 
				-            self.conv_before_upsample = nn.Sequential(nn.Conv2d(embed_dim, num_feat, 3, 1, 1),

			
 
				-                                                      nn.LeakyReLU(inplace=True))

			
 
				-            self.upsample = Upsample(upscale, num_feat)

			
 
				-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)

			
 
				-        elif self.upsampler == 'pixelshuffle_aux':

			
 
				-            self.conv_bicubic = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)

			
 
				-            self.conv_before_upsample = nn.Sequential(

			
 
				-                nn.Conv2d(embed_dim, num_feat, 3, 1, 1),

			
 
				-                nn.LeakyReLU(inplace=True))

			
 
				-            self.conv_aux = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)

			
 
				-            self.conv_after_aux = nn.Sequential(

			
 
				-                nn.Conv2d(3, num_feat, 3, 1, 1),

			
 
				-                nn.LeakyReLU(inplace=True))

			
 
				-            self.upsample = Upsample(upscale, num_feat)

			
 
				-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)

			
 
				-

			
 
				-        elif self.upsampler == 'pixelshuffle_hf':

			
 
				-            self.conv_before_upsample = nn.Sequential(nn.Conv2d(embed_dim, num_feat, 3, 1, 1),

			
 
				-                                                      nn.LeakyReLU(inplace=True))

			
 
				-            self.upsample = Upsample(upscale, num_feat)

			
 
				-            self.upsample_hf = Upsample_hf(upscale, num_feat)

			
 
				-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)

			
 
				-            self.conv_first_hf = nn.Sequential(nn.Conv2d(num_feat, embed_dim, 3, 1, 1),

			
 
				-                                                      nn.LeakyReLU(inplace=True))

			
 
				-            self.conv_after_body_hf = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)

			
 
				-            self.conv_before_upsample_hf = nn.Sequential(

			
 
				-                nn.Conv2d(embed_dim, num_feat, 3, 1, 1),

			
 
				-                nn.LeakyReLU(inplace=True))

			
 
				-            self.conv_last_hf = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)

			
 
				-

			
 
				-        elif self.upsampler == 'pixelshuffledirect':

			
 
				-            # for lightweight SR (to save parameters)

			
 
				-            self.upsample = UpsampleOneStep(upscale, embed_dim, num_out_ch,

			
 
				-                                            (patches_resolution[0], patches_resolution[1]))

			
 
				-        elif self.upsampler == 'nearest+conv':

			
 
				-            # for real-world SR (less artifacts)

			
 
				-            assert self.upscale == 4, 'only support x4 now.'

			
 
				-            self.conv_before_upsample = nn.Sequential(nn.Conv2d(embed_dim, num_feat, 3, 1, 1),

			
 
				-                                                      nn.LeakyReLU(inplace=True))

			
 
				-            self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)

			
 
				-            self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)

			
 
				-            self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)

			
 
				-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)

			
 
				-            self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)

			
 
				-        else:

			
 
				-            # for image denoising and JPEG compression artifact reduction

			
 
				-            self.conv_last = nn.Conv2d(embed_dim, num_out_ch, 3, 1, 1)

			
 
				-

			
 
				-        self.apply(self._init_weights)

			
 
				-

			
 
				-    def _init_weights(self, m):

			
 
				-        if isinstance(m, nn.Linear):

			
 
				-            trunc_normal_(m.weight, std=.02)

			
 
				-            if isinstance(m, nn.Linear) and m.bias is not None:

			
 
				-                nn.init.constant_(m.bias, 0)

			
 
				-        elif isinstance(m, nn.LayerNorm):

			
 
				-            nn.init.constant_(m.bias, 0)

			
 
				-            nn.init.constant_(m.weight, 1.0)

			
 
				-

			
 
				-    @torch.jit.ignore

			
 
				-    def no_weight_decay(self):

			
 
				-        return {'absolute_pos_embed'}

			
 
				-

			
 
				-    @torch.jit.ignore

			
 
				-    def no_weight_decay_keywords(self):

			
 
				-        return {'relative_position_bias_table'}

			
 
				-

			
 
				-    def check_image_size(self, x):

			
 
				-        _, _, h, w = x.size()

			
 
				-        mod_pad_h = (self.window_size - h % self.window_size) % self.window_size

			
 
				-        mod_pad_w = (self.window_size - w % self.window_size) % self.window_size

			
 
				-        x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), 'reflect')

			
 
				-        return x

			
 
				-

			
 
				-    def forward_features(self, x):

			
 
				-        x_size = (x.shape[2], x.shape[3])

			
 
				-        x = self.patch_embed(x)

			
 
				-        if self.ape:

			
 
				-            x = x + self.absolute_pos_embed

			
 
				-        x = self.pos_drop(x)

			
 
				-

			
 
				-        for layer in self.layers:

			
 
				-            x = layer(x, x_size)

			
 
				-

			
 
				-        x = self.norm(x)  # B L C

			
 
				-        x = self.patch_unembed(x, x_size)

			
 
				-

			
 
				-        return x

			
 
				-

			
 
				-    def forward_features_hf(self, x):

			
 
				-        x_size = (x.shape[2], x.shape[3])

			
 
				-        x = self.patch_embed(x)

			
 
				-        if self.ape:

			
 
				-            x = x + self.absolute_pos_embed

			
 
				-        x = self.pos_drop(x)

			
 
				-

			
 
				-        for layer in self.layers_hf:

			
 
				-            x = layer(x, x_size)

			
 
				-

			
 
				-        x = self.norm(x)  # B L C

			
 
				-        x = self.patch_unembed(x, x_size)

			
 
				-

			
 
				-        return x

			
 
				-

			
 
				-    def forward(self, x):

			
 
				-        H, W = x.shape[2:]

			
 
				-        x = self.check_image_size(x)

			
 
				-

			
 
				-        self.mean = self.mean.type_as(x)

			
 
				-        x = (x - self.mean) * self.img_range

			
 
				-

			
 
				-        if self.upsampler == 'pixelshuffle':

			
 
				-            # for classical SR

			
 
				-            x = self.conv_first(x)

			
 
				-            x = self.conv_after_body(self.forward_features(x)) + x

			
 
				-            x = self.conv_before_upsample(x)

			
 
				-            x = self.conv_last(self.upsample(x))

			
 
				-        elif self.upsampler == 'pixelshuffle_aux':

			
 
				-            bicubic = F.interpolate(x, size=(H * self.upscale, W * self.upscale), mode='bicubic', align_corners=False)

			
 
				-            bicubic = self.conv_bicubic(bicubic)

			
 
				-            x = self.conv_first(x)

			
 
				-            x = self.conv_after_body(self.forward_features(x)) + x

			
 
				-            x = self.conv_before_upsample(x)

			
 
				-            aux = self.conv_aux(x) # b, 3, LR_H, LR_W

			
 
				-            x = self.conv_after_aux(aux)

			
 
				-            x = self.upsample(x)[:, :, :H * self.upscale, :W * self.upscale] + bicubic[:, :, :H * self.upscale, :W * self.upscale]

			
 
				-            x = self.conv_last(x)

			
 
				-            aux = aux / self.img_range + self.mean

			
 
				-        elif self.upsampler == 'pixelshuffle_hf':

			
 
				-            # for classical SR with HF

			
 
				-            x = self.conv_first(x)

			
 
				-            x = self.conv_after_body(self.forward_features(x)) + x

			
 
				-            x_before = self.conv_before_upsample(x)

			
 
				-            x_out = self.conv_last(self.upsample(x_before))

			
 
				-

			
 
				-            x_hf = self.conv_first_hf(x_before)

			
 
				-            x_hf = self.conv_after_body_hf(self.forward_features_hf(x_hf)) + x_hf

			
 
				-            x_hf = self.conv_before_upsample_hf(x_hf)

			
 
				-            x_hf = self.conv_last_hf(self.upsample_hf(x_hf))

			
 
				-            x = x_out + x_hf

			
 
				-            x_hf = x_hf / self.img_range + self.mean

			
 
				-

			
 
				-        elif self.upsampler == 'pixelshuffledirect':

			
 
				-            # for lightweight SR

			
 
				-            x = self.conv_first(x)

			
 
				-            x = self.conv_after_body(self.forward_features(x)) + x

			
 
				-            x = self.upsample(x)

			
 
				-        elif self.upsampler == 'nearest+conv':

			
 
				-            # for real-world SR

			
 
				-            x = self.conv_first(x)

			
 
				-            x = self.conv_after_body(self.forward_features(x)) + x

			
 
				-            x = self.conv_before_upsample(x)

			
 
				-            x = self.lrelu(self.conv_up1(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))

			
 
				-            x = self.lrelu(self.conv_up2(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))

			
 
				-            x = self.conv_last(self.lrelu(self.conv_hr(x)))

			
 
				-        else:

			
 
				-            # for image denoising and JPEG compression artifact reduction

			
 
				-            x_first = self.conv_first(x)

			
 
				-            res = self.conv_after_body(self.forward_features(x_first)) + x_first

			
 
				-            x = x + self.conv_last(res)

			
 
				-

			
 
				-        x = x / self.img_range + self.mean

			
 
				-        if self.upsampler == "pixelshuffle_aux":

			
 
				-            return x[:, :, :H*self.upscale, :W*self.upscale], aux

			
 
				-

			
 
				-        elif self.upsampler == "pixelshuffle_hf":

			
 
				-            x_out = x_out / self.img_range + self.mean

			
 
				-            return x_out[:, :, :H*self.upscale, :W*self.upscale], x[:, :, :H*self.upscale, :W*self.upscale], x_hf[:, :, :H*self.upscale, :W*self.upscale]

			
 
				-

			
 
				-        else:

			
 
				-            return x[:, :, :H*self.upscale, :W*self.upscale]

			
 
				-

			
 
				-    def flops(self):

			
 
				-        flops = 0

			
 
				-        H, W = self.patches_resolution

			
 
				-        flops += H * W * 3 * self.embed_dim * 9

			
 
				-        flops += self.patch_embed.flops()

			
 
				-        for layer in self.layers:

			
 
				-            flops += layer.flops()

			
 
				-        flops += H * W * 3 * self.embed_dim * self.embed_dim

			
 
				-        flops += self.upsample.flops()

			
 
				-        return flops

			
 
				-

			
 
				-

			
 
				-if __name__ == '__main__':

			
 
				-    upscale = 4

			
 
				-    window_size = 8

			
 
				-    height = (1024 // upscale // window_size + 1) * window_size

			
 
				-    width = (720 // upscale // window_size + 1) * window_size

			
 
				-    model = Swin2SR(upscale=2, img_size=(height, width),

			
 
				-                   window_size=window_size, img_range=1., depths=[6, 6, 6, 6],

			
 
				-                   embed_dim=60, num_heads=[6, 6, 6, 6], mlp_ratio=2, upsampler='pixelshuffledirect')

			
 
				-    print(model)

			
 
				-    print(height, width, model.flops() / 1e9)

			
 
				-

			
 
				-    x = torch.randn((1, 3, height, width))

			
 
				-    x = model(x)

			
 
				-    print(x.shape)

			
--- a/modules/codeformer/codeformer_arch.py
+++ b/modules/codeformer/codeformer_arch.py
@@ -1,276 +0,0 @@
 
				-# this file is copied from CodeFormer repository. Please see comment in modules/codeformer_model.py
			
 
				-
			
 
				-import math
			
 
				-import torch
			
 
				-from torch import nn, Tensor
			
 
				-import torch.nn.functional as F
			
 
				-from typing import Optional
			
 
				-
			
 
				-from modules.codeformer.vqgan_arch import VQAutoEncoder, ResBlock
			
 
				-from basicsr.utils.registry import ARCH_REGISTRY
			
 
				-
			
 
				-def calc_mean_std(feat, eps=1e-5):
			
 
				-    """Calculate mean and std for adaptive_instance_normalization.
			
 
				-
			
 
				-    Args:
			
 
				-        feat (Tensor): 4D tensor.
			
 
				-        eps (float): A small value added to the variance to avoid
			
 
				-            divide-by-zero. Default: 1e-5.
			
 
				-    """
			
 
				-    size = feat.size()
			
 
				-    assert len(size) == 4, 'The input feature should be 4D tensor.'
			
 
				-    b, c = size[:2]
			
 
				-    feat_var = feat.view(b, c, -1).var(dim=2) + eps
			
 
				-    feat_std = feat_var.sqrt().view(b, c, 1, 1)
			
 
				-    feat_mean = feat.view(b, c, -1).mean(dim=2).view(b, c, 1, 1)
			
 
				-    return feat_mean, feat_std
			
 
				-
			
 
				-
			
 
				-def adaptive_instance_normalization(content_feat, style_feat):
			
 
				-    """Adaptive instance normalization.
			
 
				-
			
 
				-    Adjust the reference features to have the similar color and illuminations
			
 
				-    as those in the degradate features.
			
 
				-
			
 
				-    Args:
			
 
				-        content_feat (Tensor): The reference feature.
			
 
				-        style_feat (Tensor): The degradate features.
			
 
				-    """
			
 
				-    size = content_feat.size()
			
 
				-    style_mean, style_std = calc_mean_std(style_feat)
			
 
				-    content_mean, content_std = calc_mean_std(content_feat)
			
 
				-    normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
			
 
				-    return normalized_feat * style_std.expand(size) + style_mean.expand(size)
			
 
				-
			
 
				-
			
 
				-class PositionEmbeddingSine(nn.Module):
			
 
				-    """
			
 
				-    This is a more standard version of the position embedding, very similar to the one
			
 
				-    used by the Attention is all you need paper, generalized to work on images.
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
			
 
				-        super().__init__()
			
 
				-        self.num_pos_feats = num_pos_feats
			
 
				-        self.temperature = temperature
			
 
				-        self.normalize = normalize
			
 
				-        if scale is not None and normalize is False:
			
 
				-            raise ValueError("normalize should be True if scale is passed")
			
 
				-        if scale is None:
			
 
				-            scale = 2 * math.pi
			
 
				-        self.scale = scale
			
 
				-
			
 
				-    def forward(self, x, mask=None):
			
 
				-        if mask is None:
			
 
				-            mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
			
 
				-        not_mask = ~mask
			
 
				-        y_embed = not_mask.cumsum(1, dtype=torch.float32)
			
 
				-        x_embed = not_mask.cumsum(2, dtype=torch.float32)
			
 
				-        if self.normalize:
			
 
				-            eps = 1e-6
			
 
				-            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
			
 
				-            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
			
 
				-
			
 
				-        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
			
 
				-        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
			
 
				-
			
 
				-        pos_x = x_embed[:, :, :, None] / dim_t
			
 
				-        pos_y = y_embed[:, :, :, None] / dim_t
			
 
				-        pos_x = torch.stack(
			
 
				-            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
			
 
				-        ).flatten(3)
			
 
				-        pos_y = torch.stack(
			
 
				-            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
			
 
				-        ).flatten(3)
			
 
				-        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
			
 
				-        return pos
			
 
				-
			
 
				-def _get_activation_fn(activation):
			
 
				-    """Return an activation function given a string"""
			
 
				-    if activation == "relu":
			
 
				-        return F.relu
			
 
				-    if activation == "gelu":
			
 
				-        return F.gelu
			
 
				-    if activation == "glu":
			
 
				-        return F.glu
			
 
				-    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
			
 
				-
			
 
				-
			
 
				-class TransformerSALayer(nn.Module):
			
 
				-    def __init__(self, embed_dim, nhead=8, dim_mlp=2048, dropout=0.0, activation="gelu"):
			
 
				-        super().__init__()
			
 
				-        self.self_attn = nn.MultiheadAttention(embed_dim, nhead, dropout=dropout)
			
 
				-        # Implementation of Feedforward model - MLP
			
 
				-        self.linear1 = nn.Linear(embed_dim, dim_mlp)
			
 
				-        self.dropout = nn.Dropout(dropout)
			
 
				-        self.linear2 = nn.Linear(dim_mlp, embed_dim)
			
 
				-
			
 
				-        self.norm1 = nn.LayerNorm(embed_dim)
			
 
				-        self.norm2 = nn.LayerNorm(embed_dim)
			
 
				-        self.dropout1 = nn.Dropout(dropout)
			
 
				-        self.dropout2 = nn.Dropout(dropout)
			
 
				-
			
 
				-        self.activation = _get_activation_fn(activation)
			
 
				-
			
 
				-    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
			
 
				-        return tensor if pos is None else tensor + pos
			
 
				-
			
 
				-    def forward(self, tgt,
			
 
				-                tgt_mask: Optional[Tensor] = None,
			
 
				-                tgt_key_padding_mask: Optional[Tensor] = None,
			
 
				-                query_pos: Optional[Tensor] = None):
			
 
				-
			
 
				-        # self attention
			
 
				-        tgt2 = self.norm1(tgt)
			
 
				-        q = k = self.with_pos_embed(tgt2, query_pos)
			
 
				-        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
			
 
				-                              key_padding_mask=tgt_key_padding_mask)[0]
			
 
				-        tgt = tgt + self.dropout1(tgt2)
			
 
				-
			
 
				-        # ffn
			
 
				-        tgt2 = self.norm2(tgt)
			
 
				-        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
			
 
				-        tgt = tgt + self.dropout2(tgt2)
			
 
				-        return tgt
			
 
				-
			
 
				-class Fuse_sft_block(nn.Module):
			
 
				-    def __init__(self, in_ch, out_ch):
			
 
				-        super().__init__()
			
 
				-        self.encode_enc = ResBlock(2*in_ch, out_ch)
			
 
				-
			
 
				-        self.scale = nn.Sequential(
			
 
				-                    nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
			
 
				-                    nn.LeakyReLU(0.2, True),
			
 
				-                    nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
			
 
				-
			
 
				-        self.shift = nn.Sequential(
			
 
				-                    nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
			
 
				-                    nn.LeakyReLU(0.2, True),
			
 
				-                    nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
			
 
				-
			
 
				-    def forward(self, enc_feat, dec_feat, w=1):
			
 
				-        enc_feat = self.encode_enc(torch.cat([enc_feat, dec_feat], dim=1))
			
 
				-        scale = self.scale(enc_feat)
			
 
				-        shift = self.shift(enc_feat)
			
 
				-        residual = w * (dec_feat * scale + shift)
			
 
				-        out = dec_feat + residual
			
 
				-        return out
			
 
				-
			
 
				-
			
 
				-@ARCH_REGISTRY.register()
			
 
				-class CodeFormer(VQAutoEncoder):
			
 
				-    def __init__(self, dim_embd=512, n_head=8, n_layers=9,
			
 
				-                codebook_size=1024, latent_size=256,
			
 
				-                connect_list=('32', '64', '128', '256'),
			
 
				-                fix_modules=('quantize', 'generator')):
			
 
				-        super(CodeFormer, self).__init__(512, 64, [1, 2, 2, 4, 4, 8], 'nearest',2, [16], codebook_size)
			
 
				-
			
 
				-        if fix_modules is not None:
			
 
				-            for module in fix_modules:
			
 
				-                for param in getattr(self, module).parameters():
			
 
				-                    param.requires_grad = False
			
 
				-
			
 
				-        self.connect_list = connect_list
			
 
				-        self.n_layers = n_layers
			
 
				-        self.dim_embd = dim_embd
			
 
				-        self.dim_mlp = dim_embd*2
			
 
				-
			
 
				-        self.position_emb = nn.Parameter(torch.zeros(latent_size, self.dim_embd))
			
 
				-        self.feat_emb = nn.Linear(256, self.dim_embd)
			
 
				-
			
 
				-        # transformer
			
 
				-        self.ft_layers = nn.Sequential(*[TransformerSALayer(embed_dim=dim_embd, nhead=n_head, dim_mlp=self.dim_mlp, dropout=0.0)
			
 
				-                                    for _ in range(self.n_layers)])
			
 
				-
			
 
				-        # logits_predict head
			
 
				-        self.idx_pred_layer = nn.Sequential(
			
 
				-            nn.LayerNorm(dim_embd),
			
 
				-            nn.Linear(dim_embd, codebook_size, bias=False))
			
 
				-
			
 
				-        self.channels = {
			
 
				-            '16': 512,
			
 
				-            '32': 256,
			
 
				-            '64': 256,
			
 
				-            '128': 128,
			
 
				-            '256': 128,
			
 
				-            '512': 64,
			
 
				-        }
			
 
				-
			
 
				-        # after second residual block for > 16, before attn layer for ==16
			
 
				-        self.fuse_encoder_block = {'512':2, '256':5, '128':8, '64':11, '32':14, '16':18}
			
 
				-        # after first residual block for > 16, before attn layer for ==16
			
 
				-        self.fuse_generator_block = {'16':6, '32': 9, '64':12, '128':15, '256':18, '512':21}
			
 
				-
			
 
				-        # fuse_convs_dict
			
 
				-        self.fuse_convs_dict = nn.ModuleDict()
			
 
				-        for f_size in self.connect_list:
			
 
				-            in_ch = self.channels[f_size]
			
 
				-            self.fuse_convs_dict[f_size] = Fuse_sft_block(in_ch, in_ch)
			
 
				-
			
 
				-    def _init_weights(self, module):
			
 
				-        if isinstance(module, (nn.Linear, nn.Embedding)):
			
 
				-            module.weight.data.normal_(mean=0.0, std=0.02)
			
 
				-            if isinstance(module, nn.Linear) and module.bias is not None:
			
 
				-                module.bias.data.zero_()
			
 
				-        elif isinstance(module, nn.LayerNorm):
			
 
				-            module.bias.data.zero_()
			
 
				-            module.weight.data.fill_(1.0)
			
 
				-
			
 
				-    def forward(self, x, w=0, detach_16=True, code_only=False, adain=False):
			
 
				-        # ################### Encoder #####################
			
 
				-        enc_feat_dict = {}
			
 
				-        out_list = [self.fuse_encoder_block[f_size] for f_size in self.connect_list]
			
 
				-        for i, block in enumerate(self.encoder.blocks):
			
 
				-            x = block(x)
			
 
				-            if i in out_list:
			
 
				-                enc_feat_dict[str(x.shape[-1])] = x.clone()
			
 
				-
			
 
				-        lq_feat = x
			
 
				-        # ################# Transformer ###################
			
 
				-        # quant_feat, codebook_loss, quant_stats = self.quantize(lq_feat)
			
 
				-        pos_emb = self.position_emb.unsqueeze(1).repeat(1,x.shape[0],1)
			
 
				-        # BCHW -> BC(HW) -> (HW)BC
			
 
				-        feat_emb = self.feat_emb(lq_feat.flatten(2).permute(2,0,1))
			
 
				-        query_emb = feat_emb
			
 
				-        # Transformer encoder
			
 
				-        for layer in self.ft_layers:
			
 
				-            query_emb = layer(query_emb, query_pos=pos_emb)
			
 
				-
			
 
				-        # output logits
			
 
				-        logits = self.idx_pred_layer(query_emb) # (hw)bn
			
 
				-        logits = logits.permute(1,0,2) # (hw)bn -> b(hw)n
			
 
				-
			
 
				-        if code_only: # for training stage II
			
 
				-          # logits doesn't need softmax before cross_entropy loss
			
 
				-            return logits, lq_feat
			
 
				-
			
 
				-        # ################# Quantization ###################
			
 
				-        # if self.training:
			
 
				-        #     quant_feat = torch.einsum('btn,nc->btc', [soft_one_hot, self.quantize.embedding.weight])
			
 
				-        #     # b(hw)c -> bc(hw) -> bchw
			
 
				-        #     quant_feat = quant_feat.permute(0,2,1).view(lq_feat.shape)
			
 
				-        # ------------
			
 
				-        soft_one_hot = F.softmax(logits, dim=2)
			
 
				-        _, top_idx = torch.topk(soft_one_hot, 1, dim=2)
			
 
				-        quant_feat = self.quantize.get_codebook_feat(top_idx, shape=[x.shape[0],16,16,256])
			
 
				-        # preserve gradients
			
 
				-        # quant_feat = lq_feat + (quant_feat - lq_feat).detach()
			
 
				-
			
 
				-        if detach_16:
			
 
				-            quant_feat = quant_feat.detach() # for training stage III
			
 
				-        if adain:
			
 
				-            quant_feat = adaptive_instance_normalization(quant_feat, lq_feat)
			
 
				-
			
 
				-        # ################## Generator ####################
			
 
				-        x = quant_feat
			
 
				-        fuse_list = [self.fuse_generator_block[f_size] for f_size in self.connect_list]
			
 
				-
			
 
				-        for i, block in enumerate(self.generator.blocks):
			
 
				-            x = block(x)
			
 
				-            if i in fuse_list: # fuse after i-th block
			
 
				-                f_size = str(x.shape[-1])
			
 
				-                if w>0:
			
 
				-                    x = self.fuse_convs_dict[f_size](enc_feat_dict[f_size].detach(), x, w)
			
 
				-        out = x
			
 
				-        # logits doesn't need softmax before cross_entropy loss
			
 
				-        return out, logits, lq_feat
			
--- a/modules/codeformer/vqgan_arch.py
+++ b/modules/codeformer/vqgan_arch.py
@@ -1,435 +0,0 @@
 
				-# this file is copied from CodeFormer repository. Please see comment in modules/codeformer_model.py
			
 
				-
			
 
				-'''
			
 
				-VQGAN code, adapted from the original created by the Unleashing Transformers authors:
			
 
				-https://github.com/samb-t/unleashing-transformers/blob/master/models/vqgan.py
			
 
				-
			
 
				-'''
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-from basicsr.utils import get_root_logger
			
 
				-from basicsr.utils.registry import ARCH_REGISTRY
			
 
				-
			
 
				-def normalize(in_channels):
			
 
				-    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
			
 
				-
			
 
				-
			
 
				-@torch.jit.script
			
 
				-def swish(x):
			
 
				-    return x*torch.sigmoid(x)
			
 
				-
			
 
				-
			
 
				-#  Define VQVAE classes
			
 
				-class VectorQuantizer(nn.Module):
			
 
				-    def __init__(self, codebook_size, emb_dim, beta):
			
 
				-        super(VectorQuantizer, self).__init__()
			
 
				-        self.codebook_size = codebook_size  # number of embeddings
			
 
				-        self.emb_dim = emb_dim  # dimension of embedding
			
 
				-        self.beta = beta  # commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2
			
 
				-        self.embedding = nn.Embedding(self.codebook_size, self.emb_dim)
			
 
				-        self.embedding.weight.data.uniform_(-1.0 / self.codebook_size, 1.0 / self.codebook_size)
			
 
				-
			
 
				-    def forward(self, z):
			
 
				-        # reshape z -> (batch, height, width, channel) and flatten
			
 
				-        z = z.permute(0, 2, 3, 1).contiguous()
			
 
				-        z_flattened = z.view(-1, self.emb_dim)
			
 
				-
			
 
				-        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
			
 
				-        d = (z_flattened ** 2).sum(dim=1, keepdim=True) + (self.embedding.weight**2).sum(1) - \
			
 
				-            2 * torch.matmul(z_flattened, self.embedding.weight.t())
			
 
				-
			
 
				-        mean_distance = torch.mean(d)
			
 
				-        # find closest encodings
			
 
				-        # min_encoding_indices = torch.argmin(d, dim=1).unsqueeze(1)
			
 
				-        min_encoding_scores, min_encoding_indices = torch.topk(d, 1, dim=1, largest=False)
			
 
				-        # [0-1], higher score, higher confidence
			
 
				-        min_encoding_scores = torch.exp(-min_encoding_scores/10)
			
 
				-
			
 
				-        min_encodings = torch.zeros(min_encoding_indices.shape[0], self.codebook_size).to(z)
			
 
				-        min_encodings.scatter_(1, min_encoding_indices, 1)
			
 
				-
			
 
				-        # get quantized latent vectors
			
 
				-        z_q = torch.matmul(min_encodings, self.embedding.weight).view(z.shape)
			
 
				-        # compute loss for embedding
			
 
				-        loss = torch.mean((z_q.detach()-z)**2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
			
 
				-        # preserve gradients
			
 
				-        z_q = z + (z_q - z).detach()
			
 
				-
			
 
				-        # perplexity
			
 
				-        e_mean = torch.mean(min_encodings, dim=0)
			
 
				-        perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10)))
			
 
				-        # reshape back to match original input shape
			
 
				-        z_q = z_q.permute(0, 3, 1, 2).contiguous()
			
 
				-
			
 
				-        return z_q, loss, {
			
 
				-            "perplexity": perplexity,
			
 
				-            "min_encodings": min_encodings,
			
 
				-            "min_encoding_indices": min_encoding_indices,
			
 
				-            "min_encoding_scores": min_encoding_scores,
			
 
				-            "mean_distance": mean_distance
			
 
				-            }
			
 
				-
			
 
				-    def get_codebook_feat(self, indices, shape):
			
 
				-        # input indices: batch*token_num -> (batch*token_num)*1
			
 
				-        # shape: batch, height, width, channel
			
 
				-        indices = indices.view(-1,1)
			
 
				-        min_encodings = torch.zeros(indices.shape[0], self.codebook_size).to(indices)
			
 
				-        min_encodings.scatter_(1, indices, 1)
			
 
				-        # get quantized latent vectors
			
 
				-        z_q = torch.matmul(min_encodings.float(), self.embedding.weight)
			
 
				-
			
 
				-        if shape is not None:  # reshape back to match original input shape
			
 
				-            z_q = z_q.view(shape).permute(0, 3, 1, 2).contiguous()
			
 
				-
			
 
				-        return z_q
			
 
				-
			
 
				-
			
 
				-class GumbelQuantizer(nn.Module):
			
 
				-    def __init__(self, codebook_size, emb_dim, num_hiddens, straight_through=False, kl_weight=5e-4, temp_init=1.0):
			
 
				-        super().__init__()
			
 
				-        self.codebook_size = codebook_size  # number of embeddings
			
 
				-        self.emb_dim = emb_dim  # dimension of embedding
			
 
				-        self.straight_through = straight_through
			
 
				-        self.temperature = temp_init
			
 
				-        self.kl_weight = kl_weight
			
 
				-        self.proj = nn.Conv2d(num_hiddens, codebook_size, 1)  # projects last encoder layer to quantized logits
			
 
				-        self.embed = nn.Embedding(codebook_size, emb_dim)
			
 
				-
			
 
				-    def forward(self, z):
			
 
				-        hard = self.straight_through if self.training else True
			
 
				-
			
 
				-        logits = self.proj(z)
			
 
				-
			
 
				-        soft_one_hot = F.gumbel_softmax(logits, tau=self.temperature, dim=1, hard=hard)
			
 
				-
			
 
				-        z_q = torch.einsum("b n h w, n d -> b d h w", soft_one_hot, self.embed.weight)
			
 
				-
			
 
				-        # + kl divergence to the prior loss
			
 
				-        qy = F.softmax(logits, dim=1)
			
 
				-        diff = self.kl_weight * torch.sum(qy * torch.log(qy * self.codebook_size + 1e-10), dim=1).mean()
			
 
				-        min_encoding_indices = soft_one_hot.argmax(dim=1)
			
 
				-
			
 
				-        return z_q, diff, {
			
 
				-            "min_encoding_indices": min_encoding_indices
			
 
				-        }
			
 
				-
			
 
				-
			
 
				-class Downsample(nn.Module):
			
 
				-    def __init__(self, in_channels):
			
 
				-        super().__init__()
			
 
				-        self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        pad = (0, 1, 0, 1)
			
 
				-        x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
			
 
				-        x = self.conv(x)
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-class Upsample(nn.Module):
			
 
				-    def __init__(self, in_channels):
			
 
				-        super().__init__()
			
 
				-        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
			
 
				-        x = self.conv(x)
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-class ResBlock(nn.Module):
			
 
				-    def __init__(self, in_channels, out_channels=None):
			
 
				-        super(ResBlock, self).__init__()
			
 
				-        self.in_channels = in_channels
			
 
				-        self.out_channels = in_channels if out_channels is None else out_channels
			
 
				-        self.norm1 = normalize(in_channels)
			
 
				-        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
			
 
				-        self.norm2 = normalize(out_channels)
			
 
				-        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
			
 
				-        if self.in_channels != self.out_channels:
			
 
				-            self.conv_out = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
			
 
				-
			
 
				-    def forward(self, x_in):
			
 
				-        x = x_in
			
 
				-        x = self.norm1(x)
			
 
				-        x = swish(x)
			
 
				-        x = self.conv1(x)
			
 
				-        x = self.norm2(x)
			
 
				-        x = swish(x)
			
 
				-        x = self.conv2(x)
			
 
				-        if self.in_channels != self.out_channels:
			
 
				-            x_in = self.conv_out(x_in)
			
 
				-
			
 
				-        return x + x_in
			
 
				-
			
 
				-
			
 
				-class AttnBlock(nn.Module):
			
 
				-    def __init__(self, in_channels):
			
 
				-        super().__init__()
			
 
				-        self.in_channels = in_channels
			
 
				-
			
 
				-        self.norm = normalize(in_channels)
			
 
				-        self.q = torch.nn.Conv2d(
			
 
				-            in_channels,
			
 
				-            in_channels,
			
 
				-            kernel_size=1,
			
 
				-            stride=1,
			
 
				-            padding=0
			
 
				-        )
			
 
				-        self.k = torch.nn.Conv2d(
			
 
				-            in_channels,
			
 
				-            in_channels,
			
 
				-            kernel_size=1,
			
 
				-            stride=1,
			
 
				-            padding=0
			
 
				-        )
			
 
				-        self.v = torch.nn.Conv2d(
			
 
				-            in_channels,
			
 
				-            in_channels,
			
 
				-            kernel_size=1,
			
 
				-            stride=1,
			
 
				-            padding=0
			
 
				-        )
			
 
				-        self.proj_out = torch.nn.Conv2d(
			
 
				-            in_channels,
			
 
				-            in_channels,
			
 
				-            kernel_size=1,
			
 
				-            stride=1,
			
 
				-            padding=0
			
 
				-        )
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        h_ = x
			
 
				-        h_ = self.norm(h_)
			
 
				-        q = self.q(h_)
			
 
				-        k = self.k(h_)
			
 
				-        v = self.v(h_)
			
 
				-
			
 
				-        # compute attention
			
 
				-        b, c, h, w = q.shape
			
 
				-        q = q.reshape(b, c, h*w)
			
 
				-        q = q.permute(0, 2, 1)
			
 
				-        k = k.reshape(b, c, h*w)
			
 
				-        w_ = torch.bmm(q, k)
			
 
				-        w_ = w_ * (int(c)**(-0.5))
			
 
				-        w_ = F.softmax(w_, dim=2)
			
 
				-
			
 
				-        # attend to values
			
 
				-        v = v.reshape(b, c, h*w)
			
 
				-        w_ = w_.permute(0, 2, 1)
			
 
				-        h_ = torch.bmm(v, w_)
			
 
				-        h_ = h_.reshape(b, c, h, w)
			
 
				-
			
 
				-        h_ = self.proj_out(h_)
			
 
				-
			
 
				-        return x+h_
			
 
				-
			
 
				-
			
 
				-class Encoder(nn.Module):
			
 
				-    def __init__(self, in_channels, nf, emb_dim, ch_mult, num_res_blocks, resolution, attn_resolutions):
			
 
				-        super().__init__()
			
 
				-        self.nf = nf
			
 
				-        self.num_resolutions = len(ch_mult)
			
 
				-        self.num_res_blocks = num_res_blocks
			
 
				-        self.resolution = resolution
			
 
				-        self.attn_resolutions = attn_resolutions
			
 
				-
			
 
				-        curr_res = self.resolution
			
 
				-        in_ch_mult = (1,)+tuple(ch_mult)
			
 
				-
			
 
				-        blocks = []
			
 
				-        # initial convultion
			
 
				-        blocks.append(nn.Conv2d(in_channels, nf, kernel_size=3, stride=1, padding=1))
			
 
				-
			
 
				-        # residual and downsampling blocks, with attention on smaller res (16x16)
			
 
				-        for i in range(self.num_resolutions):
			
 
				-            block_in_ch = nf * in_ch_mult[i]
			
 
				-            block_out_ch = nf * ch_mult[i]
			
 
				-            for _ in range(self.num_res_blocks):
			
 
				-                blocks.append(ResBlock(block_in_ch, block_out_ch))
			
 
				-                block_in_ch = block_out_ch
			
 
				-                if curr_res in attn_resolutions:
			
 
				-                    blocks.append(AttnBlock(block_in_ch))
			
 
				-
			
 
				-            if i != self.num_resolutions - 1:
			
 
				-                blocks.append(Downsample(block_in_ch))
			
 
				-                curr_res = curr_res // 2
			
 
				-
			
 
				-        # non-local attention block
			
 
				-        blocks.append(ResBlock(block_in_ch, block_in_ch))
			
 
				-        blocks.append(AttnBlock(block_in_ch))
			
 
				-        blocks.append(ResBlock(block_in_ch, block_in_ch))
			
 
				-
			
 
				-        # normalise and convert to latent size
			
 
				-        blocks.append(normalize(block_in_ch))
			
 
				-        blocks.append(nn.Conv2d(block_in_ch, emb_dim, kernel_size=3, stride=1, padding=1))
			
 
				-        self.blocks = nn.ModuleList(blocks)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        for block in self.blocks:
			
 
				-            x = block(x)
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-class Generator(nn.Module):
			
 
				-    def __init__(self, nf, emb_dim, ch_mult, res_blocks, img_size, attn_resolutions):
			
 
				-        super().__init__()
			
 
				-        self.nf = nf
			
 
				-        self.ch_mult = ch_mult
			
 
				-        self.num_resolutions = len(self.ch_mult)
			
 
				-        self.num_res_blocks = res_blocks
			
 
				-        self.resolution = img_size
			
 
				-        self.attn_resolutions = attn_resolutions
			
 
				-        self.in_channels = emb_dim
			
 
				-        self.out_channels = 3
			
 
				-        block_in_ch = self.nf * self.ch_mult[-1]
			
 
				-        curr_res = self.resolution // 2 ** (self.num_resolutions-1)
			
 
				-
			
 
				-        blocks = []
			
 
				-        # initial conv
			
 
				-        blocks.append(nn.Conv2d(self.in_channels, block_in_ch, kernel_size=3, stride=1, padding=1))
			
 
				-
			
 
				-        # non-local attention block
			
 
				-        blocks.append(ResBlock(block_in_ch, block_in_ch))
			
 
				-        blocks.append(AttnBlock(block_in_ch))
			
 
				-        blocks.append(ResBlock(block_in_ch, block_in_ch))
			
 
				-
			
 
				-        for i in reversed(range(self.num_resolutions)):
			
 
				-            block_out_ch = self.nf * self.ch_mult[i]
			
 
				-
			
 
				-            for _ in range(self.num_res_blocks):
			
 
				-                blocks.append(ResBlock(block_in_ch, block_out_ch))
			
 
				-                block_in_ch = block_out_ch
			
 
				-
			
 
				-                if curr_res in self.attn_resolutions:
			
 
				-                    blocks.append(AttnBlock(block_in_ch))
			
 
				-
			
 
				-            if i != 0:
			
 
				-                blocks.append(Upsample(block_in_ch))
			
 
				-                curr_res = curr_res * 2
			
 
				-
			
 
				-        blocks.append(normalize(block_in_ch))
			
 
				-        blocks.append(nn.Conv2d(block_in_ch, self.out_channels, kernel_size=3, stride=1, padding=1))
			
 
				-
			
 
				-        self.blocks = nn.ModuleList(blocks)
			
 
				-
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        for block in self.blocks:
			
 
				-            x = block(x)
			
 
				-
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-@ARCH_REGISTRY.register()
			
 
				-class VQAutoEncoder(nn.Module):
			
 
				-    def __init__(self, img_size, nf, ch_mult, quantizer="nearest", res_blocks=2, attn_resolutions=None, codebook_size=1024, emb_dim=256,
			
 
				-                beta=0.25, gumbel_straight_through=False, gumbel_kl_weight=1e-8, model_path=None):
			
 
				-        super().__init__()
			
 
				-        logger = get_root_logger()
			
 
				-        self.in_channels = 3
			
 
				-        self.nf = nf
			
 
				-        self.n_blocks = res_blocks
			
 
				-        self.codebook_size = codebook_size
			
 
				-        self.embed_dim = emb_dim
			
 
				-        self.ch_mult = ch_mult
			
 
				-        self.resolution = img_size
			
 
				-        self.attn_resolutions = attn_resolutions or [16]
			
 
				-        self.quantizer_type = quantizer
			
 
				-        self.encoder = Encoder(
			
 
				-            self.in_channels,
			
 
				-            self.nf,
			
 
				-            self.embed_dim,
			
 
				-            self.ch_mult,
			
 
				-            self.n_blocks,
			
 
				-            self.resolution,
			
 
				-            self.attn_resolutions
			
 
				-        )
			
 
				-        if self.quantizer_type == "nearest":
			
 
				-            self.beta = beta #0.25
			
 
				-            self.quantize = VectorQuantizer(self.codebook_size, self.embed_dim, self.beta)
			
 
				-        elif self.quantizer_type == "gumbel":
			
 
				-            self.gumbel_num_hiddens = emb_dim
			
 
				-            self.straight_through = gumbel_straight_through
			
 
				-            self.kl_weight = gumbel_kl_weight
			
 
				-            self.quantize = GumbelQuantizer(
			
 
				-                self.codebook_size,
			
 
				-                self.embed_dim,
			
 
				-                self.gumbel_num_hiddens,
			
 
				-                self.straight_through,
			
 
				-                self.kl_weight
			
 
				-            )
			
 
				-        self.generator = Generator(
			
 
				-            self.nf,
			
 
				-            self.embed_dim,
			
 
				-            self.ch_mult,
			
 
				-            self.n_blocks,
			
 
				-            self.resolution,
			
 
				-            self.attn_resolutions
			
 
				-        )
			
 
				-
			
 
				-        if model_path is not None:
			
 
				-            chkpt = torch.load(model_path, map_location='cpu')
			
 
				-            if 'params_ema' in chkpt:
			
 
				-                self.load_state_dict(torch.load(model_path, map_location='cpu')['params_ema'])
			
 
				-                logger.info(f'vqgan is loaded from: {model_path} [params_ema]')
			
 
				-            elif 'params' in chkpt:
			
 
				-                self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
			
 
				-                logger.info(f'vqgan is loaded from: {model_path} [params]')
			
 
				-            else:
			
 
				-                raise ValueError('Wrong params!')
			
 
				-
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x = self.encoder(x)
			
 
				-        quant, codebook_loss, quant_stats = self.quantize(x)
			
 
				-        x = self.generator(quant)
			
 
				-        return x, codebook_loss, quant_stats
			
 
				-
			
 
				-
			
 
				-
			
 
				-# patch based discriminator
			
 
				-@ARCH_REGISTRY.register()
			
 
				-class VQGANDiscriminator(nn.Module):
			
 
				-    def __init__(self, nc=3, ndf=64, n_layers=4, model_path=None):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        layers = [nn.Conv2d(nc, ndf, kernel_size=4, stride=2, padding=1), nn.LeakyReLU(0.2, True)]
			
 
				-        ndf_mult = 1
			
 
				-        ndf_mult_prev = 1
			
 
				-        for n in range(1, n_layers):  # gradually increase the number of filters
			
 
				-            ndf_mult_prev = ndf_mult
			
 
				-            ndf_mult = min(2 ** n, 8)
			
 
				-            layers += [
			
 
				-                nn.Conv2d(ndf * ndf_mult_prev, ndf * ndf_mult, kernel_size=4, stride=2, padding=1, bias=False),
			
 
				-                nn.BatchNorm2d(ndf * ndf_mult),
			
 
				-                nn.LeakyReLU(0.2, True)
			
 
				-            ]
			
 
				-
			
 
				-        ndf_mult_prev = ndf_mult
			
 
				-        ndf_mult = min(2 ** n_layers, 8)
			
 
				-
			
 
				-        layers += [
			
 
				-            nn.Conv2d(ndf * ndf_mult_prev, ndf * ndf_mult, kernel_size=4, stride=1, padding=1, bias=False),
			
 
				-            nn.BatchNorm2d(ndf * ndf_mult),
			
 
				-            nn.LeakyReLU(0.2, True)
			
 
				-        ]
			
 
				-
			
 
				-        layers += [
			
 
				-            nn.Conv2d(ndf * ndf_mult, 1, kernel_size=4, stride=1, padding=1)]  # output 1 channel prediction map
			
 
				-        self.main = nn.Sequential(*layers)
			
 
				-
			
 
				-        if model_path is not None:
			
 
				-            chkpt = torch.load(model_path, map_location='cpu')
			
 
				-            if 'params_d' in chkpt:
			
 
				-                self.load_state_dict(torch.load(model_path, map_location='cpu')['params_d'])
			
 
				-            elif 'params' in chkpt:
			
 
				-                self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
			
 
				-            else:
			
 
				-                raise ValueError('Wrong params!')
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        return self.main(x)
			
--- a/modules/codeformer_model.py
+++ b/modules/codeformer_model.py
@@ -8,9 +8,6 @@ import modules.shared
 
				 from modules import shared, devices, modelloader, errors

			
 
				 from modules.paths import models_path

			
 
				 

			
 
				-# codeformer people made a choice to include modified basicsr library to their project which makes

			
 
				-# it utterly impossible to use it alongside with other libraries that also use basicsr, like GFPGAN.

			
 
				-# I am making a choice to include some files from codeformer to work around this issue.

			
 
				 model_dir = "Codeformer"

			
 
				 model_path = os.path.join(models_path, model_dir)

			
 
				 model_url = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth'

			
@@ -18,115 +15,127 @@ model_url = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codef
 
				 codeformer = None

			
 
				 

			
 
				 

			
 
				-def setup_model(dirname):

			
 
				-    os.makedirs(model_path, exist_ok=True)

			
 
				-

			
 
				-    path = modules.paths.paths.get("CodeFormer", None)

			
 
				-    if path is None:

			
 
				-        return

			
 
				-

			
 
				-    try:

			
 
				+class FaceRestorerCodeFormer(modules.face_restoration.FaceRestoration):

			
 
				+    def name(self):

			
 
				+        return "CodeFormer"

			
 
				+

			
 
				+    def __init__(self, dirname):

			
 
				+        self.net = None

			
 
				+        self.face_helper = None

			
 
				+        self.cmd_dir = dirname

			
 
				+

			
 
				+    def create_models(self):

			
 
				+        from facexlib.detection import retinaface

			
 
				+        from facexlib.utils.face_restoration_helper import FaceRestoreHelper

			
 
				+

			
 
				+        if self.net is not None and self.face_helper is not None:

			
 
				+            self.net.to(devices.device_codeformer)

			
 
				+            return self.net, self.face_helper

			
 
				+        model_paths = modelloader.load_models(

			
 
				+            model_path,

			
 
				+            model_url,

			
 
				+            self.cmd_dir,

			
 
				+            download_name='codeformer-v0.1.0.pth',

			
 
				+            ext_filter=['.pth'],

			
 
				+        )

			
 
				+

			
 
				+        if len(model_paths) != 0:

			
 
				+            ckpt_path = model_paths[0]

			
 
				+        else:

			
 
				+            print("Unable to load codeformer model.")

			
 
				+            return None, None

			
 
				+        net = modelloader.load_spandrel_model(ckpt_path, device=devices.device_codeformer)

			
 
				+

			
 
				+        if hasattr(retinaface, 'device'):

			
 
				+            retinaface.device = devices.device_codeformer

			
 
				+

			
 
				+        face_helper = FaceRestoreHelper(

			
 
				+            upscale_factor=1,

			
 
				+            face_size=512,

			
 
				+            crop_ratio=(1, 1),

			
 
				+            det_model='retinaface_resnet50',

			
 
				+            save_ext='png',

			
 
				+            use_parse=True,

			
 
				+            device=devices.device_codeformer,

			
 
				+        )

			
 
				+

			
 
				+        self.net = net

			
 
				+        self.face_helper = face_helper

			
 
				+

			
 
				+    def send_model_to(self, device):

			
 
				+        self.net.to(device)

			
 
				+        self.face_helper.face_det.to(device)

			
 
				+        self.face_helper.face_parse.to(device)

			
 
				+

			
 
				+    def restore(self, np_image, w=None):

			
 
				         from torchvision.transforms.functional import normalize

			
 
				-        from modules.codeformer.codeformer_arch import CodeFormer

			
 
				         from basicsr.utils import img2tensor, tensor2img

			
 
				-        from facelib.utils.face_restoration_helper import FaceRestoreHelper

			
 
				-        from facelib.detection.retinaface import retinaface

			
 
				-

			
 
				-        net_class = CodeFormer

			
 
				-

			
 
				-        class FaceRestorerCodeFormer(modules.face_restoration.FaceRestoration):

			
 
				-            def name(self):

			
 
				-                return "CodeFormer"

			
 
				-

			
 
				-            def __init__(self, dirname):

			
 
				-                self.net = None

			
 
				-                self.face_helper = None

			
 
				-                self.cmd_dir = dirname

			
 
				-

			
 
				-            def create_models(self):

			
 
				-

			
 
				-                if self.net is not None and self.face_helper is not None:

			
 
				-                    self.net.to(devices.device_codeformer)

			
 
				-                    return self.net, self.face_helper

			
 
				-                model_paths = modelloader.load_models(model_path, model_url, self.cmd_dir, download_name='codeformer-v0.1.0.pth', ext_filter=['.pth'])

			
 
				-                if len(model_paths) != 0:

			
 
				-                    ckpt_path = model_paths[0]

			
 
				-                else:

			
 
				-                    print("Unable to load codeformer model.")

			
 
				-                    return None, None

			
 
				-                net = net_class(dim_embd=512, codebook_size=1024, n_head=8, n_layers=9, connect_list=['32', '64', '128', '256']).to(devices.device_codeformer)

			
 
				-                checkpoint = torch.load(ckpt_path)['params_ema']

			
 
				-                net.load_state_dict(checkpoint)

			
 
				-                net.eval()

			
 
				+        np_image = np_image[:, :, ::-1]

			
 
				 

			
 
				-                if hasattr(retinaface, 'device'):

			
 
				-                    retinaface.device = devices.device_codeformer

			
 
				-                face_helper = FaceRestoreHelper(1, face_size=512, crop_ratio=(1, 1), det_model='retinaface_resnet50', save_ext='png', use_parse=True, device=devices.device_codeformer)

			
 
				+        original_resolution = np_image.shape[0:2]

			
 
				 

			
 
				-                self.net = net

			
 
				-                self.face_helper = face_helper

			
 
				+        self.create_models()

			
 
				+        if self.net is None or self.face_helper is None:

			
 
				+            return np_image

			
 
				 

			
 
				-                return net, face_helper

			
 
				+        self.send_model_to(devices.device_codeformer)

			
 
				 

			
 
				-            def send_model_to(self, device):

			
 
				-                self.net.to(device)

			
 
				-                self.face_helper.face_det.to(device)

			
 
				-                self.face_helper.face_parse.to(device)

			
 
				+        self.face_helper.clean_all()

			
 
				+        self.face_helper.read_image(np_image)

			
 
				+        self.face_helper.get_face_landmarks_5(only_center_face=False, resize=640, eye_dist_threshold=5)

			
 
				+        self.face_helper.align_warp_face()

			
 
				 

			
 
				-            def restore(self, np_image, w=None):

			
 
				-                np_image = np_image[:, :, ::-1]

			
 
				+        for cropped_face in self.face_helper.cropped_faces:

			
 
				+            cropped_face_t = img2tensor(cropped_face / 255., bgr2rgb=True, float32=True)

			
 
				+            normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)

			
 
				+            cropped_face_t = cropped_face_t.unsqueeze(0).to(devices.device_codeformer)

			
 
				 

			
 
				-                original_resolution = np_image.shape[0:2]

			
 
				+            try:

			
 
				+                with torch.no_grad():

			
 
				+                    res = self.net(cropped_face_t, w=w if w is not None else shared.opts.code_former_weight, adain=True)

			
 
				+                    if isinstance(res, tuple):

			
 
				+                        output = res[0]

			
 
				+                    else:

			
 
				+                        output = res

			
 
				+                    if not isinstance(res, torch.Tensor):

			
 
				+                        raise TypeError(f"Expected torch.Tensor, got {type(res)}")

			
 
				+                    restored_face = tensor2img(output, rgb2bgr=True, min_max=(-1, 1))

			
 
				+                del output

			
 
				+                devices.torch_gc()

			
 
				+            except Exception:

			
 
				+                errors.report('Failed inference for CodeFormer', exc_info=True)

			
 
				+                restored_face = tensor2img(cropped_face_t, rgb2bgr=True, min_max=(-1, 1))

			
 
				 

			
 
				-                self.create_models()

			
 
				-                if self.net is None or self.face_helper is None:

			
 
				-                    return np_image

			
 
				+            restored_face = restored_face.astype('uint8')

			
 
				+            self.face_helper.add_restored_face(restored_face)

			
 
				 

			
 
				-                self.send_model_to(devices.device_codeformer)

			
 
				+        self.face_helper.get_inverse_affine(None)

			
 
				 

			
 
				-                self.face_helper.clean_all()

			
 
				-                self.face_helper.read_image(np_image)

			
 
				-                self.face_helper.get_face_landmarks_5(only_center_face=False, resize=640, eye_dist_threshold=5)

			
 
				-                self.face_helper.align_warp_face()

			
 
				+        restored_img = self.face_helper.paste_faces_to_input_image()

			
 
				+        restored_img = restored_img[:, :, ::-1]

			
 
				 

			
 
				-                for cropped_face in self.face_helper.cropped_faces:

			
 
				-                    cropped_face_t = img2tensor(cropped_face / 255., bgr2rgb=True, float32=True)

			
 
				-                    normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)

			
 
				-                    cropped_face_t = cropped_face_t.unsqueeze(0).to(devices.device_codeformer)

			
 
				+        if original_resolution != restored_img.shape[0:2]:

			
 
				+            restored_img = cv2.resize(

			
 
				+                restored_img,

			
 
				+                (0, 0),

			
 
				+                fx=original_resolution[1]/restored_img.shape[1],

			
 
				+                fy=original_resolution[0]/restored_img.shape[0],

			
 
				+                interpolation=cv2.INTER_LINEAR,

			
 
				+            )

			
 
				 

			
 
				-                    try:

			
 
				-                        with torch.no_grad():

			
 
				-                            output = self.net(cropped_face_t, w=w if w is not None else shared.opts.code_former_weight, adain=True)[0]

			
 
				-                            restored_face = tensor2img(output, rgb2bgr=True, min_max=(-1, 1))

			
 
				-                        del output

			
 
				-                        devices.torch_gc()

			
 
				-                    except Exception:

			
 
				-                        errors.report('Failed inference for CodeFormer', exc_info=True)

			
 
				-                        restored_face = tensor2img(cropped_face_t, rgb2bgr=True, min_max=(-1, 1))

			
 
				+        self.face_helper.clean_all()

			
 
				 

			
 
				-                    restored_face = restored_face.astype('uint8')

			
 
				-                    self.face_helper.add_restored_face(restored_face)

			
 
				+        if shared.opts.face_restoration_unload:

			
 
				+            self.send_model_to(devices.cpu)

			
 
				 

			
 
				-                self.face_helper.get_inverse_affine(None)

			
 
				+        return restored_img

			
 
				 

			
 
				-                restored_img = self.face_helper.paste_faces_to_input_image()

			
 
				-                restored_img = restored_img[:, :, ::-1]

			
 
				-

			
 
				-                if original_resolution != restored_img.shape[0:2]:

			
 
				-                    restored_img = cv2.resize(restored_img, (0, 0), fx=original_resolution[1]/restored_img.shape[1], fy=original_resolution[0]/restored_img.shape[0], interpolation=cv2.INTER_LINEAR)

			
 
				-

			
 
				-                self.face_helper.clean_all()

			
 
				-

			
 
				-                if shared.opts.face_restoration_unload:

			
 
				-                    self.send_model_to(devices.cpu)

			
 
				-

			
 
				-                return restored_img

			
 
				 

			
 
				+def setup_model(dirname):

			
 
				+    os.makedirs(model_path, exist_ok=True)

			
 
				+    try:

			
 
				         global codeformer

			
 
				         codeformer = FaceRestorerCodeFormer(dirname)

			
 
				         shared.face_restorers.append(codeformer)

			
 
				-

			
 
				     except Exception:

			
 
				         errors.report("Error setting up CodeFormer", exc_info=True)

			
 
				-

			
 
				-   # sys.path = stored_sys_path

			
--- a/modules/esrgan_model.py
+++ b/modules/esrgan_model.py
@@ -1,122 +1,9 @@
 
				-import sys

			
 
				-

			
 
				-import torch

			
 
				-

			
 
				-import modules.esrgan_model_arch as arch

			
 
				-from modules import modelloader, devices

			
 
				+from modules import modelloader, devices, errors

			
 
				 from modules.shared import opts

			
 
				 from modules.upscaler import Upscaler, UpscalerData

			
 
				 from modules.upscaler_utils import upscale_with_model

			
 
				 

			
 
				 

			
 
				-def mod2normal(state_dict):

			
 
				-    # this code is copied from https://github.com/victorca25/iNNfer

			
 
				-    if 'conv_first.weight' in state_dict:

			
 
				-        crt_net = {}

			
 
				-        items = list(state_dict)

			
 
				-

			
 
				-        crt_net['model.0.weight'] = state_dict['conv_first.weight']

			
 
				-        crt_net['model.0.bias'] = state_dict['conv_first.bias']

			
 
				-

			
 
				-        for k in items.copy():

			
 
				-            if 'RDB' in k:

			
 
				-                ori_k = k.replace('RRDB_trunk.', 'model.1.sub.')

			
 
				-                if '.weight' in k:

			
 
				-                    ori_k = ori_k.replace('.weight', '.0.weight')

			
 
				-                elif '.bias' in k:

			
 
				-                    ori_k = ori_k.replace('.bias', '.0.bias')

			
 
				-                crt_net[ori_k] = state_dict[k]

			
 
				-                items.remove(k)

			
 
				-

			
 
				-        crt_net['model.1.sub.23.weight'] = state_dict['trunk_conv.weight']

			
 
				-        crt_net['model.1.sub.23.bias'] = state_dict['trunk_conv.bias']

			
 
				-        crt_net['model.3.weight'] = state_dict['upconv1.weight']

			
 
				-        crt_net['model.3.bias'] = state_dict['upconv1.bias']

			
 
				-        crt_net['model.6.weight'] = state_dict['upconv2.weight']

			
 
				-        crt_net['model.6.bias'] = state_dict['upconv2.bias']

			
 
				-        crt_net['model.8.weight'] = state_dict['HRconv.weight']

			
 
				-        crt_net['model.8.bias'] = state_dict['HRconv.bias']

			
 
				-        crt_net['model.10.weight'] = state_dict['conv_last.weight']

			
 
				-        crt_net['model.10.bias'] = state_dict['conv_last.bias']

			
 
				-        state_dict = crt_net

			
 
				-    return state_dict

			
 
				-

			
 
				-

			
 
				-def resrgan2normal(state_dict, nb=23):

			
 
				-    # this code is copied from https://github.com/victorca25/iNNfer

			
 
				-    if "conv_first.weight" in state_dict and "body.0.rdb1.conv1.weight" in state_dict:

			
 
				-        re8x = 0

			
 
				-        crt_net = {}

			
 
				-        items = list(state_dict)

			
 
				-

			
 
				-        crt_net['model.0.weight'] = state_dict['conv_first.weight']

			
 
				-        crt_net['model.0.bias'] = state_dict['conv_first.bias']

			
 
				-

			
 
				-        for k in items.copy():

			
 
				-            if "rdb" in k:

			
 
				-                ori_k = k.replace('body.', 'model.1.sub.')

			
 
				-                ori_k = ori_k.replace('.rdb', '.RDB')

			
 
				-                if '.weight' in k:

			
 
				-                    ori_k = ori_k.replace('.weight', '.0.weight')

			
 
				-                elif '.bias' in k:

			
 
				-                    ori_k = ori_k.replace('.bias', '.0.bias')

			
 
				-                crt_net[ori_k] = state_dict[k]

			
 
				-                items.remove(k)

			
 
				-

			
 
				-        crt_net[f'model.1.sub.{nb}.weight'] = state_dict['conv_body.weight']

			
 
				-        crt_net[f'model.1.sub.{nb}.bias'] = state_dict['conv_body.bias']

			
 
				-        crt_net['model.3.weight'] = state_dict['conv_up1.weight']

			
 
				-        crt_net['model.3.bias'] = state_dict['conv_up1.bias']

			
 
				-        crt_net['model.6.weight'] = state_dict['conv_up2.weight']

			
 
				-        crt_net['model.6.bias'] = state_dict['conv_up2.bias']

			
 
				-

			
 
				-        if 'conv_up3.weight' in state_dict:

			
 
				-            # modification supporting: https://github.com/ai-forever/Real-ESRGAN/blob/main/RealESRGAN/rrdbnet_arch.py

			
 
				-            re8x = 3

			
 
				-            crt_net['model.9.weight'] = state_dict['conv_up3.weight']

			
 
				-            crt_net['model.9.bias'] = state_dict['conv_up3.bias']

			
 
				-

			
 
				-        crt_net[f'model.{8+re8x}.weight'] = state_dict['conv_hr.weight']

			
 
				-        crt_net[f'model.{8+re8x}.bias'] = state_dict['conv_hr.bias']

			
 
				-        crt_net[f'model.{10+re8x}.weight'] = state_dict['conv_last.weight']

			
 
				-        crt_net[f'model.{10+re8x}.bias'] = state_dict['conv_last.bias']

			
 
				-

			
 
				-        state_dict = crt_net

			
 
				-    return state_dict

			
 
				-

			
 
				-

			
 
				-def infer_params(state_dict):

			
 
				-    # this code is copied from https://github.com/victorca25/iNNfer

			
 
				-    scale2x = 0

			
 
				-    scalemin = 6

			
 
				-    n_uplayer = 0

			
 
				-    plus = False

			
 
				-

			
 
				-    for block in list(state_dict):

			
 
				-        parts = block.split(".")

			
 
				-        n_parts = len(parts)

			
 
				-        if n_parts == 5 and parts[2] == "sub":

			
 
				-            nb = int(parts[3])

			
 
				-        elif n_parts == 3:

			
 
				-            part_num = int(parts[1])

			
 
				-            if (part_num > scalemin

			
 
				-                and parts[0] == "model"

			
 
				-                and parts[2] == "weight"):

			
 
				-                scale2x += 1

			
 
				-            if part_num > n_uplayer:

			
 
				-                n_uplayer = part_num

			
 
				-                out_nc = state_dict[block].shape[0]

			
 
				-        if not plus and "conv1x1" in block:

			
 
				-            plus = True

			
 
				-

			
 
				-    nf = state_dict["model.0.weight"].shape[0]

			
 
				-    in_nc = state_dict["model.0.weight"].shape[1]

			
 
				-    out_nc = out_nc

			
 
				-    scale = 2 ** scale2x

			
 
				-

			
 
				-    return in_nc, out_nc, nf, nb, plus, scale

			
 
				-

			
 
				-

			
 
				 class UpscalerESRGAN(Upscaler):

			
 
				     def __init__(self, dirname):

			
 
				         self.name = "ESRGAN"

			
@@ -142,12 +29,11 @@ class UpscalerESRGAN(Upscaler):
 
				     def do_upscale(self, img, selected_model):

			
 
				         try:

			
 
				             model = self.load_model(selected_model)

			
 
				-        except Exception as e:

			
 
				-            print(f"Unable to load ESRGAN model {selected_model}: {e}", file=sys.stderr)

			
 
				+        except Exception:

			
 
				+            errors.report(f"Unable to load ESRGAN model {selected_model}", exc_info=True)

			
 
				             return img

			
 
				         model.to(devices.device_esrgan)

			
 
				-        img = esrgan_upscale(model, img)

			
 
				-        return img

			
 
				+        return esrgan_upscale(model, img)

			
 
				 

			
 
				     def load_model(self, path: str):

			
 
				         if path.startswith("http"):

			
@@ -160,33 +46,10 @@ class UpscalerESRGAN(Upscaler):
 
				         else:

			
 
				             filename = path

			
 
				 

			
 
				-        state_dict = torch.load(filename, map_location='cpu' if devices.device_esrgan.type == 'mps' else None)

			
 
				-

			
 
				-        if "params_ema" in state_dict:

			
 
				-            state_dict = state_dict["params_ema"]

			
 
				-        elif "params" in state_dict:

			
 
				-            state_dict = state_dict["params"]

			
 
				-            num_conv = 16 if "realesr-animevideov3" in filename else 32

			
 
				-            model = arch.SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=num_conv, upscale=4, act_type='prelu')

			
 
				-            model.load_state_dict(state_dict)

			
 
				-            model.eval()

			
 
				-            return model

			
 
				-

			
 
				-        if "body.0.rdb1.conv1.weight" in state_dict and "conv_first.weight" in state_dict:

			
 
				-            nb = 6 if "RealESRGAN_x4plus_anime_6B" in filename else 23

			
 
				-            state_dict = resrgan2normal(state_dict, nb)

			
 
				-        elif "conv_first.weight" in state_dict:

			
 
				-            state_dict = mod2normal(state_dict)

			
 
				-        elif "model.0.weight" not in state_dict:

			
 
				-            raise Exception("The file is not a recognized ESRGAN model.")

			
 
				-

			
 
				-        in_nc, out_nc, nf, nb, plus, mscale = infer_params(state_dict)

			
 
				-

			
 
				-        model = arch.RRDBNet(in_nc=in_nc, out_nc=out_nc, nf=nf, nb=nb, upscale=mscale, plus=plus)

			
 
				-        model.load_state_dict(state_dict)

			
 
				-        model.eval()

			
 
				-

			
 
				-        return model

			
 
				+        return modelloader.load_spandrel_model(

			
 
				+            filename,

			
 
				+            device=('cpu' if devices.device_esrgan.type == 'mps' else None),

			
 
				+        )

			
 
				 

			
 
				 

			
 
				 def esrgan_upscale(model, img):

			
--- a/modules/esrgan_model_arch.py
+++ b/modules/esrgan_model_arch.py
@@ -1,465 +0,0 @@
 
				-# this file is adapted from https://github.com/victorca25/iNNfer

			
 
				-

			
 
				-from collections import OrderedDict

			
 
				-import math

			
 
				-import torch

			
 
				-import torch.nn as nn

			
 
				-import torch.nn.functional as F

			
 
				-

			
 
				-

			
 
				-####################

			
 
				-# RRDBNet Generator

			
 
				-####################

			
 
				-

			
 
				-class RRDBNet(nn.Module):

			
 
				-    def __init__(self, in_nc, out_nc, nf, nb, nr=3, gc=32, upscale=4, norm_type=None,

			
 
				-            act_type='leakyrelu', mode='CNA', upsample_mode='upconv', convtype='Conv2D',

			
 
				-            finalact=None, gaussian_noise=False, plus=False):

			
 
				-        super(RRDBNet, self).__init__()

			
 
				-        n_upscale = int(math.log(upscale, 2))

			
 
				-        if upscale == 3:

			
 
				-            n_upscale = 1

			
 
				-

			
 
				-        self.resrgan_scale = 0

			
 
				-        if in_nc % 16 == 0:

			
 
				-            self.resrgan_scale = 1

			
 
				-        elif in_nc != 4 and in_nc % 4 == 0:

			
 
				-            self.resrgan_scale = 2

			
 
				-

			
 
				-        fea_conv = conv_block(in_nc, nf, kernel_size=3, norm_type=None, act_type=None, convtype=convtype)

			
 
				-        rb_blocks = [RRDB(nf, nr, kernel_size=3, gc=32, stride=1, bias=1, pad_type='zero',

			
 
				-            norm_type=norm_type, act_type=act_type, mode='CNA', convtype=convtype,

			
 
				-            gaussian_noise=gaussian_noise, plus=plus) for _ in range(nb)]

			
 
				-        LR_conv = conv_block(nf, nf, kernel_size=3, norm_type=norm_type, act_type=None, mode=mode, convtype=convtype)

			
 
				-

			
 
				-        if upsample_mode == 'upconv':

			
 
				-            upsample_block = upconv_block

			
 
				-        elif upsample_mode == 'pixelshuffle':

			
 
				-            upsample_block = pixelshuffle_block

			
 
				-        else:

			
 
				-            raise NotImplementedError(f'upsample mode [{upsample_mode}] is not found')

			
 
				-        if upscale == 3:

			
 
				-            upsampler = upsample_block(nf, nf, 3, act_type=act_type, convtype=convtype)

			
 
				-        else:

			
 
				-            upsampler = [upsample_block(nf, nf, act_type=act_type, convtype=convtype) for _ in range(n_upscale)]

			
 
				-        HR_conv0 = conv_block(nf, nf, kernel_size=3, norm_type=None, act_type=act_type, convtype=convtype)

			
 
				-        HR_conv1 = conv_block(nf, out_nc, kernel_size=3, norm_type=None, act_type=None, convtype=convtype)

			
 
				-

			
 
				-        outact = act(finalact) if finalact else None

			
 
				-

			
 
				-        self.model = sequential(fea_conv, ShortcutBlock(sequential(*rb_blocks, LR_conv)),

			
 
				-            *upsampler, HR_conv0, HR_conv1, outact)

			
 
				-

			
 
				-    def forward(self, x, outm=None):

			
 
				-        if self.resrgan_scale == 1:

			
 
				-            feat = pixel_unshuffle(x, scale=4)

			
 
				-        elif self.resrgan_scale == 2:

			
 
				-            feat = pixel_unshuffle(x, scale=2)

			
 
				-        else:

			
 
				-            feat = x

			
 
				-

			
 
				-        return self.model(feat)

			
 
				-

			
 
				-

			
 
				-class RRDB(nn.Module):

			
 
				-    """

			
 
				-    Residual in Residual Dense Block

			
 
				-    (ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks)

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, nf, nr=3, kernel_size=3, gc=32, stride=1, bias=1, pad_type='zero',

			
 
				-            norm_type=None, act_type='leakyrelu', mode='CNA', convtype='Conv2D',

			
 
				-            spectral_norm=False, gaussian_noise=False, plus=False):

			
 
				-        super(RRDB, self).__init__()

			
 
				-        # This is for backwards compatibility with existing models

			
 
				-        if nr == 3:

			
 
				-            self.RDB1 = ResidualDenseBlock_5C(nf, kernel_size, gc, stride, bias, pad_type,

			
 
				-                    norm_type, act_type, mode, convtype, spectral_norm=spectral_norm,

			
 
				-                    gaussian_noise=gaussian_noise, plus=plus)

			
 
				-            self.RDB2 = ResidualDenseBlock_5C(nf, kernel_size, gc, stride, bias, pad_type,

			
 
				-                    norm_type, act_type, mode, convtype, spectral_norm=spectral_norm,

			
 
				-                    gaussian_noise=gaussian_noise, plus=plus)

			
 
				-            self.RDB3 = ResidualDenseBlock_5C(nf, kernel_size, gc, stride, bias, pad_type,

			
 
				-                    norm_type, act_type, mode, convtype, spectral_norm=spectral_norm,

			
 
				-                    gaussian_noise=gaussian_noise, plus=plus)

			
 
				-        else:

			
 
				-            RDB_list = [ResidualDenseBlock_5C(nf, kernel_size, gc, stride, bias, pad_type,

			
 
				-                                              norm_type, act_type, mode, convtype, spectral_norm=spectral_norm,

			
 
				-                                              gaussian_noise=gaussian_noise, plus=plus) for _ in range(nr)]

			
 
				-            self.RDBs = nn.Sequential(*RDB_list)

			
 
				-

			
 
				-    def forward(self, x):

			
 
				-        if hasattr(self, 'RDB1'):

			
 
				-            out = self.RDB1(x)

			
 
				-            out = self.RDB2(out)

			
 
				-            out = self.RDB3(out)

			
 
				-        else:

			
 
				-            out = self.RDBs(x)

			
 
				-        return out * 0.2 + x

			
 
				-

			
 
				-

			
 
				-class ResidualDenseBlock_5C(nn.Module):

			
 
				-    """

			
 
				-    Residual Dense Block

			
 
				-    The core module of paper: (Residual Dense Network for Image Super-Resolution, CVPR 18)

			
 
				-    Modified options that can be used:

			
 
				-        - "Partial Convolution based Padding" arXiv:1811.11718

			
 
				-        - "Spectral normalization" arXiv:1802.05957

			
 
				-        - "ICASSP 2020 - ESRGAN+ : Further Improving ESRGAN" N. C.

			
 
				-            {Rakotonirina} and A. {Rasoanaivo}

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, nf=64, kernel_size=3, gc=32, stride=1, bias=1, pad_type='zero',

			
 
				-            norm_type=None, act_type='leakyrelu', mode='CNA', convtype='Conv2D',

			
 
				-            spectral_norm=False, gaussian_noise=False, plus=False):

			
 
				-        super(ResidualDenseBlock_5C, self).__init__()

			
 
				-

			
 
				-        self.noise = GaussianNoise() if gaussian_noise else None

			
 
				-        self.conv1x1 = conv1x1(nf, gc) if plus else None

			
 
				-

			
 
				-        self.conv1 = conv_block(nf, gc, kernel_size, stride, bias=bias, pad_type=pad_type,

			
 
				-            norm_type=norm_type, act_type=act_type, mode=mode, convtype=convtype,

			
 
				-            spectral_norm=spectral_norm)

			
 
				-        self.conv2 = conv_block(nf+gc, gc, kernel_size, stride, bias=bias, pad_type=pad_type,

			
 
				-            norm_type=norm_type, act_type=act_type, mode=mode, convtype=convtype,

			
 
				-            spectral_norm=spectral_norm)

			
 
				-        self.conv3 = conv_block(nf+2*gc, gc, kernel_size, stride, bias=bias, pad_type=pad_type,

			
 
				-            norm_type=norm_type, act_type=act_type, mode=mode, convtype=convtype,

			
 
				-            spectral_norm=spectral_norm)

			
 
				-        self.conv4 = conv_block(nf+3*gc, gc, kernel_size, stride, bias=bias, pad_type=pad_type,

			
 
				-            norm_type=norm_type, act_type=act_type, mode=mode, convtype=convtype,

			
 
				-            spectral_norm=spectral_norm)

			
 
				-        if mode == 'CNA':

			
 
				-            last_act = None

			
 
				-        else:

			
 
				-            last_act = act_type

			
 
				-        self.conv5 = conv_block(nf+4*gc, nf, 3, stride, bias=bias, pad_type=pad_type,

			
 
				-            norm_type=norm_type, act_type=last_act, mode=mode, convtype=convtype,

			
 
				-            spectral_norm=spectral_norm)

			
 
				-

			
 
				-    def forward(self, x):

			
 
				-        x1 = self.conv1(x)

			
 
				-        x2 = self.conv2(torch.cat((x, x1), 1))

			
 
				-        if self.conv1x1:

			
 
				-            x2 = x2 + self.conv1x1(x)

			
 
				-        x3 = self.conv3(torch.cat((x, x1, x2), 1))

			
 
				-        x4 = self.conv4(torch.cat((x, x1, x2, x3), 1))

			
 
				-        if self.conv1x1:

			
 
				-            x4 = x4 + x2

			
 
				-        x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))

			
 
				-        if self.noise:

			
 
				-            return self.noise(x5.mul(0.2) + x)

			
 
				-        else:

			
 
				-            return x5 * 0.2 + x

			
 
				-

			
 
				-

			
 
				-####################

			
 
				-# ESRGANplus

			
 
				-####################

			
 
				-

			
 
				-class GaussianNoise(nn.Module):

			
 
				-    def __init__(self, sigma=0.1, is_relative_detach=False):

			
 
				-        super().__init__()

			
 
				-        self.sigma = sigma

			
 
				-        self.is_relative_detach = is_relative_detach

			
 
				-        self.noise = torch.tensor(0, dtype=torch.float)

			
 
				-

			
 
				-    def forward(self, x):

			
 
				-        if self.training and self.sigma != 0:

			
 
				-            self.noise = self.noise.to(x.device)

			
 
				-            scale = self.sigma * x.detach() if self.is_relative_detach else self.sigma * x

			
 
				-            sampled_noise = self.noise.repeat(*x.size()).normal_() * scale

			
 
				-            x = x + sampled_noise

			
 
				-        return x

			
 
				-

			
 
				-def conv1x1(in_planes, out_planes, stride=1):

			
 
				-    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

			
 
				-

			
 
				-

			
 
				-####################

			
 
				-# SRVGGNetCompact

			
 
				-####################

			
 
				-

			
 
				-class SRVGGNetCompact(nn.Module):

			
 
				-    """A compact VGG-style network structure for super-resolution.

			
 
				-    This class is copied from https://github.com/xinntao/Real-ESRGAN

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=16, upscale=4, act_type='prelu'):

			
 
				-        super(SRVGGNetCompact, self).__init__()

			
 
				-        self.num_in_ch = num_in_ch

			
 
				-        self.num_out_ch = num_out_ch

			
 
				-        self.num_feat = num_feat

			
 
				-        self.num_conv = num_conv

			
 
				-        self.upscale = upscale

			
 
				-        self.act_type = act_type

			
 
				-

			
 
				-        self.body = nn.ModuleList()

			
 
				-        # the first conv

			
 
				-        self.body.append(nn.Conv2d(num_in_ch, num_feat, 3, 1, 1))

			
 
				-        # the first activation

			
 
				-        if act_type == 'relu':

			
 
				-            activation = nn.ReLU(inplace=True)

			
 
				-        elif act_type == 'prelu':

			
 
				-            activation = nn.PReLU(num_parameters=num_feat)

			
 
				-        elif act_type == 'leakyrelu':

			
 
				-            activation = nn.LeakyReLU(negative_slope=0.1, inplace=True)

			
 
				-        self.body.append(activation)

			
 
				-

			
 
				-        # the body structure

			
 
				-        for _ in range(num_conv):

			
 
				-            self.body.append(nn.Conv2d(num_feat, num_feat, 3, 1, 1))

			
 
				-            # activation

			
 
				-            if act_type == 'relu':

			
 
				-                activation = nn.ReLU(inplace=True)

			
 
				-            elif act_type == 'prelu':

			
 
				-                activation = nn.PReLU(num_parameters=num_feat)

			
 
				-            elif act_type == 'leakyrelu':

			
 
				-                activation = nn.LeakyReLU(negative_slope=0.1, inplace=True)

			
 
				-            self.body.append(activation)

			
 
				-

			
 
				-        # the last conv

			
 
				-        self.body.append(nn.Conv2d(num_feat, num_out_ch * upscale * upscale, 3, 1, 1))

			
 
				-        # upsample

			
 
				-        self.upsampler = nn.PixelShuffle(upscale)

			
 
				-

			
 
				-    def forward(self, x):

			
 
				-        out = x

			
 
				-        for i in range(0, len(self.body)):

			
 
				-            out = self.body[i](out)

			
 
				-

			
 
				-        out = self.upsampler(out)

			
 
				-        # add the nearest upsampled image, so that the network learns the residual

			
 
				-        base = F.interpolate(x, scale_factor=self.upscale, mode='nearest')

			
 
				-        out += base

			
 
				-        return out

			
 
				-

			
 
				-

			
 
				-####################

			
 
				-# Upsampler

			
 
				-####################

			
 
				-

			
 
				-class Upsample(nn.Module):

			
 
				-    r"""Upsamples a given multi-channel 1D (temporal), 2D (spatial) or 3D (volumetric) data.

			
 
				-    The input data is assumed to be of the form

			
 
				-    `minibatch x channels x [optional depth] x [optional height] x width`.

			
 
				-    """

			
 
				-

			
 
				-    def __init__(self, size=None, scale_factor=None, mode="nearest", align_corners=None):

			
 
				-        super(Upsample, self).__init__()

			
 
				-        if isinstance(scale_factor, tuple):

			
 
				-            self.scale_factor = tuple(float(factor) for factor in scale_factor)

			
 
				-        else:

			
 
				-            self.scale_factor = float(scale_factor) if scale_factor else None

			
 
				-        self.mode = mode

			
 
				-        self.size = size

			
 
				-        self.align_corners = align_corners

			
 
				-

			
 
				-    def forward(self, x):

			
 
				-        return nn.functional.interpolate(x, size=self.size, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners)

			
 
				-

			
 
				-    def extra_repr(self):

			
 
				-        if self.scale_factor is not None:

			
 
				-            info = f'scale_factor={self.scale_factor}'

			
 
				-        else:

			
 
				-            info = f'size={self.size}'

			
 
				-        info += f', mode={self.mode}'

			
 
				-        return info

			
 
				-

			
 
				-

			
 
				-def pixel_unshuffle(x, scale):

			
 
				-    """ Pixel unshuffle.

			
 
				-    Args:

			
 
				-        x (Tensor): Input feature with shape (b, c, hh, hw).

			
 
				-        scale (int): Downsample ratio.

			
 
				-    Returns:

			
 
				-        Tensor: the pixel unshuffled feature.

			
 
				-    """

			
 
				-    b, c, hh, hw = x.size()

			
 
				-    out_channel = c * (scale**2)

			
 
				-    assert hh % scale == 0 and hw % scale == 0

			
 
				-    h = hh // scale

			
 
				-    w = hw // scale

			
 
				-    x_view = x.view(b, c, h, scale, w, scale)

			
 
				-    return x_view.permute(0, 1, 3, 5, 2, 4).reshape(b, out_channel, h, w)

			
 
				-

			
 
				-

			
 
				-def pixelshuffle_block(in_nc, out_nc, upscale_factor=2, kernel_size=3, stride=1, bias=True,

			
 
				-                        pad_type='zero', norm_type=None, act_type='relu', convtype='Conv2D'):

			
 
				-    """

			
 
				-    Pixel shuffle layer

			
 
				-    (Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional

			
 
				-    Neural Network, CVPR17)

			
 
				-    """

			
 
				-    conv = conv_block(in_nc, out_nc * (upscale_factor ** 2), kernel_size, stride, bias=bias,

			
 
				-                        pad_type=pad_type, norm_type=None, act_type=None, convtype=convtype)

			
 
				-    pixel_shuffle = nn.PixelShuffle(upscale_factor)

			
 
				-

			
 
				-    n = norm(norm_type, out_nc) if norm_type else None

			
 
				-    a = act(act_type) if act_type else None

			
 
				-    return sequential(conv, pixel_shuffle, n, a)

			
 
				-

			
 
				-

			
 
				-def upconv_block(in_nc, out_nc, upscale_factor=2, kernel_size=3, stride=1, bias=True,

			
 
				-                pad_type='zero', norm_type=None, act_type='relu', mode='nearest', convtype='Conv2D'):

			
 
				-    """ Upconv layer """

			
 
				-    upscale_factor = (1, upscale_factor, upscale_factor) if convtype == 'Conv3D' else upscale_factor

			
 
				-    upsample = Upsample(scale_factor=upscale_factor, mode=mode)

			
 
				-    conv = conv_block(in_nc, out_nc, kernel_size, stride, bias=bias,

			
 
				-                        pad_type=pad_type, norm_type=norm_type, act_type=act_type, convtype=convtype)

			
 
				-    return sequential(upsample, conv)

			
 
				-

			
 
				-

			
 
				-

			
 
				-

			
 
				-

			
 
				-

			
 
				-

			
 
				-

			
 
				-####################

			
 
				-# Basic blocks

			
 
				-####################

			
 
				-

			
 
				-

			
 
				-def make_layer(basic_block, num_basic_block, **kwarg):

			
 
				-    """Make layers by stacking the same blocks.

			
 
				-    Args:

			
 
				-        basic_block (nn.module): nn.module class for basic block. (block)

			
 
				-        num_basic_block (int): number of blocks. (n_layers)

			
 
				-    Returns:

			
 
				-        nn.Sequential: Stacked blocks in nn.Sequential.

			
 
				-    """

			
 
				-    layers = []

			
 
				-    for _ in range(num_basic_block):

			
 
				-        layers.append(basic_block(**kwarg))

			
 
				-    return nn.Sequential(*layers)

			
 
				-

			
 
				-

			
 
				-def act(act_type, inplace=True, neg_slope=0.2, n_prelu=1, beta=1.0):

			
 
				-    """ activation helper """

			
 
				-    act_type = act_type.lower()

			
 
				-    if act_type == 'relu':

			
 
				-        layer = nn.ReLU(inplace)

			
 
				-    elif act_type in ('leakyrelu', 'lrelu'):

			
 
				-        layer = nn.LeakyReLU(neg_slope, inplace)

			
 
				-    elif act_type == 'prelu':

			
 
				-        layer = nn.PReLU(num_parameters=n_prelu, init=neg_slope)

			
 
				-    elif act_type == 'tanh':  # [-1, 1] range output

			
 
				-        layer = nn.Tanh()

			
 
				-    elif act_type == 'sigmoid':  # [0, 1] range output

			
 
				-        layer = nn.Sigmoid()

			
 
				-    else:

			
 
				-        raise NotImplementedError(f'activation layer [{act_type}] is not found')

			
 
				-    return layer

			
 
				-

			
 
				-

			
 
				-class Identity(nn.Module):

			
 
				-    def __init__(self, *kwargs):

			
 
				-        super(Identity, self).__init__()

			
 
				-

			
 
				-    def forward(self, x, *kwargs):

			
 
				-        return x

			
 
				-

			
 
				-

			
 
				-def norm(norm_type, nc):

			
 
				-    """ Return a normalization layer """

			
 
				-    norm_type = norm_type.lower()

			
 
				-    if norm_type == 'batch':

			
 
				-        layer = nn.BatchNorm2d(nc, affine=True)

			
 
				-    elif norm_type == 'instance':

			
 
				-        layer = nn.InstanceNorm2d(nc, affine=False)

			
 
				-    elif norm_type == 'none':

			
 
				-        def norm_layer(x): return Identity()

			
 
				-    else:

			
 
				-        raise NotImplementedError(f'normalization layer [{norm_type}] is not found')

			
 
				-    return layer

			
 
				-

			
 
				-

			
 
				-def pad(pad_type, padding):

			
 
				-    """ padding layer helper """

			
 
				-    pad_type = pad_type.lower()

			
 
				-    if padding == 0:

			
 
				-        return None

			
 
				-    if pad_type == 'reflect':

			
 
				-        layer = nn.ReflectionPad2d(padding)

			
 
				-    elif pad_type == 'replicate':

			
 
				-        layer = nn.ReplicationPad2d(padding)

			
 
				-    elif pad_type == 'zero':

			
 
				-        layer = nn.ZeroPad2d(padding)

			
 
				-    else:

			
 
				-        raise NotImplementedError(f'padding layer [{pad_type}] is not implemented')

			
 
				-    return layer

			
 
				-

			
 
				-

			
 
				-def get_valid_padding(kernel_size, dilation):

			
 
				-    kernel_size = kernel_size + (kernel_size - 1) * (dilation - 1)

			
 
				-    padding = (kernel_size - 1) // 2

			
 
				-    return padding

			
 
				-

			
 
				-

			
 
				-class ShortcutBlock(nn.Module):

			
 
				-    """ Elementwise sum the output of a submodule to its input """

			
 
				-    def __init__(self, submodule):

			
 
				-        super(ShortcutBlock, self).__init__()

			
 
				-        self.sub = submodule

			
 
				-

			
 
				-    def forward(self, x):

			
 
				-        output = x + self.sub(x)

			
 
				-        return output

			
 
				-

			
 
				-    def __repr__(self):

			
 
				-        return 'Identity + \n|' + self.sub.__repr__().replace('\n', '\n|')

			
 
				-

			
 
				-

			
 
				-def sequential(*args):

			
 
				-    """ Flatten Sequential. It unwraps nn.Sequential. """

			
 
				-    if len(args) == 1:

			
 
				-        if isinstance(args[0], OrderedDict):

			
 
				-            raise NotImplementedError('sequential does not support OrderedDict input.')

			
 
				-        return args[0]  # No sequential is needed.

			
 
				-    modules = []

			
 
				-    for module in args:

			
 
				-        if isinstance(module, nn.Sequential):

			
 
				-            for submodule in module.children():

			
 
				-                modules.append(submodule)

			
 
				-        elif isinstance(module, nn.Module):

			
 
				-            modules.append(module)

			
 
				-    return nn.Sequential(*modules)

			
 
				-

			
 
				-

			
 
				-def conv_block(in_nc, out_nc, kernel_size, stride=1, dilation=1, groups=1, bias=True,

			
 
				-               pad_type='zero', norm_type=None, act_type='relu', mode='CNA', convtype='Conv2D',

			
 
				-               spectral_norm=False):

			
 
				-    """ Conv layer with padding, normalization, activation """

			
 
				-    assert mode in ['CNA', 'NAC', 'CNAC'], f'Wrong conv mode [{mode}]'

			
 
				-    padding = get_valid_padding(kernel_size, dilation)

			
 
				-    p = pad(pad_type, padding) if pad_type and pad_type != 'zero' else None

			
 
				-    padding = padding if pad_type == 'zero' else 0

			
 
				-

			
 
				-    if convtype=='PartialConv2D':

			
 
				-        from torchvision.ops import PartialConv2d  # this is definitely not going to work, but PartialConv2d doesn't work anyway and this shuts up static analyzer

			
 
				-        c = PartialConv2d(in_nc, out_nc, kernel_size=kernel_size, stride=stride, padding=padding,

			
 
				-               dilation=dilation, bias=bias, groups=groups)

			
 
				-    elif convtype=='DeformConv2D':

			
 
				-        from torchvision.ops import DeformConv2d  # not tested

			
 
				-        c = DeformConv2d(in_nc, out_nc, kernel_size=kernel_size, stride=stride, padding=padding,

			
 
				-               dilation=dilation, bias=bias, groups=groups)

			
 
				-    elif convtype=='Conv3D':

			
 
				-        c = nn.Conv3d(in_nc, out_nc, kernel_size=kernel_size, stride=stride, padding=padding,

			
 
				-                dilation=dilation, bias=bias, groups=groups)

			
 
				-    else:

			
 
				-        c = nn.Conv2d(in_nc, out_nc, kernel_size=kernel_size, stride=stride, padding=padding,

			
 
				-                dilation=dilation, bias=bias, groups=groups)

			
 
				-

			
 
				-    if spectral_norm:

			
 
				-        c = nn.utils.spectral_norm(c)

			
 
				-

			
 
				-    a = act(act_type) if act_type else None

			
 
				-    if 'CNA' in mode:

			
 
				-        n = norm(norm_type, out_nc) if norm_type else None

			
 
				-        return sequential(p, c, n, a)

			
 
				-    elif mode == 'NAC':

			
 
				-        if norm_type is None and act_type is not None:

			
 
				-            a = act(act_type, inplace=False)

			
 
				-        n = norm(norm_type, in_nc) if norm_type else None

			
 
				-        return sequential(n, a, p, c)

			
--- a/modules/gfpgan_model.py
+++ b/modules/gfpgan_model.py
@@ -1,8 +1,5 @@
 
				 import os

			
 
				 

			
 
				-import facexlib

			
 
				-import gfpgan

			
 
				-

			
 
				 import modules.face_restoration

			
 
				 from modules import paths, shared, devices, modelloader, errors

			
 
				 

			
@@ -41,6 +38,8 @@ def gfpgann():
 
				         print("Unable to load gfpgan model!")

			
 
				         return None

			
 
				 

			
 
				+    import facexlib.detection.retinaface

			
 
				+

			
 
				     if hasattr(facexlib.detection.retinaface, 'device'):

			
 
				         facexlib.detection.retinaface.device = devices.device_gfpgan

			
 
				     model_file_path = model_file

			
@@ -81,8 +80,10 @@ gfpgan_constructor = None
 
				 def setup_model(dirname):

			
 
				     try:

			
 
				         os.makedirs(model_path, exist_ok=True)

			
 
				-        from gfpgan import GFPGANer

			
 
				-        from facexlib import detection, parsing  # noqa: F401

			
 
				+        import gfpgan

			
 
				+        import facexlib.detection

			
 
				+        import facexlib.parsing

			
 
				+

			
 
				         global user_path

			
 
				         global have_gfpgan

			
 
				         global gfpgan_constructor

			
@@ -111,7 +112,7 @@ def setup_model(dirname):
 
				         facexlib.parsing.load_file_from_url = facex_load_file_from_url2

			
 
				         user_path = dirname

			
 
				         have_gfpgan = True

			
 
				-        gfpgan_constructor = GFPGANer

			
 
				+        gfpgan_constructor = gfpgan.GFPGANer

			
 
				 

			
 
				         class FaceRestorerGFPGAN(modules.face_restoration.FaceRestoration):

			
 
				             def name(self):

			
--- a/modules/launch_utils.py
+++ b/modules/launch_utils.py
@@ -345,13 +345,11 @@ def prepare_environment():
 
				     stable_diffusion_repo = os.environ.get('STABLE_DIFFUSION_REPO', "https://github.com/Stability-AI/stablediffusion.git")

			
 
				     stable_diffusion_xl_repo = os.environ.get('STABLE_DIFFUSION_XL_REPO', "https://github.com/Stability-AI/generative-models.git")

			
 
				     k_diffusion_repo = os.environ.get('K_DIFFUSION_REPO', 'https://github.com/crowsonkb/k-diffusion.git')

			
 
				-    codeformer_repo = os.environ.get('CODEFORMER_REPO', 'https://github.com/sczhou/CodeFormer.git')

			
 
				     blip_repo = os.environ.get('BLIP_REPO', 'https://github.com/salesforce/BLIP.git')

			
 
				 

			
 
				     stable_diffusion_commit_hash = os.environ.get('STABLE_DIFFUSION_COMMIT_HASH', "cf1d67a6fd5ea1aa600c4df58e5b47da45f6bdbf")

			
 
				     stable_diffusion_xl_commit_hash = os.environ.get('STABLE_DIFFUSION_XL_COMMIT_HASH', "45c443b316737a4ab6e40413d7794a7f5657c19f")

			
 
				     k_diffusion_commit_hash = os.environ.get('K_DIFFUSION_COMMIT_HASH', "ab527a9a6d347f364e3d185ba6d714e22d80cb3c")

			
 
				-    codeformer_commit_hash = os.environ.get('CODEFORMER_COMMIT_HASH', "c5b4593074ba6214284d6acd5f1719b6c5d739af")

			
 
				     blip_commit_hash = os.environ.get('BLIP_COMMIT_HASH', "48211a1594f1321b00f14c9f7a5b4813144b2fb9")

			
 
				 

			
 
				     try:

			
@@ -408,15 +406,10 @@ def prepare_environment():
 
				     git_clone(stable_diffusion_repo, repo_dir('stable-diffusion-stability-ai'), "Stable Diffusion", stable_diffusion_commit_hash)

			
 
				     git_clone(stable_diffusion_xl_repo, repo_dir('generative-models'), "Stable Diffusion XL", stable_diffusion_xl_commit_hash)

			
 
				     git_clone(k_diffusion_repo, repo_dir('k-diffusion'), "K-diffusion", k_diffusion_commit_hash)

			
 
				-    git_clone(codeformer_repo, repo_dir('CodeFormer'), "CodeFormer", codeformer_commit_hash)

			
 
				     git_clone(blip_repo, repo_dir('BLIP'), "BLIP", blip_commit_hash)

			
 
				 

			
 
				     startup_timer.record("clone repositores")

			
 
				 

			
 
				-    if not is_installed("lpips"):

			
 
				-        run_pip(f"install -r \"{os.path.join(repo_dir('CodeFormer'), 'requirements.txt')}\"", "requirements for CodeFormer")

			
 
				-        startup_timer.record("install CodeFormer requirements")

			
 
				-

			
 
				     if not os.path.isfile(requirements_file):

			
 
				         requirements_file = os.path.join(script_path, requirements_file)

			
 
				 

			
--- a/modules/modelloader.py
+++ b/modules/modelloader.py
@@ -1,5 +1,6 @@
 
				 from __future__ import annotations
			
 
				 
			
 
				+import logging
			
 
				 import os
			
 
				 import shutil
			
 
				 import importlib
			
@@ -10,6 +11,9 @@ from modules.upscaler import Upscaler, UpscalerLanczos, UpscalerNearest, Upscale
 
				 from modules.paths import script_path, models_path
			
 
				 
			
 
				 
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				 def load_file_from_url(
			
 
				     url: str,
			
 
				     *,
			
@@ -177,3 +181,15 @@ def load_upscalers():
 
				         # Special case for UpscalerNone keeps it at the beginning of the list.
			
 
				         key=lambda x: x.name.lower() if not isinstance(x.scaler, (UpscalerNone, UpscalerLanczos, UpscalerNearest)) else ""
			
 
				     )
			
 
				+
			
 
				+
			
 
				+def load_spandrel_model(path, *, device, half: bool = False, dtype=None):
			
 
				+    import spandrel
			
 
				+    model = spandrel.ModelLoader(device=device).load_from_file(path)
			
 
				+    if half:
			
 
				+        model = model.model.half()
			
 
				+    if dtype:
			
 
				+        model = model.model.to(dtype=dtype)
			
 
				+    model.eval()
			
 
				+    logger.debug("Loaded %s from %s (device=%s, half=%s, dtype=%s)", model, path, device, half, dtype)
			
 
				+    return model
			
--- a/modules/paths.py
+++ b/modules/paths.py
@@ -38,7 +38,6 @@ mute_sdxl_imports()
 
				 path_dirs = [

			
 
				     (sd_path, 'ldm', 'Stable Diffusion', []),

			
 
				     (os.path.join(sd_path, '../generative-models'), 'sgm', 'Stable Diffusion XL', ["sgm"]),

			
 
				-    (os.path.join(sd_path, '../CodeFormer'), 'inference_codeformer.py', 'CodeFormer', []),

			
 
				     (os.path.join(sd_path, '../BLIP'), 'models/blip.py', 'BLIP', []),

			
 
				     (os.path.join(sd_path, '../k-diffusion'), 'k_diffusion/sampling.py', 'k_diffusion', ["atstart"]),

			
 
				 ]

			
--- a/modules/realesrgan_model.py
+++ b/modules/realesrgan_model.py
@@ -1,9 +1,6 @@
 
				 import os

			
 
				 

			
 
				-import numpy as np

			
 
				-from PIL import Image

			
 
				-from realesrgan import RealESRGANer

			
 
				-

			
 
				+from modules.upscaler_utils import upscale_with_model

			
 
				 from modules.upscaler import Upscaler, UpscalerData

			
 
				 from modules.shared import cmd_opts, opts

			
 
				 from modules import modelloader, errors

			
@@ -14,29 +11,20 @@ class UpscalerRealESRGAN(Upscaler):
 
				         self.name = "RealESRGAN"

			
 
				         self.user_path = path

			
 
				         super().__init__()

			
 
				-        try:

			
 
				-            from basicsr.archs.rrdbnet_arch import RRDBNet  # noqa: F401

			
 
				-            from realesrgan import RealESRGANer  # noqa: F401

			
 
				-            from realesrgan.archs.srvgg_arch import SRVGGNetCompact  # noqa: F401

			
 
				-            self.enable = True

			
 
				-            self.scalers = []

			
 
				-            scalers = self.load_models(path)

			
 
				+        self.enable = True

			
 
				+        self.scalers = []

			
 
				+        scalers = get_realesrgan_models(self)

			
 
				 

			
 
				-            local_model_paths = self.find_models(ext_filter=[".pth"])

			
 
				-            for scaler in scalers:

			
 
				-                if scaler.local_data_path.startswith("http"):

			
 
				-                    filename = modelloader.friendly_name(scaler.local_data_path)

			
 
				-                    local_model_candidates = [local_model for local_model in local_model_paths if local_model.endswith(f"{filename}.pth")]

			
 
				-                    if local_model_candidates:

			
 
				-                        scaler.local_data_path = local_model_candidates[0]

			
 
				+        local_model_paths = self.find_models(ext_filter=[".pth"])

			
 
				+        for scaler in scalers:

			
 
				+            if scaler.local_data_path.startswith("http"):

			
 
				+                filename = modelloader.friendly_name(scaler.local_data_path)

			
 
				+                local_model_candidates = [local_model for local_model in local_model_paths if local_model.endswith(f"{filename}.pth")]

			
 
				+                if local_model_candidates:

			
 
				+                    scaler.local_data_path = local_model_candidates[0]

			
 
				 

			
 
				-                if scaler.name in opts.realesrgan_enabled_models:

			
 
				-                    self.scalers.append(scaler)

			
 
				-

			
 
				-        except Exception:

			
 
				-            errors.report("Error importing Real-ESRGAN", exc_info=True)

			
 
				-            self.enable = False

			
 
				-            self.scalers = []

			
 
				+            if scaler.name in opts.realesrgan_enabled_models:

			
 
				+                self.scalers.append(scaler)

			
 
				 

			
 
				     def do_upscale(self, img, path):

			
 
				         if not self.enable:

			
@@ -48,20 +36,18 @@ class UpscalerRealESRGAN(Upscaler):
 
				             errors.report(f"Unable to load RealESRGAN model {path}", exc_info=True)

			
 
				             return img

			
 
				 

			
 
				-        upsampler = RealESRGANer(

			
 
				-            scale=info.scale,

			
 
				-            model_path=info.local_data_path,

			
 
				-            model=info.model(),

			
 
				-            half=not cmd_opts.no_half and not cmd_opts.upcast_sampling,

			
 
				-            tile=opts.ESRGAN_tile,

			
 
				-            tile_pad=opts.ESRGAN_tile_overlap,

			
 
				+        mod = modelloader.load_spandrel_model(

			
 
				+            info.local_data_path,

			
 
				             device=self.device,

			
 
				+            half=(not cmd_opts.no_half and not cmd_opts.upcast_sampling),

			
 
				+        )

			
 
				+        return upscale_with_model(

			
 
				+            mod,

			
 
				+            img,

			
 
				+            tile_size=opts.ESRGAN_tile,

			
 
				+            tile_overlap=opts.ESRGAN_tile_overlap,

			
 
				+            # TODO: `outscale`?

			
 
				         )

			
 
				-

			
 
				-        upsampled = upsampler.enhance(np.array(img), outscale=info.scale)[0]

			
 
				-

			
 
				-        image = Image.fromarray(upsampled)

			
 
				-        return image

			
 
				 

			
 
				     def load_model(self, path):

			
 
				         for scaler in self.scalers:

			
@@ -76,58 +62,43 @@ class UpscalerRealESRGAN(Upscaler):
 
				                 return scaler

			
 
				         raise ValueError(f"Unable to find model info: {path}")

			
 
				 

			
 
				-    def load_models(self, _):

			
 
				-        return get_realesrgan_models(self)

			
 
				-

			
 
				 

			
 
				-def get_realesrgan_models(scaler):

			
 
				-    try:

			
 
				-        from basicsr.archs.rrdbnet_arch import RRDBNet

			
 
				-        from realesrgan.archs.srvgg_arch import SRVGGNetCompact

			
 
				-        models = [

			
 
				-            UpscalerData(

			
 
				-                name="R-ESRGAN General 4xV3",

			
 
				-                path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-x4v3.pth",

			
 
				-                scale=4,

			
 
				-                upscaler=scaler,

			
 
				-                model=lambda: SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=32, upscale=4, act_type='prelu')

			
 
				-            ),

			
 
				-            UpscalerData(

			
 
				-                name="R-ESRGAN General WDN 4xV3",

			
 
				-                path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-wdn-x4v3.pth",

			
 
				-                scale=4,

			
 
				-                upscaler=scaler,

			
 
				-                model=lambda: SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=32, upscale=4, act_type='prelu')

			
 
				-            ),

			
 
				-            UpscalerData(

			
 
				-                name="R-ESRGAN AnimeVideo",

			
 
				-                path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-animevideov3.pth",

			
 
				-                scale=4,

			
 
				-                upscaler=scaler,

			
 
				-                model=lambda: SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=16, upscale=4, act_type='prelu')

			
 
				-            ),

			
 
				-            UpscalerData(

			
 
				-                name="R-ESRGAN 4x+",

			
 
				-                path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth",

			
 
				-                scale=4,

			
 
				-                upscaler=scaler,

			
 
				-                model=lambda: RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)

			
 
				-            ),

			
 
				-            UpscalerData(

			
 
				-                name="R-ESRGAN 4x+ Anime6B",

			
 
				-                path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth",

			
 
				-                scale=4,

			
 
				-                upscaler=scaler,

			
 
				-                model=lambda: RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4)

			
 
				-            ),

			
 
				-            UpscalerData(

			
 
				-                name="R-ESRGAN 2x+",

			
 
				-                path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth",

			
 
				-                scale=2,

			
 
				-                upscaler=scaler,

			
 
				-                model=lambda: RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)

			
 
				-            ),

			
 
				-        ]

			
 
				-        return models

			
 
				-    except Exception:

			
 
				-        errors.report("Error making Real-ESRGAN models list", exc_info=True)

			
 
				+def get_realesrgan_models(scaler: UpscalerRealESRGAN):

			
 
				+    return [

			
 
				+        UpscalerData(

			
 
				+            name="R-ESRGAN General 4xV3",

			
 
				+            path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-x4v3.pth",

			
 
				+            scale=4,

			
 
				+            upscaler=scaler,

			
 
				+        ),

			
 
				+        UpscalerData(

			
 
				+            name="R-ESRGAN General WDN 4xV3",

			
 
				+            path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-wdn-x4v3.pth",

			
 
				+            scale=4,

			
 
				+            upscaler=scaler,

			
 
				+        ),

			
 
				+        UpscalerData(

			
 
				+            name="R-ESRGAN AnimeVideo",

			
 
				+            path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-animevideov3.pth",

			
 
				+            scale=4,

			
 
				+            upscaler=scaler,

			
 
				+        ),

			
 
				+        UpscalerData(

			
 
				+            name="R-ESRGAN 4x+",

			
 
				+            path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth",

			
 
				+            scale=4,

			
 
				+            upscaler=scaler,

			
 
				+        ),

			
 
				+        UpscalerData(

			
 
				+            name="R-ESRGAN 4x+ Anime6B",

			
 
				+            path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth",

			
 
				+            scale=4,

			
 
				+            upscaler=scaler,

			
 
				+        ),

			
 
				+        UpscalerData(

			
 
				+            name="R-ESRGAN 2x+",

			
 
				+            path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth",

			
 
				+            scale=2,

			
 
				+            upscaler=scaler,

			
 
				+        ),

			
 
				+    ]

			
--- a/modules/sysinfo.py
+++ b/modules/sysinfo.py
@@ -26,11 +26,9 @@ environment_whitelist = {
 
				     "OPENCLIP_PACKAGE",

			
 
				     "STABLE_DIFFUSION_REPO",

			
 
				     "K_DIFFUSION_REPO",

			
 
				-    "CODEFORMER_REPO",

			
 
				     "BLIP_REPO",

			
 
				     "STABLE_DIFFUSION_COMMIT_HASH",

			
 
				     "K_DIFFUSION_COMMIT_HASH",

			
 
				-    "CODEFORMER_COMMIT_HASH",

			
 
				     "BLIP_COMMIT_HASH",

			
 
				     "COMMANDLINE_ARGS",

			
 
				     "IGNORE_CMD_ARGS_ERRORS",

			
--- a/modules/upscaler.py
+++ b/modules/upscaler.py
@@ -98,6 +98,9 @@ class UpscalerData:
 
				         self.scale = scale
			
 
				         self.model = model
			
 
				 
			
 
				+    def __repr__(self):
			
 
				+        return f"<UpscalerData name={self.name} path={self.data_path} scale={self.scale}>"
			
 
				+
			
 
				 
			
 
				 class UpscalerNone(Upscaler):
			
 
				     name = "None"
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ basicsr
 
				 blendmodes

			
 
				 clean-fid

			
 
				 einops

			
 
				+facexlib

			
 
				 fastapi>=0.90.1

			
 
				 gfpgan

			
 
				 gradio==3.41.2

			
@@ -20,13 +21,11 @@ open-clip-torch
 
				 piexif

			
 
				 psutil

			
 
				 pytorch_lightning

			
 
				-realesrgan

			
 
				 requests

			
 
				 resize-right

			
 
				 

			
 
				 safetensors

			
 
				 scikit-image>=0.19

			
 
				-timm

			
 
				 tomesd

			
 
				 torch

			
 
				 torchdiffeq

			
--- a/requirements_versions.txt
+++ b/requirements_versions.txt
@@ -5,6 +5,7 @@ basicsr==1.4.2
 
				 blendmodes==2022

			
 
				 clean-fid==0.1.35

			
 
				 einops==0.4.1

			
 
				+facexlib==0.3.0

			
 
				 fastapi==0.94.0

			
 
				 gfpgan==1.3.8

			
 
				 gradio==3.41.2

			
@@ -19,11 +20,10 @@ open-clip-torch==2.20.0
 
				 piexif==1.1.3

			
 
				 psutil==5.9.5

			
 
				 pytorch_lightning==1.9.4

			
 
				-realesrgan==0.3.0

			
 
				 resize-right==0.0.2

			
 
				 safetensors==0.3.1

			
 
				 scikit-image==0.21.0

			
 
				-timm==0.9.2

			
 
				+spandrel==0.1.6

			
 
				 tomesd==0.1.3

			
 
				 torch

			
 
				 torchdiffeq==0.2.3