cyclegan, progan

2026-04-10 12:33:44 +00:00 · 2021-03-11 15:50:44 +01:00
parent 91b1fd156c
commit 2c53205f12
27 changed files with 276 additions and 238 deletions
--- a/ML/Pytorch/GANs/ProGAN/model.py
+++ b/ML/Pytorch/GANs/ProGAN/model.py
@@ -22,7 +22,7 @@ so specifically the first 5 layers the channels stay the same,
 whereas when we increase the img_size (towards the later layers)
 we decrease the number of chanels by 1/2, 1/4, etc.
 """
-factors = [1, 1, 1, 1, 1/2, 1/4, 1/4, 1/8, 1/16]
+factors = [1, 1, 1, 1, 1 / 2, 1 / 4, 1 / 8, 1 / 16, 1 / 32]


 class WSConv2d(nn.Module):
@@ -31,7 +31,7 @@ class WSConv2d(nn.Module):
    Note that input is multiplied rather than changing weights
    this will have the same result.

-    Inspired by:
+    Inspired and looked at:
    https://github.com/nvnbny/progressive_growing_of_gans/blob/master/modelUtils.py
    """

@@ -39,17 +39,17 @@ class WSConv2d(nn.Module):
        self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, gain=2
    ):
        super(WSConv2d, self).__init__()
-        self.conv = nn.Conv2d(
-            in_channels, out_channels, kernel_size, stride, padding
-        )
-        self.scale = (gain / (self.conv.weight[0].numel())) ** 0.5
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
+        self.scale = (gain / (in_channels * (kernel_size ** 2))) ** 0.5
+        self.bias = self.conv.bias
+        self.conv.bias = None

        # initialize conv layer
        nn.init.normal_(self.conv.weight)
-        nn.init.zeros_(self.conv.bias)
+        nn.init.zeros_(self.bias)

    def forward(self, x):
-        return self.conv(x * self.scale)
+        return self.conv(x * self.scale) + self.bias.view(1, self.bias.shape[0], 1, 1)


 class PixelNorm(nn.Module):
@@ -58,9 +58,7 @@ class PixelNorm(nn.Module):
        self.epsilon = 1e-8

    def forward(self, x):
-        return x / torch.sqrt(
-            torch.mean(x ** 2, dim=1, keepdim=True) + self.epsilon
-        )
+        return x / torch.sqrt(torch.mean(x ** 2, dim=1, keepdim=True) + self.epsilon)


 class ConvBlock(nn.Module):
@@ -81,42 +79,48 @@ class ConvBlock(nn.Module):


 class Generator(nn.Module):
-    def __init__(self, z_dim, in_channels, img_size, img_channels=3):
+    def __init__(self, z_dim, in_channels, img_channels=3):
        super(Generator, self).__init__()
-        self.prog_blocks, self.rgb_layers = nn.ModuleList([]), nn.ModuleList([])

        # initial takes 1x1 -> 4x4
        self.initial = nn.Sequential(
+            PixelNorm(),
            nn.ConvTranspose2d(z_dim, in_channels, 4, 1, 0),
            nn.LeakyReLU(0.2),
+            WSConv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2),
            PixelNorm(),
        )

-        # Create progression blocks and rgb layers
-        channels = in_channels
+        self.initial_rgb = WSConv2d(
+            in_channels, img_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.prog_blocks, self.rgb_layers = (
+            nn.ModuleList([]),
+            nn.ModuleList([self.initial_rgb]),
+        )

-        # we need to double img for log2(img_size/4) and
-        # +1 in loop for initial 4x4
-        for idx in range(int(log2(img_size/4)) + 1):
-            conv_in = channels
-            conv_out = int(in_channels*factors[idx])
-            self.prog_blocks.append(ConvBlock(conv_in, conv_out))
-            self.rgb_layers.append(WSConv2d(conv_out, img_channels, kernel_size=1, stride=1, padding=0))
-            channels = conv_out
+        for i in range(
+            len(factors) - 1
+        ):  # -1 to prevent index error because of factors[i+1]
+            conv_in_c = int(in_channels * factors[i])
+            conv_out_c = int(in_channels * factors[i + 1])
+            self.prog_blocks.append(ConvBlock(conv_in_c, conv_out_c))
+            self.rgb_layers.append(
+                WSConv2d(conv_out_c, img_channels, kernel_size=1, stride=1, padding=0)
+            )

    def fade_in(self, alpha, upscaled, generated):
-        #assert 0 <= alpha <= 1, "Alpha not between 0 and 1"
-        #assert upscaled.shape == generated.shape
+        # alpha should be scalar within [0, 1], and upscale.shape == generated.shape
        return torch.tanh(alpha * generated + (1 - alpha) * upscaled)

    def forward(self, x, alpha, steps):
-        upscaled = self.initial(x)
-        out = self.prog_blocks[0](upscaled)
+        out = self.initial(x)

        if steps == 0:
-            return self.rgb_layers[0](out)
+            return self.initial_rgb(out)

-        for step in range(1, steps+1):
+        for step in range(steps):
            upscaled = F.interpolate(out, scale_factor=2, mode="nearest")
            out = self.prog_blocks[step](upscaled)

@@ -130,76 +134,101 @@ class Generator(nn.Module):


 class Discriminator(nn.Module):
-    def __init__(self, img_size, z_dim, in_channels, img_channels=3):
+    def __init__(self, z_dim, in_channels, img_channels=3):
        super(Discriminator, self).__init__()
        self.prog_blocks, self.rgb_layers = nn.ModuleList([]), nn.ModuleList([])
+        self.leaky = nn.LeakyReLU(0.2)

-        # Create progression blocks and rgb layers
-        channels = in_channels
-        for idx in range(int(log2(img_size/4)) + 1):
-            conv_in = int(in_channels * factors[idx])
-            conv_out = channels
-            self.rgb_layers.append(WSConv2d(img_channels, conv_in, kernel_size=1, stride=1, padding=0))
+        # here we work back ways from factors because the discriminator
+        # should be mirrored from the generator. So the first prog_block and
+        # rgb layer we append will work for input size 1024x1024, then 512->256-> etc
+        for i in range(len(factors) - 1, 0, -1):
+            conv_in = int(in_channels * factors[i])
+            conv_out = int(in_channels * factors[i - 1])
            self.prog_blocks.append(ConvBlock(conv_in, conv_out, use_pixelnorm=False))
-            channels = conv_in
+            self.rgb_layers.append(
+                WSConv2d(img_channels, conv_in, kernel_size=1, stride=1, padding=0)
+            )

-        self.avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
-        # +1 to in_channels because we concatenate from minibatch std
-        self.conv = WSConv2d(in_channels + 1, z_dim, kernel_size=4, stride=1, padding=0)
-        self.linear = nn.Linear(z_dim, 1)
+        # perhaps confusing name "initial_rgb" this is just the RGB layer for 4x4 input size
+        # did this to "mirror" the generator initial_rgb
+        self.initial_rgb = WSConv2d(
+            img_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.rgb_layers.append(self.initial_rgb)
+        self.avg_pool = nn.AvgPool2d(
+            kernel_size=2, stride=2
+        )  # down sampling using avg pool
+
+        # this is the block for 4x4 input size
+        self.final_block = nn.Sequential(
+            # +1 to in_channels because we concatenate from MiniBatch std
+            WSConv2d(in_channels + 1, in_channels, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2),
+            WSConv2d(in_channels, in_channels, kernel_size=4, padding=0, stride=1),
+            nn.LeakyReLU(0.2),
+            WSConv2d(
+                in_channels, 1, kernel_size=1, padding=0, stride=1
+            ),  # we use this instead of linear layer
+        )

    def fade_in(self, alpha, downscaled, out):
-        """Used to fade in downscaled using avgpooling and output from CNN"""
-        #assert 0 <= alpha <= 1, "Alpha needs to be between [0, 1]"
-        #assert downscaled.shape == out.shape
+        """Used to fade in downscaled using avg pooling and output from CNN"""
+        # alpha should be scalar within [0, 1], and upscale.shape == generated.shape
        return alpha * out + (1 - alpha) * downscaled

    def minibatch_std(self, x):
        batch_statistics = (
-            torch.std(x, dim=0)
-            .mean()
-            .repeat(x.shape[0], 1, x.shape[2], x.shape[3])
+            torch.std(x, dim=0).mean().repeat(x.shape[0], 1, x.shape[2], x.shape[3])
        )
+        # we take the std for each example (across all channels, and pixels) then we repeat it
+        # for a single channel and concatenate it with the image. In this way the discriminator
+        # will get information about the variation in the batch/image
        return torch.cat([x, batch_statistics], dim=1)

    def forward(self, x, alpha, steps):
-        out = self.rgb_layers[steps](x) # convert from rgb as initial step
+        # where we should start in the list of prog_blocks, maybe a bit confusing but
+        # the last is for the 4x4. So example let's say steps=1, then we should start
+        # at the second to last because input_size will be 8x8. If steps==0 we just
+        # use the final block
+        cur_step = len(self.prog_blocks) - steps

-        if steps == 0: # i.e, image is 4x4
+        # convert from rgb as initial step, this will depend on
+        # the image size (each will have it's on rgb layer)
+        out = self.leaky(self.rgb_layers[cur_step](x))
+
+        if steps == 0:  # i.e, image is 4x4
            out = self.minibatch_std(out)
-            out = self.conv(out)
-            return self.linear(out.view(-1, out.shape[1]))
+            return self.final_block(out).view(out.shape[0], -1)

-        # index steps which has the "reverse" fade_in
-        downscaled = self.rgb_layers[steps - 1](self.avg_pool(x))
-        out = self.avg_pool(self.prog_blocks[steps](out))
+        # because prog_blocks might change the channels, for down scale we use rgb_layer
+        # from previous/smaller size which in our case correlates to +1 in the indexing
+        downscaled = self.leaky(self.rgb_layers[cur_step + 1](self.avg_pool(x)))
+        out = self.avg_pool(self.prog_blocks[cur_step](out))
+
+        # the fade_in is done first between the downscaled and the input
+        # this is opposite from the generator
        out = self.fade_in(alpha, downscaled, out)

-        for step in range(steps - 1, 0, -1):
-            downscaled = self.avg_pool(out)
-            out = self.prog_blocks[step](downscaled)
+        for step in range(cur_step + 1, len(self.prog_blocks)):
+            out = self.prog_blocks[step](out)
+            out = self.avg_pool(out)

        out = self.minibatch_std(out)
-        out = self.conv(out)
-        return self.linear(out.view(-1, out.shape[1]))
+        return self.final_block(out).view(out.shape[0], -1)


 if __name__ == "__main__":
-    import time
    Z_DIM = 100
-    IN_CHANNELS = 16
-    img_size = 512
-    num_steps = int(log2(img_size / 4))
-    x = torch.randn((5, Z_DIM, 1, 1))
-    gen = Generator(Z_DIM, IN_CHANNELS, img_size=img_size)
-    disc = Discriminator(img_size, Z_DIM, IN_CHANNELS)
-    start = time.time()
-    with torch.autograd.profiler.profile(use_cuda=True) as prof:
-        z = gen(x, alpha=0.5, steps=num_steps)
-    print(prof)
-    gen_time = time.time()-start
-    t = time.time()
-    out = disc(z, 0.01, num_steps)
-    disc_time = time.time()-t
-    print(gen_time, disc_time)
-    #print(disc(z, 0.01, num_steps).shape)
+    IN_CHANNELS = 256
+    gen = Generator(Z_DIM, IN_CHANNELS, img_channels=3)
+    critic = Discriminator(Z_DIM, IN_CHANNELS, img_channels=3)
+
+    for img_size in [4, 8, 16, 32, 64, 128, 256, 512, 1024]:
+        num_steps = int(log2(img_size / 4))
+        x = torch.randn((1, Z_DIM, 1, 1))
+        z = gen(x, 0.5, steps=num_steps)
+        assert z.shape == (1, 3, img_size, img_size)
+        out = critic(z, alpha=0.5, steps=num_steps)
+        assert out.shape == (1, 1)
+        print(f"Success! At img size: {img_size}")