当前位置：首页 > news >正文

图像生成：从 GAN 到 Diffusion Models

news 2026/5/13 8:41:30

图像生成：从 GAN 到 Diffusion Models

1. 技术分析

1.1 图像生成技术演进

图像生成经历了从 GAN 到扩散模型的演进：

图像生成技术路线 GAN (2014) → DCGAN (2015) → StyleGAN (2018) → Diffusion Models (2020)

1.2 生成模型对比

模型	类型	质量	多样性	训练难度
GAN	对抗训练	高	中	高
VAE	变分推断	中	高	低
Flow	归一化流	中	高	中
Diffusion	扩散过程	极高	高	中

1.3 图像生成质量评估

图像生成评估指标 FID: Fréchet Inception Distance IS: Inception Score LPIPS: Learned Perceptual Image Patch Similarity

2. 核心功能实现

2.1 GAN 实现

import torch import torch.nn as nn import torch.nn.functional as F class Generator(nn.Module): def __init__(self, latent_dim=100, channels=3): super().__init__() self.main = nn.Sequential( nn.ConvTranspose2d(latent_dim, 512, kernel_size=4, stride=1, padding=0), nn.BatchNorm2d(512), nn.ReLU(True), nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1), nn.BatchNorm2d(256), nn.ReLU(True), nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1), nn.BatchNorm2d(128), nn.ReLU(True), nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1), nn.BatchNorm2d(64), nn.ReLU(True), nn.ConvTranspose2d(64, channels, kernel_size=4, stride=2, padding=1), nn.Tanh() ) def forward(self, z): return self.main(z) class Discriminator(nn.Module): def __init__(self, channels=3): super().__init__() self.main = nn.Sequential( nn.Conv2d(channels, 64, kernel_size=4, stride=2, padding=1), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1), nn.BatchNorm2d(128), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1), nn.BatchNorm2d(256), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(256, 512, kernel_size=4, stride=2, padding=1), nn.BatchNorm2d(512), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(512, 1, kernel_size=4, stride=1, padding=0), nn.Sigmoid() ) def forward(self, x): return self.main(x) class DCGAN(nn.Module): def __init__(self, latent_dim=100): super().__init__() self.generator = Generator(latent_dim) self.discriminator = Discriminator() def generate(self, z): return self.generator(z) def discriminate(self, x): return self.discriminator(x)

2.2 StyleGAN 实现

class StyleGANGenerator(nn.Module): def __init__(self, latent_dim=512, channels=3): super().__init__() self.latent_dim = latent_dim self.style_dim = 512 self.num_layers = 8 self.style_mapping = nn.Sequential( nn.Linear(latent_dim, self.style_dim), nn.ReLU(), nn.Linear(self.style_dim, self.style_dim), nn.ReLU(), nn.Linear(self.style_dim, self.style_dim), nn.ReLU() ) self.initial_block = nn.ConvTranspose2d(512, 512, kernel_size=4, stride=1, padding=0) self.layers = nn.ModuleList() for i in range(self.num_layers): in_channels = 512 // (2 ** (i // 2)) out_channels = 512 // (2 ** ((i + 1) // 2)) self.layers.append(StyleBlock(in_channels, out_channels)) def forward(self, z): styles = self.style_mapping(z) x = self.initial_block(torch.randn(z.size(0), 512, 1, 1, device=z.device)) for i, layer in enumerate(self.layers): x = layer(x, styles) if i % 2 == 1: x = F.interpolate(x, scale_factor=2, mode='bilinear') return x class StyleBlock(nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1) self.style_scale1 = nn.Linear(512, in_channels) self.style_bias1 = nn.Linear(512, in_channels) self.style_scale2 = nn.Linear(512, out_channels) self.style_bias2 = nn.Linear(512, out_channels) def forward(self, x, style): scale = self.style_scale1(style).view(-1, x.size(1), 1, 1) bias = self.style_bias1(style).view(-1, x.size(1), 1, 1) x = x * scale + bias x = F.leaky_relu(x, 0.2) x = self.conv1(x) scale = self.style_scale2(style).view(-1, x.size(1), 1, 1) bias = self.style_bias2(style).view(-1, x.size(1), 1, 1) x = x * scale + bias x = F.leaky_relu(x, 0.2) x = self.conv2(x) return x

2.3 Diffusion Model 实现

class DiffusionModel(nn.Module): def __init__(self, channels=3): super().__init__() self.channels = channels self.time_embedding = nn.Sequential( nn.Linear(1, 256), nn.ReLU(), nn.Linear(256, 256) ) self.down_blocks = nn.ModuleList([ DownBlock(64, 128), DownBlock(128, 256), DownBlock(256, 512) ]) self.mid_block = nn.Conv2d(512, 512, kernel_size=3, padding=1) self.up_blocks = nn.ModuleList([ UpBlock(512, 256), UpBlock(256, 128), UpBlock(128, 64) ]) self.final_conv = nn.Conv2d(64, channels, kernel_size=1) def forward(self, x, t): t_emb = self.time_embedding(t.view(-1, 1)) x = torch.cat([x, t_emb.view(-1, 256, 1, 1).repeat(1, 1, x.size(2), x.size(3))], dim=1) for block in self.down_blocks: x = block(x) x = F.relu(self.mid_block(x)) for block in self.up_blocks: x = block(x) return self.final_conv(x) class DownBlock(nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1) self.downsample = nn.Conv2d(out_channels, out_channels, kernel_size=2, stride=2) def forward(self, x): x = F.relu(self.conv1(x)) x = F.relu(self.conv2(x)) x = self.downsample(x) return x class UpBlock(nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.upsample = nn.ConvTranspose2d(in_channels, in_channels, kernel_size=2, stride=2) self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1) def forward(self, x): x = self.upsample(x) x = F.relu(self.conv1(x)) x = F.relu(self.conv2(x)) return x

3. 性能对比

3.1 图像生成模型对比

模型	FID	IS	多样性	训练时间
DCGAN	30	8	中	1周
StyleGAN2	12	18	高	2周
Stable Diffusion	8	25	很高	1月
DALL-E 2	6	28	极高	-

3.2 不同分辨率表现

分辨率	DCGAN	StyleGAN	Diffusion
64x64	25	10	8
128x128	30	12	9
256x256	35	15	10
512x512	-	18	12

3.3 训练难度对比

模型	稳定性	调参难度	显存需求
GAN	低	高	中
VAE	高	低	低
Diffusion	高	中	高

4. 最佳实践

4.1 图像生成模型选择

def select_generator(task_type, constraints): if constraints.get('speed', False): return DCGAN() elif constraints.get('quality', False): return StableDiffusion() else: return StyleGAN2() class GeneratorFactory: @staticmethod def create(config): if config['type'] == 'gan': return DCGAN() elif config['type'] == 'stylegan': return StyleGANGenerator() elif config['type'] == 'diffusion': return DiffusionModel()

4.2 图像生成训练流程

class GANTrainer: def __init__(self, generator, discriminator, g_optimizer, d_optimizer, loss_fn): self.generator = generator self.discriminator = discriminator self.g_optimizer = g_optimizer self.d_optimizer = d_optimizer self.loss_fn = loss_fn def train_step(self, real_images): batch_size = real_images.size(0) z = torch.randn(batch_size, 100, 1, 1) self.d_optimizer.zero_grad() real_pred = self.discriminator(real_images) real_loss = self.loss_fn(real_pred, torch.ones_like(real_pred)) fake_images = self.generator(z) fake_pred = self.discriminator(fake_images.detach()) fake_loss = self.loss_fn(fake_pred, torch.zeros_like(fake_pred)) d_loss = (real_loss + fake_loss) / 2 d_loss.backward() self.d_optimizer.step() self.g_optimizer.zero_grad() fake_pred = self.discriminator(fake_images) g_loss = self.loss_fn(fake_pred, torch.ones_like(fake_pred)) g_loss.backward() self.g_optimizer.step() return d_loss.item(), g_loss.item() class DiffusionTrainer: def __init__(self, model, optimizer, scheduler): self.model = model self.optimizer = optimizer self.scheduler = scheduler def train_step(self, images): self.optimizer.zero_grad() t = torch.randint(0, 1000, (images.size(0),)).float() noise = torch.randn_like(images) noisy_images = self._add_noise(images, t, noise) noise_pred = self.model(noisy_images, t) loss = F.mse_loss(noise_pred, noise) loss.backward() self.optimizer.step() self.scheduler.step() return loss.item() def _add_noise(self, x, t, noise): sqrt_alpha = torch.sqrt(1 - self._beta(t)) sqrt_one_minus_alpha = torch.sqrt(self._beta(t)) return sqrt_alpha * x + sqrt_one_minus_alpha * noise def _beta(self, t): return 0.0001 + 0.02 * t / 1000