图像生成:从 GAN 到 Diffusion Models
1. 技术分析
1.1 图像生成技术演进
图像生成经历了从 GAN 到扩散模型的演进:
图像生成技术路线 GAN (2014) → DCGAN (2015) → StyleGAN (2018) → Diffusion Models (2020)
1.2 生成模型对比
| 模型 | 类型 | 质量 | 多样性 | 训练难度 |
|---|
| GAN | 对抗训练 | 高 | 中 | 高 |
| VAE | 变分推断 | 中 | 高 | 低 |
| Flow | 归一化流 | 中 | 高 | 中 |
| Diffusion | 扩散过程 | 极高 | 高 | 中 |
1.3 图像生成质量评估
图像生成评估指标 FID: Fréchet Inception Distance IS: Inception Score LPIPS: Learned Perceptual Image Patch Similarity
2. 核心功能实现
2.1 GAN 实现
import torch import torch.nn as nn import torch.nn.functional as F class Generator(nn.Module): def __init__(self, latent_dim=100, channels=3): super().__init__() self.main = nn.Sequential( nn.ConvTranspose2d(latent_dim, 512, kernel_size=4, stride=1, padding=0), nn.BatchNorm2d(512), nn.ReLU(True), nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1), nn.BatchNorm2d(256), nn.ReLU(True), nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1), nn.BatchNorm2d(128), nn.ReLU(True), nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1), nn.BatchNorm2d(64), nn.ReLU(True), nn.ConvTranspose2d(64, channels, kernel_size=4, stride=2, padding=1), nn.Tanh() ) def forward(self, z): return self.main(z) class Discriminator(nn.Module): def __init__(self, channels=3): super().__init__() self.main = nn.Sequential( nn.Conv2d(channels, 64, kernel_size=4, stride=2, padding=1), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1), nn.BatchNorm2d(128), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1), nn.BatchNorm2d(256), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(256, 512, kernel_size=4, stride=2, padding=1), nn.BatchNorm2d(512), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(512, 1, kernel_size=4, stride=1, padding=0), nn.Sigmoid() ) def forward(self, x): return self.main(x) class DCGAN(nn.Module): def __init__(self, latent_dim=100): super().__init__() self.generator = Generator(latent_dim) self.discriminator = Discriminator() def generate(self, z): return self.generator(z) def discriminate(self, x): return self.discriminator(x)
2.2 StyleGAN 实现
class StyleGANGenerator(nn.Module): def __init__(self, latent_dim=512, channels=3): super().__init__() self.latent_dim = latent_dim self.style_dim = 512 self.num_layers = 8 self.style_mapping = nn.Sequential( nn.Linear(latent_dim, self.style_dim), nn.ReLU(), nn.Linear(self.style_dim, self.style_dim), nn.ReLU(), nn.Linear(self.style_dim, self.style_dim), nn.ReLU() ) self.initial_block = nn.ConvTranspose2d(512, 512, kernel_size=4, stride=1, padding=0) self.layers = nn.ModuleList() for i in range(self.num_layers): in_channels = 512 // (2 ** (i // 2)) out_channels = 512 // (2 ** ((i + 1) // 2)) self.layers.append(StyleBlock(in_channels, out_channels)) def forward(self, z): styles = self.style_mapping(z) x = self.initial_block(torch.randn(z.size(0), 512, 1, 1, device=z.device)) for i, layer in enumerate(self.layers): x = layer(x, styles) if i % 2 == 1: x = F.interpolate(x, scale_factor=2, mode='bilinear') return x class StyleBlock(nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1) self.style_scale1 = nn.Linear(512, in_channels) self.style_bias1 = nn.Linear(512, in_channels) self.style_scale2 = nn.Linear(512, out_channels) self.style_bias2 = nn.Linear(512, out_channels) def forward(self, x, style): scale = self.style_scale1(style).view(-1, x.size(1), 1, 1) bias = self.style_bias1(style).view(-1, x.size(1), 1, 1) x = x * scale + bias x = F.leaky_relu(x, 0.2) x = self.conv1(x) scale = self.style_scale2(style).view(-1, x.size(1), 1, 1) bias = self.style_bias2(style).view(-1, x.size(1), 1, 1) x = x * scale + bias x = F.leaky_relu(x, 0.2) x = self.conv2(x) return x
2.3 Diffusion Model 实现
class DiffusionModel(nn.Module): def __init__(self, channels=3): super().__init__() self.channels = channels self.time_embedding = nn.Sequential( nn.Linear(1, 256), nn.ReLU(), nn.Linear(256, 256) ) self.down_blocks = nn.ModuleList([ DownBlock(64, 128), DownBlock(128, 256), DownBlock(256, 512) ]) self.mid_block = nn.Conv2d(512, 512, kernel_size=3, padding=1) self.up_blocks = nn.ModuleList([ UpBlock(512, 256), UpBlock(256, 128), UpBlock(128, 64) ]) self.final_conv = nn.Conv2d(64, channels, kernel_size=1) def forward(self, x, t): t_emb = self.time_embedding(t.view(-1, 1)) x = torch.cat([x, t_emb.view(-1, 256, 1, 1).repeat(1, 1, x.size(2), x.size(3))], dim=1) for block in self.down_blocks: x = block(x) x = F.relu(self.mid_block(x)) for block in self.up_blocks: x = block(x) return self.final_conv(x) class DownBlock(nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1) self.downsample = nn.Conv2d(out_channels, out_channels, kernel_size=2, stride=2) def forward(self, x): x = F.relu(self.conv1(x)) x = F.relu(self.conv2(x)) x = self.downsample(x) return x class UpBlock(nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.upsample = nn.ConvTranspose2d(in_channels, in_channels, kernel_size=2, stride=2) self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1) def forward(self, x): x = self.upsample(x) x = F.relu(self.conv1(x)) x = F.relu(self.conv2(x)) return x
3. 性能对比
3.1 图像生成模型对比
| 模型 | FID | IS | 多样性 | 训练时间 |
|---|
| DCGAN | 30 | 8 | 中 | 1周 |
| StyleGAN2 | 12 | 18 | 高 | 2周 |
| Stable Diffusion | 8 | 25 | 很高 | 1月 |
| DALL-E 2 | 6 | 28 | 极高 | - |
3.2 不同分辨率表现
| 分辨率 | DCGAN | StyleGAN | Diffusion |
|---|
| 64x64 | 25 | 10 | 8 |
| 128x128 | 30 | 12 | 9 |
| 256x256 | 35 | 15 | 10 |
| 512x512 | - | 18 | 12 |
3.3 训练难度对比
| 模型 | 稳定性 | 调参难度 | 显存需求 |
|---|
| GAN | 低 | 高 | 中 |
| VAE | 高 | 低 | 低 |
| Diffusion | 高 | 中 | 高 |
4. 最佳实践
4.1 图像生成模型选择
def select_generator(task_type, constraints): if constraints.get('speed', False): return DCGAN() elif constraints.get('quality', False): return StableDiffusion() else: return StyleGAN2() class GeneratorFactory: @staticmethod def create(config): if config['type'] == 'gan': return DCGAN() elif config['type'] == 'stylegan': return StyleGANGenerator() elif config['type'] == 'diffusion': return DiffusionModel()
4.2 图像生成训练流程
class GANTrainer: def __init__(self, generator, discriminator, g_optimizer, d_optimizer, loss_fn): self.generator = generator self.discriminator = discriminator self.g_optimizer = g_optimizer self.d_optimizer = d_optimizer self.loss_fn = loss_fn def train_step(self, real_images): batch_size = real_images.size(0) z = torch.randn(batch_size, 100, 1, 1) self.d_optimizer.zero_grad() real_pred = self.discriminator(real_images) real_loss = self.loss_fn(real_pred, torch.ones_like(real_pred)) fake_images = self.generator(z) fake_pred = self.discriminator(fake_images.detach()) fake_loss = self.loss_fn(fake_pred, torch.zeros_like(fake_pred)) d_loss = (real_loss + fake_loss) / 2 d_loss.backward() self.d_optimizer.step() self.g_optimizer.zero_grad() fake_pred = self.discriminator(fake_images) g_loss = self.loss_fn(fake_pred, torch.ones_like(fake_pred)) g_loss.backward() self.g_optimizer.step() return d_loss.item(), g_loss.item() class DiffusionTrainer: def __init__(self, model, optimizer, scheduler): self.model = model self.optimizer = optimizer self.scheduler = scheduler def train_step(self, images): self.optimizer.zero_grad() t = torch.randint(0, 1000, (images.size(0),)).float() noise = torch.randn_like(images) noisy_images = self._add_noise(images, t, noise) noise_pred = self.model(noisy_images, t) loss = F.mse_loss(noise_pred, noise) loss.backward() self.optimizer.step() self.scheduler.step() return loss.item() def _add_noise(self, x, t, noise): sqrt_alpha = torch.sqrt(1 - self._beta(t)) sqrt_one_minus_alpha = torch.sqrt(self._beta(t)) return sqrt_alpha * x + sqrt_one_minus_alpha * noise def _beta(self, t): return 0.0001 + 0.02 * t / 1000
5. 总结
图像生成技术取得巨大进步:
- GAN:对抗训练,生成质量高但训练不稳定
- StyleGAN:风格控制能力强,生成高清图像
- Diffusion Models:当前最先进,质量和多样性都很好
- 选择建议:根据需求和资源选择合适模型
对比数据如下:
- Diffusion Models 在 FID 和多样性上领先
- GAN 训练难度最高但推理速度最快
- StyleGAN 在人脸生成上表现出色
- 推荐在大多数场景下使用 Diffusion Models