From fc35d0c8c2cbd9cad5cbd79cd201b6e324302f5f Mon Sep 17 00:00:00 2001 From: chenjian Date: Fri, 19 Aug 2022 13:19:39 +0800 Subject: [PATCH] Add disco_diffusion_clip_vitb32 model --- .../disco_diffusion_clip_vitb32/README.md | 114 ++ .../clip/README.md | 2 + .../clip/clip/__init__.py | 1 + .../clip/clip/layers.py | 182 +++ .../clip/clip/model.py | 227 +++ .../clip/clip/simple_tokenizer.py | 135 ++ .../clip/clip/utils.py | 122 ++ .../disco_diffusion_clip_vitb32/module.py | 441 ++++++ .../requirements.txt | 8 + .../resize_right/README.md | 3 + .../resize_right/__init__.py | 0 .../resize_right/interp_methods.py | 70 + .../resize_right/resize_right.py | 403 ++++++ .../reverse_diffusion/README.md | 2 + .../reverse_diffusion/__init__.py | 156 +++ .../reverse_diffusion/config.py | 77 ++ .../reverse_diffusion/helper.py | 137 ++ .../reverse_diffusion/model/__init__.py | 3 + .../model/gaussian_diffusion.py | 1214 +++++++++++++++++ .../reverse_diffusion/model/losses.py | 86 ++ .../reverse_diffusion/model/make_cutouts.py | 177 +++ .../reverse_diffusion/model/nn.py | 127 ++ .../reverse_diffusion/model/perlin_noises.py | 78 ++ .../reverse_diffusion/model/respace.py | 123 ++ .../reverse_diffusion/model/script_util.py | 201 +++ .../reverse_diffusion/model/sec_diff.py | 135 ++ .../reverse_diffusion/model/transforms.py | 757 ++++++++++ .../reverse_diffusion/model/unet.py | 838 ++++++++++++ .../reverse_diffusion/resources/default.yml | 47 + .../resources/docstrings.yml | 103 ++ .../reverse_diffusion/runner.py | 285 ++++ 31 files changed, 6254 insertions(+) create mode 100644 modules/image/text_to_image/disco_diffusion_clip_vitb32/README.md create mode 100644 modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/README.md create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/__init__.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/layers.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/model.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/simple_tokenizer.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/utils.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/module.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/requirements.txt create mode 100644 modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/README.md create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/__init__.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/interp_methods.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/resize_right.py create mode 100644 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/README.md create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/__init__.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/config.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/helper.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/__init__.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/gaussian_diffusion.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/losses.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/make_cutouts.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/nn.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/perlin_noises.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/respace.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/script_util.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/sec_diff.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/transforms.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/unet.py create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/resources/default.yml create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/resources/docstrings.yml create mode 100755 modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/runner.py diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/README.md b/modules/image/text_to_image/disco_diffusion_clip_vitb32/README.md new file mode 100644 index 00000000..1a42914c --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/README.md @@ -0,0 +1,114 @@ +# disco_diffusion_clip_vitb32 + +|模型名称|disco_diffusion_clip_vitb32| +| :--- | :---: | +|类别|图像-文图生成| +|网络|dd+clip ViTB32| +|数据集|-| +|是否支持Fine-tuning|否| +|模型大小|3.1GB| +|最新更新日期|2022-08-02| +|数据指标|-| + +## 一、模型基本信息 + +### 应用效果展示 + + - 输入文本 "A beautiful painting of a singular lighthouse, shining its light across a tumultuous sea of blood by greg rutkowski and thomas kinkade, Trending on artstation." + + - 输出图像 +

+ +
+ + - 生成过程 +

+ +
+ +### 模型介绍 + +disco_diffusion_clip_vitb32 是一个文图生成模型,可以通过输入一段文字来生成符合该句子语义的图像。该模型由两部分组成,一部分是扩散模型,是一种生成模型,可以从噪声输入中重建出原始图像。另一部分是多模态预训练模型(CLIP), 可以将文本和图像表示在同一个特征空间,相近语义的文本和图像在该特征空间里距离会更相近。在该文图生成模型中,扩散模型负责从初始噪声或者指定初始图像中来生成目标图像,CLIP负责引导生成图像的语义和输入的文本的语义尽可能接近,随着扩散模型在CLIP的引导下不断的迭代生成新图像,最终能够生成文本所描述内容的图像。该模块中使用的CLIP模型结构为ViTB32。 + +更多详情请参考论文:[Diffusion Models Beat GANs on Image Synthesis](https://arxiv.org/abs/2105.05233) 以及 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) + +## 二、安装 + +- ### 1、环境依赖 + + - paddlepaddle >= 2.0.0 + + - paddlehub >= 2.2.0 | [如何安装PaddleHub](../../../../docs/docs_ch/get_start/installation.rst) + +- ### 2、安装 + + - ```shell + $ hub install disco_diffusion_clip_vitb32 + ``` + - 如您安装时遇到问题,可参考:[零基础windows安装](../../../../docs/docs_ch/get_start/windows_quickstart.md) + | [零基础Linux安装](../../../../docs/docs_ch/get_start/linux_quickstart.md) | [零基础MacOS安装](../../../../docs/docs_ch/get_start/mac_quickstart.md) + + +## 三、模型API预测 + +- ### 1、命令行预测 + + - ```shell + $ hub run disco_diffusion_clip_vitb32 --text_prompts "A beautiful painting of a singular lighthouse, shining its light across a tumultuous sea of blood by greg rutkowski and thomas kinkade, Trending on artstation." --output_dir disco_diffusion_clip_vitb32_out + ``` + +- ### 2、预测代码示例 + + - ```python + import paddlehub as hub + + module = hub.Module(name="disco_diffusion_clip_vitb32") + text_prompts = ["A beautiful painting of a singular lighthouse, shining its light across a tumultuous sea of blood by greg rutkowski and thomas kinkade, Trending on artstation."] + # 生成图像, 默认会在disco_diffusion_clip_vitb32_out目录保存图像 + # 返回的da是一个DocumentArray对象,保存了所有的结果,包括最终结果和迭代过程的中间结果 + # 可以通过操作DocumentArray对象对生成的图像做后处理,保存或者分析 + da = module.generate_image(text_prompts=text_prompts, output_dir='./disco_diffusion_clip_vitb32_out/') + # 手动将最终生成的图像保存到指定路径 + da[0].save_uri_to_file('disco_diffusion_clip_vitb32_out-result.png') + # 展示所有的中间结果 + da[0].chunks.plot_image_sprites(skip_empty=True, show_index=True, keep_aspect_ratio=True) + # 将整个生成过程保存为一个动态图gif + da[0].chunks.save_gif('disco_diffusion_clip_vitb32_out-result.gif', show_index=True, inline_display=True, size_ratio=0.5) + ``` + +- ### 3、API + + - ```python + def generate_image( + text_prompts, + style: Optional[str] = None, + artist: Optional[str] = None, + width_height: Optional[List[int]] = [1280, 768], + seed: Optional[int] = None, + output_dir: Optional[str] = 'disco_diffusion_clip_vitb32_out'): + ``` + + - 文图生成API,生成文本描述内容的图像。 + + - **参数** + + - text_prompts(str): 输入的语句,描述想要生成的图像的内容。通常比较有效的构造方式为 "一段描述性的文字内容" + "指定艺术家的名字",如"a beautiful painting of Chinese architecture, by krenz, sunny, super wide angle, artstation."。prompt的构造可以参考[网站](https://docs.google.com/document/d/1XUT2G9LmkZataHFzmuOtRXnuWBfhvXDAo8DkS--8tec/edit#)。 + - style(Optional[str]): 指定绘画的风格,如'watercolor','Chinese painting'等。当不指定时,风格完全由您所填写的prompt决定。 + - artist(Optional[str]): 指定特定的艺术家,如Greg Rutkowsk、krenz,将会生成所指定艺术家的绘画风格。当不指定时,风格完全由您所填写的prompt决定。各种艺术家的风格可以参考[网站](https://weirdwonderfulai.art/resources/disco-diffusion-70-plus-artist-studies/)。 + - width_height(Optional[List[int]]): 指定最终输出图像的宽高,宽和高都需要是64的倍数,生成的图像越大,所需要的计算时间越长。 + - seed(Optional[int]): 随机种子,由于输入默认是随机高斯噪声,设置不同的随机种子会由不同的初始输入,从而最终生成不同的结果,可以设置该参数来获得不同的输出图像。 + - output_dir(Optional[str]): 保存输出图像的目录,默认为"disco_diffusion_clip_vitb32_out"。 + + + - **返回** + - ra(DocumentArray): DocumentArray对象, 包含`n_batches`个Documents,其中每个Document都保存了迭代过程的所有中间结果。详细可参考[DocumentArray使用文档](https://docarray.jina.ai/fundamentals/documentarray/index.html)。 + +## 四、更新历史 + +* 1.0.0 + + 初始发布 + + ```shell + $ hub install disco_diffusion_clip_vitb32 == 1.0.0 + ``` diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/README.md b/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/README.md new file mode 100644 index 00000000..317214d8 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/README.md @@ -0,0 +1,2 @@ +# OpenAI CLIP implemented in Paddle. +The original implementation repo is [ranchlai/clip.paddle](https://github.com/ranchlai/clip.paddle). We copy this repo here for guided diffusion. diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/__init__.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/__init__.py new file mode 100755 index 00000000..5657b56e --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/__init__.py @@ -0,0 +1 @@ +from .utils import * diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/layers.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/layers.py new file mode 100755 index 00000000..286f35ab --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/layers.py @@ -0,0 +1,182 @@ +from typing import Optional + +import paddle +import paddle.nn as nn +from paddle import Tensor +from paddle.nn import functional as F +from paddle.nn import Linear + +__all__ = ['ResidualAttentionBlock', 'AttentionPool2d', 'multi_head_attention_forward', 'MultiHeadAttention'] + + +def multi_head_attention_forward(x: Tensor, + num_heads: int, + q_proj: Linear, + k_proj: Linear, + v_proj: Linear, + c_proj: Linear, + attn_mask: Optional[Tensor] = None): + max_len, batch_size, emb_dim = x.shape + head_dim = emb_dim // num_heads + scaling = float(head_dim)**-0.5 + q = q_proj(x) # L, N, E + k = k_proj(x) # L, N, E + v = v_proj(x) # L, N, E + #k = k.con + v = v.reshape((-1, batch_size * num_heads, head_dim)).transpose((1, 0, 2)) + k = k.reshape((-1, batch_size * num_heads, head_dim)).transpose((1, 0, 2)) + q = q.reshape((-1, batch_size * num_heads, head_dim)).transpose((1, 0, 2)) + + q = q * scaling + qk = paddle.bmm(q, k.transpose((0, 2, 1))) + if attn_mask is not None: + if attn_mask.ndim == 2: + attn_mask.unsqueeze_(0) + #assert str(attn_mask.dtype) == 'VarType.FP32' and attn_mask.ndim == 3 + assert attn_mask.shape[0] == 1 and attn_mask.shape[1] == max_len and attn_mask.shape[2] == max_len + qk += attn_mask + + qk = paddle.nn.functional.softmax(qk, axis=-1) + atten = paddle.bmm(qk, v) + atten = atten.transpose((1, 0, 2)) + atten = atten.reshape((max_len, batch_size, emb_dim)) + atten = c_proj(atten) + return atten + + +class MultiHeadAttention(nn.Layer): # without attention mask + + def __init__(self, emb_dim: int, num_heads: int): + super().__init__() + self.q_proj = nn.Linear(emb_dim, emb_dim, bias_attr=True) + self.k_proj = nn.Linear(emb_dim, emb_dim, bias_attr=True) + self.v_proj = nn.Linear(emb_dim, emb_dim, bias_attr=True) + self.c_proj = nn.Linear(emb_dim, emb_dim, bias_attr=True) + self.head_dim = emb_dim // num_heads + self.emb_dim = emb_dim + self.num_heads = num_heads + assert self.head_dim * num_heads == emb_dim, "embed_dim must be divisible by num_heads" + #self.scaling = float(self.head_dim) ** -0.5 + + def forward(self, x, attn_mask=None): # x is in shape[max_len,batch_size,emb_dim] + + atten = multi_head_attention_forward(x, + self.num_heads, + self.q_proj, + self.k_proj, + self.v_proj, + self.c_proj, + attn_mask=attn_mask) + + return atten + + +class Identity(nn.Layer): + + def __init__(self): + super().__init__() + + def forward(self, x): + return x + + +class Bottleneck(nn.Layer): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1): + super().__init__() + + # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 + self.conv1 = nn.Conv2D(inplanes, planes, 1, bias_attr=False) + self.bn1 = nn.BatchNorm2D(planes) + + self.conv2 = nn.Conv2D(planes, planes, 3, padding=1, bias_attr=False) + self.bn2 = nn.BatchNorm2D(planes) + + self.avgpool = nn.AvgPool2D(stride) if stride > 1 else Identity() + + self.conv3 = nn.Conv2D(planes, planes * self.expansion, 1, bias_attr=False) + self.bn3 = nn.BatchNorm2D(planes * self.expansion) + + self.relu = nn.ReLU() + self.downsample = None + self.stride = stride + + if stride > 1 or inplanes != planes * Bottleneck.expansion: + self.downsample = nn.Sequential( + ("-1", nn.AvgPool2D(stride)), + ("0", nn.Conv2D(inplanes, planes * self.expansion, 1, stride=1, bias_attr=False)), + ("1", nn.BatchNorm2D(planes * self.expansion))) + + def forward(self, x): + identity = x + + out = self.relu(self.bn1(self.conv1(x))) + out = self.relu(self.bn2(self.conv2(out))) + out = self.avgpool(out) + out = self.bn3(self.conv3(out)) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + return out + + +class AttentionPool2d(nn.Layer): + + def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None): + super().__init__() + + self.positional_embedding = paddle.create_parameter((spacial_dim**2 + 1, embed_dim), dtype='float32') + + self.q_proj = nn.Linear(embed_dim, embed_dim, bias_attr=True) + self.k_proj = nn.Linear(embed_dim, embed_dim, bias_attr=True) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias_attr=True) + self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim, bias_attr=True) + self.num_heads = num_heads + + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" + + def forward(self, x): + + x = x.reshape((x.shape[0], x.shape[1], x.shape[2] * x.shape[3])).transpose((2, 0, 1)) # NCHW -> (HW)NC + max_len, batch_size, emb_dim = x.shape + head_dim = self.head_dim + x = paddle.concat([paddle.mean(x, axis=0, keepdim=True), x], axis=0) + x = x + paddle.unsqueeze(self.positional_embedding, 1) + out = multi_head_attention_forward(x, self.num_heads, self.q_proj, self.k_proj, self.v_proj, self.c_proj) + + return out[0] + + +class QuickGELU(nn.Layer): + + def forward(self, x): + return x * paddle.nn.functional.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Layer): + + def __init__(self, d_model: int, n_head: int, attn_mask=None): + super().__init__() + + self.attn = MultiHeadAttention(d_model, n_head) + self.ln_1 = nn.LayerNorm(d_model) + self.mlp = nn.Sequential(("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()), + ("c_proj", nn.Linear(d_model * 4, d_model))) + self.ln_2 = nn.LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x): + x = self.attn(x, self.attn_mask) + assert isinstance(x, paddle.Tensor) # not tuble here + return x + + def forward(self, x): + + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/model.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/model.py new file mode 100755 index 00000000..63d1835c --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/model.py @@ -0,0 +1,227 @@ +from typing import Tuple +from typing import Union + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import nn + +from .layers import AttentionPool2d +from .layers import Bottleneck +from .layers import MultiHeadAttention +from .layers import ResidualAttentionBlock + + +class ModifiedResNet(nn.Layer): + """ + A ResNet class that is similar to torchvision's but contains the following changes: + - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. + - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 + - The final pooling layer is a QKV attention instead of an average pool + """ + + def __init__(self, layers, output_dim, heads, input_resolution=224, width=64): + super().__init__() + self.output_dim = output_dim + self.input_resolution = input_resolution + + # the 3-layer stem + self.conv1 = nn.Conv2D(3, width // 2, kernel_size=3, stride=2, padding=1, bias_attr=False) + self.bn1 = nn.BatchNorm2D(width // 2) + self.conv2 = nn.Conv2D(width // 2, width // 2, kernel_size=3, padding=1, bias_attr=False) + self.bn2 = nn.BatchNorm2D(width // 2) + self.conv3 = nn.Conv2D(width // 2, width, kernel_size=3, padding=1, bias_attr=False) + self.bn3 = nn.BatchNorm2D(width) + self.avgpool = nn.AvgPool2D(2) + self.relu = nn.ReLU() + + # residual layers + self._inplanes = width # this is a *mutable* variable used during construction + self.layer1 = self._make_layer(width, layers[0]) + self.layer2 = self._make_layer(width * 2, layers[1], stride=2) + self.layer3 = self._make_layer(width * 4, layers[2], stride=2) + self.layer4 = self._make_layer(width * 8, layers[3], stride=2) + + embed_dim = width * 32 # the ResNet feature dimension + self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim) + + def _make_layer(self, planes, blocks, stride=1): + layers = [Bottleneck(self._inplanes, planes, stride)] + + self._inplanes = planes * Bottleneck.expansion + for _ in range(1, blocks): + layers.append(Bottleneck(self._inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + + def stem(x): + for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]: + x = self.relu(bn(conv(x))) + x = self.avgpool(x) + return x + + #x = x.type(self.conv1.weight.dtype) + x = stem(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.attnpool(x) + + return x + + +class Transformer(nn.Layer): + + def __init__(self, width: int, layers: int, heads: int, attn_mask=None): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]) + + def forward(self, x): + return self.resblocks(x) + + +class VisualTransformer(nn.Layer): + + def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int): + super().__init__() + self.input_resolution = input_resolution + self.output_dim = output_dim + # used patch_size x patch_size, stride patch_size to do linear projection + self.conv1 = nn.Conv2D(in_channels=3, + out_channels=width, + kernel_size=patch_size, + stride=patch_size, + bias_attr=False) + + # scale = width ** -0.5 + self.class_embedding = paddle.create_parameter((width, ), 'float32') + + self.positional_embedding = paddle.create_parameter(((input_resolution // patch_size)**2 + 1, width), 'float32') + + self.ln_pre = nn.LayerNorm(width) + + self.transformer = Transformer(width, layers, heads) + + self.ln_post = nn.LayerNorm(width) + self.proj = paddle.create_parameter((width, output_dim), 'float32') + + def forward(self, x): + + x = self.conv1(x) + x = x.reshape((x.shape[0], x.shape[1], -1)) + x = x.transpose((0, 2, 1)) + x = paddle.concat([self.class_embedding + paddle.zeros((x.shape[0], 1, x.shape[-1]), dtype=x.dtype), x], axis=1) + + x = x + self.positional_embedding + x = self.ln_pre(x) + x = x.transpose((1, 0, 2)) + x = self.transformer(x) + x = x.transpose((1, 0, 2)) + x = self.ln_post(x[:, 0, :]) + if self.proj is not None: + x = paddle.matmul(x, self.proj) + + return x + + +class CLIP(nn.Layer): + + def __init__( + self, + embed_dim: int, + # vision + image_resolution: int, + vision_layers: Union[Tuple[int, int, int, int], int], + vision_width: int, + vision_patch_size: int, + # text + context_length: int, + vocab_size: int, + transformer_width: int, + transformer_heads: int, + transformer_layers: int): + super().__init__() + + self.context_length = context_length + if isinstance(vision_layers, (tuple, list)): + vision_heads = vision_width * 32 // 64 + self.visual = ModifiedResNet(layers=vision_layers, + output_dim=embed_dim, + heads=vision_heads, + input_resolution=image_resolution, + width=vision_width) + else: + vision_heads = vision_width // 64 + self.visual = VisualTransformer(input_resolution=image_resolution, + patch_size=vision_patch_size, + width=vision_width, + layers=vision_layers, + heads=vision_heads, + output_dim=embed_dim) + + self.transformer = Transformer(width=transformer_width, + layers=transformer_layers, + heads=transformer_heads, + attn_mask=self.build_attention_mask()) + + self.vocab_size = vocab_size + self.token_embedding = nn.Embedding(vocab_size, transformer_width) + self.positional_embedding = paddle.create_parameter((self.context_length, transformer_width), 'float32') + self.ln_final = nn.LayerNorm(transformer_width) + + self.text_projection = paddle.create_parameter((transformer_width, embed_dim), 'float32') + self.logit_scale = paddle.create_parameter((1, ), 'float32') + + def build_attention_mask(self): + # lazily create causal attention mask, with full attention between the vision tokens + # mask = paddle.empty((self.context_length, self.context_length),dtype='float32') + # mask.fill_(float("-inf")) + #mask.triu_(1) # zero out the lower diagonal + + mask = paddle.ones((self.context_length, self.context_length)) * float("-inf") + mask = paddle.triu(mask, diagonal=1) + + return mask + + def encode_image(self, image): + return self.visual(image) + + def encode_text(self, text): + x = self.token_embedding(text) # [batch_size, n_ctx, d_model] + # print(x.shape) + + x = x + self.positional_embedding + #print(x.shape) + + x = x.transpose((1, 0, 2)) # NLD -> LND + x = self.transformer(x) + x = x.transpose((1, 0, 2)) # LND -> NLD + x = self.ln_final(x) + + idx = text.numpy().argmax(-1) + idx = list(idx) + x = [x[i:i + 1, int(j), :] for i, j in enumerate(idx)] + x = paddle.concat(x, 0) + x = paddle.matmul(x, self.text_projection) + return x + + def forward(self, image, text): + image_features = self.encode_image(image) + text_features = self.encode_text(text) + + # normalized features + image_features = image_features / image_features.norm(dim=-1, keepdim=True) + text_features = text_features / text_features.norm(dim=-1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp() + logits_per_image = paddle.matmul(logit_scale * image_features, text_features.t()) + logits_per_text = paddle.matmul(logit_scale * text_features, image_features.t()) + + # shape = [global_batch_size, global_batch_size] + return logits_per_image, logits_per_text diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/simple_tokenizer.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/simple_tokenizer.py new file mode 100755 index 00000000..4eaf82e9 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/simple_tokenizer.py @@ -0,0 +1,135 @@ +import gzip +import html +import os +from functools import lru_cache + +import ftfy +import regex as re + + +@lru_cache() +def default_bpe(): + return os.path.join(os.path.dirname(os.path.abspath(__file__)), "../assets/bpe_simple_vocab_16e6.txt.gz") + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def basic_clean(text): + text = ftfy.fix_text(text) + text = html.unescape(html.unescape(text)) + return text.strip() + + +def whitespace_clean(text): + text = re.sub(r'\s+', ' ', text) + text = text.strip() + return text + + +class SimpleTokenizer(object): + + def __init__(self, bpe_path: str = default_bpe()): + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') + merges = merges[1:49152 - 256 - 2 + 1] + merges = [tuple(merge.split()) for merge in merges] + vocab = list(bytes_to_unicode().values()) + vocab = vocab + [v + '' for v in vocab] + for merge in merges: + vocab.append(''.join(merge)) + vocab.extend(['<|startoftext|>', '<|endoftext|>']) + self.encoder = dict(zip(vocab, range(len(vocab)))) + self.decoder = {v: k for k, v in self.encoder.items()} + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} + self.pat = re.compile( + r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", + re.IGNORECASE) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token[:-1]) + (token[-1] + '', ) + pairs = get_pairs(word) + + if not pairs: + return token + '' + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def encode(self, text): + bpe_tokens = [] + text = whitespace_clean(basic_clean(text)).lower() + for token in re.findall(self.pat, text): + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) + bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') + return text diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/utils.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/utils.py new file mode 100755 index 00000000..8ea90914 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/clip/clip/utils.py @@ -0,0 +1,122 @@ +import os +from typing import List +from typing import Union + +import numpy as np +import paddle +from paddle.utils import download +from paddle.vision.transforms import CenterCrop +from paddle.vision.transforms import Compose +from paddle.vision.transforms import Normalize +from paddle.vision.transforms import Resize +from paddle.vision.transforms import ToTensor + +from .model import CLIP +from .simple_tokenizer import SimpleTokenizer + +__all__ = ['transform', 'tokenize', 'build_model'] + +MODEL_NAMES = ['RN50', 'RN101', 'VIT32'] + +URL = { + 'RN50': os.path.join(os.path.dirname(__file__), 'pre_trained', 'RN50.pdparams'), + 'RN101': os.path.join(os.path.dirname(__file__), 'pre_trained', 'RN101.pdparams'), + 'VIT32': os.path.join(os.path.dirname(__file__), 'pre_trained', 'ViT-B-32.pdparams') +} + +MEAN, STD = (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711) +_tokenizer = SimpleTokenizer() + +transform = Compose([ + Resize(224, interpolation='bicubic'), + CenterCrop(224), lambda image: image.convert('RGB'), + ToTensor(), + Normalize(mean=MEAN, std=STD), lambda t: t.unsqueeze_(0) +]) + + +def tokenize(texts: Union[str, List[str]], context_length: int = 77): + """ + Returns the tokenized representation of given input string(s) + + Parameters + ---------- + texts : Union[str, List[str]] + An input string or a list of input strings to tokenize + + context_length : int + The context length to use; all CLIP models use 77 as the context length + + Returns + ------- + A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] + """ + if isinstance(texts, str): + texts = [texts] + + sot_token = _tokenizer.encoder["<|startoftext|>"] + eot_token = _tokenizer.encoder["<|endoftext|>"] + all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts] + result = paddle.zeros((len(all_tokens), context_length), dtype='int64') + + for i, tokens in enumerate(all_tokens): + if len(tokens) > context_length: + raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}") + result[i, :len(tokens)] = paddle.Tensor(np.array(tokens)) + + return result + + +def build_model(name='VIT32'): + assert name in MODEL_NAMES, f"model name must be one of {MODEL_NAMES}" + name2model = {'RN101': build_rn101_model, 'VIT32': build_vit_model, 'RN50': build_rn50_model} + model = name2model[name]() + weight = URL[name] + sd = paddle.load(weight) + model.load_dict(sd) + model.eval() + return model + + +def build_vit_model(): + + model = CLIP(embed_dim=512, + image_resolution=224, + vision_layers=12, + vision_width=768, + vision_patch_size=32, + context_length=77, + vocab_size=49408, + transformer_width=512, + transformer_heads=8, + transformer_layers=12) + return model + + +def build_rn101_model(): + model = CLIP( + embed_dim=512, + image_resolution=224, + vision_layers=(3, 4, 23, 3), + vision_width=64, + vision_patch_size=0, #Not used in resnet + context_length=77, + vocab_size=49408, + transformer_width=512, + transformer_heads=8, + transformer_layers=12) + return model + + +def build_rn50_model(): + model = CLIP(embed_dim=1024, + image_resolution=224, + vision_layers=(3, 4, 6, 3), + vision_width=64, + vision_patch_size=None, + context_length=77, + vocab_size=49408, + transformer_width=512, + transformer_heads=8, + transformer_layers=12) + return model diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/module.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/module.py new file mode 100755 index 00000000..fb025bfc --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/module.py @@ -0,0 +1,441 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import ast +import os +import sys +from functools import partial +from typing import List +from typing import Optional + +import disco_diffusion_clip_vitb32.clip as clip +import disco_diffusion_clip_vitb32.resize_right as resize_right +import paddle +from disco_diffusion_clip_vitb32.reverse_diffusion import create + +import paddlehub as hub +from paddlehub.module.module import moduleinfo +from paddlehub.module.module import runnable +from paddlehub.module.module import serving + + +@moduleinfo(name="disco_diffusion_clip_vitb32", + version="1.0.0", + type="image/text_to_image", + summary="", + author="paddlepaddle", + author_email="paddle-dev@baidu.com") +class DiscoDiffusionClip: + + def generate_image(self, + text_prompts, + style: Optional[str] = None, + artist: Optional[str] = None, + init_image: Optional[str] = None, + width_height: Optional[List[int]] = [1280, 768], + skip_steps: Optional[int] = 0, + steps: Optional[int] = 250, + cut_ic_pow: Optional[int] = 1, + init_scale: Optional[int] = 1000, + clip_guidance_scale: Optional[int] = 5000, + tv_scale: Optional[int] = 0, + range_scale: Optional[int] = 0, + sat_scale: Optional[int] = 0, + cutn_batches: Optional[int] = 4, + diffusion_sampling_mode: Optional[str] = 'ddim', + perlin_init: Optional[bool] = False, + perlin_mode: Optional[str] = 'mixed', + seed: Optional[int] = None, + eta: Optional[float] = 0.8, + clamp_grad: Optional[bool] = True, + clamp_max: Optional[float] = 0.05, + randomize_class: Optional[bool] = True, + clip_denoised: Optional[bool] = False, + fuzzy_prompt: Optional[bool] = False, + rand_mag: Optional[float] = 0.05, + cut_overview: Optional[str] = '[12]*400+[4]*600', + cut_innercut: Optional[str] = '[4]*400+[12]*600', + cut_icgray_p: Optional[str] = '[0.2]*400+[0]*600', + display_rate: Optional[int] = 10, + n_batches: Optional[int] = 1, + batch_size: Optional[int] = 1, + batch_name: Optional[str] = '', + use_gpu: Optional[bool] = True, + output_dir: Optional[str] = 'disco_diffusion_clip_vitb32_out'): + """ + Create Disco Diffusion artworks and save the result into a DocumentArray. + + :param text_prompts: Phrase, sentence, or string of words and phrases describing what the image should look like. The words will be analyzed by the AI and will guide the diffusion process toward the image(s) you describe. These can include commas and weights to adjust the relative importance of each element. E.g. "A beautiful painting of a singular lighthouse, shining its light across a tumultuous sea of blood by greg rutkowski and thomas kinkade, Trending on artstation."Notice that this prompt loosely follows a structure: [subject], [prepositional details], [setting], [meta modifiers and artist]; this is a good starting point for your experiments. Developing text prompts takes practice and experience, and is not the subject of this guide. If you are a beginner to writing text prompts, a good place to start is on a simple AI art app like Night Cafe, starry ai or WOMBO prior to using DD, to get a feel for how text gets translated into images by GAN tools. These other apps use different technologies, but many of the same principles apply. + :param style: Image style, such as oil paintings, if specified, style will be used to construct prompts. + :param artist: Artist style, if specified, style will be used to construct prompts. + :param init_image: Recall that in the image sequence above, the first image shown is just noise. If an init_image is provided, diffusion will replace the noise with the init_image as its starting state. To use an init_image, upload the image to the Colab instance or your Google Drive, and enter the full image path here. If using an init_image, you may need to increase skip_steps to ~ 50% of total steps to retain the character of the init. See skip_steps above for further discussion. + :param width_height: Desired final image size, in pixels. You can have a square, wide, or tall image, but each edge length should be set to a multiple of 64px, and a minimum of 512px on the default CLIP model setting. If you forget to use multiples of 64px in your dimensions, DD will adjust the dimensions of your image to make it so. + :param skip_steps: Consider the chart shown here. Noise scheduling (denoise strength) starts very high and progressively gets lower and lower as diffusion steps progress. The noise levels in the first few steps are very high, so images change dramatically in early steps.As DD moves along the curve, noise levels (and thus the amount an image changes per step) declines, and image coherence from one step to the next increases.The first few steps of denoising are often so dramatic that some steps (maybe 10-15% of total) can be skipped without affecting the final image. You can experiment with this as a way to cut render times.If you skip too many steps, however, the remaining noise may not be high enough to generate new content, and thus may not have ‘time left’ to finish an image satisfactorily.Also, depending on your other settings, you may need to skip steps to prevent CLIP from overshooting your goal, resulting in ‘blown out’ colors (hyper saturated, solid white, or solid black regions) or otherwise poor image quality. Consider that the denoising process is at its strongest in the early steps, so skipping steps can sometimes mitigate other problems.Lastly, if using an init_image, you will need to skip ~50% of the diffusion steps to retain the shapes in the original init image. However, if you’re using an init_image, you can also adjust skip_steps up or down for creative reasons. With low skip_steps you can get a result "inspired by" the init_image which will retain the colors and rough layout and shapes but look quite different. With high skip_steps you can preserve most of the init_image contents and just do fine tuning of the texture. + :param steps: When creating an image, the denoising curve is subdivided into steps for processing. Each step (or iteration) involves the AI looking at subsets of the image called ‘cuts’ and calculating the ‘direction’ the image should be guided to be more like the prompt. Then it adjusts the image with the help of the diffusion denoiser, and moves to the next step.Increasing steps will provide more opportunities for the AI to adjust the image, and each adjustment will be smaller, and thus will yield a more precise, detailed image. Increasing steps comes at the expense of longer render times. Also, while increasing steps should generally increase image quality, there is a diminishing return on additional steps beyond 250 - 500 steps. However, some intricate images can take 1000, 2000, or more steps. It is really up to the user. Just know that the render time is directly related to the number of steps, and many other parameters have a major impact on image quality, without costing additional time. + :param cut_ic_pow: This sets the size of the border used for inner cuts. High cut_ic_pow values have larger borders, and therefore the cuts themselves will be smaller and provide finer details. If you have too many or too-small inner cuts, you may lose overall image coherency and/or it may cause an undesirable ‘mosaic’ effect. Low cut_ic_pow values will allow the inner cuts to be larger, helping image coherency while still helping with some details. + :param init_scale: This controls how strongly CLIP will try to match the init_image provided. This is balanced against the clip_guidance_scale (CGS) above. Too much init scale, and the image won’t change much during diffusion. Too much CGS and the init image will be lost. + :param clip_guidance_scale: CGS is one of the most important parameters you will use. It tells DD how strongly you want CLIP to move toward your prompt each timestep. Higher is generally better, but if CGS is too strong it will overshoot the goal and distort the image. So a happy medium is needed, and it takes experience to learn how to adjust CGS. Note that this parameter generally scales with image dimensions. In other words, if you increase your total dimensions by 50% (e.g. a change from 512 x 512 to 512 x 768), then to maintain the same effect on the image, you’d want to increase clip_guidance_scale from 5000 to 7500. Of the basic settings, clip_guidance_scale, steps and skip_steps are the most important contributors to image quality, so learn them well. + :param tv_scale: Total variance denoising. Optional, set to zero to turn off. Controls ‘smoothness’ of final output. If used, tv_scale will try to smooth out your final image to reduce overall noise. If your image is too ‘crunchy’, increase tv_scale. TV denoising is good at preserving edges while smoothing away noise in flat regions. See https://en.wikipedia.org/wiki/Total_variation_denoising + :param range_scale: Optional, set to zero to turn off. Used for adjustment of color contrast. Lower range_scale will increase contrast. Very low numbers create a reduced color palette, resulting in more vibrant or poster-like images. Higher range_scale will reduce contrast, for more muted images. + :param sat_scale: Saturation scale. Optional, set to zero to turn off. If used, sat_scale will help mitigate oversaturation. If your image is too saturated, increase sat_scale to reduce the saturation. + :param cutn_batches: Each iteration, the AI cuts the image into smaller pieces known as cuts, and compares each cut to the prompt to decide how to guide the next diffusion step. More cuts can generally lead to better images, since DD has more chances to fine-tune the image precision in each timestep. Additional cuts are memory intensive, however, and if DD tries to evaluate too many cuts at once, it can run out of memory. You can use cutn_batches to increase cuts per timestep without increasing memory usage. At the default settings, DD is scheduled to do 16 cuts per timestep. If cutn_batches is set to 1, there will indeed only be 16 cuts total per timestep. However, if cutn_batches is increased to 4, DD will do 64 cuts total in each timestep, divided into 4 sequential batches of 16 cuts each. Because the cuts are being evaluated only 16 at a time, DD uses the memory required for only 16 cuts, but gives you the quality benefit of 64 cuts. The tradeoff, of course, is that this will take ~4 times as long to render each image.So, (scheduled cuts) x (cutn_batches) = (total cuts per timestep). Increasing cutn_batches will increase render times, however, as the work is being done sequentially. DD’s default cut schedule is a good place to start, but the cut schedule can be adjusted in the Cutn Scheduling section, explained below. + :param diffusion_sampling_mode: Two alternate diffusion denoising algorithms. ddim has been around longer, and is more established and tested. plms is a newly added alternate method that promises good diffusion results in fewer steps, but has not been as fully tested and may have side effects. This new plms mode is actively being researched in the #settings-and-techniques channel in the DD Discord. + :param perlin_init: Normally, DD will use an image filled with random noise as a starting point for the diffusion curve. If perlin_init is selected, DD will instead use a Perlin noise model as an initial state. Perlin has very interesting characteristics, distinct from random noise, so it’s worth experimenting with this for your projects. Beyond perlin, you can, of course, generate your own noise images (such as with GIMP, etc) and use them as an init_image (without skipping steps). Choosing perlin_init does not affect the actual diffusion process, just the starting point for the diffusion. Please note that selecting a perlin_init will replace and override any init_image you may have specified. Further, because the 2D, 3D and video animation systems all rely on the init_image system, if you enable Perlin while using animation modes, the perlin_init will jump in front of any previous image or video input, and DD will NOT give you the expected sequence of coherent images. All of that said, using Perlin and animation modes together do make a very colorful rainbow effect, which can be used creatively. + :param perlin_mode: sets type of Perlin noise: colored, gray, or a mix of both, giving you additional options for noise types. Experiment to see what these do in your projects. + :param seed: Deep in the diffusion code, there is a random number ‘seed’ which is used as the basis for determining the initial state of the diffusion. By default, this is random, but you can also specify your own seed. This is useful if you like a particular result and would like to run more iterations that will be similar. After each run, the actual seed value used will be reported in the parameters report, and can be reused if desired by entering seed # here. If a specific numerical seed is used repeatedly, the resulting images will be quite similar but not identical. + :param eta: eta (greek letter η) is a diffusion model variable that mixes in a random amount of scaled noise into each timestep. 0 is no noise, 1.0 is more noise. As with most DD parameters, you can go below zero for eta, but it may give you unpredictable results. The steps parameter has a close relationship with the eta parameter. If you set eta to 0, then you can get decent output with only 50-75 steps. Setting eta to 1.0 favors higher step counts, ideally around 250 and up. eta has a subtle, unpredictable effect on image, so you’ll need to experiment to see how this affects your projects. + :param clamp_grad: As I understand it, clamp_grad is an internal limiter that stops DD from producing extreme results. Try your images with and without clamp_grad. If the image changes drastically with clamp_grad turned off, it probably means your clip_guidance_scale is too high and should be reduced. + :param clamp_max: Sets the value of the clamp_grad limitation. Default is 0.05, providing for smoother, more muted coloration in images, but setting higher values (0.15-0.3) can provide interesting contrast and vibrancy. + :param fuzzy_prompt: Controls whether to add multiple noisy prompts to the prompt losses. If True, can increase variability of image output. Experiment with this. + :param rand_mag: Affects only the fuzzy_prompt. Controls the magnitude of the random noise added by fuzzy_prompt. + :param cut_overview: The schedule of overview cuts + :param cut_innercut: The schedule of inner cuts + :param cut_icgray_p: This sets the size of the border used for inner cuts. High cut_ic_pow values have larger borders, and therefore the cuts themselves will be smaller and provide finer details. If you have too many or too-small inner cuts, you may lose overall image coherency and/or it may cause an undesirable ‘mosaic’ effect. Low cut_ic_pow values will allow the inner cuts to be larger, helping image coherency while still helping with some details. + :param display_rate: During a diffusion run, you can monitor the progress of each image being created with this variable. If display_rate is set to 50, DD will show you the in-progress image every 50 timesteps. Setting this to a lower value, like 5 or 10, is a good way to get an early peek at where your image is heading. If you don’t like the progression, just interrupt execution, change some settings, and re-run. If you are planning a long, unmonitored batch, it’s better to set display_rate equal to steps, because displaying interim images does slow Colab down slightly. + :param n_batches: This variable sets the number of still images you want DD to create. If you are using an animation mode (see below for details) DD will ignore n_batches and create a single set of animated frames based on the animation settings. + :param batch_name: The name of the batch, the batch id will be named as "discoart-[batch_name]-seed". To avoid your artworks be overridden by other users, please use a unique name. + :param use_gpu: whether to use gpu or not. + :return: a DocumentArray object that has `n_batches` Documents + """ + if use_gpu: + try: + _places = os.environ.get("CUDA_VISIBLE_DEVICES", None) + if _places: + paddle.device.set_device("gpu:{}".format(0)) + except: + raise RuntimeError( + "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES as cuda_device_id." + ) + else: + paddle.device.set_device("cpu") + paddle.disable_static() + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + + if isinstance(text_prompts, str): + text_prompts = text_prompts.rstrip(',.,。') + if style is not None: + text_prompts += ",{}".format(style) + if artist is not None: + text_prompts += ",{},trending on artstation".format(artist) + elif isinstance(text_prompts, list): + text_prompts[0] = text_prompts[0].rstrip(',.,。') + if style is not None: + text_prompts[0] += ",{}".format(style) + if artist is not None: + text_prompts[0] += ",{},trending on artstation".format(artist) + + return create(text_prompts=text_prompts, + init_image=init_image, + width_height=width_height, + skip_steps=skip_steps, + steps=steps, + cut_ic_pow=cut_ic_pow, + init_scale=init_scale, + clip_guidance_scale=clip_guidance_scale, + tv_scale=tv_scale, + range_scale=range_scale, + sat_scale=sat_scale, + cutn_batches=cutn_batches, + diffusion_sampling_mode=diffusion_sampling_mode, + perlin_init=perlin_init, + perlin_mode=perlin_mode, + seed=seed, + eta=eta, + clamp_grad=clamp_grad, + clamp_max=clamp_max, + randomize_class=randomize_class, + clip_denoised=clip_denoised, + fuzzy_prompt=fuzzy_prompt, + rand_mag=rand_mag, + cut_overview=cut_overview, + cut_innercut=cut_innercut, + cut_icgray_p=cut_icgray_p, + display_rate=display_rate, + n_batches=n_batches, + batch_size=batch_size, + batch_name=batch_name, + clip_models=['VIT32'], + output_dir=output_dir) + + @serving + def serving_method(self, text_prompts, **kwargs): + """ + Run as a service. + """ + results = [] + for text_prompt in text_prompts: + result = self.generate_image(text_prompts=text_prompt, **kwargs)[0].to_base64() + results.append(result) + return results + + @runnable + def run_cmd(self, argvs): + """ + Run as a command. + """ + self.parser = argparse.ArgumentParser(description="Run the {} module.".format(self.name), + prog='hub run {}'.format(self.name), + usage='%(prog)s', + add_help=True) + self.arg_input_group = self.parser.add_argument_group(title="Input options", description="Input data. Required") + self.arg_config_group = self.parser.add_argument_group( + title="Config options", description="Run configuration for controlling module behavior, not required.") + self.add_module_config_arg() + self.add_module_input_arg() + args = self.parser.parse_args(argvs) + results = self.generate_image(text_prompts=args.text_prompts, + style=args.style, + artist=args.artist, + init_image=args.init_image, + width_height=args.width_height, + skip_steps=args.skip_steps, + steps=args.steps, + cut_ic_pow=args.cut_ic_pow, + init_scale=args.init_scale, + clip_guidance_scale=args.clip_guidance_scale, + tv_scale=args.tv_scale, + range_scale=args.range_scale, + sat_scale=args.sat_scale, + cutn_batches=args.cutn_batches, + diffusion_sampling_mode=args.diffusion_sampling_mode, + perlin_init=args.perlin_init, + perlin_mode=args.perlin_mode, + seed=args.seed, + eta=args.eta, + clamp_grad=args.clamp_grad, + clamp_max=args.clamp_max, + randomize_class=args.randomize_class, + clip_denoised=args.clip_denoised, + fuzzy_prompt=args.fuzzy_prompt, + rand_mag=args.rand_mag, + cut_overview=args.cut_overview, + cut_innercut=args.cut_innercut, + cut_icgray_p=args.cut_icgray_p, + display_rate=args.display_rate, + n_batches=args.n_batches, + batch_size=args.batch_size, + batch_name=args.batch_name, + output_dir=args.output_dir) + return results + + def add_module_config_arg(self): + """ + Add the command config options. + """ + self.arg_input_group.add_argument( + '--skip_steps', + type=int, + default=0, + help= + 'Consider the chart shown here. Noise scheduling (denoise strength) starts very high and progressively gets lower and lower as diffusion steps progress. The noise levels in the first few steps are very high, so images change dramatically in early steps.As DD moves along the curve, noise levels (and thus the amount an image changes per step) declines, and image coherence from one step to the next increases.The first few steps of denoising are often so dramatic that some steps (maybe 10-15%% of total) can be skipped without affecting the final image. You can experiment with this as a way to cut render times.If you skip too many steps, however, the remaining noise may not be high enough to generate new content, and thus may not have ‘time left’ to finish an image satisfactorily.Also, depending on your other settings, you may need to skip steps to prevent CLIP from overshooting your goal, resulting in ‘blown out’ colors (hyper saturated, solid white, or solid black regions) or otherwise poor image quality. Consider that the denoising process is at its strongest in the early steps, so skipping steps can sometimes mitigate other problems.Lastly, if using an init_image, you will need to skip ~50%% of the diffusion steps to retain the shapes in the original init image. However, if you’re using an init_image, you can also adjust skip_steps up or down for creative reasons. With low skip_steps you can get a result "inspired by" the init_image which will retain the colors and rough layout and shapes but look quite different. With high skip_steps you can preserve most of the init_image contents and just do fine tuning of the texture' + ) + self.arg_input_group.add_argument( + '--steps', + type=int, + default=250, + help= + "When creating an image, the denoising curve is subdivided into steps for processing. Each step (or iteration) involves the AI looking at subsets of the image called ‘cuts’ and calculating the ‘direction’ the image should be guided to be more like the prompt. Then it adjusts the image with the help of the diffusion denoiser, and moves to the next step.Increasing steps will provide more opportunities for the AI to adjust the image, and each adjustment will be smaller, and thus will yield a more precise, detailed image. Increasing steps comes at the expense of longer render times. Also, while increasing steps should generally increase image quality, there is a diminishing return on additional steps beyond 250 - 500 steps. However, some intricate images can take 1000, 2000, or more steps. It is really up to the user. Just know that the render time is directly related to the number of steps, and many other parameters have a major impact on image quality, without costing additional time." + ) + self.arg_input_group.add_argument( + '--cut_ic_pow', + type=int, + default=1, + help= + "This sets the size of the border used for inner cuts. High cut_ic_pow values have larger borders, and therefore the cuts themselves will be smaller and provide finer details. If you have too many or too-small inner cuts, you may lose overall image coherency and/or it may cause an undesirable ‘mosaic’ effect. Low cut_ic_pow values will allow the inner cuts to be larger, helping image coherency while still helping with some details." + ) + self.arg_input_group.add_argument( + '--init_scale', + type=int, + default=1000, + help= + "This controls how strongly CLIP will try to match the init_image provided. This is balanced against the clip_guidance_scale (CGS) above. Too much init scale, and the image won’t change much during diffusion. Too much CGS and the init image will be lost." + ) + self.arg_input_group.add_argument( + '--clip_guidance_scale', + type=int, + default=5000, + help= + "CGS is one of the most important parameters you will use. It tells DD how strongly you want CLIP to move toward your prompt each timestep. Higher is generally better, but if CGS is too strong it will overshoot the goal and distort the image. So a happy medium is needed, and it takes experience to learn how to adjust CGS. Note that this parameter generally scales with image dimensions. In other words, if you increase your total dimensions by 50% (e.g. a change from 512 x 512 to 512 x 768), then to maintain the same effect on the image, you’d want to increase clip_guidance_scale from 5000 to 7500. Of the basic settings, clip_guidance_scale, steps and skip_steps are the most important contributors to image quality, so learn them well." + ) + self.arg_input_group.add_argument( + '--tv_scale', + type=int, + default=0, + help= + "Total variance denoising. Optional, set to zero to turn off. Controls ‘smoothness’ of final output. If used, tv_scale will try to smooth out your final image to reduce overall noise. If your image is too ‘crunchy’, increase tv_scale. TV denoising is good at preserving edges while smoothing away noise in flat regions. See https://en.wikipedia.org/wiki/Total_variation_denoising" + ) + self.arg_input_group.add_argument( + '--range_scale', + type=int, + default=0, + help= + "Optional, set to zero to turn off. Used for adjustment of color contrast. Lower range_scale will increase contrast. Very low numbers create a reduced color palette, resulting in more vibrant or poster-like images. Higher range_scale will reduce contrast, for more muted images." + ) + self.arg_input_group.add_argument( + '--sat_scale', + type=int, + default=0, + help= + "Saturation scale. Optional, set to zero to turn off. If used, sat_scale will help mitigate oversaturation. If your image is too saturated, increase sat_scale to reduce the saturation." + ) + self.arg_input_group.add_argument( + '--cutn_batches', + type=int, + default=4, + help= + "Each iteration, the AI cuts the image into smaller pieces known as cuts, and compares each cut to the prompt to decide how to guide the next diffusion step. More cuts can generally lead to better images, since DD has more chances to fine-tune the image precision in each timestep. Additional cuts are memory intensive, however, and if DD tries to evaluate too many cuts at once, it can run out of memory. You can use cutn_batches to increase cuts per timestep without increasing memory usage. At the default settings, DD is scheduled to do 16 cuts per timestep. If cutn_batches is set to 1, there will indeed only be 16 cuts total per timestep. However, if cutn_batches is increased to 4, DD will do 64 cuts total in each timestep, divided into 4 sequential batches of 16 cuts each. Because the cuts are being evaluated only 16 at a time, DD uses the memory required for only 16 cuts, but gives you the quality benefit of 64 cuts. The tradeoff, of course, is that this will take ~4 times as long to render each image.So, (scheduled cuts) x (cutn_batches) = (total cuts per timestep). Increasing cutn_batches will increase render times, however, as the work is being done sequentially. DD’s default cut schedule is a good place to start, but the cut schedule can be adjusted in the Cutn Scheduling section, explained below." + ) + self.arg_input_group.add_argument( + '--diffusion_sampling_mode', + type=str, + default='ddim', + help= + "Two alternate diffusion denoising algorithms. ddim has been around longer, and is more established and tested. plms is a newly added alternate method that promises good diffusion results in fewer steps, but has not been as fully tested and may have side effects. This new plms mode is actively being researched in the #settings-and-techniques channel in the DD Discord." + ) + self.arg_input_group.add_argument( + '--perlin_init', + type=bool, + default=False, + help= + "Normally, DD will use an image filled with random noise as a starting point for the diffusion curve. If perlin_init is selected, DD will instead use a Perlin noise model as an initial state. Perlin has very interesting characteristics, distinct from random noise, so it’s worth experimenting with this for your projects. Beyond perlin, you can, of course, generate your own noise images (such as with GIMP, etc) and use them as an init_image (without skipping steps). Choosing perlin_init does not affect the actual diffusion process, just the starting point for the diffusion. Please note that selecting a perlin_init will replace and override any init_image you may have specified. Further, because the 2D, 3D and video animation systems all rely on the init_image system, if you enable Perlin while using animation modes, the perlin_init will jump in front of any previous image or video input, and DD will NOT give you the expected sequence of coherent images. All of that said, using Perlin and animation modes together do make a very colorful rainbow effect, which can be used creatively." + ) + self.arg_input_group.add_argument( + '--perlin_mode', + type=str, + default='mixed', + help= + "sets type of Perlin noise: colored, gray, or a mix of both, giving you additional options for noise types. Experiment to see what these do in your projects." + ) + self.arg_input_group.add_argument( + '--seed', + type=int, + default=None, + help= + "Deep in the diffusion code, there is a random number ‘seed’ which is used as the basis for determining the initial state of the diffusion. By default, this is random, but you can also specify your own seed. This is useful if you like a particular result and would like to run more iterations that will be similar. After each run, the actual seed value used will be reported in the parameters report, and can be reused if desired by entering seed # here. If a specific numerical seed is used repeatedly, the resulting images will be quite similar but not identical." + ) + self.arg_input_group.add_argument( + '--eta', + type=float, + default=0.8, + help= + "eta (greek letter η) is a diffusion model variable that mixes in a random amount of scaled noise into each timestep. 0 is no noise, 1.0 is more noise. As with most DD parameters, you can go below zero for eta, but it may give you unpredictable results. The steps parameter has a close relationship with the eta parameter. If you set eta to 0, then you can get decent output with only 50-75 steps. Setting eta to 1.0 favors higher step counts, ideally around 250 and up. eta has a subtle, unpredictable effect on image, so you’ll need to experiment to see how this affects your projects." + ) + self.arg_input_group.add_argument( + '--clamp_grad', + type=bool, + default=True, + help= + "As I understand it, clamp_grad is an internal limiter that stops DD from producing extreme results. Try your images with and without clamp_grad. If the image changes drastically with clamp_grad turned off, it probably means your clip_guidance_scale is too high and should be reduced." + ) + self.arg_input_group.add_argument( + '--clamp_max', + type=float, + default=0.05, + help= + "Sets the value of the clamp_grad limitation. Default is 0.05, providing for smoother, more muted coloration in images, but setting higher values (0.15-0.3) can provide interesting contrast and vibrancy." + ) + self.arg_input_group.add_argument('--randomize_class', type=bool, default=True, help="Random class.") + self.arg_input_group.add_argument('--clip_denoised', type=bool, default=False, help="Clip denoised.") + self.arg_input_group.add_argument( + '--fuzzy_prompt', + type=bool, + default=False, + help= + "Controls whether to add multiple noisy prompts to the prompt losses. If True, can increase variability of image output. Experiment with this." + ) + self.arg_input_group.add_argument( + '--rand_mag', + type=float, + default=0.5, + help="Affects only the fuzzy_prompt. Controls the magnitude of the random noise added by fuzzy_prompt.") + self.arg_input_group.add_argument('--cut_overview', + type=str, + default='[12]*400+[4]*600', + help="The schedule of overview cuts") + self.arg_input_group.add_argument('--cut_innercut', + type=str, + default='[4]*400+[12]*600', + help="The schedule of inner cuts") + self.arg_input_group.add_argument( + '--cut_icgray_p', + type=str, + default='[0.2]*400+[0]*600', + help= + "This sets the size of the border used for inner cuts. High cut_ic_pow values have larger borders, and therefore the cuts themselves will be smaller and provide finer details. If you have too many or too-small inner cuts, you may lose overall image coherency and/or it may cause an undesirable ‘mosaic’ effect. Low cut_ic_pow values will allow the inner cuts to be larger, helping image coherency while still helping with some details." + ) + self.arg_input_group.add_argument( + '--display_rate', + type=int, + default=10, + help= + "During a diffusion run, you can monitor the progress of each image being created with this variable. If display_rate is set to 50, DD will show you the in-progress image every 50 timesteps. Setting this to a lower value, like 5 or 10, is a good way to get an early peek at where your image is heading. If you don’t like the progression, just interrupt execution, change some settings, and re-run. If you are planning a long, unmonitored batch, it’s better to set display_rate equal to steps, because displaying interim images does slow Colab down slightly." + ) + self.arg_config_group.add_argument('--use_gpu', + type=ast.literal_eval, + default=True, + help="whether use GPU or not") + self.arg_config_group.add_argument('--output_dir', + type=str, + default='disco_diffusion_clip_vitb32_out', + help='Output directory.') + + def add_module_input_arg(self): + """ + Add the command input options. + """ + self.arg_input_group.add_argument( + '--text_prompts', + type=str, + help= + 'Phrase, sentence, or string of words and phrases describing what the image should look like. The words will be analyzed by the AI and will guide the diffusion process toward the image(s) you describe. These can include commas and weights to adjust the relative importance of each element. E.g. "A beautiful painting of a singular lighthouse, shining its light across a tumultuous sea of blood by greg rutkowski and thomas kinkade, Trending on artstation."Notice that this prompt loosely follows a structure: [subject], [prepositional details], [setting], [meta modifiers and artist]; this is a good starting point for your experiments. Developing text prompts takes practice and experience, and is not the subject of this guide. If you are a beginner to writing text prompts, a good place to start is on a simple AI art app like Night Cafe, starry ai or WOMBO prior to using DD, to get a feel for how text gets translated into images by GAN tools. These other apps use different technologies, but many of the same principles apply.' + ) + self.arg_input_group.add_argument( + '--style', + type=str, + default=None, + help='Image style, such as oil paintings, if specified, style will be used to construct prompts.') + self.arg_input_group.add_argument('--artist', + type=str, + default=None, + help='Artist style, if specified, style will be used to construct prompts.') + self.arg_input_group.add_argument( + '--init_image', + type=str, + default=None, + help= + "Recall that in the image sequence above, the first image shown is just noise. If an init_image is provided, diffusion will replace the noise with the init_image as its starting state. To use an init_image, upload the image to the Colab instance or your Google Drive, and enter the full image path here. If using an init_image, you may need to increase skip_steps to ~ 50% of total steps to retain the character of the init. See skip_steps above for further discussion." + ) + self.arg_input_group.add_argument( + '--width_height', + type=ast.literal_eval, + default=[1280, 768], + help= + "Desired final image size, in pixels. You can have a square, wide, or tall image, but each edge length should be set to a multiple of 64px, and a minimum of 512px on the default CLIP model setting. If you forget to use multiples of 64px in your dimensions, DD will adjust the dimensions of your image to make it so." + ) + self.arg_input_group.add_argument( + '--n_batches', + type=int, + default=1, + help= + "This variable sets the number of still images you want DD to create. If you are using an animation mode (see below for details) DD will ignore n_batches and create a single set of animated frames based on the animation settings." + ) + self.arg_input_group.add_argument('--batch_size', type=int, default=1, help="Batch size.") + self.arg_input_group.add_argument( + '--batch_name', + type=str, + default='', + help= + 'The name of the batch, the batch id will be named as "reverse_diffusion-[batch_name]-seed". To avoid your artworks be overridden by other users, please use a unique name.' + ) diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/requirements.txt b/modules/image/text_to_image/disco_diffusion_clip_vitb32/requirements.txt new file mode 100755 index 00000000..8b4bc0ea --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/requirements.txt @@ -0,0 +1,8 @@ +numpy +paddle_lpips==0.1.2 +ftfy +docarray>=0.13.29 +pyyaml +regex +tqdm +ipywidgets diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/README.md b/modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/README.md new file mode 100644 index 00000000..1f8d0bb0 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/README.md @@ -0,0 +1,3 @@ +# ResizeRight (Paddle) +Fully differentiable resize function implemented by Paddle. +This module is based on [assafshocher/ResizeRight](https://github.com/assafshocher/ResizeRight). diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/__init__.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/interp_methods.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/interp_methods.py new file mode 100755 index 00000000..276eb055 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/interp_methods.py @@ -0,0 +1,70 @@ +from math import pi + +try: + import paddle +except ImportError: + paddle = None + +try: + import numpy + import numpy as np +except ImportError: + numpy = None + +if numpy is None and paddle is None: + raise ImportError("Must have either Numpy or PyTorch but both not found") + + +def set_framework_dependencies(x): + if type(x) is numpy.ndarray: + to_dtype = lambda a: a + fw = numpy + else: + to_dtype = lambda a: paddle.cast(a, x.dtype) + fw = paddle + # eps = fw.finfo(fw.float32).eps + eps = paddle.to_tensor(np.finfo(np.float32).eps) + return fw, to_dtype, eps + + +def support_sz(sz): + + def wrapper(f): + f.support_sz = sz + return f + + return wrapper + + +@support_sz(4) +def cubic(x): + fw, to_dtype, eps = set_framework_dependencies(x) + absx = fw.abs(x) + absx2 = absx**2 + absx3 = absx**3 + return ((1.5 * absx3 - 2.5 * absx2 + 1.) * to_dtype(absx <= 1.) + + (-0.5 * absx3 + 2.5 * absx2 - 4. * absx + 2.) * to_dtype((1. < absx) & (absx <= 2.))) + + +@support_sz(4) +def lanczos2(x): + fw, to_dtype, eps = set_framework_dependencies(x) + return (((fw.sin(pi * x) * fw.sin(pi * x / 2) + eps) / ((pi**2 * x**2 / 2) + eps)) * to_dtype(abs(x) < 2)) + + +@support_sz(6) +def lanczos3(x): + fw, to_dtype, eps = set_framework_dependencies(x) + return (((fw.sin(pi * x) * fw.sin(pi * x / 3) + eps) / ((pi**2 * x**2 / 3) + eps)) * to_dtype(abs(x) < 3)) + + +@support_sz(2) +def linear(x): + fw, to_dtype, eps = set_framework_dependencies(x) + return ((x + 1) * to_dtype((-1 <= x) & (x < 0)) + (1 - x) * to_dtype((0 <= x) & (x <= 1))) + + +@support_sz(1) +def box(x): + fw, to_dtype, eps = set_framework_dependencies(x) + return to_dtype((-1 <= x) & (x < 0)) + to_dtype((0 <= x) & (x <= 1)) diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/resize_right.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/resize_right.py new file mode 100755 index 00000000..77ea9564 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/resize_right/resize_right.py @@ -0,0 +1,403 @@ +import warnings +from fractions import Fraction +from math import ceil +from typing import Tuple + +import disco_diffusion_clip_vitb32.resize_right.interp_methods as interp_methods + + +class NoneClass: + pass + + +try: + import paddle + from paddle import nn + nnModuleWrapped = nn.Layer +except ImportError: + warnings.warn('No PyTorch found, will work only with Numpy') + paddle = None + nnModuleWrapped = NoneClass + +try: + import numpy + import numpy as np +except ImportError: + warnings.warn('No Numpy found, will work only with PyTorch') + numpy = None + +if numpy is None and paddle is None: + raise ImportError("Must have either Numpy or PyTorch but both not found") + + +def resize(input, + scale_factors=None, + out_shape=None, + interp_method=interp_methods.cubic, + support_sz=None, + antialiasing=True, + by_convs=False, + scale_tolerance=None, + max_numerator=10, + pad_mode='constant'): + # get properties of the input tensor + in_shape, n_dims = input.shape, input.ndim + + # fw stands for framework that can be either numpy or paddle, + # determined by the input type + fw = numpy if type(input) is numpy.ndarray else paddle + eps = np.finfo(np.float32).eps if fw == numpy else paddle.to_tensor(np.finfo(np.float32).eps) + device = input.place if fw is paddle else None + + # set missing scale factors or output shapem one according to another, + # scream if both missing. this is also where all the defults policies + # take place. also handling the by_convs attribute carefully. + scale_factors, out_shape, by_convs = set_scale_and_out_sz(in_shape, out_shape, scale_factors, by_convs, + scale_tolerance, max_numerator, eps, fw) + + # sort indices of dimensions according to scale of each dimension. + # since we are going dim by dim this is efficient + sorted_filtered_dims_and_scales = [(dim, scale_factors[dim], by_convs[dim], in_shape[dim], out_shape[dim]) + for dim in sorted(range(n_dims), key=lambda ind: scale_factors[ind]) + if scale_factors[dim] != 1.] + # unless support size is specified by the user, it is an attribute + # of the interpolation method + if support_sz is None: + support_sz = interp_method.support_sz + + # output begins identical to input and changes with each iteration + output = input + + # iterate over dims + for (dim, scale_factor, dim_by_convs, in_sz, out_sz) in sorted_filtered_dims_and_scales: + # STEP 1- PROJECTED GRID: The non-integer locations of the projection + # of output pixel locations to the input tensor + projected_grid = get_projected_grid(in_sz, out_sz, scale_factor, fw, dim_by_convs, device) + + # STEP 1.5: ANTIALIASING- If antialiasing is taking place, we modify + # the window size and the interpolation method (see inside function) + cur_interp_method, cur_support_sz = apply_antialiasing_if_needed(interp_method, support_sz, scale_factor, + antialiasing) + + # STEP 2- FIELDS OF VIEW: for each output pixels, map the input pixels + # that influence it. Also calculate needed padding and update grid + # accoedingly + field_of_view = get_field_of_view(projected_grid, cur_support_sz, fw, eps, device) + + # STEP 2.5- CALCULATE PAD AND UPDATE: according to the field of view, + # the input should be padded to handle the boundaries, coordinates + # should be updated. actual padding only occurs when weights are + # aplied (step 4). if using by_convs for this dim, then we need to + # calc right and left boundaries for each filter instead. + pad_sz, projected_grid, field_of_view = calc_pad_sz(in_sz, out_sz, field_of_view, projected_grid, scale_factor, + dim_by_convs, fw, device) + # STEP 3- CALCULATE WEIGHTS: Match a set of weights to the pixels in + # the field of view for each output pixel + weights = get_weights(cur_interp_method, projected_grid, field_of_view) + + # STEP 4- APPLY WEIGHTS: Each output pixel is calculated by multiplying + # its set of weights with the pixel values in its field of view. + # We now multiply the fields of view with their matching weights. + # We do this by tensor multiplication and broadcasting. + # if by_convs is true for this dim, then we do this action by + # convolutions. this is equivalent but faster. + if not dim_by_convs: + output = apply_weights(output, field_of_view, weights, dim, n_dims, pad_sz, pad_mode, fw) + else: + output = apply_convs(output, scale_factor, in_sz, out_sz, weights, dim, pad_sz, pad_mode, fw) + return output + + +def get_projected_grid(in_sz, out_sz, scale_factor, fw, by_convs, device=None): + # we start by having the ouput coordinates which are just integer locations + # in the special case when usin by_convs, we only need two cycles of grid + # points. the first and last. + grid_sz = out_sz if not by_convs else scale_factor.numerator + out_coordinates = fw_arange(grid_sz, fw, device) + + # This is projecting the ouput pixel locations in 1d to the input tensor, + # as non-integer locations. + # the following fomrula is derived in the paper + # "From Discrete to Continuous Convolutions" by Shocher et al. + return (out_coordinates / float(scale_factor) + (in_sz - 1) / 2 - (out_sz - 1) / (2 * float(scale_factor))) + + +def get_field_of_view(projected_grid, cur_support_sz, fw, eps, device): + # for each output pixel, map which input pixels influence it, in 1d. + # we start by calculating the leftmost neighbor, using half of the window + # size (eps is for when boundary is exact int) + left_boundaries = fw_ceil(projected_grid - cur_support_sz / 2 - eps, fw) + + # then we simply take all the pixel centers in the field by counting + # window size pixels from the left boundary + ordinal_numbers = fw_arange(ceil(cur_support_sz - eps), fw, device) + return left_boundaries[:, None] + ordinal_numbers + + +def calc_pad_sz(in_sz, out_sz, field_of_view, projected_grid, scale_factor, dim_by_convs, fw, device): + if not dim_by_convs: + # determine padding according to neighbor coords out of bound. + # this is a generalized notion of padding, when pad<0 it means crop + pad_sz = [-field_of_view[0, 0].item(), field_of_view[-1, -1].item() - in_sz + 1] + + # since input image will be changed by padding, coordinates of both + # field_of_view and projected_grid need to be updated + field_of_view += pad_sz[0] + projected_grid += pad_sz[0] + + else: + # only used for by_convs, to calc the boundaries of each filter the + # number of distinct convolutions is the numerator of the scale factor + num_convs, stride = scale_factor.numerator, scale_factor.denominator + + # calculate left and right boundaries for each conv. left can also be + # negative right can be bigger than in_sz. such cases imply padding if + # needed. however if# both are in-bounds, it means we need to crop, + # practically apply the conv only on part of the image. + left_pads = -field_of_view[:, 0] + + # next calc is tricky, explanation by rows: + # 1) counting output pixels between the first position of each filter + # to the right boundary of the input + # 2) dividing it by number of filters to count how many 'jumps' + # each filter does + # 3) multiplying by the stride gives us the distance over the input + # coords done by all these jumps for each filter + # 4) to this distance we add the right boundary of the filter when + # placed in its leftmost position. so now we get the right boundary + # of that filter in input coord. + # 5) the padding size needed is obtained by subtracting the rightmost + # input coordinate. if the result is positive padding is needed. if + # negative then negative padding means shaving off pixel columns. + right_pads = (((out_sz - fw_arange(num_convs, fw, device) - 1) # (1) + // num_convs) # (2) + * stride # (3) + + field_of_view[:, -1] # (4) + - in_sz + 1) # (5) + + # in the by_convs case pad_sz is a list of left-right pairs. one per + # each filter + + pad_sz = list(zip(left_pads, right_pads)) + + return pad_sz, projected_grid, field_of_view + + +def get_weights(interp_method, projected_grid, field_of_view): + # the set of weights per each output pixels is the result of the chosen + # interpolation method applied to the distances between projected grid + # locations and the pixel-centers in the field of view (distances are + # directed, can be positive or negative) + weights = interp_method(projected_grid[:, None] - field_of_view) + + # we now carefully normalize the weights to sum to 1 per each output pixel + sum_weights = weights.sum(1, keepdim=True) + sum_weights[sum_weights == 0] = 1 + return weights / sum_weights + + +def apply_weights(input, field_of_view, weights, dim, n_dims, pad_sz, pad_mode, fw): + # for this operation we assume the resized dim is the first one. + # so we transpose and will transpose back after multiplying + tmp_input = fw_swapaxes(input, dim, 0, fw) + + # apply padding + tmp_input = fw_pad(tmp_input, fw, pad_sz, pad_mode) + + # field_of_view is a tensor of order 2: for each output (1d location + # along cur dim)- a list of 1d neighbors locations. + # note that this whole operations is applied to each dim separately, + # this is why it is all in 1d. + # neighbors = tmp_input[field_of_view] is a tensor of order image_dims+1: + # for each output pixel (this time indicated in all dims), these are the + # values of the neighbors in the 1d field of view. note that we only + # consider neighbors along the current dim, but such set exists for every + # multi-dim location, hence the final tensor order is image_dims+1. + paddle.device.cuda.empty_cache() + neighbors = tmp_input[field_of_view] + + # weights is an order 2 tensor: for each output location along 1d- a list + # of weights matching the field of view. we augment it with ones, for + # broadcasting, so that when multiplies some tensor the weights affect + # only its first dim. + tmp_weights = fw.reshape(weights, (*weights.shape, *[1] * (n_dims - 1))) + + # now we simply multiply the weights with the neighbors, and then sum + # along the field of view, to get a single value per out pixel + tmp_output = (neighbors * tmp_weights).sum(1) + # we transpose back the resized dim to its original position + return fw_swapaxes(tmp_output, 0, dim, fw) + + +def apply_convs(input, scale_factor, in_sz, out_sz, weights, dim, pad_sz, pad_mode, fw): + # for this operations we assume the resized dim is the last one. + # so we transpose and will transpose back after multiplying + input = fw_swapaxes(input, dim, -1, fw) + + # the stride for all convs is the denominator of the scale factor + stride, num_convs = scale_factor.denominator, scale_factor.numerator + + # prepare an empty tensor for the output + tmp_out_shape = list(input.shape) + tmp_out_shape[-1] = out_sz + tmp_output = fw_empty(tuple(tmp_out_shape), fw, input.device) + + # iterate over the conv operations. we have as many as the numerator + # of the scale-factor. for each we need boundaries and a filter. + for conv_ind, (pad_sz, filt) in enumerate(zip(pad_sz, weights)): + # apply padding (we pad last dim, padding can be negative) + pad_dim = input.ndim - 1 + tmp_input = fw_pad(input, fw, pad_sz, pad_mode, dim=pad_dim) + + # apply convolution over last dim. store in the output tensor with + # positional strides so that when the loop is comlete conv results are + # interwind + tmp_output[..., conv_ind::num_convs] = fw_conv(tmp_input, filt, stride) + + return fw_swapaxes(tmp_output, -1, dim, fw) + + +def set_scale_and_out_sz(in_shape, out_shape, scale_factors, by_convs, scale_tolerance, max_numerator, eps, fw): + # eventually we must have both scale-factors and out-sizes for all in/out + # dims. however, we support many possible partial arguments + if scale_factors is None and out_shape is None: + raise ValueError("either scale_factors or out_shape should be " + "provided") + if out_shape is not None: + # if out_shape has less dims than in_shape, we defaultly resize the + # first dims for numpy and last dims for paddle + out_shape = (list(out_shape) + + list(in_shape[len(out_shape):]) if fw is numpy else list(in_shape[:-len(out_shape)]) + + list(out_shape)) + if scale_factors is None: + # if no scale given, we calculate it as the out to in ratio + # (not recomended) + scale_factors = [out_sz / in_sz for out_sz, in_sz in zip(out_shape, in_shape)] + if scale_factors is not None: + # by default, if a single number is given as scale, we assume resizing + # two dims (most common are images with 2 spatial dims) + scale_factors = (scale_factors if isinstance(scale_factors, (list, tuple)) else [scale_factors, scale_factors]) + # if less scale_factors than in_shape dims, we defaultly resize the + # first dims for numpy and last dims for paddle + scale_factors = (list(scale_factors) + [1] * (len(in_shape) - len(scale_factors)) if fw is numpy else [1] * + (len(in_shape) - len(scale_factors)) + list(scale_factors)) + if out_shape is None: + # when no out_shape given, it is calculated by multiplying the + # scale by the in_shape (not recomended) + out_shape = [ceil(scale_factor * in_sz) for scale_factor, in_sz in zip(scale_factors, in_shape)] + # next part intentionally after out_shape determined for stability + # we fix by_convs to be a list of truth values in case it is not + if not isinstance(by_convs, (list, tuple)): + by_convs = [by_convs] * len(out_shape) + + # next loop fixes the scale for each dim to be either frac or float. + # this is determined by by_convs and by tolerance for scale accuracy. + for ind, (sf, dim_by_convs) in enumerate(zip(scale_factors, by_convs)): + # first we fractionaize + if dim_by_convs: + frac = Fraction(1 / sf).limit_denominator(max_numerator) + frac = Fraction(numerator=frac.denominator, denominator=frac.numerator) + + # if accuracy is within tolerance scale will be frac. if not, then + # it will be float and the by_convs attr will be set false for + # this dim + if scale_tolerance is None: + scale_tolerance = eps + if dim_by_convs and abs(frac - sf) < scale_tolerance: + scale_factors[ind] = frac + else: + scale_factors[ind] = float(sf) + by_convs[ind] = False + + return scale_factors, out_shape, by_convs + + +def apply_antialiasing_if_needed(interp_method, support_sz, scale_factor, antialiasing): + # antialiasing is "stretching" the field of view according to the scale + # factor (only for downscaling). this is low-pass filtering. this + # requires modifying both the interpolation (stretching the 1d + # function and multiplying by the scale-factor) and the window size. + scale_factor = float(scale_factor) + if scale_factor >= 1.0 or not antialiasing: + return interp_method, support_sz + cur_interp_method = (lambda arg: scale_factor * interp_method(scale_factor * arg)) + cur_support_sz = support_sz / scale_factor + return cur_interp_method, cur_support_sz + + +def fw_ceil(x, fw): + if fw is numpy: + return fw.int_(fw.ceil(x)) + else: + return paddle.cast(x.ceil(), dtype='int64') + + +def fw_floor(x, fw): + if fw is numpy: + return fw.int_(fw.floor(x)) + else: + return paddle.cast(x.floor(), dtype='int64') + + +def fw_cat(x, fw): + if fw is numpy: + return fw.concatenate(x) + else: + return fw.concat(x) + + +def fw_swapaxes(x, ax_1, ax_2, fw): + if fw is numpy: + return fw.swapaxes(x, ax_1, ax_2) + else: + if ax_1 == -1: + ax_1 = len(x.shape) - 1 + if ax_2 == -1: + ax_2 = len(x.shape) - 1 + perm0 = list(range(len(x.shape))) + temp = ax_1 + perm0[temp] = ax_2 + perm0[ax_2] = temp + return fw.transpose(x, perm0) + + +def fw_pad(x, fw, pad_sz, pad_mode, dim=0): + if pad_sz == (0, 0): + return x + if fw is numpy: + pad_vec = [(0, 0)] * x.ndim + pad_vec[dim] = pad_sz + return fw.pad(x, pad_width=pad_vec, mode=pad_mode) + else: + if x.ndim < 3: + x = x[None, None, ...] + + pad_vec = [0] * ((x.ndim - 2) * 2) + pad_vec[0:2] = pad_sz + return fw_swapaxes(fw.nn.functional.pad(fw_swapaxes(x, dim, -1, fw), pad=pad_vec, mode=pad_mode), dim, -1, fw) + + +def fw_conv(input, filter, stride): + # we want to apply 1d conv to any nd array. the way to do it is to reshape + # the input to a 4D tensor. first two dims are singeletons, 3rd dim stores + # all the spatial dims that we are not convolving along now. then we can + # apply conv2d with a 1xK filter. This convolves the same way all the other + # dims stored in the 3d dim. like depthwise conv over these. + # TODO: numpy support + reshaped_input = input.reshape(1, 1, -1, input.shape[-1]) + reshaped_output = paddle.nn.functional.conv2d(reshaped_input, filter.view(1, 1, 1, -1), stride=(1, stride)) + return reshaped_output.reshape(*input.shape[:-1], -1) + + +def fw_arange(upper_bound, fw, device): + if fw is numpy: + return fw.arange(upper_bound) + else: + return fw.arange(upper_bound) + + +def fw_empty(shape, fw, device): + if fw is numpy: + return fw.empty(shape) + else: + return fw.empty(shape=shape) diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/README.md b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/README.md new file mode 100644 index 00000000..711671ba --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/README.md @@ -0,0 +1,2 @@ +# Diffusion model (Paddle) +This module implements diffusion model which accepts a text prompt and outputs images semantically close to the text. The code is rewritten by Paddle, and mainly refer to two projects: jina-ai/discoart[https://github.com/jina-ai/discoart] and openai/guided-diffusion[https://github.com/openai/guided-diffusion]. Thanks for their wonderful work. diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/__init__.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/__init__.py new file mode 100755 index 00000000..39fc908d --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/__init__.py @@ -0,0 +1,156 @@ +''' +https://github.com/jina-ai/discoart/blob/main/discoart/__init__.py +''' +import os +import warnings + +os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE' + +__all__ = ['create'] + +import sys + +__resources_path__ = os.path.join( + os.path.dirname(sys.modules.get(__package__).__file__ if __package__ in sys.modules else __file__), + 'resources', +) + +import gc + +# check if GPU is available +import paddle + +# download and load models, this will take some time on the first load + +from .helper import load_all_models, load_diffusion_model, load_clip_models + +model_config, secondary_model = load_all_models('512x512_diffusion_uncond_finetune_008100', use_secondary_model=True) + +from typing import TYPE_CHECKING, overload, List, Optional + +if TYPE_CHECKING: + from docarray import DocumentArray, Document + +_clip_models_cache = {} + +# begin_create_overload + + +@overload +def create(text_prompts: Optional[List[str]] = [ + 'A beautiful painting of a singular lighthouse, shining its light across a tumultuous sea of blood by greg rutkowski and thomas kinkade, Trending on artstation.', + 'yellow color scheme', +], + init_image: Optional[str] = None, + width_height: Optional[List[int]] = [1280, 768], + skip_steps: Optional[int] = 10, + steps: Optional[int] = 250, + cut_ic_pow: Optional[int] = 1, + init_scale: Optional[int] = 1000, + clip_guidance_scale: Optional[int] = 5000, + tv_scale: Optional[int] = 0, + range_scale: Optional[int] = 150, + sat_scale: Optional[int] = 0, + cutn_batches: Optional[int] = 4, + diffusion_model: Optional[str] = '512x512_diffusion_uncond_finetune_008100', + use_secondary_model: Optional[bool] = True, + diffusion_sampling_mode: Optional[str] = 'ddim', + perlin_init: Optional[bool] = False, + perlin_mode: Optional[str] = 'mixed', + seed: Optional[int] = None, + eta: Optional[float] = 0.8, + clamp_grad: Optional[bool] = True, + clamp_max: Optional[float] = 0.05, + randomize_class: Optional[bool] = True, + clip_denoised: Optional[bool] = False, + fuzzy_prompt: Optional[bool] = False, + rand_mag: Optional[float] = 0.05, + cut_overview: Optional[str] = '[12]*400+[4]*600', + cut_innercut: Optional[str] = '[4]*400+[12]*600', + cut_icgray_p: Optional[str] = '[0.2]*400+[0]*600', + display_rate: Optional[int] = 10, + n_batches: Optional[int] = 4, + batch_size: Optional[int] = 1, + batch_name: Optional[str] = '', + clip_models: Optional[list] = ['ViTB32', 'ViTB16', 'RN50'], + output_dir: Optional[str] = 'discoart_output') -> 'DocumentArray': + """ + Create Disco Diffusion artworks and save the result into a DocumentArray. + + :param text_prompts: Phrase, sentence, or string of words and phrases describing what the image should look like. The words will be analyzed by the AI and will guide the diffusion process toward the image(s) you describe. These can include commas and weights to adjust the relative importance of each element. E.g. "A beautiful painting of a singular lighthouse, shining its light across a tumultuous sea of blood by greg rutkowski and thomas kinkade, Trending on artstation."Notice that this prompt loosely follows a structure: [subject], [prepositional details], [setting], [meta modifiers and artist]; this is a good starting point for your experiments. Developing text prompts takes practice and experience, and is not the subject of this guide. If you are a beginner to writing text prompts, a good place to start is on a simple AI art app like Night Cafe, starry ai or WOMBO prior to using DD, to get a feel for how text gets translated into images by GAN tools. These other apps use different technologies, but many of the same principles apply. + :param init_image: Recall that in the image sequence above, the first image shown is just noise. If an init_image is provided, diffusion will replace the noise with the init_image as its starting state. To use an init_image, upload the image to the Colab instance or your Google Drive, and enter the full image path here. If using an init_image, you may need to increase skip_steps to ~ 50% of total steps to retain the character of the init. See skip_steps above for further discussion. + :param width_height: Desired final image size, in pixels. You can have a square, wide, or tall image, but each edge length should be set to a multiple of 64px, and a minimum of 512px on the default CLIP model setting. If you forget to use multiples of 64px in your dimensions, DD will adjust the dimensions of your image to make it so. + :param skip_steps: Consider the chart shown here. Noise scheduling (denoise strength) starts very high and progressively gets lower and lower as diffusion steps progress. The noise levels in the first few steps are very high, so images change dramatically in early steps.As DD moves along the curve, noise levels (and thus the amount an image changes per step) declines, and image coherence from one step to the next increases.The first few steps of denoising are often so dramatic that some steps (maybe 10-15% of total) can be skipped without affecting the final image. You can experiment with this as a way to cut render times.If you skip too many steps, however, the remaining noise may not be high enough to generate new content, and thus may not have ‘time left’ to finish an image satisfactorily.Also, depending on your other settings, you may need to skip steps to prevent CLIP from overshooting your goal, resulting in ‘blown out’ colors (hyper saturated, solid white, or solid black regions) or otherwise poor image quality. Consider that the denoising process is at its strongest in the early steps, so skipping steps can sometimes mitigate other problems.Lastly, if using an init_image, you will need to skip ~50% of the diffusion steps to retain the shapes in the original init image. However, if you’re using an init_image, you can also adjust skip_steps up or down for creative reasons. With low skip_steps you can get a result "inspired by" the init_image which will retain the colors and rough layout and shapes but look quite different. With high skip_steps you can preserve most of the init_image contents and just do fine tuning of the texture. + :param steps: When creating an image, the denoising curve is subdivided into steps for processing. Each step (or iteration) involves the AI looking at subsets of the image called ‘cuts’ and calculating the ‘direction’ the image should be guided to be more like the prompt. Then it adjusts the image with the help of the diffusion denoiser, and moves to the next step.Increasing steps will provide more opportunities for the AI to adjust the image, and each adjustment will be smaller, and thus will yield a more precise, detailed image. Increasing steps comes at the expense of longer render times. Also, while increasing steps should generally increase image quality, there is a diminishing return on additional steps beyond 250 - 500 steps. However, some intricate images can take 1000, 2000, or more steps. It is really up to the user. Just know that the render time is directly related to the number of steps, and many other parameters have a major impact on image quality, without costing additional time. + :param cut_ic_pow: This sets the size of the border used for inner cuts. High cut_ic_pow values have larger borders, and therefore the cuts themselves will be smaller and provide finer details. If you have too many or too-small inner cuts, you may lose overall image coherency and/or it may cause an undesirable ‘mosaic’ effect. Low cut_ic_pow values will allow the inner cuts to be larger, helping image coherency while still helping with some details. + :param init_scale: This controls how strongly CLIP will try to match the init_image provided. This is balanced against the clip_guidance_scale (CGS) above. Too much init scale, and the image won’t change much during diffusion. Too much CGS and the init image will be lost. + :param clip_guidance_scale: CGS is one of the most important parameters you will use. It tells DD how strongly you want CLIP to move toward your prompt each timestep. Higher is generally better, but if CGS is too strong it will overshoot the goal and distort the image. So a happy medium is needed, and it takes experience to learn how to adjust CGS. Note that this parameter generally scales with image dimensions. In other words, if you increase your total dimensions by 50% (e.g. a change from 512 x 512 to 512 x 768), then to maintain the same effect on the image, you’d want to increase clip_guidance_scale from 5000 to 7500. Of the basic settings, clip_guidance_scale, steps and skip_steps are the most important contributors to image quality, so learn them well. + :param tv_scale: Total variance denoising. Optional, set to zero to turn off. Controls ‘smoothness’ of final output. If used, tv_scale will try to smooth out your final image to reduce overall noise. If your image is too ‘crunchy’, increase tv_scale. TV denoising is good at preserving edges while smoothing away noise in flat regions. See https://en.wikipedia.org/wiki/Total_variation_denoising + :param range_scale: Optional, set to zero to turn off. Used for adjustment of color contrast. Lower range_scale will increase contrast. Very low numbers create a reduced color palette, resulting in more vibrant or poster-like images. Higher range_scale will reduce contrast, for more muted images. + :param sat_scale: Saturation scale. Optional, set to zero to turn off. If used, sat_scale will help mitigate oversaturation. If your image is too saturated, increase sat_scale to reduce the saturation. + :param cutn_batches: Each iteration, the AI cuts the image into smaller pieces known as cuts, and compares each cut to the prompt to decide how to guide the next diffusion step. More cuts can generally lead to better images, since DD has more chances to fine-tune the image precision in each timestep. Additional cuts are memory intensive, however, and if DD tries to evaluate too many cuts at once, it can run out of memory. You can use cutn_batches to increase cuts per timestep without increasing memory usage. At the default settings, DD is scheduled to do 16 cuts per timestep. If cutn_batches is set to 1, there will indeed only be 16 cuts total per timestep. However, if cutn_batches is increased to 4, DD will do 64 cuts total in each timestep, divided into 4 sequential batches of 16 cuts each. Because the cuts are being evaluated only 16 at a time, DD uses the memory required for only 16 cuts, but gives you the quality benefit of 64 cuts. The tradeoff, of course, is that this will take ~4 times as long to render each image.So, (scheduled cuts) x (cutn_batches) = (total cuts per timestep). Increasing cutn_batches will increase render times, however, as the work is being done sequentially. DD’s default cut schedule is a good place to start, but the cut schedule can be adjusted in the Cutn Scheduling section, explained below. + :param diffusion_model: Diffusion_model of choice. + :param use_secondary_model: Option to use a secondary purpose-made diffusion model to clean up interim diffusion images for CLIP evaluation. If this option is turned off, DD will use the regular (large) diffusion model. Using the secondary model is faster - one user reported a 50% improvement in render speed! However, the secondary model is much smaller, and may reduce image quality and detail. I suggest you experiment with this. + :param diffusion_sampling_mode: Two alternate diffusion denoising algorithms. ddim has been around longer, and is more established and tested. plms is a newly added alternate method that promises good diffusion results in fewer steps, but has not been as fully tested and may have side effects. This new plms mode is actively being researched in the #settings-and-techniques channel in the DD Discord. + :param perlin_init: Normally, DD will use an image filled with random noise as a starting point for the diffusion curve. If perlin_init is selected, DD will instead use a Perlin noise model as an initial state. Perlin has very interesting characteristics, distinct from random noise, so it’s worth experimenting with this for your projects. Beyond perlin, you can, of course, generate your own noise images (such as with GIMP, etc) and use them as an init_image (without skipping steps). Choosing perlin_init does not affect the actual diffusion process, just the starting point for the diffusion. Please note that selecting a perlin_init will replace and override any init_image you may have specified. Further, because the 2D, 3D and video animation systems all rely on the init_image system, if you enable Perlin while using animation modes, the perlin_init will jump in front of any previous image or video input, and DD will NOT give you the expected sequence of coherent images. All of that said, using Perlin and animation modes together do make a very colorful rainbow effect, which can be used creatively. + :param perlin_mode: sets type of Perlin noise: colored, gray, or a mix of both, giving you additional options for noise types. Experiment to see what these do in your projects. + :param seed: Deep in the diffusion code, there is a random number ‘seed’ which is used as the basis for determining the initial state of the diffusion. By default, this is random, but you can also specify your own seed. This is useful if you like a particular result and would like to run more iterations that will be similar. After each run, the actual seed value used will be reported in the parameters report, and can be reused if desired by entering seed # here. If a specific numerical seed is used repeatedly, the resulting images will be quite similar but not identical. + :param eta: eta (greek letter η) is a diffusion model variable that mixes in a random amount of scaled noise into each timestep. 0 is no noise, 1.0 is more noise. As with most DD parameters, you can go below zero for eta, but it may give you unpredictable results. The steps parameter has a close relationship with the eta parameter. If you set eta to 0, then you can get decent output with only 50-75 steps. Setting eta to 1.0 favors higher step counts, ideally around 250 and up. eta has a subtle, unpredictable effect on image, so you’ll need to experiment to see how this affects your projects. + :param clamp_grad: As I understand it, clamp_grad is an internal limiter that stops DD from producing extreme results. Try your images with and without clamp_grad. If the image changes drastically with clamp_grad turned off, it probably means your clip_guidance_scale is too high and should be reduced. + :param clamp_max: Sets the value of the clamp_grad limitation. Default is 0.05, providing for smoother, more muted coloration in images, but setting higher values (0.15-0.3) can provide interesting contrast and vibrancy. + :param fuzzy_prompt: Controls whether to add multiple noisy prompts to the prompt losses. If True, can increase variability of image output. Experiment with this. + :param rand_mag: Affects only the fuzzy_prompt. Controls the magnitude of the random noise added by fuzzy_prompt. + :param cut_overview: The schedule of overview cuts + :param cut_innercut: The schedule of inner cuts + :param cut_icgray_p: This sets the size of the border used for inner cuts. High cut_ic_pow values have larger borders, and therefore the cuts themselves will be smaller and provide finer details. If you have too many or too-small inner cuts, you may lose overall image coherency and/or it may cause an undesirable ‘mosaic’ effect. Low cut_ic_pow values will allow the inner cuts to be larger, helping image coherency while still helping with some details. + :param display_rate: During a diffusion run, you can monitor the progress of each image being created with this variable. If display_rate is set to 50, DD will show you the in-progress image every 50 timesteps. Setting this to a lower value, like 5 or 10, is a good way to get an early peek at where your image is heading. If you don’t like the progression, just interrupt execution, change some settings, and re-run. If you are planning a long, unmonitored batch, it’s better to set display_rate equal to steps, because displaying interim images does slow Colab down slightly. + :param n_batches: This variable sets the number of still images you want DD to create. If you are using an animation mode (see below for details) DD will ignore n_batches and create a single set of animated frames based on the animation settings. + :param batch_name: The name of the batch, the batch id will be named as "discoart-[batch_name]-seed". To avoid your artworks be overridden by other users, please use a unique name. + :param clip_models: CLIP Model selectors. ViTB32, ViTB16, ViTL14, RN101, RN50, RN50x4, RN50x16, RN50x64.These various CLIP models are available for you to use during image generation. Models have different styles or ‘flavors,’ so look around. You can mix in multiple models as well for different results. However, keep in mind that some models are extremely memory-hungry, and turning on additional models will take additional memory and may cause a crash.The rough order of speed/mem usage is (smallest/fastest to largest/slowest):VitB32RN50RN101VitB16RN50x4RN50x16RN50x64ViTL14For RN50x64 & ViTL14 you may need to use fewer cuts, depending on your VRAM. + :return: a DocumentArray object that has `n_batches` Documents + """ + + +# end_create_overload + + +@overload +def create(init_document: 'Document') -> 'DocumentArray': + """ + Create an artwork using a DocArray ``Document`` object as initial state. + :param init_document: its ``.tags`` will be used as parameters, ``.uri`` (if present) will be used as init image. + :return: a DocumentArray object that has `n_batches` Documents + """ + + +def create(**kwargs) -> 'DocumentArray': + from .config import load_config + from .runner import do_run + + if 'init_document' in kwargs: + d = kwargs['init_document'] + _kwargs = d.tags + if not _kwargs: + warnings.warn('init_document has no .tags, fallback to default config') + if d.uri: + _kwargs['init_image'] = kwargs['init_document'].uri + else: + warnings.warn('init_document has no .uri, fallback to no init image') + kwargs.pop('init_document') + if kwargs: + warnings.warn('init_document has .tags and .uri, but kwargs are also present, will override .tags') + _kwargs.update(kwargs) + _args = load_config(user_config=_kwargs) + else: + _args = load_config(user_config=kwargs) + + model, diffusion = load_diffusion_model(model_config, _args.diffusion_model, steps=_args.steps) + + clip_models = load_clip_models(enabled=_args.clip_models, clip_models=_clip_models_cache) + + gc.collect() + paddle.device.cuda.empty_cache() + try: + return do_run(_args, (model, diffusion, clip_models, secondary_model)) + except KeyboardInterrupt: + pass diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/config.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/config.py new file mode 100755 index 00000000..0cbc71e6 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/config.py @@ -0,0 +1,77 @@ +''' +https://github.com/jina-ai/discoart/blob/main/discoart/config.py +''' +import copy +import random +import warnings +from types import SimpleNamespace +from typing import Dict + +import yaml +from yaml import Loader + +from . import __resources_path__ + +with open(f'{__resources_path__}/default.yml') as ymlfile: + default_args = yaml.load(ymlfile, Loader=Loader) + + +def load_config(user_config: Dict, ): + cfg = copy.deepcopy(default_args) + + if user_config: + cfg.update(**user_config) + + for k in user_config.keys(): + if k not in cfg: + warnings.warn(f'unknown argument {k}, ignored') + + for k, v in cfg.items(): + if k in ('batch_size', 'display_rate', 'seed', 'skip_steps', 'steps', 'n_batches', + 'cutn_batches') and isinstance(v, float): + cfg[k] = int(v) + if k == 'width_height': + cfg[k] = [int(vv) for vv in v] + + cfg.update(**{ + 'seed': cfg['seed'] or random.randint(0, 2**32), + }) + + if cfg['batch_name']: + da_name = f'{__package__}-{cfg["batch_name"]}-{cfg["seed"]}' + else: + da_name = f'{__package__}-{cfg["seed"]}' + warnings.warn('you did not set `batch_name`, set it to have unique session ID') + + cfg.update(**{'name_docarray': da_name}) + + print_args_table(cfg) + + return SimpleNamespace(**cfg) + + +def print_args_table(cfg): + from rich.table import Table + from rich import box + from rich.console import Console + + console = Console() + + param_str = Table( + title=cfg['name_docarray'], + box=box.ROUNDED, + highlight=True, + title_justify='left', + ) + param_str.add_column('Argument', justify='right') + param_str.add_column('Value', justify='left') + + for k, v in sorted(cfg.items()): + value = str(v) + + if not default_args.get(k, None) == v: + value = f'[b]{value}[/]' + + param_str.add_row(k, value) + + console.print(param_str) diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/helper.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/helper.py new file mode 100755 index 00000000..6fc4196b --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/helper.py @@ -0,0 +1,137 @@ +''' +This code is rewritten by Paddle based on Jina-ai/discoart. +https://github.com/jina-ai/discoart/blob/main/discoart/helper.py +''' +import hashlib +import logging +import os +import subprocess +import sys +from os.path import expanduser +from pathlib import Path +from typing import Any +from typing import Dict +from typing import List + +import paddle + + +def _get_logger(): + logger = logging.getLogger(__package__) + logger.setLevel("INFO") + ch = logging.StreamHandler() + ch.setLevel("INFO") + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + ch.setFormatter(formatter) + logger.addHandler(ch) + return logger + + +logger = _get_logger() + + +def load_clip_models(enabled: List[str], clip_models: Dict[str, Any] = {}): + + import disco_diffusion_clip_vitb32.clip.clip as clip + from disco_diffusion_clip_vitb32.clip.clip import build_model, tokenize, transform + + # load enabled models + for k in enabled: + if k not in clip_models: + clip_models[k] = build_model(name=k) + clip_models[k].eval() + for parameter in clip_models[k].parameters(): + parameter.stop_gradient = True + + # disable not enabled models to save memory + for k in clip_models: + if k not in enabled: + clip_models.pop(k) + + return list(clip_models.values()) + + +def load_all_models(diffusion_model, use_secondary_model): + from .model.script_util import ( + model_and_diffusion_defaults, ) + + model_config = model_and_diffusion_defaults() + + if diffusion_model == '512x512_diffusion_uncond_finetune_008100': + model_config.update({ + 'attention_resolutions': '32, 16, 8', + 'class_cond': False, + 'diffusion_steps': 1000, # No need to edit this, it is taken care of later. + 'rescale_timesteps': True, + 'timestep_respacing': 250, # No need to edit this, it is taken care of later. + 'image_size': 512, + 'learn_sigma': True, + 'noise_schedule': 'linear', + 'num_channels': 256, + 'num_head_channels': 64, + 'num_res_blocks': 2, + 'resblock_updown': True, + 'use_fp16': False, + 'use_scale_shift_norm': True, + }) + elif diffusion_model == '256x256_diffusion_uncond': + model_config.update({ + 'attention_resolutions': '32, 16, 8', + 'class_cond': False, + 'diffusion_steps': 1000, # No need to edit this, it is taken care of later. + 'rescale_timesteps': True, + 'timestep_respacing': 250, # No need to edit this, it is taken care of later. + 'image_size': 256, + 'learn_sigma': True, + 'noise_schedule': 'linear', + 'num_channels': 256, + 'num_head_channels': 64, + 'num_res_blocks': 2, + 'resblock_updown': True, + 'use_fp16': False, + 'use_scale_shift_norm': True, + }) + + secondary_model = None + if use_secondary_model: + from .model.sec_diff import SecondaryDiffusionImageNet2 + secondary_model = SecondaryDiffusionImageNet2() + model_dict = paddle.load( + os.path.join(os.path.dirname(__file__), 'pre_trained', 'secondary_model_imagenet_2.pdparams')) + secondary_model.set_state_dict(model_dict) + secondary_model.eval() + for parameter in secondary_model.parameters(): + parameter.stop_gradient = True + + return model_config, secondary_model + + +def load_diffusion_model(model_config, diffusion_model, steps): + from .model.script_util import ( + create_model_and_diffusion, ) + + timestep_respacing = f'ddim{steps}' + diffusion_steps = (1000 // steps) * steps if steps < 1000 else steps + model_config.update({ + 'timestep_respacing': timestep_respacing, + 'diffusion_steps': diffusion_steps, + }) + + model, diffusion = create_model_and_diffusion(**model_config) + model.set_state_dict( + paddle.load(os.path.join(os.path.dirname(__file__), 'pre_trained', f'{diffusion_model}.pdparams'))) + model.eval() + for name, param in model.named_parameters(): + param.stop_gradient = True + + return model, diffusion + + +def parse_prompt(prompt): + if prompt.startswith('http://') or prompt.startswith('https://'): + vals = prompt.rsplit(':', 2) + vals = [vals[0] + ':' + vals[1], *vals[2:]] + else: + vals = prompt.rsplit(':', 1) + vals = vals + ['', '1'][len(vals):] + return vals[0], float(vals[1]) diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/__init__.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/__init__.py new file mode 100755 index 00000000..46680066 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/__init__.py @@ -0,0 +1,3 @@ +""" +Codebase for "Improved Denoising Diffusion Probabilistic Models" implemented by Paddle. +""" diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/gaussian_diffusion.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/gaussian_diffusion.py new file mode 100755 index 00000000..86cd2c65 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/gaussian_diffusion.py @@ -0,0 +1,1214 @@ +""" +Diffusion model implemented by Paddle. +This code is rewritten based on Pytorch version of of Ho et al's diffusion models: +https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py +""" +import enum +import math + +import numpy as np +import paddle + +from .losses import discretized_gaussian_log_likelihood +from .losses import normal_kl +from .nn import mean_flat + + +def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): + """ + Get a pre-defined beta schedule for the given name. + + The beta schedule library consists of beta schedules which remain similar + in the limit of num_diffusion_timesteps. + Beta schedules may be added, but should not be removed or changed once + they are committed to maintain backwards compatibility. + """ + if schedule_name == "linear": + # Linear schedule from Ho et al, extended to work for any number of + # diffusion steps. + scale = 1000 / num_diffusion_timesteps + beta_start = scale * 0.0001 + beta_end = scale * 0.02 + return np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64) + elif schedule_name == "cosine": + return betas_for_alpha_bar( + num_diffusion_timesteps, + lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2)**2, + ) + else: + raise NotImplementedError(f"unknown beta schedule: {schedule_name}") + + +def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, + which defines the cumulative product of (1-beta) over time from t = [0,1]. + + :param num_diffusion_timesteps: the number of betas to produce. + :param alpha_bar: a lambda that takes an argument t from 0 to 1 and + produces the cumulative product of (1-beta) up to that + part of the diffusion process. + :param max_beta: the maximum beta to use; use values lower than 1 to + prevent singularities. + """ + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return np.array(betas) + + +class ModelMeanType(enum.Enum): + """ + Which type of output the model predicts. + """ + + PREVIOUS_X = enum.auto() # the model predicts x_{t-1} + START_X = enum.auto() # the model predicts x_0 + EPSILON = enum.auto() # the model predicts epsilon + + +class ModelVarType(enum.Enum): + """ + What is used as the model's output variance. + + The LEARNED_RANGE option has been added to allow the model to predict + values between FIXED_SMALL and FIXED_LARGE, making its job easier. + """ + + LEARNED = enum.auto() + FIXED_SMALL = enum.auto() + FIXED_LARGE = enum.auto() + LEARNED_RANGE = enum.auto() + + +class LossType(enum.Enum): + MSE = enum.auto() # use raw MSE loss (and KL when learning variances) + RESCALED_MSE = (enum.auto()) # use raw MSE loss (with RESCALED_KL when learning variances) + KL = enum.auto() # use the variational lower-bound + RESCALED_KL = enum.auto() # like KL, but rescale to estimate the full VLB + + def is_vb(self): + return self == LossType.KL or self == LossType.RESCALED_KL + + +class GaussianDiffusion: + """ + Utilities for training and sampling diffusion models. + + Ported directly from here, and then adapted over time to further experimentation. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42 + + :param betas: a 1-D numpy array of betas for each diffusion timestep, + starting at T and going to 1. + :param model_mean_type: a ModelMeanType determining what the model outputs. + :param model_var_type: a ModelVarType determining how variance is output. + :param loss_type: a LossType determining the loss function to use. + :param rescale_timesteps: if True, pass floating point timesteps into the + model so that they are always scaled like in the + original paper (0 to 1000). + """ + + def __init__( + self, + *, + betas, + model_mean_type, + model_var_type, + loss_type, + rescale_timesteps=False, + ): + self.model_mean_type = model_mean_type + self.model_var_type = model_var_type + self.loss_type = loss_type + self.rescale_timesteps = rescale_timesteps + + # Use float64 for accuracy. + betas = np.array(betas, dtype=np.float64) + self.betas = betas + assert len(betas.shape) == 1, "betas must be 1-D" + assert (betas > 0).all() and (betas <= 1).all() + + self.num_timesteps = int(betas.shape[0]) + + alphas = 1.0 - betas + self.alphas_cumprod = np.cumprod(alphas, axis=0) + self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) + self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0) + assert self.alphas_cumprod_prev.shape == (self.num_timesteps, ) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod) + self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod) + self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod) + self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod) + self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1) + + # calculations for posterior q(x_{t-1} | x_t, x_0) + self.posterior_variance = (betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)) + # log calculation clipped because the posterior variance is 0 at the + # beginning of the diffusion chain. + self.posterior_log_variance_clipped = np.log(np.append(self.posterior_variance[1], self.posterior_variance[1:])) + self.posterior_mean_coef1 = (betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)) + self.posterior_mean_coef2 = ((1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)) + + def q_mean_variance(self, x_start, t): + """ + Get the distribution q(x_t | x_0). + + :param x_start: the [N x C x ...] tensor of noiseless inputs. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :return: A tuple (mean, variance, log_variance), all of x_start's shape. + """ + mean = (_extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start) + variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape) + log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape) + return mean, variance, log_variance + + def q_sample(self, x_start, t, noise=None): + """ + Diffuse the data for a given number of diffusion steps. + + In other words, sample from q(x_t | x_0). + + :param x_start: the initial data batch. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :param noise: if specified, the split-out normal noise. + :return: A noisy version of x_start. + """ + if noise is None: + # noise = th.randn_like(x_start) + noise = paddle.randn(x_start.shape, x_start.dtype) + assert noise.shape == x_start.shape + return (_extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise) + + def q_posterior_mean_variance(self, x_start, x_t, t): + """ + Compute the mean and variance of the diffusion posterior: + + q(x_{t-1} | x_t, x_0) + + """ + assert x_start.shape == x_t.shape + posterior_mean = (_extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t) + posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape) + posterior_log_variance_clipped = _extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape) + assert (posterior_mean.shape[0] == posterior_variance.shape[0] == posterior_log_variance_clipped.shape[0] == + x_start.shape[0]) + return posterior_mean, posterior_variance, posterior_log_variance_clipped + + def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None): + """ + Apply the model to get p(x_{t-1} | x_t), as well as a prediction of + the initial x, x_0. + + :param model: the model, which takes a signal and a batch of timesteps + as input. + :param x: the [N x C x ...] tensor at time t. + :param t: a 1-D Tensor of timesteps. + :param clip_denoised: if True, clip the denoised signal into [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. Applies before + clip_denoised. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict with the following keys: + - 'mean': the model mean output. + - 'variance': the model variance output. + - 'log_variance': the log of 'variance'. + - 'pred_xstart': the prediction for x_0. + """ + if model_kwargs is None: + model_kwargs = {} + + B, C = x.shape[:2] + assert t.shape == [B] + model_output = model(x, self._scale_timesteps(t), **model_kwargs) + + if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]: + assert model_output.shape == [B, C * 2, *x.shape[2:]] + model_output, model_var_values = paddle.split(model_output, 2, axis=1) + if self.model_var_type == ModelVarType.LEARNED: + model_log_variance = model_var_values + model_variance = paddle.exp(model_log_variance) + else: + min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape) + max_log = _extract_into_tensor(np.log(self.betas), t, x.shape) + # The model_var_values is [-1, 1] for [min_var, max_var]. + frac = (model_var_values + 1) / 2 + model_log_variance = frac * max_log + (1 - frac) * min_log + model_variance = paddle.exp(model_log_variance) + else: + model_variance, model_log_variance = { + # for fixedlarge, we set the initial (log-)variance like so + # to get a better decoder log likelihood. + ModelVarType.FIXED_LARGE: ( + np.append(self.posterior_variance[1], self.betas[1:]), + np.log(np.append(self.posterior_variance[1], self.betas[1:])), + ), + ModelVarType.FIXED_SMALL: ( + self.posterior_variance, + self.posterior_log_variance_clipped, + ), + }[self.model_var_type] + model_variance = _extract_into_tensor(model_variance, t, x.shape) + model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape) + + def process_xstart(x): + if denoised_fn is not None: + x = denoised_fn(x) + if clip_denoised: + return x.clamp(-1, 1) + return x + + if self.model_mean_type == ModelMeanType.PREVIOUS_X: + pred_xstart = process_xstart(self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output)) + model_mean = model_output + elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]: + if self.model_mean_type == ModelMeanType.START_X: + pred_xstart = process_xstart(model_output) + else: + pred_xstart = process_xstart(self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)) + model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t) + else: + raise NotImplementedError(self.model_mean_type) + + assert (model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape) + return { + "mean": model_mean, + "variance": model_variance, + "log_variance": model_log_variance, + "pred_xstart": pred_xstart, + } + + def _predict_xstart_from_eps(self, x_t, t, eps): + assert x_t.shape == eps.shape + return (_extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - + _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps) + + def _predict_xstart_from_xprev(self, x_t, t, xprev): + assert x_t.shape == xprev.shape + return ( # (xprev - coef2*x_t) / coef1 + _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev - + _extract_into_tensor(self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape) * x_t) + + def _predict_eps_from_xstart(self, x_t, t, pred_xstart): + return (_extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - + pred_xstart) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) + + def _scale_timesteps(self, t): + if self.rescale_timesteps: + return paddle.cast((t), 'float32') * (1000.0 / self.num_timesteps) + return t + + def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute the mean for the previous step, given a function cond_fn that + computes the gradient of a conditional log probability with respect to + x. In particular, cond_fn computes grad(log(p(y|x))), and we want to + condition on y. + + This uses the conditioning strategy from Sohl-Dickstein et al. (2015). + """ + gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs) + new_mean = (paddle.cast((p_mean_var["mean"]), 'float32') + p_mean_var["variance"] * paddle.cast( + (gradient), 'float32')) + return new_mean + + def condition_mean_with_grad(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute the mean for the previous step, given a function cond_fn that + computes the gradient of a conditional log probability with respect to + x. In particular, cond_fn computes grad(log(p(y|x))), and we want to + condition on y. + + This uses the conditioning strategy from Sohl-Dickstein et al. (2015). + """ + gradient = cond_fn(x, t, p_mean_var, **model_kwargs) + new_mean = (paddle.cast((p_mean_var["mean"]), 'float32') + p_mean_var["variance"] * paddle.cast( + (gradient), 'float32')) + return new_mean + + def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute what the p_mean_variance output would have been, should the + model's score function be conditioned by cond_fn. + + See condition_mean() for details on cond_fn. + + Unlike condition_mean(), this instead uses the conditioning strategy + from Song et al (2020). + """ + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + + eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"]) + eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, self._scale_timesteps(t), **model_kwargs) + + out = p_mean_var.copy() + out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps) + out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t) + return out + + def condition_score_with_grad(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute what the p_mean_variance output would have been, should the + model's score function be conditioned by cond_fn. + + See condition_mean() for details on cond_fn. + + Unlike condition_mean(), this instead uses the conditioning strategy + from Song et al (2020). + """ + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + + eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"]) + eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, p_mean_var, **model_kwargs) + + out = p_mean_var.copy() + out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps) + out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t) + return out + + def p_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + ): + """ + Sample x_{t-1} from the model at the given timestep. + + :param model: the model to sample from. + :param x: the current tensor at x_{t-1}. + :param t: the value of t, starting at 0 for the first diffusion step. + :param clip_denoised: if True, clip the x_start prediction to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict containing the following keys: + - 'sample': a random sample from the model. + - 'pred_xstart': a prediction of x_0. + """ + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + # noise = th.randn_like(x) + noise = paddle.randn(x.shape, x.dtype) + nonzero_mask = (paddle.cast((t != 0), 'float32').reshape([-1, + *([1] * (len(x.shape) - 1))])) # no noise when t == 0 + if cond_fn is not None: + out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs) + sample = out["mean"] + nonzero_mask * paddle.exp(0.5 * out["log_variance"]) * noise + return {"sample": sample, "pred_xstart": out["pred_xstart"]} + + def p_sample_with_grad( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + ): + """ + Sample x_{t-1} from the model at the given timestep. + + :param model: the model to sample from. + :param x: the current tensor at x_{t-1}. + :param t: the value of t, starting at 0 for the first diffusion step. + :param clip_denoised: if True, clip the x_start prediction to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict containing the following keys: + - 'sample': a random sample from the model. + - 'pred_xstart': a prediction of x_0. + """ + # with th.enable_grad(): + # x = x.detach().requires_grad_() + x = x.detach() + # x.stop_gradient = False + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + # noise = th.randn_like(x) + noise = paddle.randn(x.shape, x.dtype) + nonzero_mask = (paddle.cast((t != 0), 'float32').reshape([-1, + *([1] * (len(x.shape) - 1))])) # no noise when t == 0 + if cond_fn is not None: + out["mean"] = self.condition_mean_with_grad(cond_fn, out, x, t, model_kwargs=model_kwargs) + sample = out["mean"] + nonzero_mask * paddle.exp(0.5 * out["log_variance"]) * noise + return {"sample": sample, "pred_xstart": out["pred_xstart"].detach()} + + def p_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + skip_timesteps=0, + init_image=None, + randomize_class=False, + cond_fn_with_grad=False, + ): + """ + Generate samples from the model. + + :param model: the model module. + :param shape: the shape of the samples, (N, C, H, W). + :param noise: if specified, the noise from the encoder to sample. + Should be of the same shape as `shape`. + :param clip_denoised: if True, clip x_start predictions to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param device: if specified, the device to create the samples on. + If not specified, use a model parameter's device. + :param progress: if True, show a tqdm progress bar. + :return: a non-differentiable batch of samples. + """ + final = None + for sample in self.p_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + skip_timesteps=skip_timesteps, + init_image=init_image, + randomize_class=randomize_class, + cond_fn_with_grad=cond_fn_with_grad, + ): + final = sample + return final["sample"] + + def p_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + skip_timesteps=0, + init_image=None, + randomize_class=False, + cond_fn_with_grad=False, + ): + """ + Generate samples from the model and yield intermediate samples from + each timestep of diffusion. + + Arguments are the same as p_sample_loop(). + Returns a generator over dicts, where each dict is the return value of + p_sample(). + """ + if device is None: + device = model.parameters()[0].place + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = paddle.randn(shape) + + if skip_timesteps and init_image is None: + init_image = paddle.zeros_like(img) + + indices = list(range(self.num_timesteps - skip_timesteps))[::-1] + + if init_image is not None: + my_t = paddle.ones([shape[0]], dtype='int64') * indices[0] + img = self.q_sample(init_image, my_t, img) + + if progress: + # Lazy import so that we don't depend on tqdm. + from tqdm.auto import tqdm + + indices = tqdm(indices) + + for i in indices: + t = paddle.to_tensor([i] * shape[0], place=device) + if randomize_class and 'y' in model_kwargs: + model_kwargs['y'] = paddle.randint(low=0, high=model.num_classes, shape=model_kwargs['y'].shape) + # with paddle.no_grad(): + sample_fn = self.p_sample_with_grad if cond_fn_with_grad else self.p_sample + out = sample_fn( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + ) + yield out + img = out["sample"] + + def ddim_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t-1} from the model using DDIM. + + Same usage as p_sample(). + """ + out_orig = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + if cond_fn is not None: + out = self.condition_score(cond_fn, out_orig, x, t, model_kwargs=model_kwargs) + else: + out = out_orig + + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) + + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) + sigma = (eta * paddle.sqrt( + (1 - alpha_bar_prev) / (1 - alpha_bar)) * paddle.sqrt(1 - alpha_bar / alpha_bar_prev)) + # Equation 12. + # noise = th.randn_like(x) + noise = paddle.randn(x.shape, x.dtype) + mean_pred = (out["pred_xstart"] * paddle.sqrt(alpha_bar_prev) + + paddle.sqrt(1 - alpha_bar_prev - sigma**2) * eps) + nonzero_mask = (paddle.cast((t != 0), 'float32').reshape([-1, + *([1] * (len(x.shape) - 1))])) # no noise when t == 0 + sample = mean_pred + nonzero_mask * sigma * noise + return {"sample": sample, "pred_xstart": out_orig["pred_xstart"]} + + def ddim_sample_with_grad( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t-1} from the model using DDIM. + + Same usage as p_sample(). + """ + # with th.enable_grad(): + # x = x.detach().requires_grad_() + x = x.detach() + # x.stop_gradient = False + out_orig = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + if cond_fn is not None: + out = self.condition_score_with_grad(cond_fn, out_orig, x, t, model_kwargs=model_kwargs) + else: + out = out_orig + + out["pred_xstart"] = out["pred_xstart"].detach() + + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) + + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) + sigma = (eta * paddle.sqrt( + (1 - alpha_bar_prev) / (1 - alpha_bar)) * paddle.sqrt(1 - alpha_bar / alpha_bar_prev)) + # Equation 12. + # noise = th.randn_like(x) + noise = paddle.randn(x.shape, x.dtype) + mean_pred = (out["pred_xstart"] * paddle.sqrt(alpha_bar_prev) + + paddle.sqrt(1 - alpha_bar_prev - sigma**2) * eps) + nonzero_mask = (paddle.cast((t != 0), 'float32').reshape([-1, + *([1] * (len(x.shape) - 1))])) # no noise when t == 0 + sample = mean_pred + nonzero_mask * sigma * noise + return {"sample": sample, "pred_xstart": out_orig["pred_xstart"].detach()} + + def ddim_reverse_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t+1} from the model using DDIM reverse ODE. + """ + assert eta == 0.0, "Reverse ODE only for deterministic path" + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = (_extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - + out["pred_xstart"]) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape) + alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape) + + # Equation 12. reversed + mean_pred = (out["pred_xstart"] * paddle.sqrt(alpha_bar_next) + paddle.sqrt(1 - alpha_bar_next) * eps) + + return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]} + + def ddim_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + skip_timesteps=0, + init_image=None, + randomize_class=False, + cond_fn_with_grad=False, + ): + """ + Generate samples from the model using DDIM. + + Same usage as p_sample_loop(). + """ + final = None + for sample in self.ddim_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + eta=eta, + skip_timesteps=skip_timesteps, + init_image=init_image, + randomize_class=randomize_class, + cond_fn_with_grad=cond_fn_with_grad, + ): + final = sample + return final["sample"] + + def ddim_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + skip_timesteps=0, + init_image=None, + randomize_class=False, + cond_fn_with_grad=False, + ): + """ + Use DDIM to sample from the model and yield intermediate samples from + each timestep of DDIM. + + Same usage as p_sample_loop_progressive(). + """ + # if device is None: + # device = next(model.parameters()).device + if device is None: + device = model.parameters()[0].place + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = paddle.randn(shape) + + if skip_timesteps and init_image is None: + init_image = paddle.zeros_like(img) + + indices = list(range(self.num_timesteps - skip_timesteps))[::-1] + + if init_image is not None: + my_t = paddle.ones([shape[0]], dtype='int64') * indices[0] + img = self.q_sample(init_image, my_t, img) + + if progress: + # Lazy import so that we don't depend on tqdm. + from tqdm.auto import tqdm + + indices = tqdm(indices) + + for i in indices: + t = paddle.to_tensor([i] * shape[0]) + if randomize_class and 'y' in model_kwargs: + model_kwargs['y'] = paddle.randint( + low=0, + high=model.num_classes, + shape=model_kwargs['y'].shape, + ) + sample_fn = self.ddim_sample_with_grad if cond_fn_with_grad else self.ddim_sample + out = sample_fn( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + eta=eta, + ) + yield out + img = out["sample"] + + def plms_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + cond_fn_with_grad=False, + order=2, + old_out=None, + ): + """ + Sample x_{t-1} from the model using Pseudo Linear Multistep. + + Same usage as p_sample(). + """ + if not int(order) or not 1 <= order <= 4: + raise ValueError('order is invalid (should be int from 1-4).') + + def get_model_output(x, t): + with paddle.set_grad_enabled(cond_fn_with_grad and cond_fn is not None): + x = x.detach().requires_grad_() if cond_fn_with_grad else x + out_orig = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + if cond_fn is not None: + if cond_fn_with_grad: + out = self.condition_score_with_grad(cond_fn, out_orig, x, t, model_kwargs=model_kwargs) + x = x.detach() + else: + out = self.condition_score(cond_fn, out_orig, x, t, model_kwargs=model_kwargs) + else: + out = out_orig + + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) + return eps, out, out_orig + + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) + eps, out, out_orig = get_model_output(x, t) + + if order > 1 and old_out is None: + # Pseudo Improved Euler + old_eps = [eps] + mean_pred = out["pred_xstart"] * paddle.sqrt(alpha_bar_prev) + paddle.sqrt(1 - alpha_bar_prev) * eps + eps_2, _, _ = get_model_output(mean_pred, t - 1) + eps_prime = (eps + eps_2) / 2 + pred_prime = self._predict_xstart_from_eps(x, t, eps_prime) + mean_pred = pred_prime * paddle.sqrt(alpha_bar_prev) + paddle.sqrt(1 - alpha_bar_prev) * eps_prime + else: + # Pseudo Linear Multistep (Adams-Bashforth) + old_eps = old_out["old_eps"] + old_eps.append(eps) + cur_order = min(order, len(old_eps)) + if cur_order == 1: + eps_prime = old_eps[-1] + elif cur_order == 2: + eps_prime = (3 * old_eps[-1] - old_eps[-2]) / 2 + elif cur_order == 3: + eps_prime = (23 * old_eps[-1] - 16 * old_eps[-2] + 5 * old_eps[-3]) / 12 + elif cur_order == 4: + eps_prime = (55 * old_eps[-1] - 59 * old_eps[-2] + 37 * old_eps[-3] - 9 * old_eps[-4]) / 24 + else: + raise RuntimeError('cur_order is invalid.') + pred_prime = self._predict_xstart_from_eps(x, t, eps_prime) + mean_pred = pred_prime * paddle.sqrt(alpha_bar_prev) + paddle.sqrt(1 - alpha_bar_prev) * eps_prime + + if len(old_eps) >= order: + old_eps.pop(0) + + nonzero_mask = paddle.cast((t != 0), 'float32').reshape([-1, *([1] * (len(x.shape) - 1))]) + sample = mean_pred * nonzero_mask + out["pred_xstart"] * (1 - nonzero_mask) + + return {"sample": sample, "pred_xstart": out_orig["pred_xstart"], "old_eps": old_eps} + + def plms_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + skip_timesteps=0, + init_image=None, + randomize_class=False, + cond_fn_with_grad=False, + order=2, + ): + """ + Generate samples from the model using Pseudo Linear Multistep. + + Same usage as p_sample_loop(). + """ + final = None + for sample in self.plms_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + skip_timesteps=skip_timesteps, + init_image=init_image, + randomize_class=randomize_class, + cond_fn_with_grad=cond_fn_with_grad, + order=order, + ): + final = sample + return final["sample"] + + def plms_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + skip_timesteps=0, + init_image=None, + randomize_class=False, + cond_fn_with_grad=False, + order=2, + ): + """ + Use PLMS to sample from the model and yield intermediate samples from each + timestep of PLMS. + + Same usage as p_sample_loop_progressive(). + """ + if device is None: + device = model.parameters()[0].place + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = paddle.randn(shape) + + if skip_timesteps and init_image is None: + init_image = paddle.zeros_like(img) + + indices = list(range(self.num_timesteps - skip_timesteps))[::-1] + + if init_image is not None: + my_t = paddle.ones([shape[0]], dtype='int64') * indices[0] + img = self.q_sample(init_image, my_t, img) + + if progress: + # Lazy import so that we don't depend on tqdm. + from tqdm.auto import tqdm + + indices = tqdm(indices) + + old_out = None + + for i in indices: + t = paddle.to_tensor([i] * shape[0], place=device) + if randomize_class and 'y' in model_kwargs: + model_kwargs['y'] = paddle.randint(low=0, high=model.num_classes, shape=model_kwargs['y'].shape) + # with paddle.no_grad(): + out = self.plms_sample( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + cond_fn_with_grad=cond_fn_with_grad, + order=order, + old_out=old_out, + ) + yield out + old_out = out + img = out["sample"] + + def _vb_terms_bpd(self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None): + """ + Get a term for the variational lower-bound. + + The resulting units are bits (rather than nats, as one might expect). + This allows for comparison to other papers. + + :return: a dict with the following keys: + - 'output': a shape [N] tensor of NLLs or KLs. + - 'pred_xstart': the x_0 predictions. + """ + true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t) + out = self.p_mean_variance(model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs) + kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]) + kl = mean_flat(kl) / np.log(2.0) + + decoder_nll = -discretized_gaussian_log_likelihood( + x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]) + assert decoder_nll.shape == x_start.shape + decoder_nll = mean_flat(decoder_nll) / np.log(2.0) + + # At the first timestep return the decoder NLL, + # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t)) + output = paddle.where((t == 0), decoder_nll, kl) + return {"output": output, "pred_xstart": out["pred_xstart"]} + + def training_losses(self, model, x_start, t, model_kwargs=None, noise=None): + """ + Compute training losses for a single timestep. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param t: a batch of timestep indices. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param noise: if specified, the specific Gaussian noise to try to remove. + :return: a dict with the key "loss" containing a tensor of shape [N]. + Some mean or variance settings may also have other keys. + """ + if model_kwargs is None: + model_kwargs = {} + if noise is None: + # noise = th.randn_like(x_start) + noise = paddle.randn(x_start.shape, x_start.dtype) + x_t = self.q_sample(x_start, t, noise=noise) + + terms = {} + + if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: + terms["loss"] = self._vb_terms_bpd( + model=model, + x_start=x_start, + x_t=x_t, + t=t, + clip_denoised=False, + model_kwargs=model_kwargs, + )["output"] + if self.loss_type == LossType.RESCALED_KL: + terms["loss"] *= self.num_timesteps + elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: + model_output = model(x_t, self._scale_timesteps(t), **model_kwargs) + + if self.model_var_type in [ + ModelVarType.LEARNED, + ModelVarType.LEARNED_RANGE, + ]: + B, C = x_t.shape[:2] + assert model_output.shape == (B, C * 2, *x_t.shape[2:]) + model_output, model_var_values = paddle.split(model_output, 2, dim=1) + # Learn the variance using the variational bound, but don't let + # it affect our mean prediction. + frozen_out = paddle.concat([model_output.detach(), model_var_values], axis=1) + terms["vb"] = self._vb_terms_bpd( + model=lambda *args, r=frozen_out: r, + x_start=x_start, + x_t=x_t, + t=t, + clip_denoised=False, + )["output"] + if self.loss_type == LossType.RESCALED_MSE: + # Divide by 1000 for equivalence with initial implementation. + # Without a factor of 1/1000, the VB term hurts the MSE term. + terms["vb"] *= self.num_timesteps / 1000.0 + + target = { + ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0], + ModelMeanType.START_X: x_start, + ModelMeanType.EPSILON: noise, + }[self.model_mean_type] + assert model_output.shape == target.shape == x_start.shape + terms["mse"] = mean_flat((target - model_output)**2) + if "vb" in terms: + terms["loss"] = terms["mse"] + terms["vb"] + else: + terms["loss"] = terms["mse"] + else: + raise NotImplementedError(self.loss_type) + + return terms + + def _prior_bpd(self, x_start): + """ + Get the prior KL term for the variational lower-bound, measured in + bits-per-dim. + + This term can't be optimized, as it only depends on the encoder. + + :param x_start: the [N x C x ...] tensor of inputs. + :return: a batch of [N] KL values (in bits), one per batch element. + """ + batch_size = x_start.shape[0] + t = paddle.to_tensor([self.num_timesteps - 1] * batch_size, place=x_start.place) + qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t) + kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0) + return mean_flat(kl_prior) / np.log(2.0) + + def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None): + """ + Compute the entire variational lower-bound, measured in bits-per-dim, + as well as other related quantities. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param clip_denoised: if True, clip denoised samples. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + + :return: a dict containing the following keys: + - total_bpd: the total variational lower-bound, per batch element. + - prior_bpd: the prior term in the lower-bound. + - vb: an [N x T] tensor of terms in the lower-bound. + - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep. + - mse: an [N x T] tensor of epsilon MSEs for each timestep. + """ + device = x_start.place + batch_size = x_start.shape[0] + + vb = [] + xstart_mse = [] + mse = [] + for t in list(range(self.num_timesteps))[::-1]: + t_batch = paddle.to_tensor([t] * batch_size, place=device) + # noise = th.randn_like(x_start) + noise = paddle.randn(x_start.shape, x_start.dtype) + x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise) + # Calculate VLB term at the current timestep + # with paddle.no_grad(): + out = self._vb_terms_bpd( + model, + x_start=x_start, + x_t=x_t, + t=t_batch, + clip_denoised=clip_denoised, + model_kwargs=model_kwargs, + ) + vb.append(out["output"]) + xstart_mse.append(mean_flat((out["pred_xstart"] - x_start)**2)) + eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"]) + mse.append(mean_flat((eps - noise)**2)) + + vb = paddle.stack(vb, axis=1) + xstart_mse = paddle.stack(xstart_mse, axis=1) + mse = paddle.stack(mse, axis=1) + + prior_bpd = self._prior_bpd(x_start) + total_bpd = vb.sum(axis=1) + prior_bpd + return { + "total_bpd": total_bpd, + "prior_bpd": prior_bpd, + "vb": vb, + "xstart_mse": xstart_mse, + "mse": mse, + } + + +def _extract_into_tensor(arr, timesteps, broadcast_shape): + """ + Extract values from a 1-D numpy array for a batch of indices. + + :param arr: the 1-D numpy array. + :param timesteps: a tensor of indices into the array to extract. + :param broadcast_shape: a larger shape of K dimensions with the batch + dimension equal to the length of timesteps. + :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. + """ + res = paddle.to_tensor(arr, place=timesteps.place)[timesteps] + while len(res.shape) < len(broadcast_shape): + res = res[..., None] + return res.expand(broadcast_shape) diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/losses.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/losses.py new file mode 100755 index 00000000..5c3970de --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/losses.py @@ -0,0 +1,86 @@ +""" +Helpers for various likelihood-based losses implemented by Paddle. These are ported from the original +Ho et al. diffusion models codebase: +https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py +""" +import numpy as np +import paddle +import paddle.nn.functional as F + + +def normal_kl(mean1, logvar1, mean2, logvar2): + """ + Compute the KL divergence between two gaussians. + + Shapes are automatically broadcasted, so batches can be compared to + scalars, among other use cases. + """ + tensor = None + for obj in (mean1, logvar1, mean2, logvar2): + if isinstance(obj, paddle.Tensor): + tensor = obj + break + assert tensor is not None, "at least one argument must be a Tensor" + + # Force variances to be Tensors. Broadcasting helps convert scalars to + # Tensors, but it does not work for th.exp(). + logvar1, logvar2 = [x if isinstance(x, paddle.Tensor) else paddle.to_tensor(x) for x in (logvar1, logvar2)] + + return 0.5 * (-1.0 + logvar2 - logvar1 + paddle.exp(logvar1 - logvar2) + + ((mean1 - mean2)**2) * paddle.exp(-logvar2)) + + +def approx_standard_normal_cdf(x): + """ + A fast approximation of the cumulative distribution function of the + standard normal. + """ + return 0.5 * (1.0 + paddle.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * paddle.pow(x, 3)))) + + +def discretized_gaussian_log_likelihood(x, *, means, log_scales): + """ + Compute the log-likelihood of a Gaussian distribution discretizing to a + given image. + + :param x: the target images. It is assumed that this was uint8 values, + rescaled to the range [-1, 1]. + :param means: the Gaussian mean Tensor. + :param log_scales: the Gaussian log stddev Tensor. + :return: a tensor like x of log probabilities (in nats). + """ + assert x.shape == means.shape == log_scales.shape + centered_x = x - means + inv_stdv = paddle.exp(-log_scales) + plus_in = inv_stdv * (centered_x + 1.0 / 255.0) + cdf_plus = approx_standard_normal_cdf(plus_in) + min_in = inv_stdv * (centered_x - 1.0 / 255.0) + cdf_min = approx_standard_normal_cdf(min_in) + log_cdf_plus = paddle.log(cdf_plus.clip(min=1e-12)) + log_one_minus_cdf_min = paddle.log((1.0 - cdf_min).clip(min=1e-12)) + cdf_delta = cdf_plus - cdf_min + log_probs = paddle.where( + x < -0.999, + log_cdf_plus, + paddle.where(x > 0.999, log_one_minus_cdf_min, paddle.log(cdf_delta.clip(min=1e-12))), + ) + assert log_probs.shape == x.shape + return log_probs + + +def spherical_dist_loss(x, y): + x = F.normalize(x, axis=-1) + y = F.normalize(y, axis=-1) + return (x - y).norm(axis=-1).divide(paddle.to_tensor(2.0)).asin().pow(2).multiply(paddle.to_tensor(2.0)) + + +def tv_loss(input): + """L2 total variation loss, as in Mahendran et al.""" + input = F.pad(input, (0, 1, 0, 1), 'replicate') + x_diff = input[..., :-1, 1:] - input[..., :-1, :-1] + y_diff = input[..., 1:, :-1] - input[..., :-1, :-1] + return (x_diff**2 + y_diff**2).mean([1, 2, 3]) + + +def range_loss(input): + return (input - input.clip(-1, 1)).pow(2).mean([1, 2, 3]) diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/make_cutouts.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/make_cutouts.py new file mode 100755 index 00000000..cd46e4bd --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/make_cutouts.py @@ -0,0 +1,177 @@ +''' +This code is rewritten by Paddle based on Jina-ai/discoart. +https://github.com/jina-ai/discoart/blob/main/discoart/nn/make_cutouts.py +''' +import math + +import paddle +import paddle.nn as nn +from disco_diffusion_clip_vitb32.resize_right.resize_right import resize +from paddle.nn import functional as F + +from . import transforms as T + +skip_augs = False # @param{type: 'boolean'} + + +def sinc(x): + return paddle.where(x != 0, paddle.sin(math.pi * x) / (math.pi * x), x.new_ones([])) + + +def lanczos(x, a): + cond = paddle.logical_and(-a < x, x < a) + out = paddle.where(cond, sinc(x) * sinc(x / a), x.new_zeros([])) + return out / out.sum() + + +def ramp(ratio, width): + n = math.ceil(width / ratio + 1) + out = paddle.empty([n]) + cur = 0 + for i in range(out.shape[0]): + out[i] = cur + cur += ratio + return paddle.concat([-out[1:].flip([0]), out])[1:-1] + + +class MakeCutouts(nn.Layer): + + def __init__(self, cut_size, cutn, skip_augs=False): + super().__init__() + self.cut_size = cut_size + self.cutn = cutn + self.skip_augs = skip_augs + self.augs = nn.Sequential(*[ + T.RandomHorizontalFlip(prob=0.5), + T.Lambda(lambda x: x + paddle.randn(x.shape) * 0.01), + T.RandomAffine(degrees=15, translate=(0.1, 0.1)), + T.Lambda(lambda x: x + paddle.randn(x.shape) * 0.01), + T.RandomPerspective(distortion_scale=0.4, p=0.7), + T.Lambda(lambda x: x + paddle.randn(x.shape) * 0.01), + T.RandomGrayscale(p=0.15), + T.Lambda(lambda x: x + paddle.randn(x.shape) * 0.01), + T.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1), + ]) + + def forward(self, input): + input = T.Pad(input.shape[2] // 4, fill=0)(input) + sideY, sideX = input.shape[2:4] + max_size = min(sideX, sideY) + + cutouts = [] + for ch in range(self.cutn): + if ch > self.cutn - self.cutn // 4: + cutout = input.clone() + else: + size = int(max_size * + paddle.zeros(1, ).normal_(mean=0.8, std=0.3).clip(float(self.cut_size / max_size), 1.0)) + offsetx = paddle.randint(0, abs(sideX - size + 1), ()) + offsety = paddle.randint(0, abs(sideY - size + 1), ()) + cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size] + + if not self.skip_augs: + cutout = self.augs(cutout) + cutouts.append(resample(cutout, (self.cut_size, self.cut_size))) + del cutout + + cutouts = paddle.concat(cutouts, axis=0) + return cutouts + + +class MakeCutoutsDango(nn.Layer): + + def __init__(self, cut_size, Overview=4, InnerCrop=0, IC_Size_Pow=0.5, IC_Grey_P=0.2): + super().__init__() + self.cut_size = cut_size + self.Overview = Overview + self.InnerCrop = InnerCrop + self.IC_Size_Pow = IC_Size_Pow + self.IC_Grey_P = IC_Grey_P + self.augs = nn.Sequential(*[ + T.RandomHorizontalFlip(prob=0.5), + T.Lambda(lambda x: x + paddle.randn(x.shape) * 0.01), + T.RandomAffine( + degrees=10, + translate=(0.05, 0.05), + interpolation=T.InterpolationMode.BILINEAR, + ), + T.Lambda(lambda x: x + paddle.randn(x.shape) * 0.01), + T.RandomGrayscale(p=0.1), + T.Lambda(lambda x: x + paddle.randn(x.shape) * 0.01), + T.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1), + ]) + + def forward(self, input): + cutouts = [] + gray = T.Grayscale(3) + sideY, sideX = input.shape[2:4] + max_size = min(sideX, sideY) + min_size = min(sideX, sideY, self.cut_size) + output_shape = [1, 3, self.cut_size, self.cut_size] + pad_input = F.pad( + input, + ( + (sideY - max_size) // 2, + (sideY - max_size) // 2, + (sideX - max_size) // 2, + (sideX - max_size) // 2, + ), + **padargs, + ) + cutout = resize(pad_input, out_shape=output_shape) + + if self.Overview > 0: + if self.Overview <= 4: + if self.Overview >= 1: + cutouts.append(cutout) + if self.Overview >= 2: + cutouts.append(gray(cutout)) + if self.Overview >= 3: + cutouts.append(cutout[:, :, :, ::-1]) + if self.Overview == 4: + cutouts.append(gray(cutout[:, :, :, ::-1])) + else: + cutout = resize(pad_input, out_shape=output_shape) + for _ in range(self.Overview): + cutouts.append(cutout) + + if self.InnerCrop > 0: + for i in range(self.InnerCrop): + size = int(paddle.rand([1])**self.IC_Size_Pow * (max_size - min_size) + min_size) + offsetx = paddle.randint(0, sideX - size + 1) + offsety = paddle.randint(0, sideY - size + 1) + cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size] + if i <= int(self.IC_Grey_P * self.InnerCrop): + cutout = gray(cutout) + cutout = resize(cutout, out_shape=output_shape) + cutouts.append(cutout) + + cutouts = paddle.concat(cutouts) + if skip_augs is not True: + cutouts = self.augs(cutouts) + return cutouts + + +def resample(input, size, align_corners=True): + n, c, h, w = input.shape + dh, dw = size + + input = input.reshape([n * c, 1, h, w]) + + if dh < h: + kernel_h = lanczos(ramp(dh / h, 2), 2).to(input.device, input.dtype) + pad_h = (kernel_h.shape[0] - 1) // 2 + input = F.pad(input, (0, 0, pad_h, pad_h), 'reflect') + input = F.conv2d(input, kernel_h[None, None, :, None]) + + if dw < w: + kernel_w = lanczos(ramp(dw / w, 2), 2).to(input.device, input.dtype) + pad_w = (kernel_w.shape[0] - 1) // 2 + input = F.pad(input, (pad_w, pad_w, 0, 0), 'reflect') + input = F.conv2d(input, kernel_w[None, None, None, :]) + + input = input.reshape([n, c, h, w]) + return F.interpolate(input, size, mode='bicubic', align_corners=align_corners) + + +padargs = {} diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/nn.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/nn.py new file mode 100755 index 00000000..d618183e --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/nn.py @@ -0,0 +1,127 @@ +""" +Various utilities for neural networks implemented by Paddle. This code is rewritten based on: +https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/nn.py +""" +import math + +import paddle +import paddle.nn as nn + + +class SiLU(nn.Layer): + + def forward(self, x): + return x * nn.functional.sigmoid(x) + + +class GroupNorm32(nn.GroupNorm): + + def forward(self, x): + return super().forward(x) + + +def conv_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D convolution module. + """ + if dims == 1: + return nn.Conv1D(*args, **kwargs) + elif dims == 2: + return nn.Conv2D(*args, **kwargs) + elif dims == 3: + return nn.Conv3D(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def linear(*args, **kwargs): + """ + Create a linear module. + """ + return nn.Linear(*args, **kwargs) + + +def avg_pool_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D average pooling module. + """ + if dims == 1: + return nn.AvgPool1D(*args, **kwargs) + elif dims == 2: + return nn.AvgPool2D(*args, **kwargs) + elif dims == 3: + return nn.AvgPool3D(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def update_ema(target_params, source_params, rate=0.99): + """ + Update target parameters to be closer to those of source parameters using + an exponential moving average. + + :param target_params: the target parameter sequence. + :param source_params: the source parameter sequence. + :param rate: the EMA rate (closer to 1 means slower). + """ + for targ, src in zip(target_params, source_params): + targ.detach().mul_(rate).add_(src, alpha=1 - rate) + + +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +def scale_module(module, scale): + """ + Scale the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().mul_(scale) + return module + + +def mean_flat(tensor): + """ + Take the mean over all non-batch dimensions. + """ + return tensor.mean(axis=list(range(1, len(tensor.shape)))) + + +def normalization(channels): + """ + Make a standard normalization layer. + + :param channels: number of input channels. + :return: an nn.Module for normalization. + """ + return GroupNorm32(32, channels) + + +def timestep_embedding(timesteps, dim, max_period=10000): + """ + Create sinusoidal timestep embeddings. + + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + half = dim // 2 + freqs = paddle.exp(-math.log(max_period) * paddle.arange(start=0, end=half, dtype=paddle.float32) / half) + args = paddle.cast(timesteps[:, None], 'float32') * freqs[None] + embedding = paddle.concat([paddle.cos(args), paddle.sin(args)], axis=-1) + if dim % 2: + embedding = paddle.concat([embedding, paddle.zeros_like(embedding[:, :1])], axis=-1) + return embedding + + +def checkpoint(func, inputs, params, flag): + """ + This function is disabled. And now just forward. + """ + return func(*inputs) diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/perlin_noises.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/perlin_noises.py new file mode 100755 index 00000000..6dacb331 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/perlin_noises.py @@ -0,0 +1,78 @@ +''' +Perlin noise implementation by Paddle. +This code is rewritten based on: +https://github.com/jina-ai/discoart/blob/main/discoart/nn/perlin_noises.py +''' +import numpy as np +import paddle +import paddle.vision.transforms as TF +from PIL import Image +from PIL import ImageOps + + +def interp(t): + return 3 * t**2 - 2 * t**3 + + +def perlin(width, height, scale=10): + gx, gy = paddle.randn([2, width + 1, height + 1, 1, 1]) + xs = paddle.linspace(0, 1, scale + 1)[:-1, None] + ys = paddle.linspace(0, 1, scale + 1)[None, :-1] + wx = 1 - interp(xs) + wy = 1 - interp(ys) + dots = 0 + dots += wx * wy * (gx[:-1, :-1] * xs + gy[:-1, :-1] * ys) + dots += (1 - wx) * wy * (-gx[1:, :-1] * (1 - xs) + gy[1:, :-1] * ys) + dots += wx * (1 - wy) * (gx[:-1, 1:] * xs - gy[:-1, 1:] * (1 - ys)) + dots += (1 - wx) * (1 - wy) * (-gx[1:, 1:] * (1 - xs) - gy[1:, 1:] * (1 - ys)) + return dots.transpose([0, 2, 1, 3]).reshape([width * scale, height * scale]) + + +def perlin_ms(octaves, width, height, grayscale): + out_array = [0.5] if grayscale else [0.5, 0.5, 0.5] + # out_array = [0.0] if grayscale else [0.0, 0.0, 0.0] + for i in range(1 if grayscale else 3): + scale = 2**len(octaves) + oct_width = width + oct_height = height + for oct in octaves: + p = perlin(oct_width, oct_height, scale) + out_array[i] += p * oct + scale //= 2 + oct_width *= 2 + oct_height *= 2 + return paddle.concat(out_array) + + +def create_perlin_noise(octaves, width, height, grayscale, side_y, side_x): + out = perlin_ms(octaves, width, height, grayscale) + if grayscale: + out = TF.resize(size=(side_y, side_x), img=out.numpy()) + out = np.uint8(out) + out = Image.fromarray(out).convert('RGB') + else: + out = out.reshape([-1, 3, out.shape[0] // 3, out.shape[1]]) + out = out.squeeze().transpose([1, 2, 0]).numpy() + out = TF.resize(size=(side_y, side_x), img=out) + out = out.clip(0, 1) * 255 + out = np.uint8(out) + out = Image.fromarray(out) + + out = ImageOps.autocontrast(out) + return out + + +def regen_perlin(perlin_mode, side_y, side_x, batch_size): + if perlin_mode == 'color': + init = create_perlin_noise([1.5**-i * 0.5 for i in range(12)], 1, 1, False, side_y, side_x) + init2 = create_perlin_noise([1.5**-i * 0.5 for i in range(8)], 4, 4, False, side_y, side_x) + elif perlin_mode == 'gray': + init = create_perlin_noise([1.5**-i * 0.5 for i in range(12)], 1, 1, True, side_y, side_x) + init2 = create_perlin_noise([1.5**-i * 0.5 for i in range(8)], 4, 4, True, side_y, side_x) + else: + init = create_perlin_noise([1.5**-i * 0.5 for i in range(12)], 1, 1, False, side_y, side_x) + init2 = create_perlin_noise([1.5**-i * 0.5 for i in range(8)], 4, 4, True, side_y, side_x) + + init = (TF.to_tensor(init).add(TF.to_tensor(init2)).divide(paddle.to_tensor(2.0)).unsqueeze(0) * 2 - 1) + del init2 + return init.expand([batch_size, -1, -1, -1]) diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/respace.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/respace.py new file mode 100755 index 00000000..c001c70d --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/respace.py @@ -0,0 +1,123 @@ +''' +This code is rewritten by Paddle based on +https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/respace.py +''' +import numpy as np +import paddle + +from .gaussian_diffusion import GaussianDiffusion + + +def space_timesteps(num_timesteps, section_counts): + """ + Create a list of timesteps to use from an original diffusion process, + given the number of timesteps we want to take from equally-sized portions + of the original process. + + For example, if there's 300 timesteps and the section counts are [10,15,20] + then the first 100 timesteps are strided to be 10 timesteps, the second 100 + are strided to be 15 timesteps, and the final 100 are strided to be 20. + + If the stride is a string starting with "ddim", then the fixed striding + from the DDIM paper is used, and only one section is allowed. + + :param num_timesteps: the number of diffusion steps in the original + process to divide up. + :param section_counts: either a list of numbers, or a string containing + comma-separated numbers, indicating the step count + per section. As a special case, use "ddimN" where N + is a number of steps to use the striding from the + DDIM paper. + :return: a set of diffusion steps from the original process to use. + """ + if isinstance(section_counts, str): + if section_counts.startswith("ddim"): + desired_count = int(section_counts[len("ddim"):]) + for i in range(1, num_timesteps): + if len(range(0, num_timesteps, i)) == desired_count: + return set(range(0, num_timesteps, i)) + raise ValueError(f"cannot create exactly {num_timesteps} steps with an integer stride") + section_counts = [int(x) for x in section_counts.split(",")] + size_per = num_timesteps // len(section_counts) + extra = num_timesteps % len(section_counts) + start_idx = 0 + all_steps = [] + for i, section_count in enumerate(section_counts): + size = size_per + (1 if i < extra else 0) + if size < section_count: + raise ValueError(f"cannot divide section of {size} steps into {section_count}") + if section_count <= 1: + frac_stride = 1 + else: + frac_stride = (size - 1) / (section_count - 1) + cur_idx = 0.0 + taken_steps = [] + for _ in range(section_count): + taken_steps.append(start_idx + round(cur_idx)) + cur_idx += frac_stride + all_steps += taken_steps + start_idx += size + return set(all_steps) + + +class SpacedDiffusion(GaussianDiffusion): + """ + A diffusion process which can skip steps in a base diffusion process. + + :param use_timesteps: a collection (sequence or set) of timesteps from the + original diffusion process to retain. + :param kwargs: the kwargs to create the base diffusion process. + """ + + def __init__(self, use_timesteps, **kwargs): + self.use_timesteps = set(use_timesteps) + self.timestep_map = [] + self.original_num_steps = len(kwargs["betas"]) + + base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa + last_alpha_cumprod = 1.0 + new_betas = [] + for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod): + if i in self.use_timesteps: + new_betas.append(1 - alpha_cumprod / last_alpha_cumprod) + last_alpha_cumprod = alpha_cumprod + self.timestep_map.append(i) + kwargs["betas"] = np.array(new_betas) + super().__init__(**kwargs) + + def p_mean_variance(self, model, *args, **kwargs): # pylint: disable=signature-differs + return super().p_mean_variance(self._wrap_model(model), *args, **kwargs) + + def training_losses(self, model, *args, **kwargs): # pylint: disable=signature-differs + return super().training_losses(self._wrap_model(model), *args, **kwargs) + + def condition_mean(self, cond_fn, *args, **kwargs): + return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs) + + def condition_score(self, cond_fn, *args, **kwargs): + return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs) + + def _wrap_model(self, model): + if isinstance(model, _WrappedModel): + return model + return _WrappedModel(model, self.timestep_map, self.rescale_timesteps, self.original_num_steps) + + def _scale_timesteps(self, t): + # Scaling is done by the wrapped model. + return t + + +class _WrappedModel: + + def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps): + self.model = model + self.timestep_map = timestep_map + self.rescale_timesteps = rescale_timesteps + self.original_num_steps = original_num_steps + + def __call__(self, x, ts, **kwargs): + map_tensor = paddle.to_tensor(self.timestep_map, place=ts.place, dtype=ts.dtype) + new_ts = map_tensor[ts] + if self.rescale_timesteps: + new_ts = paddle.cast(new_ts, 'float32') * (1000.0 / self.original_num_steps) + return self.model(x, new_ts, **kwargs) diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/script_util.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/script_util.py new file mode 100755 index 00000000..d728a543 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/script_util.py @@ -0,0 +1,201 @@ +''' +This code is based on +https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/script_util.py +''' +import argparse +import inspect + +from . import gaussian_diffusion as gd +from .respace import space_timesteps +from .respace import SpacedDiffusion +from .unet import EncoderUNetModel +from .unet import SuperResModel +from .unet import UNetModel + +NUM_CLASSES = 1000 + + +def diffusion_defaults(): + """ + Defaults for image and classifier training. + """ + return dict( + learn_sigma=False, + diffusion_steps=1000, + noise_schedule="linear", + timestep_respacing="", + use_kl=False, + predict_xstart=False, + rescale_timesteps=False, + rescale_learned_sigmas=False, + ) + + +def model_and_diffusion_defaults(): + """ + Defaults for image training. + """ + res = dict( + image_size=64, + num_channels=128, + num_res_blocks=2, + num_heads=4, + num_heads_upsample=-1, + num_head_channels=-1, + attention_resolutions="16,8", + channel_mult="", + dropout=0.0, + class_cond=False, + use_checkpoint=False, + use_scale_shift_norm=True, + resblock_updown=False, + use_fp16=False, + use_new_attention_order=False, + ) + res.update(diffusion_defaults()) + return res + + +def create_model_and_diffusion( + image_size, + class_cond, + learn_sigma, + num_channels, + num_res_blocks, + channel_mult, + num_heads, + num_head_channels, + num_heads_upsample, + attention_resolutions, + dropout, + diffusion_steps, + noise_schedule, + timestep_respacing, + use_kl, + predict_xstart, + rescale_timesteps, + rescale_learned_sigmas, + use_checkpoint, + use_scale_shift_norm, + resblock_updown, + use_fp16, + use_new_attention_order, +): + model = create_model( + image_size, + num_channels, + num_res_blocks, + channel_mult=channel_mult, + learn_sigma=learn_sigma, + class_cond=class_cond, + use_checkpoint=use_checkpoint, + attention_resolutions=attention_resolutions, + num_heads=num_heads, + num_head_channels=num_head_channels, + num_heads_upsample=num_heads_upsample, + use_scale_shift_norm=use_scale_shift_norm, + dropout=dropout, + resblock_updown=resblock_updown, + use_fp16=use_fp16, + use_new_attention_order=use_new_attention_order, + ) + diffusion = create_gaussian_diffusion( + steps=diffusion_steps, + learn_sigma=learn_sigma, + noise_schedule=noise_schedule, + use_kl=use_kl, + predict_xstart=predict_xstart, + rescale_timesteps=rescale_timesteps, + rescale_learned_sigmas=rescale_learned_sigmas, + timestep_respacing=timestep_respacing, + ) + return model, diffusion + + +def create_model( + image_size, + num_channels, + num_res_blocks, + channel_mult="", + learn_sigma=False, + class_cond=False, + use_checkpoint=False, + attention_resolutions="16", + num_heads=1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + dropout=0, + resblock_updown=False, + use_fp16=False, + use_new_attention_order=False, +): + if channel_mult == "": + if image_size == 512: + channel_mult = (0.5, 1, 1, 2, 2, 4, 4) + elif image_size == 256: + channel_mult = (1, 1, 2, 2, 4, 4) + elif image_size == 128: + channel_mult = (1, 1, 2, 3, 4) + elif image_size == 64: + channel_mult = (1, 2, 3, 4) + else: + raise ValueError(f"unsupported image size: {image_size}") + else: + channel_mult = tuple(int(ch_mult) for ch_mult in channel_mult.split(",")) + + attention_ds = [] + for res in attention_resolutions.split(","): + attention_ds.append(image_size // int(res)) + + return UNetModel( + image_size=image_size, + in_channels=3, + model_channels=num_channels, + out_channels=(3 if not learn_sigma else 6), + num_res_blocks=num_res_blocks, + attention_resolutions=tuple(attention_ds), + dropout=dropout, + channel_mult=channel_mult, + num_classes=(NUM_CLASSES if class_cond else None), + use_checkpoint=use_checkpoint, + use_fp16=use_fp16, + num_heads=num_heads, + num_head_channels=num_head_channels, + num_heads_upsample=num_heads_upsample, + use_scale_shift_norm=use_scale_shift_norm, + resblock_updown=resblock_updown, + use_new_attention_order=use_new_attention_order, + ) + + +def create_gaussian_diffusion( + *, + steps=1000, + learn_sigma=False, + sigma_small=False, + noise_schedule="linear", + use_kl=False, + predict_xstart=False, + rescale_timesteps=False, + rescale_learned_sigmas=False, + timestep_respacing="", +): + betas = gd.get_named_beta_schedule(noise_schedule, steps) + if use_kl: + loss_type = gd.LossType.RESCALED_KL + elif rescale_learned_sigmas: + loss_type = gd.LossType.RESCALED_MSE + else: + loss_type = gd.LossType.MSE + if not timestep_respacing: + timestep_respacing = [steps] + return SpacedDiffusion( + use_timesteps=space_timesteps(steps, timestep_respacing), + betas=betas, + model_mean_type=(gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X), + model_var_type=((gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL) + if not learn_sigma else gd.ModelVarType.LEARNED_RANGE), + loss_type=loss_type, + rescale_timesteps=rescale_timesteps, + ) diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/sec_diff.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/sec_diff.py new file mode 100755 index 00000000..1e361f18 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/sec_diff.py @@ -0,0 +1,135 @@ +''' +This code is rewritten by Paddle based on +https://github.com/jina-ai/discoart/blob/main/discoart/nn/sec_diff.py +''' +import math +from dataclasses import dataclass +from functools import partial + +import paddle +import paddle.nn as nn + + +@dataclass +class DiffusionOutput: + v: paddle.Tensor + pred: paddle.Tensor + eps: paddle.Tensor + + +class SkipBlock(nn.Layer): + + def __init__(self, main, skip=None): + super().__init__() + self.main = nn.Sequential(*main) + self.skip = skip if skip else nn.Identity() + + def forward(self, input): + return paddle.concat([self.main(input), self.skip(input)], axis=1) + + +def append_dims(x, n): + return x[(Ellipsis, *(None, ) * (n - x.ndim))] + + +def expand_to_planes(x, shape): + return paddle.tile(append_dims(x, len(shape)), [1, 1, *shape[2:]]) + + +def alpha_sigma_to_t(alpha, sigma): + return paddle.atan2(sigma, alpha) * 2 / math.pi + + +def t_to_alpha_sigma(t): + return paddle.cos(t * math.pi / 2), paddle.sin(t * math.pi / 2) + + +class SecondaryDiffusionImageNet2(nn.Layer): + + def __init__(self): + super().__init__() + c = 64 # The base channel count + cs = [c, c * 2, c * 2, c * 4, c * 4, c * 8] + + self.timestep_embed = FourierFeatures(1, 16) + self.down = nn.AvgPool2D(2) + self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False) + + self.net = nn.Sequential( + ConvBlock(3 + 16, cs[0]), + ConvBlock(cs[0], cs[0]), + SkipBlock([ + self.down, + ConvBlock(cs[0], cs[1]), + ConvBlock(cs[1], cs[1]), + SkipBlock([ + self.down, + ConvBlock(cs[1], cs[2]), + ConvBlock(cs[2], cs[2]), + SkipBlock([ + self.down, + ConvBlock(cs[2], cs[3]), + ConvBlock(cs[3], cs[3]), + SkipBlock([ + self.down, + ConvBlock(cs[3], cs[4]), + ConvBlock(cs[4], cs[4]), + SkipBlock([ + self.down, + ConvBlock(cs[4], cs[5]), + ConvBlock(cs[5], cs[5]), + ConvBlock(cs[5], cs[5]), + ConvBlock(cs[5], cs[4]), + self.up, + ]), + ConvBlock(cs[4] * 2, cs[4]), + ConvBlock(cs[4], cs[3]), + self.up, + ]), + ConvBlock(cs[3] * 2, cs[3]), + ConvBlock(cs[3], cs[2]), + self.up, + ]), + ConvBlock(cs[2] * 2, cs[2]), + ConvBlock(cs[2], cs[1]), + self.up, + ]), + ConvBlock(cs[1] * 2, cs[1]), + ConvBlock(cs[1], cs[0]), + self.up, + ]), + ConvBlock(cs[0] * 2, cs[0]), + nn.Conv2D(cs[0], 3, 3, padding=1), + ) + + def forward(self, input, t): + timestep_embed = expand_to_planes(self.timestep_embed(t[:, None]), input.shape) + v = self.net(paddle.concat([input, timestep_embed], axis=1)) + alphas, sigmas = map(partial(append_dims, n=v.ndim), t_to_alpha_sigma(t)) + pred = input * alphas - v * sigmas + eps = input * sigmas + v * alphas + return DiffusionOutput(v, pred, eps) + + +class FourierFeatures(nn.Layer): + + def __init__(self, in_features, out_features, std=1.0): + super().__init__() + assert out_features % 2 == 0 + # self.weight = nn.Parameter(paddle.randn([out_features // 2, in_features]) * std) + self.weight = paddle.create_parameter([out_features // 2, in_features], + dtype='float32', + default_initializer=nn.initializer.Normal(mean=0.0, std=std)) + + def forward(self, input): + f = 2 * math.pi * input @ self.weight.T + return paddle.concat([f.cos(), f.sin()], axis=-1) + + +class ConvBlock(nn.Sequential): + + def __init__(self, c_in, c_out): + super().__init__( + nn.Conv2D(c_in, c_out, 3, padding=1), + nn.ReLU(), + ) diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/transforms.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/transforms.py new file mode 100755 index 00000000..e0b620b0 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/transforms.py @@ -0,0 +1,757 @@ +''' +This code is rewritten by Paddle based on +https://github.com/pytorch/vision/blob/main/torchvision/transforms/transforms.py +''' +import math +import numbers +import warnings +from enum import Enum +from typing import Any +from typing import Dict +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Union + +import paddle +import paddle.nn as nn +from paddle import Tensor +from paddle.nn import functional as F +from paddle.nn.functional import grid_sample +from paddle.vision import transforms as T + + +class Normalize(nn.Layer): + + def __init__(self, mean, std): + super(Normalize, self).__init__() + self.mean = paddle.to_tensor(mean) + self.std = paddle.to_tensor(std) + + def forward(self, tensor: Tensor): + dtype = tensor.dtype + mean = paddle.to_tensor(self.mean, dtype=dtype) + std = paddle.to_tensor(self.std, dtype=dtype) + mean = mean.reshape([1, -1, 1, 1]) + std = std.reshape([1, -1, 1, 1]) + result = tensor.subtract(mean).divide(std) + return result + + +class InterpolationMode(Enum): + """Interpolation modes + Available interpolation methods are ``nearest``, ``bilinear``, ``bicubic``, ``box``, ``hamming``, and ``lanczos``. + """ + + NEAREST = "nearest" + BILINEAR = "bilinear" + BICUBIC = "bicubic" + # For PIL compatibility + BOX = "box" + HAMMING = "hamming" + LANCZOS = "lanczos" + + +class Grayscale(nn.Layer): + + def __init__(self, num_output_channels): + super(Grayscale, self).__init__() + self.num_output_channels = num_output_channels + + def forward(self, x): + output = (0.2989 * x[:, 0:1, :, :] + 0.587 * x[:, 1:2, :, :] + 0.114 * x[:, 2:3, :, :]) + if self.num_output_channels == 3: + return output.expand(x.shape) + + return output + + +class Lambda(nn.Layer): + + def __init__(self, func): + super(Lambda, self).__init__() + self.transform = func + + def forward(self, x): + return self.transform(x) + + +class RandomGrayscale(nn.Layer): + + def __init__(self, p): + super(RandomGrayscale, self).__init__() + self.prob = p + self.transform = Grayscale(3) + + def forward(self, x): + if paddle.rand([1]) < self.prob: + return self.transform(x) + else: + return x + + +class RandomHorizontalFlip(nn.Layer): + + def __init__(self, prob): + super(RandomHorizontalFlip, self).__init__() + self.prob = prob + + def forward(self, x): + if paddle.rand([1]) < self.prob: + return x[:, :, :, ::-1] + else: + return x + + +def _blend(img1: Tensor, img2: Tensor, ratio: float) -> Tensor: + ratio = float(ratio) + bound = 1.0 + return (ratio * img1 + (1.0 - ratio) * img2).clip(0, bound) + + +def trunc_div(a, b): + ipt = paddle.divide(a, b) + sign_ipt = paddle.sign(ipt) + abs_ipt = paddle.abs(ipt) + abs_ipt = paddle.floor(abs_ipt) + out = paddle.multiply(sign_ipt, abs_ipt) + return out + + +def fmod(a, b): + return a - trunc_div(a, b) * b + + +def _rgb2hsv(img: Tensor) -> Tensor: + r, g, b = img.unbind(axis=-3) + + # Implementation is based on https://github.com/python-pillow/Pillow/blob/4174d4267616897df3746d315d5a2d0f82c656ee/ + # src/libImaging/Convert.c#L330 + maxc = paddle.max(img, axis=-3) + minc = paddle.min(img, axis=-3) + + # The algorithm erases S and H channel where `maxc = minc`. This avoids NaN + # from happening in the results, because + # + S channel has division by `maxc`, which is zero only if `maxc = minc` + # + H channel has division by `(maxc - minc)`. + # + # Instead of overwriting NaN afterwards, we just prevent it from occuring so + # we don't need to deal with it in case we save the NaN in a buffer in + # backprop, if it is ever supported, but it doesn't hurt to do so. + eqc = maxc == minc + + cr = maxc - minc + # Since `eqc => cr = 0`, replacing denominator with 1 when `eqc` is fine. + ones = paddle.ones_like(maxc) + s = cr / paddle.where(eqc, ones, maxc) + # Note that `eqc => maxc = minc = r = g = b`. So the following calculation + # of `h` would reduce to `bc - gc + 2 + rc - bc + 4 + rc - bc = 6` so it + # would not matter what values `rc`, `gc`, and `bc` have here, and thus + # replacing denominator with 1 when `eqc` is fine. + cr_divisor = paddle.where(eqc, ones, cr) + rc = (maxc - r) / cr_divisor + gc = (maxc - g) / cr_divisor + bc = (maxc - b) / cr_divisor + + hr = (maxc == r).cast('float32') * (bc - gc) + hg = ((maxc == g) & (maxc != r)).cast('float32') * (2.0 + rc - bc) + hb = ((maxc != g) & (maxc != r)).cast('float32') * (4.0 + gc - rc) + h = hr + hg + hb + h = fmod((h / 6.0 + 1.0), paddle.to_tensor(1.0)) + return paddle.stack((h, s, maxc), axis=-3) + + +def _hsv2rgb(img: Tensor) -> Tensor: + h, s, v = img.unbind(axis=-3) + i = paddle.floor(h * 6.0) + f = (h * 6.0) - i + i = i.cast(dtype='int32') + + p = paddle.clip((v * (1.0 - s)), 0.0, 1.0) + q = paddle.clip((v * (1.0 - s * f)), 0.0, 1.0) + t = paddle.clip((v * (1.0 - s * (1.0 - f))), 0.0, 1.0) + i = i % 6 + + mask = i.unsqueeze(axis=-3) == paddle.arange(6).reshape([-1, 1, 1]) + + a1 = paddle.stack((v, q, p, p, t, v), axis=-3) + a2 = paddle.stack((t, v, v, q, p, p), axis=-3) + a3 = paddle.stack((p, p, t, v, v, q), axis=-3) + a4 = paddle.stack((a1, a2, a3), axis=-4) + + return paddle.einsum("...ijk, ...xijk -> ...xjk", mask.cast(dtype=img.dtype), a4) + + +def adjust_brightness(img: Tensor, brightness_factor: float) -> Tensor: + if brightness_factor < 0: + raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.") + + return _blend(img, paddle.zeros_like(img), brightness_factor) + + +def adjust_contrast(img: Tensor, contrast_factor: float) -> Tensor: + if contrast_factor < 0: + raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.") + + c = img.shape[1] + + if c == 3: + output = (0.2989 * img[:, 0:1, :, :] + 0.587 * img[:, 1:2, :, :] + 0.114 * img[:, 2:3, :, :]) + mean = paddle.mean(output, axis=(-3, -2, -1), keepdim=True) + + else: + mean = paddle.mean(img, axis=(-3, -2, -1), keepdim=True) + + return _blend(img, mean, contrast_factor) + + +def adjust_hue(img: Tensor, hue_factor: float) -> Tensor: + if not (-0.5 <= hue_factor <= 0.5): + raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].") + + img = _rgb2hsv(img) + h, s, v = img.unbind(axis=-3) + h = fmod(h + hue_factor, paddle.to_tensor(1.0)) + img = paddle.stack((h, s, v), axis=-3) + img_hue_adj = _hsv2rgb(img) + return img_hue_adj + + +def adjust_saturation(img: Tensor, saturation_factor: float) -> Tensor: + if saturation_factor < 0: + raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.") + + output = (0.2989 * img[:, 0:1, :, :] + 0.587 * img[:, 1:2, :, :] + 0.114 * img[:, 2:3, :, :]) + + return _blend(img, output, saturation_factor) + + +class ColorJitter(nn.Layer): + + def __init__(self, brightness=0, contrast=0, saturation=0, hue=0): + super(ColorJitter, self).__init__() + self.brightness = self._check_input(brightness, "brightness") + self.contrast = self._check_input(contrast, "contrast") + self.saturation = self._check_input(saturation, "saturation") + self.hue = self._check_input(hue, "hue", center=0, bound=(-0.5, 0.5), clip_first_on_zero=False) + + def _check_input(self, value, name, center=1, bound=(0, float("inf")), clip_first_on_zero=True): + if isinstance(value, numbers.Number): + if value < 0: + raise ValueError(f"If {name} is a single number, it must be non negative.") + value = [center - float(value), center + float(value)] + if clip_first_on_zero: + value[0] = max(value[0], 0.0) + elif isinstance(value, (tuple, list)) and len(value) == 2: + if not bound[0] <= value[0] <= value[1] <= bound[1]: + raise ValueError(f"{name} values should be between {bound}") + else: + raise TypeError(f"{name} should be a single number or a list/tuple with length 2.") + + # if value is 0 or (1., 1.) for brightness/contrast/saturation + # or (0., 0.) for hue, do nothing + if value[0] == value[1] == center: + value = None + return value + + @staticmethod + def get_params( + brightness: Optional[List[float]], + contrast: Optional[List[float]], + saturation: Optional[List[float]], + hue: Optional[List[float]], + ) -> Tuple[Tensor, Optional[float], Optional[float], Optional[float], Optional[float]]: + """Get the parameters for the randomized transform to be applied on image. + + Args: + brightness (tuple of float (min, max), optional): The range from which the brightness_factor is chosen + uniformly. Pass None to turn off the transformation. + contrast (tuple of float (min, max), optional): The range from which the contrast_factor is chosen + uniformly. Pass None to turn off the transformation. + saturation (tuple of float (min, max), optional): The range from which the saturation_factor is chosen + uniformly. Pass None to turn off the transformation. + hue (tuple of float (min, max), optional): The range from which the hue_factor is chosen uniformly. + Pass None to turn off the transformation. + + Returns: + tuple: The parameters used to apply the randomized transform + along with their random order. + """ + fn_idx = paddle.randperm(4) + + b = None if brightness is None else paddle.empty([1]).uniform_(brightness[0], brightness[1]) + c = None if contrast is None else paddle.empty([1]).uniform_(contrast[0], contrast[1]) + s = None if saturation is None else paddle.empty([1]).uniform_(saturation[0], saturation[1]) + h = None if hue is None else paddle.empty([1]).uniform_(hue[0], hue[1]) + + return fn_idx, b, c, s, h + + def forward(self, img): + """ + Args: + img (PIL Image or Tensor): Input image. + + Returns: + PIL Image or Tensor: Color jittered image. + """ + fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params( + self.brightness, self.contrast, self.saturation, self.hue) + + for fn_id in fn_idx: + if fn_id == 0 and brightness_factor is not None: + img = adjust_brightness(img, brightness_factor) + elif fn_id == 1 and contrast_factor is not None: + img = adjust_contrast(img, contrast_factor) + elif fn_id == 2 and saturation_factor is not None: + img = adjust_saturation(img, saturation_factor) + elif fn_id == 3 and hue_factor is not None: + img = adjust_hue(img, hue_factor) + + return img + + def __repr__(self) -> str: + s = (f"{self.__class__.__name__}(" + f"brightness={self.brightness}" + f", contrast={self.contrast}" + f", saturation={self.saturation}" + f", hue={self.hue})") + return s + + +def _apply_grid_transform(img: Tensor, grid: Tensor, mode: str, fill: Optional[List[float]]) -> Tensor: + + if img.shape[0] > 1: + # Apply same grid to a batch of images + grid = grid.expand([img.shape[0], grid.shape[1], grid.shape[2], grid.shape[3]]) + + # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice + if fill is not None: + dummy = paddle.ones((img.shape[0], 1, img.shape[2], img.shape[3]), dtype=img.dtype) + img = paddle.concat((img, dummy), axis=1) + + img = grid_sample(img, grid, mode=mode, padding_mode="zeros", align_corners=False) + + # Fill with required color + if fill is not None: + mask = img[:, -1:, :, :] # N * 1 * H * W + img = img[:, :-1, :, :] # N * C * H * W + mask = mask.expand_as(img) + len_fill = len(fill) if isinstance(fill, (tuple, list)) else 1 + fill_img = paddle.to_tensor(fill, dtype=img.dtype).reshape([1, len_fill, 1, 1]).expand_as(img) + if mode == "nearest": + mask = mask < 0.5 + img[mask] = fill_img[mask] + else: # 'bilinear' + img = img * mask + (1.0 - mask) * fill_img + return img + + +def _gen_affine_grid( + theta: Tensor, + w: int, + h: int, + ow: int, + oh: int, +) -> Tensor: + # https://github.com/pytorch/pytorch/blob/74b65c32be68b15dc7c9e8bb62459efbfbde33d8/aten/src/ATen/native/ + # AffineGridGenerator.cpp#L18 + # Difference with AffineGridGenerator is that: + # 1) we normalize grid values after applying theta + # 2) we can normalize by other image size, such that it covers "extend" option like in PIL.Image.rotate + + d = 0.5 + base_grid = paddle.empty([1, oh, ow, 3], dtype=theta.dtype) + x_grid = paddle.linspace(-ow * 0.5 + d, ow * 0.5 + d - 1, num=ow) + base_grid[..., 0] = (x_grid) + y_grid = paddle.linspace(-oh * 0.5 + d, oh * 0.5 + d - 1, num=oh).unsqueeze_(-1) + base_grid[..., 1] = (y_grid) + base_grid[..., 2] = 1.0 + rescaled_theta = theta.transpose([0, 2, 1]) / paddle.to_tensor([0.5 * w, 0.5 * h], dtype=theta.dtype) + output_grid = base_grid.reshape([1, oh * ow, 3]).bmm(rescaled_theta) + return output_grid.reshape([1, oh, ow, 2]) + + +def affine_impl(img: Tensor, + matrix: List[float], + interpolation: str = "nearest", + fill: Optional[List[float]] = None) -> Tensor: + theta = paddle.to_tensor(matrix, dtype=img.dtype).reshape([1, 2, 3]) + shape = img.shape + # grid will be generated on the same device as theta and img + grid = _gen_affine_grid(theta, w=shape[-1], h=shape[-2], ow=shape[-1], oh=shape[-2]) + return _apply_grid_transform(img, grid, interpolation, fill=fill) + + +def _get_inverse_affine_matrix(center: List[float], + angle: float, + translate: List[float], + scale: float, + shear: List[float], + inverted: bool = True) -> List[float]: + # Helper method to compute inverse matrix for affine transformation + + # Pillow requires inverse affine transformation matrix: + # Affine matrix is : M = T * C * RotateScaleShear * C^-1 + # + # where T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1] + # C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1] + # RotateScaleShear is rotation with scale and shear matrix + # + # RotateScaleShear(a, s, (sx, sy)) = + # = R(a) * S(s) * SHy(sy) * SHx(sx) + # = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(sx)/cos(sy) - sin(a)), 0 ] + # [ s*sin(a + sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ] + # [ 0 , 0 , 1 ] + # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears: + # SHx(s) = [1, -tan(s)] and SHy(s) = [1 , 0] + # [0, 1 ] [-tan(s), 1] + # + # Thus, the inverse is M^-1 = C * RotateScaleShear^-1 * C^-1 * T^-1 + + rot = math.radians(angle) + sx = math.radians(shear[0]) + sy = math.radians(shear[1]) + + cx, cy = center + tx, ty = translate + + # RSS without scaling + a = math.cos(rot - sy) / math.cos(sy) + b = -math.cos(rot - sy) * math.tan(sx) / math.cos(sy) - math.sin(rot) + c = math.sin(rot - sy) / math.cos(sy) + d = -math.sin(rot - sy) * math.tan(sx) / math.cos(sy) + math.cos(rot) + + if inverted: + # Inverted rotation matrix with scale and shear + # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1 + matrix = [d, -b, 0.0, -c, a, 0.0] + matrix = [x / scale for x in matrix] + # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1 + matrix[2] += matrix[0] * (-cx - tx) + matrix[1] * (-cy - ty) + matrix[5] += matrix[3] * (-cx - tx) + matrix[4] * (-cy - ty) + # Apply center translation: C * RSS^-1 * C^-1 * T^-1 + matrix[2] += cx + matrix[5] += cy + else: + matrix = [a, b, 0.0, c, d, 0.0] + matrix = [x * scale for x in matrix] + # Apply inverse of center translation: RSS * C^-1 + matrix[2] += matrix[0] * (-cx) + matrix[1] * (-cy) + matrix[5] += matrix[3] * (-cx) + matrix[4] * (-cy) + # Apply translation and center : T * C * RSS * C^-1 + matrix[2] += cx + tx + matrix[5] += cy + ty + + return matrix + + +def affine( + img: Tensor, + angle: float, + translate: List[int], + scale: float, + shear: List[float], + interpolation: InterpolationMode = InterpolationMode.NEAREST, + fill: Optional[List[float]] = None, + resample: Optional[int] = None, + fillcolor: Optional[List[float]] = None, + center: Optional[List[int]] = None, +) -> Tensor: + """Apply affine transformation on the image keeping image center invariant. + If the image is paddle Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + img (PIL Image or Tensor): image to transform. + angle (number): rotation angle in degrees between -180 and 180, clockwise direction. + translate (sequence of integers): horizontal and vertical translations (post-rotation translation) + scale (float): overall scale + shear (float or sequence): shear angle value in degrees between -180 to 180, clockwise direction. + If a sequence is specified, the first value corresponds to a shear parallel to the x axis, while + the second value corresponds to a shear parallel to the y axis. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted, + but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + + .. note:: + In torchscript mode single int/float value is not supported, please use a sequence + of length 1: ``[value, ]``. + fillcolor (sequence or number, optional): + .. warning:: + This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``fill`` instead. + resample (int, optional): + .. warning:: + This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``interpolation`` + instead. + center (sequence, optional): Optional center of rotation. Origin is the upper left corner. + Default is the center of the image. + + Returns: + PIL Image or Tensor: Transformed image. + """ + + # Backward compatibility with integer value + if isinstance(interpolation, int): + warnings.warn("Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " + "Please use InterpolationMode enum.") + interpolation = _interpolation_modes_from_int(interpolation) + + if fillcolor is not None: + warnings.warn("The parameter 'fillcolor' is deprecated since 0.12 and will be removed in 0.14. " + "Please use 'fill' instead.") + fill = fillcolor + + if not isinstance(angle, (int, float)): + raise TypeError("Argument angle should be int or float") + + if not isinstance(translate, (list, tuple)): + raise TypeError("Argument translate should be a sequence") + + if len(translate) != 2: + raise ValueError("Argument translate should be a sequence of length 2") + + if scale <= 0.0: + raise ValueError("Argument scale should be positive") + + if not isinstance(shear, (numbers.Number, (list, tuple))): + raise TypeError("Shear should be either a single value or a sequence of two values") + + if not isinstance(interpolation, InterpolationMode): + raise TypeError("Argument interpolation should be a InterpolationMode") + + if isinstance(angle, int): + angle = float(angle) + + if isinstance(translate, tuple): + translate = list(translate) + + if isinstance(shear, numbers.Number): + shear = [shear, 0.0] + + if isinstance(shear, tuple): + shear = list(shear) + + if len(shear) == 1: + shear = [shear[0], shear[0]] + + if len(shear) != 2: + raise ValueError(f"Shear should be a sequence containing two values. Got {shear}") + + if center is not None and not isinstance(center, (list, tuple)): + raise TypeError("Argument center should be a sequence") + center_f = [0.0, 0.0] + if center is not None: + _, height, width = img.shape[0], img.shape[1], img.shape[2] + # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center. + center_f = [1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])] + + translate_f = [1.0 * t for t in translate] + matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear) + return affine_impl(img, matrix=matrix, interpolation=interpolation.value, fill=fill) + + +def _interpolation_modes_from_int(i: int) -> InterpolationMode: + inverse_modes_mapping = { + 0: InterpolationMode.NEAREST, + 2: InterpolationMode.BILINEAR, + 3: InterpolationMode.BICUBIC, + 4: InterpolationMode.BOX, + 5: InterpolationMode.HAMMING, + 1: InterpolationMode.LANCZOS, + } + return inverse_modes_mapping[i] + + +def _check_sequence_input(x, name, req_sizes): + msg = req_sizes[0] if len(req_sizes) < 2 else " or ".join([str(s) for s in req_sizes]) + if not isinstance(x, Sequence): + raise TypeError(f"{name} should be a sequence of length {msg}.") + if len(x) not in req_sizes: + raise ValueError(f"{name} should be sequence of length {msg}.") + + +def _setup_angle(x, name, req_sizes=(2, )): + if isinstance(x, numbers.Number): + if x < 0: + raise ValueError(f"If {name} is a single number, it must be positive.") + x = [-x, x] + else: + _check_sequence_input(x, name, req_sizes) + + return [float(d) for d in x] + + +class RandomAffine(nn.Layer): + """Random affine transformation of the image keeping center invariant. + If the image is paddle Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + degrees (sequence or number): Range of degrees to select from. + If degrees is a number instead of sequence like (min, max), the range of degrees + will be (-degrees, +degrees). Set to 0 to deactivate rotations. + translate (tuple, optional): tuple of maximum absolute fraction for horizontal + and vertical translations. For example translate=(a, b), then horizontal shift + is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is + randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default. + scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is + randomly sampled from the range a <= scale <= b. Will keep original scale by default. + shear (sequence or number, optional): Range of degrees to select from. + If shear is a number, a shear parallel to the x axis in the range (-shear, +shear) + will be applied. Else if shear is a sequence of 2 values a shear parallel to the x axis in the + range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values, + a x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied. + Will not apply shear by default. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted, + but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum. + fill (sequence or number): Pixel fill value for the area outside the transformed + image. Default is ``0``. If given a number, the value is used for all bands respectively. + fillcolor (sequence or number, optional): + .. warning:: + This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``fill`` instead. + resample (int, optional): + .. warning:: + This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``interpolation`` + instead. + center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner. + Default is the center of the image. + + .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters + + """ + + def __init__( + self, + degrees, + translate=None, + scale=None, + shear=None, + interpolation=InterpolationMode.NEAREST, + fill=0, + fillcolor=None, + resample=None, + center=None, + ): + super(RandomAffine, self).__init__() + if resample is not None: + warnings.warn("The parameter 'resample' is deprecated since 0.12 and will be removed in 0.14. " + "Please use 'interpolation' instead.") + interpolation = _interpolation_modes_from_int(resample) + + # Backward compatibility with integer value + if isinstance(interpolation, int): + warnings.warn("Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " + "Please use InterpolationMode enum.") + interpolation = _interpolation_modes_from_int(interpolation) + + if fillcolor is not None: + warnings.warn("The parameter 'fillcolor' is deprecated since 0.12 and will be removed in 0.14. " + "Please use 'fill' instead.") + fill = fillcolor + + self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2, )) + + if translate is not None: + _check_sequence_input(translate, "translate", req_sizes=(2, )) + for t in translate: + if not (0.0 <= t <= 1.0): + raise ValueError("translation values should be between 0 and 1") + self.translate = translate + + if scale is not None: + _check_sequence_input(scale, "scale", req_sizes=(2, )) + for s in scale: + if s <= 0: + raise ValueError("scale values should be positive") + self.scale = scale + + if shear is not None: + self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4)) + else: + self.shear = shear + + self.resample = self.interpolation = interpolation + + if fill is None: + fill = 0 + elif not isinstance(fill, (Sequence, numbers.Number)): + raise TypeError("Fill should be either a sequence or a number.") + + self.fillcolor = self.fill = fill + + if center is not None: + _check_sequence_input(center, "center", req_sizes=(2, )) + + self.center = center + + @staticmethod + def get_params( + degrees: List[float], + translate: Optional[List[float]], + scale_ranges: Optional[List[float]], + shears: Optional[List[float]], + img_size: List[int], + ) -> Tuple[float, Tuple[int, int], float, Tuple[float, float]]: + """Get parameters for affine transformation + + Returns: + params to be passed to the affine transformation + """ + angle = float(paddle.empty([1]).uniform_(float(degrees[0]), float(degrees[1]))) + if translate is not None: + max_dx = float(translate[0] * img_size[0]) + max_dy = float(translate[1] * img_size[1]) + tx = int(float(paddle.empty([1]).uniform_(-max_dx, max_dx))) + ty = int(float(paddle.empty([1]).uniform_(-max_dy, max_dy))) + translations = (tx, ty) + else: + translations = (0, 0) + + if scale_ranges is not None: + scale = float(paddle.empty([1]).uniform_(scale_ranges[0], scale_ranges[1])) + else: + scale = 1.0 + + shear_x = shear_y = 0.0 + if shears is not None: + shear_x = float(paddle.empty([1]).uniform_(shears[0], shears[1])) + if len(shears) == 4: + shear_y = float(paddle.empty([1]).uniform_(shears[2], shears[3])) + + shear = (shear_x, shear_y) + + return angle, translations, scale, shear + + def forward(self, img): + fill = self.fill + channels, height, width = img.shape[1], img.shape[2], img.shape[3] + if isinstance(fill, (int, float)): + fill = [float(fill)] * channels + else: + fill = [float(f) for f in fill] + + img_size = [width, height] # flip for keeping BC on get_params call + + ret = self.get_params(self.degrees, self.translate, self.scale, self.shear, img_size) + + return affine(img, *ret, interpolation=self.interpolation, fill=fill, center=self.center) + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}(degrees={self.degrees}" + s += f", translate={self.translate}" if self.translate is not None else "" + s += f", scale={self.scale}" if self.scale is not None else "" + s += f", shear={self.shear}" if self.shear is not None else "" + s += f", interpolation={self.interpolation.value}" if self.interpolation != InterpolationMode.NEAREST else "" + s += f", fill={self.fill}" if self.fill != 0 else "" + s += f", center={self.center}" if self.center is not None else "" + s += ")" + + return s diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/unet.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/unet.py new file mode 100755 index 00000000..56f3ad61 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/model/unet.py @@ -0,0 +1,838 @@ +''' +This code is rewritten by Paddle based on +https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/unet.py +''' +import math +from abc import abstractmethod + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from .nn import avg_pool_nd +from .nn import checkpoint +from .nn import conv_nd +from .nn import linear +from .nn import normalization +from .nn import SiLU +from .nn import timestep_embedding +from .nn import zero_module + + +class AttentionPool2d(nn.Layer): + """ + Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py + """ + + def __init__( + self, + spacial_dim: int, + embed_dim: int, + num_heads_channels: int, + output_dim: int = None, + ): + super().__init__() + # self.positional_embedding = nn.Parameter( + # th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5 + # ) + positional_embedding = self.create_parameter(paddle.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5) + self.add_parameter("positional_embedding", positional_embedding) + self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1) + self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1) + self.num_heads = embed_dim // num_heads_channels + self.attention = QKVAttention(self.num_heads) + + def forward(self, x): + b, c, *_spatial = x.shape + # x = x.reshape(b, c, -1) # NC(HW) + x = paddle.reshape(x, [b, c, -1]) + x = paddle.concat([x.mean(dim=-1, keepdim=True), x], axis=-1) # NC(HW+1) + x = x + paddle.cast(self.positional_embedding[None, :, :], x.dtype) # NC(HW+1) + x = self.qkv_proj(x) + x = self.attention(x) + x = self.c_proj(x) + return x[:, :, 0] + + +class TimestepBlock(nn.Layer): + """ + Any module where forward() takes timestep embeddings as a second argument. + """ + + @abstractmethod + def forward(self, x, emb): + """ + Apply the module to `x` given `emb` timestep embeddings. + """ + + +class TimestepEmbedSequential(nn.Sequential, TimestepBlock): + """ + A sequential module that passes timestep embeddings to the children that + support it as an extra input. + """ + + def forward(self, x, emb): + for layer in self: + if isinstance(layer, TimestepBlock): + x = layer(x, emb) + else: + x = layer(x) + return x + + +class Upsample(nn.Layer): + """ + An upsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + upsampling occurs in the inner-two dimensions. + """ + + def __init__(self, channels, use_conv, dims=2, out_channels=None): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + if use_conv: + self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1) + + def forward(self, x): + assert x.shape[1] == self.channels + if self.dims == 3: + x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest") + else: + x = F.interpolate(x, scale_factor=2, mode="nearest") + if self.use_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Layer): + """ + A downsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + downsampling occurs in the inner-two dimensions. + """ + + def __init__(self, channels, use_conv, dims=2, out_channels=None): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + stride = 2 if dims != 3 else (1, 2, 2) + if use_conv: + self.op = conv_nd(dims, self.channels, self.out_channels, 3, stride=stride, padding=1) + else: + assert self.channels == self.out_channels + self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) + + def forward(self, x): + assert x.shape[1] == self.channels + return self.op(x) + + +class ResBlock(TimestepBlock): + """ + A residual block that can optionally change the number of channels. + + :param channels: the number of input channels. + :param emb_channels: the number of timestep embedding channels. + :param dropout: the rate of dropout. + :param out_channels: if specified, the number of out channels. + :param use_conv: if True and out_channels is specified, use a spatial + convolution instead of a smaller 1x1 convolution to change the + channels in the skip connection. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param use_checkpoint: if True, use gradient checkpointing on this module. + :param up: if True, use this block for upsampling. + :param down: if True, use this block for downsampling. + """ + + def __init__( + self, + channels, + emb_channels, + dropout, + out_channels=None, + use_conv=False, + use_scale_shift_norm=False, + dims=2, + use_checkpoint=False, + up=False, + down=False, + ): + super().__init__() + self.channels = channels + self.emb_channels = emb_channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_checkpoint = use_checkpoint + self.use_scale_shift_norm = use_scale_shift_norm + + self.in_layers = nn.Sequential( + normalization(channels), + SiLU(), + conv_nd(dims, channels, self.out_channels, 3, padding=1), + ) + + self.updown = up or down + + if up: + self.h_upd = Upsample(channels, False, dims) + self.x_upd = Upsample(channels, False, dims) + elif down: + self.h_upd = Downsample(channels, False, dims) + self.x_upd = Downsample(channels, False, dims) + else: + self.h_upd = self.x_upd = nn.Identity() + + self.emb_layers = nn.Sequential( + SiLU(), + linear( + emb_channels, + 2 * self.out_channels if use_scale_shift_norm else self.out_channels, + ), + ) + self.out_layers = nn.Sequential( + normalization(self.out_channels), + SiLU(), + nn.Dropout(p=dropout), + zero_module(conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + elif use_conv: + self.skip_connection = conv_nd(dims, channels, self.out_channels, 3, padding=1) + else: + self.skip_connection = conv_nd(dims, channels, self.out_channels, 1) + + def forward(self, x, emb): + """ + Apply the block to a Tensor, conditioned on a timestep embedding. + + :param x: an [N x C x ...] Tensor of features. + :param emb: an [N x emb_channels] Tensor of timestep embeddings. + :return: an [N x C x ...] Tensor of outputs. + """ + return checkpoint(self._forward, (x, emb), self.parameters(), self.use_checkpoint) + + def _forward(self, x, emb): + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + emb_out = self.emb_layers(emb) + emb_out = paddle.cast(emb_out, h.dtype) + while len(emb_out.shape) < len(h.shape): + emb_out = emb_out[..., None] + if self.use_scale_shift_norm: + out_norm, out_rest = self.out_layers[0], self.out_layers[1:] + scale, shift = paddle.chunk(emb_out, 2, axis=1) + h = out_norm(h) * (1 + scale) + shift + h = out_rest(h) + else: + h = h + emb_out + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class AttentionBlock(nn.Layer): + """ + An attention block that allows spatial positions to attend to each other. + + Originally ported from here, but adapted to the N-d case. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66. + """ + + def __init__( + self, + channels, + num_heads=1, + num_head_channels=-1, + use_checkpoint=False, + use_new_attention_order=False, + ): + super().__init__() + self.channels = channels + if num_head_channels == -1: + self.num_heads = num_heads + else: + assert (channels % num_head_channels == 0 + ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + self.num_heads = channels // num_head_channels + self.use_checkpoint = use_checkpoint + self.norm = normalization(channels) + self.qkv = conv_nd(1, channels, channels * 3, 1) + if use_new_attention_order: + # split qkv before split heads + self.attention = QKVAttention(self.num_heads) + else: + # split heads before split qkv + self.attention = QKVAttentionLegacy(self.num_heads) + + self.proj_out = zero_module(conv_nd(1, channels, channels, 1)) + + def forward(self, x): + return checkpoint(self._forward, (x, ), self.parameters(), self.use_checkpoint) + + def _forward(self, x): + b, c, *spatial = x.shape + # x = x.reshape(b, c, -1) + x = paddle.reshape(x, [b, c, -1]) + qkv = self.qkv(self.norm(x)) + h = self.attention(qkv) + h = self.proj_out(h) + # return (x + h).reshape(b, c, *spatial) + return paddle.reshape(x + h, [b, c, *spatial]) + + +def count_flops_attn(model, _x, y): + """ + A counter for the `thop` package to count the operations in an + attention operation. + Meant to be used like: + macs, params = thop.profile( + model, + inputs=(inputs, timestamps), + custom_ops={QKVAttention: QKVAttention.count_flops}, + ) + """ + b, c, *spatial = y[0].shape + num_spatial = int(np.prod(spatial)) + # We perform two matmuls with the same number of ops. + # The first computes the weight matrix, the second computes + # the combination of the value vectors. + matmul_ops = 2 * b * (num_spatial**2) * c + model.total_ops += paddle.to_tensor([matmul_ops], dtype='float64') + + +class QKVAttentionLegacy(nn.Layer): + """ + A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + + :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + # q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1) + q, k, v = paddle.reshape(qkv, [bs * self.n_heads, ch * 3, length]).split(3, axis=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = paddle.einsum("bct,bcs->bts", q * scale, k * scale) # More stable with f16 than dividing afterwards + weight = paddle.cast(nn.functional.softmax(paddle.cast(weight, 'float32'), axis=-1), weight.dtype) + a = paddle.einsum("bts,bcs->bct", weight, v) + # return a.reshape(bs, -1, length) + return paddle.reshape(a, [bs, -1, length]) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class QKVAttention(nn.Layer): + """ + A module which performs QKV attention and splits in a different order. + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + + :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.chunk(3, axis=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = paddle.einsum( + "bct,bcs->bts", + (q * scale).view(bs * self.n_heads, ch, length), + (k * scale).view(bs * self.n_heads, ch, length), + ) # More stable with f16 than dividing afterwards + weight = paddle.cast(nn.functional.softmax(paddle.cast(weight, 'float32'), axis=-1), weight.dtype) + a = paddle.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length)) + # return a.reshape(bs, -1, length) + return paddle.reshape(a, [bs, -1, length]) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class UNetModel(nn.Layer): + """ + The full UNet model with attention and timestep embedding. + + :param in_channels: channels in the input Tensor. + :param model_channels: base channel count for the model. + :param out_channels: channels in the output Tensor. + :param num_res_blocks: number of residual blocks per downsample. + :param attention_resolutions: a collection of downsample rates at which + attention will take place. May be a set, list, or tuple. + For example, if this contains 4, then at 4x downsampling, attention + will be used. + :param dropout: the dropout probability. + :param channel_mult: channel multiplier for each level of the UNet. + :param conv_resample: if True, use learned convolutions for upsampling and + downsampling. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param num_classes: if specified (as an int), then this model will be + class-conditional with `num_classes` classes. + :param use_checkpoint: use gradient checkpointing to reduce memory usage. + :param num_heads: the number of attention heads in each attention layer. + :param num_heads_channels: if specified, ignore num_heads and instead use + a fixed channel width per attention head. + :param num_heads_upsample: works with num_heads to set a different number + of heads for upsampling. Deprecated. + :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. + :param resblock_updown: use residual blocks for up/downsampling. + :param use_new_attention_order: use a different attention pattern for potentially + increased efficiency. + """ + + def __init__( + self, + image_size, + in_channels, + model_channels, + out_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + num_classes=None, + use_checkpoint=False, + use_fp16=False, + num_heads=1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + use_new_attention_order=False, + ): + super().__init__() + + if num_heads_upsample == -1: + num_heads_upsample = num_heads + + self.image_size = image_size + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.num_classes = num_classes + self.use_checkpoint = use_checkpoint + self.dtype = paddle.float16 if use_fp16 else paddle.float32 + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + + time_embed_dim = model_channels * 4 + self.time_embed = nn.Sequential( + linear(model_channels, time_embed_dim), + SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + + if self.num_classes is not None: + self.label_emb = nn.Embedding(num_classes, time_embed_dim) + + ch = input_ch = int(channel_mult[0] * model_channels) + self.input_blocks = nn.LayerList([TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))]) + self._feature_size = ch + input_block_chans = [ch] + ds = 1 + for level, mult in enumerate(channel_mult): + for _ in range(num_res_blocks): + layers = [ + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=int(mult * model_channels), + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = int(mult * model_channels) + if ds in attention_resolutions: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + )) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True, + ) if resblock_updown else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch))) + ch = out_ch + input_block_chans.append(ch) + ds *= 2 + self._feature_size += ch + + self.middle_block = TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + ), + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + ) + self._feature_size += ch + + self.output_blocks = nn.LayerList([]) + for level, mult in list(enumerate(channel_mult))[::-1]: + for i in range(num_res_blocks + 1): + ich = input_block_chans.pop() + layers = [ + ResBlock( + ch + ich, + time_embed_dim, + dropout, + out_channels=int(model_channels * mult), + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = int(model_channels * mult) + if ds in attention_resolutions: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads_upsample, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + )) + if level and i == num_res_blocks: + out_ch = ch + layers.append( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + up=True, + ) if resblock_updown else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)) + ds //= 2 + self.output_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + + self.out = nn.Sequential( + normalization(ch), + SiLU(), + zero_module(conv_nd(dims, input_ch, out_channels, 3, padding=1)), + ) + + def forward(self, x, timesteps, y=None): + """ + Apply the model to an input batch. + + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :param y: an [N] Tensor of labels, if class-conditional. + :return: an [N x C x ...] Tensor of outputs. + """ + assert (y is not None) == (self.num_classes + is not None), "must specify y if and only if the model is class-conditional" + + hs = [] + emb = self.time_embed(timestep_embedding(timesteps, self.model_channels)) + if self.num_classes is not None: + assert y.shape == (x.shape[0], ) + emb = emb + self.label_emb(y) + + h = paddle.cast(x, self.dtype) + for module in self.input_blocks: + h = module(h, emb) + hs.append(h) + h = self.middle_block(h, emb) + for module in self.output_blocks: + h = paddle.concat([h, hs.pop()], axis=1) + h = module(h, emb) + # h = paddle.cast(h, x.dtype) + return self.out(h) + + +class SuperResModel(UNetModel): + """ + A UNetModel that performs super-resolution. + + Expects an extra kwarg `low_res` to condition on a low-resolution image. + """ + + def __init__(self, image_size, in_channels, *args, **kwargs): + super().__init__(image_size, in_channels * 2, *args, **kwargs) + + def forward(self, x, timesteps, low_res=None, **kwargs): + _, _, new_height, new_width = x.shape + upsampled = F.interpolate(low_res, (new_height, new_width), mode="bilinear") + x = paddle.concat([x, upsampled], axis=1) + return super().forward(x, timesteps, **kwargs) + + +class EncoderUNetModel(nn.Layer): + """ + The half UNet model with attention and timestep embedding. + + For usage, see UNet. + """ + + def __init__( + self, + image_size, + in_channels, + model_channels, + out_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + use_checkpoint=False, + use_fp16=False, + num_heads=1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + use_new_attention_order=False, + pool="adaptive", + ): + super().__init__() + + if num_heads_upsample == -1: + num_heads_upsample = num_heads + + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.use_checkpoint = use_checkpoint + self.dtype = paddle.float16 if use_fp16 else paddle.float32 + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + + time_embed_dim = model_channels * 4 + self.time_embed = nn.Sequential( + linear(model_channels, time_embed_dim), + SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + + ch = int(channel_mult[0] * model_channels) + self.input_blocks = nn.LayerList([TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))]) + self._feature_size = ch + input_block_chans = [ch] + ds = 1 + for level, mult in enumerate(channel_mult): + for _ in range(num_res_blocks): + layers = [ + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=int(mult * model_channels), + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = int(mult * model_channels) + if ds in attention_resolutions: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + )) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True, + ) if resblock_updown else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch))) + ch = out_ch + input_block_chans.append(ch) + ds *= 2 + self._feature_size += ch + + self.middle_block = TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + ), + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + ) + self._feature_size += ch + self.pool = pool + if pool == "adaptive": + self.out = nn.Sequential( + normalization(ch), + SiLU(), + nn.AdaptiveAvgPool2D((1, 1)), + zero_module(conv_nd(dims, ch, out_channels, 1)), + nn.Flatten(), + ) + elif pool == "attention": + assert num_head_channels != -1 + self.out = nn.Sequential( + normalization(ch), + SiLU(), + AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels), + ) + elif pool == "spatial": + self.out = nn.Sequential( + nn.Linear(self._feature_size, 2048), + nn.ReLU(), + nn.Linear(2048, self.out_channels), + ) + elif pool == "spatial_v2": + self.out = nn.Sequential( + nn.Linear(self._feature_size, 2048), + normalization(2048), + SiLU(), + nn.Linear(2048, self.out_channels), + ) + else: + raise NotImplementedError(f"Unexpected {pool} pooling") + + def forward(self, x, timesteps): + """ + Apply the model to an input batch. + + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :return: an [N x K] Tensor of outputs. + """ + emb = self.time_embed(timestep_embedding(timesteps, self.model_channels)) + + results = [] + # h = x.type(self.dtype) + h = paddle.cast(x, self.dtype) + for module in self.input_blocks: + h = module(h, emb) + if self.pool.startswith("spatial"): + # results.append(h.type(x.dtype).mean(axis=(2, 3))) + results.append(paddle.cast(h, x.dtype).mean(axis=(2, 3))) + h = self.middle_block(h, emb) + if self.pool.startswith("spatial"): + results.append(paddle.cast(h, x.dtype).mean(axis=(2, 3))) + h = paddle.concat(results, axis=-1) + return self.out(h) + else: + # h = h.type(x.dtype) + h = paddle.cast(h, x.dtype) + return self.out(h) diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/resources/default.yml b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/resources/default.yml new file mode 100755 index 00000000..97c3c1b9 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/resources/default.yml @@ -0,0 +1,47 @@ +text_prompts: + - A beautiful painting of a singular lighthouse, shining its light across a tumultuous sea of blood by greg rutkowski and thomas kinkade, Trending on artstation. + +init_image: + +width_height: [ 1280, 768] + +skip_steps: 10 +steps: 250 + +cut_ic_pow: 1 +init_scale: 1000 +clip_guidance_scale: 5000 + +tv_scale: 0 +range_scale: 150 +sat_scale: 0 +cutn_batches: 4 + +diffusion_model: 512x512_diffusion_uncond_finetune_008100 +use_secondary_model: True +diffusion_sampling_mode: ddim + +perlin_init: False +perlin_mode: mixed +seed: 445467575 +eta: 0.8 +clamp_grad: True +clamp_max: 0.05 + +randomize_class: True +clip_denoised: False +fuzzy_prompt: False +rand_mag: 0.05 + +cut_overview: "[12]*400+[4]*600" +cut_innercut: "[4]*400+[12]*600" +cut_icgray_p: "[0.2]*400+[0]*600" + +display_rate: 10 +n_batches: 1 +batch_size: 1 +batch_name: '' +clip_models: + - VIT + - RN50 + - RN101 diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/resources/docstrings.yml b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/resources/docstrings.yml new file mode 100755 index 00000000..702015e1 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/resources/docstrings.yml @@ -0,0 +1,103 @@ +text_prompts: | + Phrase, sentence, or string of words and phrases describing what the image should look like. The words will be analyzed by the AI and will guide the diffusion process toward the image(s) you describe. These can include commas and weights to adjust the relative importance of each element. E.g. "A beautiful painting of a singular lighthouse, shining its light across a tumultuous sea of blood by greg rutkowski and thomas kinkade, Trending on artstation." + Notice that this prompt loosely follows a structure: [subject], [prepositional details], [setting], [meta modifiers and artist]; this is a good starting point for your experiments. + Developing text prompts takes practice and experience, and is not the subject of this guide. If you are a beginner to writing text prompts, a good place to start is on a simple AI art app like Night Cafe, starry ai or WOMBO prior to using DD, to get a feel for how text gets translated into images by GAN tools. These other apps use different technologies, but many of the same principles apply. +init_image: | + Recall that in the image sequence above, the first image shown is just noise. If an init_image is provided, diffusion will replace the noise with the init_image as its starting state. To use an init_image, upload the image to the Colab instance or your Google Drive, and enter the full image path here. + If using an init_image, you may need to increase skip_steps to ~ 50% of total steps to retain the character of the init. See skip_steps above for further discussion. +width_height: | + Desired final image size, in pixels. You can have a square, wide, or tall image, but each edge length should be set to a multiple of 64px, and a minimum of 512px on the default CLIP model setting. If you forget to use multiples of 64px in your dimensions, DD will adjust the dimensions of your image to make it so. + +skip_steps: | + Consider the chart shown here. Noise scheduling (denoise strength) starts very high and progressively gets lower and lower as diffusion steps progress. The noise levels in the first few steps are very high, so images change dramatically in early steps. + As DD moves along the curve, noise levels (and thus the amount an image changes per step) declines, and image coherence from one step to the next increases. + The first few steps of denoising are often so dramatic that some steps (maybe 10-15% of total) can be skipped without affecting the final image. You can experiment with this as a way to cut render times. + If you skip too many steps, however, the remaining noise may not be high enough to generate new content, and thus may not have ‘time left’ to finish an image satisfactorily. + Also, depending on your other settings, you may need to skip steps to prevent CLIP from overshooting your goal, resulting in ‘blown out’ colors (hyper saturated, solid white, or solid black regions) or otherwise poor image quality. Consider that the denoising process is at its strongest in the early steps, so skipping steps can sometimes mitigate other problems. + Lastly, if using an init_image, you will need to skip ~50% of the diffusion steps to retain the shapes in the original init image. + However, if you’re using an init_image, you can also adjust skip_steps up or down for creative reasons. With low skip_steps you can get a result "inspired by" the init_image which will retain the colors and rough layout and shapes but look quite different. With high skip_steps you can preserve most of the init_image contents and just do fine tuning of the texture. + +steps: | + When creating an image, the denoising curve is subdivided into steps for processing. Each step (or iteration) involves the AI looking at subsets of the image called ‘cuts’ and calculating the ‘direction’ the image should be guided to be more like the prompt. Then it adjusts the image with the help of the diffusion denoiser, and moves to the next step. + Increasing steps will provide more opportunities for the AI to adjust the image, and each adjustment will be smaller, and thus will yield a more precise, detailed image. Increasing steps comes at the expense of longer render times. Also, while increasing steps should generally increase image quality, there is a diminishing return on additional steps beyond 250 - 500 steps. However, some intricate images can take 1000, 2000, or more steps. It is really up to the user. + Just know that the render time is directly related to the number of steps, and many other parameters have a major impact on image quality, without costing additional time. + +cut_ic_pow: | + This sets the size of the border used for inner cuts. High cut_ic_pow values have larger borders, and therefore the cuts themselves will be smaller and provide finer details. If you have too many or too-small inner cuts, you may lose overall image coherency and/or it may cause an undesirable ‘mosaic’ effect. Low cut_ic_pow values will allow the inner cuts to be larger, helping image coherency while still helping with some details. + +init_scale: | + This controls how strongly CLIP will try to match the init_image provided. This is balanced against the clip_guidance_scale (CGS) above. Too much init scale, and the image won’t change much during diffusion. Too much CGS and the init image will be lost. +clip_guidance_scale: | + CGS is one of the most important parameters you will use. It tells DD how strongly you want CLIP to move toward your prompt each timestep. Higher is generally better, but if CGS is too strong it will overshoot the goal and distort the image. So a happy medium is needed, and it takes experience to learn how to adjust CGS. + Note that this parameter generally scales with image dimensions. In other words, if you increase your total dimensions by 50% (e.g. a change from 512 x 512 to 512 x 768), then to maintain the same effect on the image, you’d want to increase clip_guidance_scale from 5000 to 7500. + Of the basic settings, clip_guidance_scale, steps and skip_steps are the most important contributors to image quality, so learn them well. +tv_scale: | + Total variance denoising. Optional, set to zero to turn off. Controls ‘smoothness’ of final output. If used, tv_scale will try to smooth out your final image to reduce overall noise. If your image is too ‘crunchy’, increase tv_scale. TV denoising is good at preserving edges while smoothing away noise in flat regions. See https://en.wikipedia.org/wiki/Total_variation_denoising +range_scale: | + Optional, set to zero to turn off. Used for adjustment of color contrast. Lower range_scale will increase contrast. Very low numbers create a reduced color palette, resulting in more vibrant or poster-like images. Higher range_scale will reduce contrast, for more muted images. + +sat_scale: | + Saturation scale. Optional, set to zero to turn off. If used, sat_scale will help mitigate oversaturation. If your image is too saturated, increase sat_scale to reduce the saturation. +cutn_batches: | + Each iteration, the AI cuts the image into smaller pieces known as cuts, and compares each cut to the prompt to decide how to guide the next diffusion step. More cuts can generally lead to better images, since DD has more chances to fine-tune the image precision in each timestep. + Additional cuts are memory intensive, however, and if DD tries to evaluate too many cuts at once, it can run out of memory. You can use cutn_batches to increase cuts per timestep without increasing memory usage. + At the default settings, DD is scheduled to do 16 cuts per timestep. If cutn_batches is set to 1, there will indeed only be 16 cuts total per timestep. + However, if cutn_batches is increased to 4, DD will do 64 cuts total in each timestep, divided into 4 sequential batches of 16 cuts each. Because the cuts are being evaluated only 16 at a time, DD uses the memory required for only 16 cuts, but gives you the quality benefit of 64 cuts. The tradeoff, of course, is that this will take ~4 times as long to render each image. + So, (scheduled cuts) x (cutn_batches) = (total cuts per timestep). Increasing cutn_batches will increase render times, however, as the work is being done sequentially. DD’s default cut schedule is a good place to start, but the cut schedule can be adjusted in the Cutn Scheduling section, explained below. + +diffusion_model: Diffusion_model of choice. + +use_secondary_model: | + Option to use a secondary purpose-made diffusion model to clean up interim diffusion images for CLIP evaluation. If this option is turned off, DD will use the regular (large) diffusion model. Using the secondary model is faster - one user reported a 50% improvement in render speed! However, the secondary model is much smaller, and may reduce image quality and detail. I suggest you experiment with this. + +diffusion_sampling_mode: | + Two alternate diffusion denoising algorithms. ddim has been around longer, and is more established and tested. plms is a newly added alternate method that promises good diffusion results in fewer steps, but has not been as fully tested and may have side effects. This new plms mode is actively being researched in the #settings-and-techniques channel in the DD Discord. + +perlin_init: | + Normally, DD will use an image filled with random noise as a starting point for the diffusion curve. If perlin_init is selected, DD will instead use a Perlin noise model as an initial state. Perlin has very interesting characteristics, distinct from random noise, so it’s worth experimenting with this for your projects. Beyond perlin, you can, of course, generate your own noise images (such as with GIMP, etc) and use them as an init_image (without skipping steps). + Choosing perlin_init does not affect the actual diffusion process, just the starting point for the diffusion. Please note that selecting a perlin_init will replace and override any init_image you may have specified. Further, because the 2D, 3D and video animation systems all rely on the init_image system, if you enable Perlin while using animation modes, the perlin_init will jump in front of any previous image or video input, and DD will NOT give you the expected sequence of coherent images. All of that said, using Perlin and animation modes together do make a very colorful rainbow effect, which can be used creatively. + +perlin_mode: | + sets type of Perlin noise: colored, gray, or a mix of both, giving you additional options for noise types. Experiment to see what these do in your projects. +seed: | + Deep in the diffusion code, there is a random number ‘seed’ which is used as the basis for determining the initial state of the diffusion. By default, this is random, but you can also specify your own seed. This is useful if you like a particular result and would like to run more iterations that will be similar. + After each run, the actual seed value used will be reported in the parameters report, and can be reused if desired by entering seed # here. If a specific numerical seed is used repeatedly, the resulting images will be quite similar but not identical. +eta: | + eta (greek letter η) is a diffusion model variable that mixes in a random amount of scaled noise into each timestep. 0 is no noise, 1.0 is more noise. As with most DD parameters, you can go below zero for eta, but it may give you unpredictable results. + The steps parameter has a close relationship with the eta parameter. If you set eta to 0, then you can get decent output with only 50-75 steps. Setting eta to 1.0 favors higher step counts, ideally around 250 and up. eta has a subtle, unpredictable effect on image, so you’ll need to experiment to see how this affects your projects. +clamp_grad: | + As I understand it, clamp_grad is an internal limiter that stops DD from producing extreme results. Try your images with and without clamp_grad. If the image changes drastically with clamp_grad turned off, it probably means your clip_guidance_scale is too high and should be reduced. +clamp_max: | + Sets the value of the clamp_grad limitation. Default is 0.05, providing for smoother, more muted coloration in images, but setting higher values (0.15-0.3) can provide interesting contrast and vibrancy. + +randomize_class: +clip_denoised: False +fuzzy_prompt: | + Controls whether to add multiple noisy prompts to the prompt losses. If True, can increase variability of image output. Experiment with this. +rand_mag: | + Affects only the fuzzy_prompt. Controls the magnitude of the random noise added by fuzzy_prompt. + +cut_overview: The schedule of overview cuts +cut_innercut: The schedule of inner cuts +cut_icgray_p: This sets the size of the border used for inner cuts. High cut_ic_pow values have larger borders, and therefore the cuts themselves will be smaller and provide finer details. If you have too many or too-small inner cuts, you may lose overall image coherency and/or it may cause an undesirable ‘mosaic’ effect. Low cut_ic_pow values will allow the inner cuts to be larger, helping image coherency while still helping with some details. + +display_rate: | + During a diffusion run, you can monitor the progress of each image being created with this variable. If display_rate is set to 50, DD will show you the in-progress image every 50 timesteps. Setting this to a lower value, like 5 or 10, is a good way to get an early peek at where your image is heading. If you don’t like the progression, just interrupt execution, change some settings, and re-run. If you are planning a long, unmonitored batch, it’s better to set display_rate equal to steps, because displaying interim images does slow Colab down slightly. +n_batches: | + This variable sets the number of still images you want DD to create. If you are using an animation mode (see below for details) DD will ignore n_batches and create a single set of animated frames based on the animation settings. +batch_name: | + The name of the batch, the batch id will be named as "discoart-[batch_name]-seed". To avoid your artworks be overridden by other users, please use a unique name. +clip_models: | + CLIP Model selectors. ViT-B/32, ViT-B/16, ViT-L/14, RN101, RN50, RN50x4, RN50x16, RN50x64. + These various CLIP models are available for you to use during image generation. Models have different styles or ‘flavors,’ so look around. + You can mix in multiple models as well for different results. However, keep in mind that some models are extremely memory-hungry, and turning on additional models will take additional memory and may cause a crash. + The rough order of speed/mem usage is (smallest/fastest to largest/slowest): + ViT-B/32 + RN50 + RN101 + ViT-B/16 + RN50x4 + RN50x16 + RN50x64 + ViT-L/14 + For RN50x64 & ViTL14 you may need to use fewer cuts, depending on your VRAM. diff --git a/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/runner.py b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/runner.py new file mode 100755 index 00000000..b1e155b0 --- /dev/null +++ b/modules/image/text_to_image/disco_diffusion_clip_vitb32/reverse_diffusion/runner.py @@ -0,0 +1,285 @@ +''' +This code is rewritten by Paddle based on Jina-ai/discoart. +https://github.com/jina-ai/discoart/blob/main/discoart/runner.py +''' +import gc +import os +import random +from threading import Thread + +import disco_diffusion_clip_vitb32.clip.clip as clip +import numpy as np +import paddle +import paddle.vision.transforms as T +import paddle_lpips as lpips +from docarray import Document +from docarray import DocumentArray +from IPython import display +from ipywidgets import Output +from PIL import Image + +from .helper import logger +from .helper import parse_prompt +from .model.losses import range_loss +from .model.losses import spherical_dist_loss +from .model.losses import tv_loss +from .model.make_cutouts import MakeCutoutsDango +from .model.sec_diff import alpha_sigma_to_t +from .model.sec_diff import SecondaryDiffusionImageNet2 +from .model.transforms import Normalize + + +def do_run(args, models) -> 'DocumentArray': + logger.info('preparing models...') + model, diffusion, clip_models, secondary_model = models + normalize = Normalize( + mean=[0.48145466, 0.4578275, 0.40821073], + std=[0.26862954, 0.26130258, 0.27577711], + ) + lpips_model = lpips.LPIPS(net='vgg') + for parameter in lpips_model.parameters(): + parameter.stop_gradient = True + side_x = (args.width_height[0] // 64) * 64 + side_y = (args.width_height[1] // 64) * 64 + cut_overview = eval(args.cut_overview) + cut_innercut = eval(args.cut_innercut) + cut_icgray_p = eval(args.cut_icgray_p) + + from .model.perlin_noises import create_perlin_noise, regen_perlin + + seed = args.seed + + skip_steps = args.skip_steps + + loss_values = [] + + if seed is not None: + np.random.seed(seed) + random.seed(seed) + paddle.seed(seed) + + model_stats = [] + for clip_model in clip_models: + model_stat = { + 'clip_model': None, + 'target_embeds': [], + 'make_cutouts': None, + 'weights': [], + } + model_stat['clip_model'] = clip_model + + if isinstance(args.text_prompts, str): + args.text_prompts = [args.text_prompts] + + for prompt in args.text_prompts: + txt, weight = parse_prompt(prompt) + txt = clip_model.encode_text(clip.tokenize(prompt)) + if args.fuzzy_prompt: + for i in range(25): + model_stat['target_embeds'].append((txt + paddle.randn(txt.shape) * args.rand_mag).clip(0, 1)) + model_stat['weights'].append(weight) + else: + model_stat['target_embeds'].append(txt) + model_stat['weights'].append(weight) + + model_stat['target_embeds'] = paddle.concat(model_stat['target_embeds']) + model_stat['weights'] = paddle.to_tensor(model_stat['weights']) + if model_stat['weights'].sum().abs() < 1e-3: + raise RuntimeError('The weights must not sum to 0.') + model_stat['weights'] /= model_stat['weights'].sum().abs() + model_stats.append(model_stat) + + init = None + if args.init_image: + d = Document(uri=args.init_image).load_uri_to_image_tensor(side_x, side_y) + init = T.to_tensor(d.tensor).unsqueeze(0) * 2 - 1 + + if args.perlin_init: + if args.perlin_mode == 'color': + init = create_perlin_noise([1.5**-i * 0.5 for i in range(12)], 1, 1, False, side_y, side_x) + init2 = create_perlin_noise([1.5**-i * 0.5 for i in range(8)], 4, 4, False, side_y, side_x) + elif args.perlin_mode == 'gray': + init = create_perlin_noise([1.5**-i * 0.5 for i in range(12)], 1, 1, True, side_y, side_x) + init2 = create_perlin_noise([1.5**-i * 0.5 for i in range(8)], 4, 4, True, side_y, side_x) + else: + init = create_perlin_noise([1.5**-i * 0.5 for i in range(12)], 1, 1, False, side_y, side_x) + init2 = create_perlin_noise([1.5**-i * 0.5 for i in range(8)], 4, 4, True, side_y, side_x) + init = (T.to_tensor(init).add(T.to_tensor(init2)).divide(paddle.to_tensor(2.0)).unsqueeze(0) * 2 - 1) + del init2 + + cur_t = None + + def cond_fn(x, t, y=None): + x_is_NaN = False + n = x.shape[0] + if secondary_model: + alpha = paddle.to_tensor(diffusion.sqrt_alphas_cumprod[cur_t], dtype='float32') + sigma = paddle.to_tensor(diffusion.sqrt_one_minus_alphas_cumprod[cur_t], dtype='float32') + cosine_t = alpha_sigma_to_t(alpha, sigma) + x = paddle.to_tensor(x.detach(), dtype='float32') + x.stop_gradient = False + cosine_t = paddle.tile(paddle.to_tensor(cosine_t.detach().cpu().numpy()), [n]) + cosine_t.stop_gradient = False + out = secondary_model(x, cosine_t).pred + fac = diffusion.sqrt_one_minus_alphas_cumprod[cur_t] + x_in_d = out * fac + x * (1 - fac) + x_in = x_in_d.detach() + x_in.stop_gradient = False + x_in_grad = paddle.zeros_like(x_in, dtype='float32') + else: + t = paddle.ones([n], dtype='int64') * cur_t + out = diffusion.p_mean_variance(model, x, t, clip_denoised=False, model_kwargs={'y': y}) + fac = diffusion.sqrt_one_minus_alphas_cumprod[cur_t] + x_in_d = out['pred_xstart'] * fac + x * (1 - fac) + x_in = x_in_d.detach() + x_in.stop_gradient = False + x_in_grad = paddle.zeros_like(x_in, dtype='float32') + for model_stat in model_stats: + for i in range(args.cutn_batches): + t_int = (int(t.item()) + 1) # errors on last step without +1, need to find source + # when using SLIP Base model the dimensions need to be hard coded to avoid AttributeError: 'VisionTransformer' object has no attribute 'input_resolution' + try: + input_resolution = model_stat['clip_model'].visual.input_resolution + except: + input_resolution = 224 + + cuts = MakeCutoutsDango( + input_resolution, + Overview=cut_overview[1000 - t_int], + InnerCrop=cut_innercut[1000 - t_int], + IC_Size_Pow=args.cut_ic_pow, + IC_Grey_P=cut_icgray_p[1000 - t_int], + ) + clip_in = normalize(cuts(x_in.add(paddle.to_tensor(1.0)).divide(paddle.to_tensor(2.0)))) + image_embeds = (model_stat['clip_model'].encode_image(clip_in)) + + dists = spherical_dist_loss( + image_embeds.unsqueeze(1), + model_stat['target_embeds'].unsqueeze(0), + ) + + dists = dists.reshape([ + cut_overview[1000 - t_int] + cut_innercut[1000 - t_int], + n, + -1, + ]) + losses = dists.multiply(model_stat['weights']).sum(2).mean(0) + loss_values.append(losses.sum().item()) # log loss, probably shouldn't do per cutn_batch + + x_in_grad += (paddle.grad(losses.sum() * args.clip_guidance_scale, x_in)[0] / args.cutn_batches) + tv_losses = tv_loss(x_in) + range_losses = range_loss(x_in) + sat_losses = paddle.abs(x_in - x_in.clip(min=-1, max=1)).mean() + loss = (tv_losses.sum() * args.tv_scale + range_losses.sum() * args.range_scale + + sat_losses.sum() * args.sat_scale) + if init is not None and args.init_scale: + init_losses = lpips_model(x_in, init) + loss = loss + init_losses.sum() * args.init_scale + x_in_grad += paddle.grad(loss, x_in)[0] + if not paddle.isnan(x_in_grad).any(): + grad = -paddle.grad(x_in_d, x, x_in_grad)[0] + else: + x_is_NaN = True + grad = paddle.zeros_like(x) + if args.clamp_grad and not x_is_NaN: + magnitude = grad.square().mean().sqrt() + return (grad * magnitude.clip(max=args.clamp_max) / magnitude) + return grad + + if args.diffusion_sampling_mode == 'ddim': + sample_fn = diffusion.ddim_sample_loop_progressive + else: + sample_fn = diffusion.plms_sample_loop_progressive + + logger.info('creating artwork...') + + image_display = Output() + da_batches = DocumentArray() + + for _nb in range(args.n_batches): + display.clear_output(wait=True) + display.display(args.name_docarray, image_display) + gc.collect() + paddle.device.cuda.empty_cache() + + d = Document(tags=vars(args)) + da_batches.append(d) + + cur_t = diffusion.num_timesteps - skip_steps - 1 + + if args.perlin_init: + init = regen_perlin(args.perlin_mode, side_y, side_x, args.batch_size) + + if args.diffusion_sampling_mode == 'ddim': + samples = sample_fn( + model, + (args.batch_size, 3, side_y, side_x), + clip_denoised=args.clip_denoised, + model_kwargs={}, + cond_fn=cond_fn, + progress=True, + skip_timesteps=skip_steps, + init_image=init, + randomize_class=args.randomize_class, + eta=args.eta, + ) + else: + samples = sample_fn( + model, + (args.batch_size, 3, side_y, side_x), + clip_denoised=args.clip_denoised, + model_kwargs={}, + cond_fn=cond_fn, + progress=True, + skip_timesteps=skip_steps, + init_image=init, + randomize_class=args.randomize_class, + order=2, + ) + + threads = [] + for j, sample in enumerate(samples): + cur_t -= 1 + with image_display: + if j % args.display_rate == 0 or cur_t == -1: + for _, image in enumerate(sample['pred_xstart']): + image = (image + 1) / 2 + image = image.clip(0, 1).squeeze().transpose([1, 2, 0]).numpy() * 255 + image = np.uint8(image) + image = Image.fromarray(image) + + image.save(os.path.join(args.output_dir, 'progress-{}.png'.format(_nb))) + c = Document(tags={'cur_t': cur_t}) + c.load_pil_image_to_datauri(image) + d.chunks.append(c) + display.clear_output(wait=True) + display.display(display.Image(os.path.join(args.output_dir, 'progress-{}.png'.format(_nb)))) + d.chunks.plot_image_sprites(os.path.join(args.output_dir, + f'{args.name_docarray}-progress-{_nb}.png'), + show_index=True) + t = Thread( + target=_silent_push, + args=( + da_batches, + args.name_docarray, + ), + ) + threads.append(t) + t.start() + + if cur_t == -1: + d.load_pil_image_to_datauri(image) + + for t in threads: + t.join() + display.clear_output(wait=True) + logger.info(f'done! {args.name_docarray}') + da_batches.plot_image_sprites(skip_empty=True, show_index=True, keep_aspect_ratio=True) + return da_batches + + +def _silent_push(da_batches: DocumentArray, name: str) -> None: + try: + da_batches.push(name) + except Exception as ex: + logger.debug(f'push failed: {ex}') -- GitLab