ControlNet使用diffusers库实现

最新推荐文章于 2025-01-18 15:07:45 发布

。。。DY

最新推荐文章于 2025-01-18 15:07:45 发布

阅读量511

点赞数 4

文章标签： pytorch 人工智能 python

本文链接：https://blog.youkuaiyun.com/Dy1211186431/article/details/145055179

版权

本文记录一下使用diffusers库实现controlNet过程遇到的问题

1. ControlNet

ControlNet项目就不用介绍了，这里说明一下如何从huggingface上下载权重文件。

hf链接：https://huggingface.co/lllyasviel/sd-controlnet-normal

这里包含depth，normal等条件的模型，并且还有代码参考实例。替换不同条件模型时，只需要替换对应的Model Nome，注意中间用-连接。

2.diffusers

使用diffusers库来使用T2I或者I2I非常便捷

pip install diffusers

不过需要注意的是，在使用它来实现controlNet时，有时会出现问题：

TypeError: forward() takes 2 positional arguments but 3 were given

我运行github上项目，使用StableDiffusionControlNetPipeline这个pipe。

解决方法：

安装0.26.0版本的diffusers库，不同版本diffusers对应的huggingface版本可能也不同。

pip install huggingface_hub==0.25.0

pip install diffusers==0.26.0

3.ControlNet 详细代码

代码基于Tech 中 guidance.py 修改，保留了其中计算SDS部分

from transformers import CLIPTextModel, CLIPTokenizer, logging,CLIPProcessor,CLIPVisionModel
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler, DDIMScheduler, ControlNetModel
from diffusers.utils import load_image
# suppress partial model loading warning
logging.set_verbosity_error()
from torchvision import transforms
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
import torchvision
import numpy as np
import PIL

from torch.cuda.amp import custom_bwd, custom_fwd 
import clip

class SpecifyGradient(torch.autograd.Function):
    @staticmethod
    @custom_fwd
    def forward(ctx, input_tensor, gt_grad):
        ctx.save_for_backward(gt_grad) 
        return torch.zeros([1], device=input_tensor.device, dtype=input_tensor.dtype) # dummy loss value

    @staticmethod
    @custom_bwd
    def backward(ctx, grad):
        gt_grad, = ctx.saved_tensors
        batch_size = len(gt_grad)
        return gt_grad / batch_size, None

def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #torch.backends.cudnn.deterministic = True
    #torch.backends.cudnn.benchmark = True

class StableDiffusion(nn.Module):
    def __init__(self, device, sd_version='2.1', hf_key=None, sd_step_range=[0.2, 0.98], controlnet=None, lora=None, cfg=None, head_hf_key=None):
        super().__init__()
        self.cfg = cfg
        self.device = device
        self.sd_version = sd_version

        print(f'[INFO] loading stable diffusion...')
        
        if hf_key is not None:
            print(f'[INFO] using hugging face custom model key: {hf_key}')
            model_key = hf_key
        elif self.sd_version == '2.1':
            model_key = "stabilityai/stable-diffusion-2-1-base"
        elif self.sd_version == '2.0':
            model_key = "stabilityai/stable-diffusion-2-base"
        elif self.sd_version == '1.5':
            model_key = "runwayml/stable-diffusion-v1-5"
        else:
            raise ValueError(f'Stable-diffusion version {self.sd_version} not supported.')
        self.clip_model, _ = clip.load("ViT-L/14", device=self.device, jit=False, download_root='clip_ckpts')
        self.clip_model = self.clip_model.eval().requires_grad_(False).to(self.device)
        self.clip_preprocess = T.Compose([
            T.Resize((224, 224)),
            T.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
        ])
        # Create model
        self.vae = AutoencoderKL.from_pretrained(model_key, subfolder="vae").to(self.device)
        self.tokenizer = CLIPTokenizer.from_pretrained(model_key, subfolder="tokenizer")
        self.text_encoder = CLIPTextModel.from_pretrained(model_key, subfolder="text_encoder").to(self.device)
        self.unet = UNet2DConditionModel.from_pretrained(model_key, subfolder="unet").to(self.device)

        self.use_head_model = head_hf_key is not None
        if self.use_head_model:
            self.tokenizer_head = CLIPTokenizer.from_pretrained(head_hf_key, subfolder="tokenizer")
            self.text_encoder_head = CLIPTextModel.from_pretrained(head_hf_key, subfolder="text_encoder").to(self.device)
            self.unet_head = UNet2DConditionModel.from_pretrained(head_hf_key, subfolder="unet").to(self.device)
        else:
            self.tokenizer_head = self.tokenizer
            self.text_encoder_head = self.text_encoder
            self.unet_head = self.unet
        
        self.scheduler = DDIMScheduler.from_pretrained(model_key, subfolder="scheduler")

        self.num_train_timesteps = self.scheduler.config.num_train_timesteps
        self.min_step = int(self.num_train_timesteps * sd_step_range[0])
        self.max_step = int(self.num_train_timesteps * sd_step_range[1])
        self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience

        if controlnet is None:
            self.controlnet = None
        else:
            self.controlnet = ControlNetModel.from_pretrained(controlnet).to(self.device)

        if lora is not None:
            self.unet.load_attn_procs(lora)
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.image_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
        print(f'[INFO] loaded stable diffusion!')

    def img_clip_loss(self, rgb1, rgb2):
        image_z_1 = self.clip_model.encode_image(self.clip_preprocess(rgb1))
        image_z_2 = self.clip_model.encode_image(self.clip_preprocess(rgb2))
        image_z_1 = image_z_1 / image_z_1.norm(dim=-1, keepdim=True) # normalize features
        image_z_2 = image_z_2 / image_z_2.norm(dim=-1, keepdim=True) # normalize features

        loss = - (image_z_1 * image_z_2).sum(-1).mean()
        return loss
    
    def img_text_clip_loss(self, rgb, prompts):
        image_z_1 = self.clip_model.encode_image(self.aug(rgb))
        image_z_1 = image_z_1 / image_z_1.norm(dim=-1, keepdim=True) # normalize features

        text = clip.tokenize(prompt).to(self.device)
        text_z = self.clip_model.encode_text(text)
        text_z = text_z / text_z.norm(dim=-1, keepdim=True)
        loss = - (image_z_1 * text_z).sum(-1).mean()
        return loss
    
    def get_text_embeds(self, prompt, negative_prompt, is_face=False):
        print('text prompt: [positive]', prompt, '[negative]', negative_prompt)
        if not is_face:
            tokenizer = self.tokenizer
            text_encoder = self.text_encoder
        else:
            tokenizer = self.tokenizer_head
            text_encoder = self.text_encoder_head
        # prompt, negative_prompt: [str]

        # Tokenize text and get embeddings
        text_input = tokenizer(prompt, padding='max_length', max_length=tokenizer.model_max_length, truncation=True, return_tensors='pt')

        with torch.no_grad():
            text_embeddings = text_encoder(text_input.input_ids.to(self.device))[0]

        # Do the same for unconditional embeddings
        uncond_input = tokenizer(negative_prompt, padding='max_length', max_length=tokenizer.model_max_length, return_tensors='pt')

        with torch.no_grad():
            uncond_embeddings = text_encoder(uncond_input.input_ids.to(self.device))[0]

        # Cat for final embeddings
        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
        return text_embeddings

    def get_img_embeds(self, images: list):
        # 检查输入是否是列表，如果是单个图像，转换为列表
        if not isinstance(images, list):
            images = [images]

        # 使用 CLIPProcessor 处理图像：调整大小、标准化等
        inputs = self.processor(images=images, return_tensors="pt").to(self.device)
        
        # 获取图像嵌入
        with torch.no_grad():
            image_embeddings = self.image_encoder(**inputs).last_hidden_state

        # 构建无条件的图像嵌入
        uncond_image_embeddings = torch.zeros_like(image_embeddings)  # 这里使用全零向量作为无条件的图像嵌入

        # 拼接无条件和条件图像嵌入
        final_image_embeddings = torch.cat([uncond_image_embeddings, image_embeddings])

        return final_image_embeddings
    def train_step(self, text_embeddings, pred_rgb, guidance_scale=100, controlnet_hint=None, controlnet_conditioning_scale=1.0, clip_ref_img=None, is_face=False, **kwargs):
        
        if is_face:
            unet = self.unet_head
        else:
            unet = self.unet
        # interp to 512x512 to be fed into vae.

        # _t = time.time()
        pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode='bilinear', align_corners=False)
        #pred_rgb_512 = pred_rgb
        if controlnet_hint:
            assert self.controlnet is not None
            controlnet_hint = self.controlnet_hint_conversion(controlnet_hint, 512, 512)
        # torch.cuda.synchronize(); print(f'[TIME] guiding: interp {time.time() - _t:.4f}s')

        # timestep ~ U(0.02, 0.98) to avoid very high/low noise level
        t = torch.randint(self.min_step, self.max_step + 1, [1], dtype=torch.long, device=self.device)

        # encode image into latents with vae, requires grad!
        # _t = time.time()
        latents = self.encode_imgs(pred_rgb_512)
        # torch.cuda.synchronize(); print(f'[TIME] guiding: vae enc {time.time() - _t:.4f}s')

        # predict the noise residual with unet, NO grad!
        # _t = time.time()
        with torch.no_grad():
            # add noise
            noise = torch.randn_like(latents)
            latents_noisy = self.scheduler.add_noise(latents, noise, t)
            # pred noise
            latent_model_input = torch.cat([latents_noisy] * 2)
            if controlnet_hint is not None:
                down_block_res_samples, mid_block_res_sample = self.controlnet(
                    latent_model_input,
                    t,
                    encoder_hidden_states=text_embeddings,
                    controlnet_cond=controlnet_hint,
                    conditioning_scale=controlnet_conditioning_scale,
                    return_dict=False
                )
                noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings, 
                    down_block_additional_residuals=down_block_res_samples,
                    mid_block_additional_residual=mid_block_res_sample,).sample
            else:
                noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
        # torch.cuda.synchronize(); print(f'[TIME] guiding: unet {time.time() - _t:.4f}s')

        # perform guidance (high scale from paper!)
        if self.scheduler.config.prediction_type == "v_prediction":
            alphas_cumprod = self.scheduler.alphas_cumprod.to(
                device=latents_noisy.device, dtype=latents_noisy.dtype
            )
            alpha_t = alphas_cumprod[t] ** 0.5
            sigma_t = (1 - alphas_cumprod[t]) ** 0.5

            noise_pred = latent_model_input * torch.cat([sigma_t] * 2, dim=0).view(
                -1, 1, 1, 1
            ) + noise_pred * torch.cat([alpha_t] * 2, dim=0).view(-1, 1, 1, 1)
        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)

        if clip_ref_img is not None and t < self.cfg.clip_step_range * self.num_train_timesteps:

            guidance_scale = self.cfg.clip_guidance_scale
            noise_pred = noise_pred_text + guidance_scale * (noise_pred_text - noise_pred_uncond)
            self.scheduler.set_timesteps(self.num_train_timesteps)
            de_latents = self.scheduler.step(noise_pred, t, latents_noisy)['prev_sample']
            imgs = self.decode_latents(de_latents)
            loss = 0
            if self.cfg.lambda_clip_img_loss > 0:
                loss = loss + self.img_clip_loss(imgs, clip_ref_img) * self.cfg.lambda_clip_img_loss
            if self.cfg.lambda_clip_text_loss > 0:
                text = self.cfg.text.replace('sks', '')
                loss = loss + self.img_text_clip_loss(imgs, [text]) * self.cfg.lambda_clip_text_loss

        else:
            noise_pred = noise_pred_text + guidance_scale * (noise_pred_text - noise_pred_uncond)
            
            # w(t), sigma_t^2
            w = (1 - self.alphas[t])
            # w = self.alphas[t] ** 0.5 * (1 - self.alphas[t])
            grad = w * (noise_pred - noise)

            # clip grad for stable training?
            # grad = grad.clamp(-10, 10)
            grad = torch.nan_to_num(grad)

            # since we omitted an item in grad, we need to use the custom function to specify the gradient
            # _t = time.time()
            loss = SpecifyGradient.apply(latents, grad) 
            # torch.cuda.synchronize(); print(f'[TIME] guiding: backward {time.time() - _t:.4f}s')

        return loss 

    def produce_latents(self, text_embeddings, height=512, width=512, num_inference_steps=50, guidance_scale=7.5, latents=None, controlnet_hint=None):

        if latents is None:
            latents = torch.randn((text_embeddings.shape[0] // 2, self.unet.in_channels, height // 8, width // 8), device=self.device)

        self.scheduler.set_timesteps(num_inference_steps)

        with torch.autocast('cuda'):
            for i, t in enumerate(self.scheduler.timesteps):
                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
                latent_model_input = torch.cat([latents] * 2)

                # predict the noise residual
                with torch.no_grad():
                    if controlnet_hint is not None:
                        controlnet_hint = self.controlnet_hint_conversion(controlnet_hint, 512, 512)  
                        down_block_res_samples, mid_block_res_sample = self.controlnet(
                            latent_model_input,
                            t,
                            encoder_hidden_states=text_embeddings,
                            controlnet_cond=controlnet_hint,
                            conditioning_scale=1,
                            return_dict=False
                        )
                        noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings, 
                            down_block_additional_residuals=down_block_res_samples,
                            mid_block_additional_residual=mid_block_res_sample,).sample
                    else:
                        noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)['sample']

                # perform guidance
                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                noise_pred = noise_pred_text + guidance_scale * (noise_pred_text - noise_pred_uncond)

                # compute the previous noisy sample x_t -> x_t-1
                latents = self.scheduler.step(noise_pred, t, latents)['prev_sample']
        
        return latents

    def decode_latents(self, latents):

        latents = 1 / 0.18215 * latents

        with torch.no_grad():
            imgs = self.vae.decode(latents).sample

        imgs = (imgs / 2 + 0.5).clamp(0, 1)
        
        return imgs

    def encode_imgs(self, imgs):
        # imgs: [B, 3, H, W]

        imgs = 2 * imgs - 1

        posterior = self.vae.encode(imgs).latent_dist
        latents = posterior.sample() * 0.18215

        return latents

    def prompt_to_img(self, prompts, negative_prompts='', height=512, width=512, num_inference_steps=50, guidance_scale=7.5, latents=None, controlnet_hint=None,is_text=True):

        if isinstance(prompts, str):
            prompts = [prompts]
        
        if isinstance(negative_prompts, str):
            negative_prompts = [negative_prompts]
        if is_text:
            # Prompts -> text embeds
            text_embeds = self.get_text_embeds(prompts, negative_prompts)
        else:
            text_embeds = self.get_img_embeds(prompts)
        # # Prompts -> text embeds
        # text_embeds = self.get_text_embeds(prompts, negative_prompts) # [2, 77, 768]

        # Text embeds -> img latents
        latents = self.produce_latents(text_embeds, height=height, width=width, latents=latents, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale,controlnet_hint=controlnet_hint) # [1, 4, 64, 64]
        
        # Img latents -> imgs
        imgs = self.decode_latents(latents) # [1, 3, 512, 512]

        # Img to Numpy
        imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy()
        imgs = (imgs * 255).round().astype('uint8')

        return imgs
    

    def controlnet_hint_conversion(self, controlnet_hint, height, width, num_images_per_prompt=1):
        channels = 3
        if isinstance(controlnet_hint, torch.Tensor):
            # torch.Tensor: acceptble shape are any of chw, bchw(b==1) or bchw(b==num_images_per_prompt)
            shape_chw = (channels, height, width)
            shape_bchw = (1, channels, height, width)
            shape_nchw = (num_images_per_prompt, channels, height, width)
            if controlnet_hint.shape in [shape_chw, shape_bchw, shape_nchw]:
                controlnet_hint = controlnet_hint.to(dtype=self.controlnet.dtype, device=self.controlnet.device)
                if controlnet_hint.shape != shape_nchw:
                    controlnet_hint = controlnet_hint.repeat(num_images_per_prompt, 1, 1, 1)
                return controlnet_hint
            else:
                raise ValueError(
                    f"Acceptble shape of `controlnet_hint` are any of ({channels}, {height}, {width}),"
                    + f" (1, {channels}, {height}, {width}) or ({num_images_per_prompt}, "
                    + f"{channels}, {height}, {width}) but is {controlnet_hint.shape}"
                )
        elif isinstance(controlnet_hint, np.ndarray):
            # np.ndarray: acceptable shape is any of hw, hwc, bhwc(b==1) or bhwc(b==num_images_per_promot)
            # hwc is opencv compatible image format. Color channel must be BGR Format.
            if controlnet_hint.shape == (height, width):
                controlnet_hint = np.repeat(controlnet_hint[:, :, np.newaxis], channels, axis=2)  # hw -> hwc(c==3)
            shape_hwc = (height, width, channels)
            shape_bhwc = (1, height, width, channels)
            shape_nhwc = (num_images_per_prompt, height, width, channels)
            if controlnet_hint.shape in [shape_hwc, shape_bhwc, shape_nhwc]:
                controlnet_hint = torch.from_numpy(controlnet_hint.copy())
                controlnet_hint = controlnet_hint.to(dtype=self.controlnet.dtype, device=self.controlnet.device)
                controlnet_hint /= 255.0
                if controlnet_hint.shape != shape_nhwc:
                    controlnet_hint = controlnet_hint.repeat(num_images_per_prompt, 1, 1, 1)
                controlnet_hint = controlnet_hint.permute(0, 3, 1, 2)  # b h w c -> b c h w
                return controlnet_hint
            else:
                raise ValueError(
                    f"Acceptble shape of `controlnet_hint` are any of ({width}, {channels}), "
                    + f"({height}, {width}, {channels}), "
                    + f"(1, {height}, {width}, {channels}) or "
                    + f"({num_images_per_prompt}, {channels}, {height}, {width}) but is {controlnet_hint.shape}"
                )
        elif isinstance(controlnet_hint, PIL.Image.Image):
            if controlnet_hint.size == (width, height):
                controlnet_hint = controlnet_hint.convert("RGB")  # make sure 3 channel RGB format
                controlnet_hint = np.array(controlnet_hint)  # to numpy
                controlnet_hint = controlnet_hint[:, :, ::-1]  # RGB -> BGR
                return self.controlnet_hint_conversion(controlnet_hint, height, width, num_images_per_prompt)
            else:
                raise ValueError(
                    f"Acceptable image size of `controlnet_hint` is ({width}, {height}) but is {controlnet_hint.size}"
                )
        else:
            raise ValueError(
                f"Acceptable type of `controlnet_hint` are any of torch.Tensor, np.ndarray, PIL.Image.Image but is {type(controlnet_hint)}"
            )


if __name__ == '__main__':

    import argparse
    import matplotlib.pyplot as plt

    # parser = argparse.ArgumentParser()
    # parser.add_argument('prompt', type=str)
    # parser.add_argument('--negative', default='', type=str)
    # parser.add_argument('--sd_version', type=str, default='2.1', choices=['1.5', '2.0', '2.1'], help="stable diffusion version")
    # parser.add_argument('--hf_key', type=str, default=None, help="hugging face Stable diffusion model key")
    # parser.add_argument('-H', type=int, default=512)
    # parser.add_argument('-W', type=int, default=512)
    # parser.add_argument('--seed', type=int, default=0)
    # parser.add_argument('--steps', type=int, default=50)
    # opt = parser.parse_args()

    seed_everything(0)
    control_img_path = "/home/clothAvatar/normals_06.png"
    device = torch.device('cuda')
    img = load_image(control_img_path)
    transform = transforms.Compose([
    transforms.ToTensor(),  # 将 PIL 图像转换为 [C, H, W] Tensor
    transforms.Resize((512, 512)),  # 调整大小为 512x512
    ])
    control_image = transform(img).unsqueeze(0)
    # controlnet="lllyasviel/sd-controlnet-openpose"
    controlnet="lllyasviel/sd-controlnet-normal"
    # controlnet = "lllyasviel/sd-controlnet-depth"
    # hf_key="/home/TeCH/exp/examples/name/sd_model"
    # lora = "/home/add_detail.safetensors"
    sd = StableDiffusion(device, '1.5', hf_key=None,controlnet=controlnet)
    prompt_ = "a sks man, dark brown short and slightly messy hair, neutral, slightly frowning, sks light grey plain, casual hoodie, sks grey regular fit jeans pants, hands are slightly spread out and relaxed" 
    prompt_ = prompt_.replace('sks',"")
    # prompt_ = "A man with a piece of Brown Sweatshirt."
    # prompt_ = PIL.Image.open("/home/color_000001.png").convert("RGB")
    # visualize image
    imgs = []
    for _ in range(4):
        img = sd.prompt_to_img(prompt_, "", 512, 512, 50,controlnet_hint=control_image)  
        # img = sd.prompt_to_img(prompt_, "", 512, 512, 50,is_text=False)      
        img = img / 255.
        imgs.append(torch.from_numpy(img))
        print("done one")
    imgs = torch.cat(imgs, dim=0)
    # save image as a grid
    imgs = imgs.permute(0, 3, 1, 2)
    img_grid = torchvision.utils.make_grid(imgs, nrow = 5, padding = 10)
    torchvision.utils.save_image(img_grid, '/home/img_grid3.png')
    print('Image saved as img_grid.png')

4. 使用自定义扩散模型

网上会有一些可以下载的已经训练好的扩散模型和lora等，通常可以使用别人写的web-ui来实现。

这里简单记录一下，其中pipe需要与原模型提供的相符。

import torch
from diffusers import StableDiffusionXLPipeline,AutoencoderKL
from diffusers import EulerDiscreteScheduler

ckpt_path = "/home/ps/dy/downloads/noobaiXLNAIXL_epsilonPred10Version.safetensors"
lora_path = "/home/ps/dy/ahemaru_n.safetensors"
vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae").to("cuda")
pipe = StableDiffusionXLPipeline.from_single_file(
    ckpt_path,
    use_safetensors=True,
    torch_dtype=torch.float32,
)
pipe.vae = vae  # 将VAE绑定到Pipeline
# pipe.unet.load_attn_procs(lora_path,alpha=0.1)
# scheduler_args = {"prediction_type": "epsilon", "rescale_betas_zero_snr": True}
pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, None)
# pipe.enable_xformers_memory_efficient_attention()
pipe = pipe.to("cuda")

prompt = """masterpiece, best quality, chromatic aberration, cute \(theme\), limited palette, high contrast, color contrast, hot colors, while theme, \(medium\)"""
negative_prompt = "nsfw, worst quality, old, early, low quality, lowres, signature, username, logo, bad hands, mutated hands, mammal, anthro, furry, ambiguous form, feral, semi-anthro"

image = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=832,
    height=1216,
    num_inference_steps=28,
    guidance_scale=7.5,
    generator=torch.Generator().manual_seed(0),
).images[0]

image.save("/home/ps/dy/output.png")