孙赢利_11月17日_超分周报

hitsz_syl

已于 2024-11-17 16:35:33 修改

阅读量739

点赞数 11

文章标签：周报

于 2024-11-17 16:18:50 首次发布

本文链接：https://blog.youkuaiyun.com/SZ170110231/article/details/143833162

版权

一. 康佳PC端实现：1080 → 4K 实时超分

1. 将图像预处理操作从 CPU → GPU 运行

将图像预处理操作从 CPU → GPU 运行

2. 后处理部分操作从 CPU → GPU 运行

后处理部分操作从 CPU → GPU 运行

inference_realesrgan_Animal_Video.py

import argparse
import cv2
import glob
import os
from basicsr.archs.rrdbnet_arch import RRDBNet
from basicsr.utils.download_util import load_file_from_url

from realesrgan import RealESRGANer
from realesrgan.archs.srvgg_arch import SRVGGNetCompact
import numpy as np
import time
import torch
# import cupy as cp
import subprocess
import threading

def play_video():
    subprocess.call(["ffplay", "-nodisp", "-i", R"C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\第1集_桃园三结义_1080P.mp4"])
    # subprocess.call(["ffplay", "-i", "/home/tyzc/0416_syl_chaofen/rknn-multi-threaded-Super-Resolution-syl/cartorn/video_640.mp4"])

def bgr_to_yuv(bgr_image):
    """
    将 BGR 图像转换为 YUV 图像，并分离出 Y, U, V 通道
    :param bgr_image: 输入的 BGR 图像，类型为 CuPy 数组
    :return: Y, U, V 通道（都是 CuPy 数组）
    """
    # 将 BGR 图像转换为 YUV
    # BGR 到 YUV 转换的矩阵
    # bgr_image = bgr_image.astype(cp.float32)

    Y = 0.299 * bgr_image[:, :, 2] + 0.587 * bgr_image[:, :, 1] + 0.114 * bgr_image[:, :, 0]
    U = -0.169 * bgr_image[:, :, 2] - 0.331 * bgr_image[:, :, 1] + 0.499 * bgr_image[:, :, 0] + 128
    V = 0.499 * bgr_image[:, :, 2] - 0.419 * bgr_image[:, :, 1] - 0.0813 * bgr_image[:, :, 0] + 128

    return Y, U, V

def main():
    """Inference demo for Real-ESRGAN.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\weights\4K_Cartoon_Y', help='Input image or folder')
    parser.add_argument(
        '-n',
        '--model_name',
        type=str,
        default='RealESRGAN_x2plus',
        help=('Model names: RealESRGAN_x4plus | RealESRNet_x4plus | RealESRGAN_x4plus_anime_6B | RealESRGAN_x2plus | '
              'realesr-animevideov3 | realesr-general-x4v3'))
    parser.add_argument('-o', '--output', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\weights\4K_Cartoon_Y_2X', help='Output folder')
    parser.add_argument(
        '-dn',
        '--denoise_strength',
        type=float,
        default=0.5,
        help=('Denoise strength. 0 for weak denoise (keep noise), 1 for strong denoise ability. '
              'Only used for the realesr-general-x4v3 model'))
    parser.add_argument('-s', '--outscale', type=float, default=2, help='The final upsampling scale of the image')
    parser.add_argument(
        '--model_path', type=str, default=R"C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\models\train_realesrnet_x2plus_32_1_16_4channel__123conv_1rdb1_net_oneresize_no_conv_hr_pairdata_0929_real_net_g_520000.pth", help='[Option] Model path. Usually, you do not need to specify it')
        # '--model_path', type=str, default="/home/sunyingli/Real-ESRGAN/experiments/train_realesrnet_x2plus_32_1_16_4channel__123conv_1rdb1_net_oneresize_no_conv_hr_pairdata_0806/models/net_g_410000.pth", help='[Option] Model path. Usually, you do not need to specify it')
    parser.add_argument('--suffix', type=str, default='', help='Suffix of the restored image')
    parser.add_argument('-t', '--tile', type=int, default=0, help='Tile size, 0 for no tile during testing')
    parser.add_argument('--tile_pad', type=int, default=10, help='Tile padding')
    parser.add_argument('--pre_pad', type=int, default=0, help='Pre padding size at each border')
    parser.add_argument('--face_enhance', action='store_true', help='Use GFPGAN to enhance face')
    parser.add_argument(
        '--fp32', action='store_true', help='Use fp32 precision during inference. Default: fp16 (half precision).')
    parser.add_argument(
        '--alpha_upsampler',
        type=str,
        default='realesrgan',
        help='The upsampler for the alpha channels. Options: realesrgan | bicubic')
    parser.add_argument(
        '--ext',
        type=str,
        default='auto',
        help='Image extension. Options: auto | jpg | png, auto means using the same extension as inputs')
    parser.add_argument(
        '-g', '--gpu-id', type=int, default=0, help='gpu device to use (default=None) can be 0,1,2 for multi-gpu')

    args = parser.parse_args()

    # determine models according to model names
    args.model_name = args.model_name.split('.')[0]
    if args.model_name == 'RealESRGAN_x4plus':  # x4 RRDBNet model
        model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
        netscale = 4
        file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth']
    elif args.model_name == 'RealESRNet_x4plus':  # x4 RRDBNet model
        model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
        netscale = 4
        file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.1/RealESRNet_x4plus.pth']
    elif args.model_name == 'RealESRGAN_x4plus_anime_6B':  # x4 RRDBNet model with 6 blocks
        model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4)
        netscale = 4
        file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth']
    elif args.model_name == 'RealESRGAN_x2plus':  # x2 RRDBNet model
        model = RRDBNet(num_in_ch=1, num_out_ch=1, num_feat=32, num_block=1, num_grow_ch=16, scale=2)
        netscale = 2
        file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth']
    elif args.model_name == 'realesr-animevideov3':  # x4 VGG-style model (XS size)
        model = SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=16, upscale=4, act_type='prelu')
        netscale = 4
        file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-animevideov3.pth']
    elif args.model_name == 'realesr-general-x4v3':  # x4 VGG-style model (S size)
        model = SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=32, upscale=4, act_type='prelu')
        netscale = 4
        file_url = [
            'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-wdn-x4v3.pth',
            'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-x4v3.pth'
        ]

    # determine model paths
    if args.model_path is not None:
        model_path = args.model_path
    else:
        model_path = os.path.join('weights', args.model_name + '.pth')
        if not os.path.isfile(model_path):
            ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
            for url in file_url:
                # model_path will be updated
                model_path = load_file_from_url(
                    url=url, model_dir=os.path.join(ROOT_DIR, 'weights'), progress=True, file_name=None)

    # use dni to control the denoise strength
    dni_weight = None
    if args.model_name == 'realesr-general-x4v3' and args.denoise_strength != 1:
        wdn_model_path = model_path.replace('realesr-general-x4v3', 'realesr-general-wdn-x4v3')
        model_path = [model_path, wdn_model_path]
        dni_weight = [args.denoise_strength, 1 - args.denoise_strength]

    # restorer
    upsampler = RealESRGANer(
        scale=netscale,
        model_path=model_path,
        dni_weight=dni_weight,
        model=model,
        tile=args.tile,
        tile_pad=args.tile_pad,
        pre_pad=args.pre_pad,
        half=not args.fp32,
        gpu_id=args.gpu_id)

    if args.face_enhance:  # Use GFPGAN for face enhancement
        from gfpgan import GFPGANer
        face_enhancer = GFPGANer(
            model_path='https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth',
            upscale=args.outscale,
            arch='clean',
            channel_multiplier=2,
            bg_upsampler=upsampler)
    os.makedirs(args.output, exist_ok=True)

    if os.path.isfile(args.input):
        paths = [args.input]
    else:
        paths = sorted(glob.glob(os.path.join(args.input, '*')))
    # for idx, path in enumerate(paths):
    cap = cv2.VideoCapture(R'C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\整装待发发兵出征_720P.mp4')
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    idx = 0
    while (cap.isOpened()):
        idx += 1
        if idx == 1:
            one_all_start_time = time.time()
        print('Testing', idx)
        start_time = time.time()
        ret, frame = cap.read()
        frame_resize = frame[:,:width//2,]
        img_BGR = frame[:,width//2:,]
        end_time = time.time()
        print("1 " + str(end_time - start_time))
        start_time = time.time()
        img_yuv = cv2.cvtColor(img_BGR, cv2.COLOR_BGR2YUV)
        Y, U, V = cv2.split(img_yuv)
        end_time = time.time()
        print("2 " + str(end_time - start_time))

        # image = img_BGR
        # image_cp = cp.asarray(image)  # 将图像转换为 CuPy 数组
        # # 转换为 YUV，并分离通道
        # Y, U, V = bgr_to_yuv(image_cp)

        # # 将 Y, U, V 转换为 NumPy 数组以便查看（如果需要在 CPU 上操作）
        # Y_cpu = cp.asnumpy(Y)
        # U_cpu = cp.asnumpy(U)
        # V_cpu = cp.asnumpy(V)

        # # 显示 Y, U, V 通道的形状，或者将其保存为图像（可选）
        # print("Y channel shape:", Y_cpu.shape)
        # print("U channel shape:", U_cpu.shape)
        # print("V channel shape:", V_cpu.shape)
        # end_time = time.time()
        # print("1 " + str(end_time - start_time))


        # start_time = time.time()
        # # 将图像转换为 Tensor，并将数据移动到 GPU 上
        # img_tensor = torch.from_numpy(img_BGR).float().to(device)
        # end_time = time.time()
        # print("3 " + str(end_time - start_time))
        # 将图像数据从 BGR 转换为 YUV
        # BGR 到 YUV 转换矩阵
        # bgr_to_yuv_matrix = torch.tensor([
        #     [0.114,  0.587,  0.299],
        #     [-0.169, -0.331,  0.499],
        #     [0.499, -0.419, -0.081]
        # ], dtype=torch.float32).cuda()
        # # 进行矩阵乘法，BGR -> YUV
        # start_time = time.time()
        # img_yuv_tensor = torch.matmul(img_tensor[..., :3], bgr_to_yuv_matrix.T)
        # # 分离 Y, U, V 通道
        # Y_tensor, U_tensor, V_tensor = img_yuv_tensor.split(1, dim=-1)
        # end_time = time.time()
        # print("4 " + str(end_time - start_time))
        # # 将结果从 GPU 移回 CPU
        # Y_cpu = Y_tensor.cpu().numpy()
        # U_cpu = U_tensor.cpu().numpy()
        # V_cpu = V_tensor.cpu().numpy()
        # # 你也可以保存图像
        # cv2.imwrite('Y_channel.png', Y_cpu.astype(np.uint8))

        # 将图像转换为 PyTorch 张量并移至 GPU
        # img_yuv = torch.tensor(img_BGR).permute(2, 0, 1).unsqueeze(0).float().to('cuda')  # (1, 3, H, W)
        # img = img_yuv
        # 获取YUV三个通道
        img_mode = None
        output, _ = upsampler.enhance(Y, outscale=args.outscale)
        if args.ext == 'auto':
            extension = "png"
        else:
            extension = args.ext
        if img_mode == 'RGBA':  # RGBA images should be saved in png format
            extension = 'png'
        if args.suffix == '':
            start_time = time.time()
            save_path = os.path.join(args.output, f'{idx}.{extension}')
            # 保存为彩色图像
            # 创建一个空白的彩色图像
            h, w = output.shape

            # 对U和V通道进行resize
            resized_U = cv2.resize(U, (w, h))
            resized_V = cv2.resize(V, (w, h))
            # 合并resize后的YUV通道
            img_YUV_OUT = cv2.merge([output, resized_U, resized_V])

            # # 将图像从YUV颜色空间转换回BGR颜色空间
            img_BGR_OUT = cv2.cvtColor(img_YUV_OUT, cv2.COLOR_YUV2BGR)


            frame_resize = cv2.resize(frame_resize, (width, height * 2), 2, 2)
            # 创建一个空白帧，大小与输入视频相同
            result_frame = np.zeros((height*2, width*2, 3), dtype=np.uint8)

            # 将左半部分放置在左边
            result_frame[:, :width] = frame_resize
            # 将all_start_time右半部分放置在右边
            result_frame[:, width:] = img_BGR_OUT
            result_frame[:, width:width+1, :] = (255, 255, 255)
            frame = result_frame
            # 启动视频播放线程
            # if idx == 1:
            #     video_thread = threading.Thread(target=play_video)
            #     video_thread.start()
            #     one_all_start_time = time.time()
            # # 保存合成后的图像
            # 显示图像
            cv2.imshow('Image', frame)

            # 等待用户按键，按任意键继续，按 'q' 键退出
            if cv2.waitKey(1) & 0xFF == ord('q'):
                # out.release()
                break
            # cv2.imwrite(save_path, img_BGR_OUT)
            end_time = time.time()
            print("8 " + str(end_time - start_time))
            one_all_end_time = time.time()
            print("One Picture All Time: " + str(one_all_end_time - one_all_start_time))
            print("-"*60)
            if one_all_end_time - one_all_start_time < 0.04:
                time.sleep(0.04 - one_all_end_time + one_all_start_time)
                print("time.sleep: " + str(0.04 - one_all_end_time + one_all_start_time))
            # if one_all_end_time - one_all_start_time > 0.04:
            #     delay_time = delay_time + one_all_end_time - one_all_start_time - 0.04
            one_all_start_time = time.time()
    cv2.destroyAllWindows()
if __name__ == '__main__':
    main()

utils.py

import cv2
import math
import numpy as np
import os
import queue
import threading
import torch
from basicsr.utils.download_util import load_file_from_url
from torch.nn import functional as F
import time

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))


class RealESRGANer():
    """A helper class for upsampling images with RealESRGAN.

    Args:
        scale (int): Upsampling scale factor used in the networks. It is usually 2 or 4.
        model_path (str): The path to the pretrained model. It can be urls (will first download it automatically).
        model (nn.Module): The defined network. Default: None.
        tile (int): As too large images result in the out of GPU memory issue, so this tile option will first crop
            input images into tiles, and then process each of them. Finally, they will be merged into one image.
            0 denotes for do not use tile. Default: 0.
        tile_pad (int): The pad size for each tile, to remove border artifacts. Default: 10.
        pre_pad (int): Pad the input images to avoid border artifacts. Default: 10.
        half (float): Whether to use half precision during inference. Default: False.
    """

    def __init__(self,
                 scale,
                 model_path,
                 dni_weight=None,
                 model=None,
                 tile=0,
                 tile_pad=10,
                 pre_pad=10,
                 half=False,
                 device=None,
                 gpu_id=None):
        self.scale = scale
        self.tile_size = tile
        self.tile_pad = tile_pad
        self.pre_pad = pre_pad
        self.mod_scale = None
        self.half = half

        # initialize model
        if gpu_id:
            self.device = torch.device(
                f'cuda:{gpu_id}' if torch.cuda.is_available() else 'cpu') if device is None else device
        else:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device is None else device

        if isinstance(model_path, list):
            # dni
            assert len(model_path) == len(dni_weight), 'model_path and dni_weight should have the save length.'
            loadnet = self.dni(model_path[0], model_path[1], dni_weight)
        else:
            # if the model_path starts with https, it will first download models to the folder: weights
            if model_path.startswith('https://'):
                model_path = load_file_from_url(
                    url=model_path, model_dir=os.path.join(ROOT_DIR, 'weights'), progress=True, file_name=None)
            loadnet = torch.load(model_path, map_location=torch.device('cpu'))

        # prefer to use params_ema
        if 'params_ema' in loadnet:
            keyname = 'params_ema'
        else:
            keyname = 'params'
        model.load_state_dict(loadnet[keyname], strict=True)

        model.eval()
        self.model = model.to(self.device)
        if self.half:
            self.model = self.model.half()

    def dni(self, net_a, net_b, dni_weight, key='params', loc='cpu'):
        """Deep network interpolation.

        ``Paper: Deep Network Interpolation for Continuous Imagery Effect Transition``
        """
        net_a = torch.load(net_a, map_location=torch.device(loc))
        net_b = torch.load(net_b, map_location=torch.device(loc))
        for k, v_a in net_a[key].items():
            net_a[key][k] = dni_weight[0] * v_a + dni_weight[1] * net_b[key][k]
        return net_a

    def pre_process(self, img):
        """Pre-process, such as pre-pad and mod pad, so that the images can be divisible
        """
        self.img = torch.from_numpy(img).unsqueeze(0).unsqueeze(0).to(self.device).float() / 255.0
        # img = torch.from_numpy(img)
        # self.img = img.unsqueeze(0).to(self.device)
        if self.half:
            self.img = self.img.half()

        # # pre_pad
        # if self.pre_pad != 0:
        #     self.img = F.pad(self.img, (0, self.pre_pad, 0, self.pre_pad), 'reflect')
        # mod pad for divisible borders
        if self.scale == 2:
            self.mod_scale = 2
        # elif self.scale == 1:
        #     self.mod_scale = 4
        if self.mod_scale is not None:
            self.mod_pad_h, self.mod_pad_w = 0, 0
            _, _, h, w = self.img.size()
            if (h % self.mod_scale != 0):
                self.mod_pad_h = (self.mod_scale - h % self.mod_scale)
            if (w % self.mod_scale != 0):
                self.mod_pad_w = (self.mod_scale - w % self.mod_scale)
            self.img = F.pad(self.img, (0, self.mod_pad_w, 0, self.mod_pad_h), 'reflect')

    def process(self):
        # model inference
        # self.output = self.model(self.img)

        # cv2.imwrite("/home/sunyingli/y_channel.jpg", ((self.img[:, 2, :, :].cpu().numpy()*255).astype(np.uint8)[0]))
        # self.output = self.model(self.img[:, 2, :, :].unsqueeze(1))
        start_time = time.time()
        print(self.img.shape)
        self.output = self.model(self.img[:, :, :, :])
        end_time = time.time()

        print("Input_size: " + str(self.img[:, :, :, :].shape))
        print("Inference Time: ", str(end_time - start_time))
        # from thop import profile
        # # 2 相当于只取Y通道
        # f = open(os.devnull, "w")
        # import sys
        # sys.stdout = f
        # flops, params = profile(self.model, inputs=(self.img[:, 2, :, :].unsqueeze(1),))
        # sys.stdout = sys.__stdout__
        # f.close()
        # # 将FLOPs转换为TOPs,计算精度为32位
        # tops = flops*24 / 10**12
        # print("24 frames per second computing power: ")
        # # 打印结果
        # print("FLOPs:", flops)
        # print("TOPs:", tops)
        # print("Params:", params)
        # print("-"*60)


    def tile_process(self):
        """It will first crop input images to tiles, and then process each tile.
        Finally, all the processed tiles are merged into one images.

        Modified from: https://github.com/ata4/esrgan-launcher
        """
        batch, channel, height, width = self.img.shape
        output_height = height * self.scale
        output_width = width * self.scale
        output_shape = (batch, channel, output_height, output_width)

        # start with black image
        self.output = self.img.new_zeros(output_shape)
        tiles_x = math.ceil(width / self.tile_size)
        tiles_y = math.ceil(height / self.tile_size)

        # loop over all tiles
        for y in range(tiles_y):
            for x in range(tiles_x):
                # extract tile from input image
                ofs_x = x * self.tile_size
                ofs_y = y * self.tile_size
                # input tile area on total image
                input_start_x = ofs_x
                input_end_x = min(ofs_x + self.tile_size, width)
                input_start_y = ofs_y
                input_end_y = min(ofs_y + self.tile_size, height)

                # input tile area on total image with padding
                input_start_x_pad = max(input_start_x - self.tile_pad, 0)
                input_end_x_pad = min(input_end_x + self.tile_pad, width)
                input_start_y_pad = max(input_start_y - self.tile_pad, 0)
                input_end_y_pad = min(input_end_y + self.tile_pad, height)

                # input tile dimensions
                input_tile_width = input_end_x - input_start_x
                input_tile_height = input_end_y - input_start_y
                tile_idx = y * tiles_x + x + 1
                input_tile = self.img[:, :, input_start_y_pad:input_end_y_pad, input_start_x_pad:input_end_x_pad]

                # upscale tile
                try:
                    with torch.no_grad():
                        output_tile = self.model(input_tile)
                except RuntimeError as error:
                    print('Error', error)
                print(f'\tTile {tile_idx}/{tiles_x * tiles_y}')

                # output tile area on total image
                output_start_x = input_start_x * self.scale
                output_end_x = input_end_x * self.scale
                output_start_y = input_start_y * self.scale
                output_end_y = input_end_y * self.scale

                # output tile area without padding
                output_start_x_tile = (input_start_x - input_start_x_pad) * self.scale
                output_end_x_tile = output_start_x_tile + input_tile_width * self.scale
                output_start_y_tile = (input_start_y - input_start_y_pad) * self.scale
                output_end_y_tile = output_start_y_tile + input_tile_height * self.scale

                # put tile into output image
                self.output[:, :, output_start_y:output_end_y,
                            output_start_x:output_end_x] = output_tile[:, :, output_start_y_tile:output_end_y_tile,
                                                                       output_start_x_tile:output_end_x_tile]

    def post_process(self):
        # remove extra pad
        if self.mod_scale is not None:
            _, _, h, w = self.output.size()
            self.output = self.output[:, :, 0:h - self.mod_pad_h * self.scale, 0:w - self.mod_pad_w * self.scale]
        # remove prepad
        if self.pre_pad != 0:
            _, _, h, w = self.output.size()
            self.output = self.output[:, :, 0:h - self.pre_pad * self.scale, 0:w - self.pre_pad * self.scale]
        return self.output

    @torch.no_grad()
    def enhance(self, img, outscale=None, alpha_upsampler='realesrgan'):
        # start_time = time.time()
        # # img: numpy
        # img = img.astype(np.float16)

        # max_range = 255
        # img = img / max_range

        img_mode = 'RGB'
        # img = np.expand_dims(img, axis=0)
        # end_time = time.time()
        # print("3 " + str(end_time - start_time))
        # ------------------- process image (without the alpha channel) ------------------- #
        start_time = time.time()
        self.pre_process(img)
        end_time = time.time()
        print("4 " + str(end_time - start_time))
        if self.tile_size > 0:
            self.tile_process()
        else:
            start_time = time.time()
            self.process()
            end_time = time.time()
            print("4 " + str(end_time - start_time))
        start_time = time.time()
        output_img = self.post_process()
        end_time = time.time()
        print("5 " + str(end_time - start_time))
        start_time = time.time()
        # output_img = output_img.data.squeeze().float().cpu().clamp_(0, 1).numpy()
        output_img = output_img.data[0].clamp_(0, 1)*255
        output = output_img.round().to(torch.uint8).cpu().numpy()[0]
        end_time = time.time()
        print("6 " + str(end_time - start_time))

        # ------------------------------ return ------------------------------ #

        # start_time = time.time()
        # output = (output_img * 255.0).round().astype(np.uint8)
        # end_time = time.time()
        # print("7 " + str(end_time - start_time))

        return output, img_mode


class PrefetchReader(threading.Thread):
    """Prefetch images.

    Args:
        img_list (list[str]): A image list of image paths to be read.
        num_prefetch_queue (int): Number of prefetch queue.
    """

    def __init__(self, img_list, num_prefetch_queue):
        super().__init__()
        self.que = queue.Queue(num_prefetch_queue)
        self.img_list = img_list

    def run(self):
        for img_path in self.img_list:
            img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
            self.que.put(img)

        self.que.put(None)

    def __next__(self):
        next_item = self.que.get()
        if next_item is None:
            raise StopIteration
        return next_item

    def __iter__(self):
        return self


class IOConsumer(threading.Thread):

    def __init__(self, opt, que, qid):
        super().__init__()
        self._queue = que
        self.qid = qid
        self.opt = opt

    def run(self):
        while True:
            msg = self._queue.get()
            if isinstance(msg, str) and msg == 'quit':
                break

            output = msg['output']
            save_path = msg['save_path']
            cv2.imwrite(save_path, output)
        print(f'IO worker {self.qid} is done.')

3. PC端多线程加速（ I/O 密集型任务）

PC端多线程加速（ I/O 密集型任务）和多进程加速（CPU 密集型任务）

Python 多线程

1. 适合 I/O 密集型任务

多线程在 I/O 密集型任务（如文件读写、网络请求、数据库访问等）可显著提高程序性能。

多线程可以在 I/O 等待时间内可以执行其他任务。

即使 GIL（全局解释器锁）限制了 CPU 密集型任务的并发性，多线程在 I/O 密集型任务中，可以在等待 I/O 操作的同时让其他线程继续运行。

2. 共享内存，共享变量

所有线程共享同一进程的内存空间，内存消耗较少。

线程间的数据交换和共享（直接共享变量）。

3. 受 GIL（全局解释器锁）影响，不适合 CPU 密集型任务

下面为实现的python代码，对于频繁的cv2.imshow，确实可提升整体帧率。

import argparse
import os
# os.add_dll_directory(r'D:\CUDA\CUDA02\bin')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_img_hash460.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_img_hash460d.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_world460.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_world460d.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin')

import cv2
import time
import threading
import queue
import argparse
import os
from basicsr.archs.rrdbnet_arch import RRDBNet
from realesrgan import RealESRGANer

def process_image(q, args):
    args.model_name = args.model_name.split('.')[0]
    if args.model_name == 'RealESRGAN_x2plus':  # x2 RRDBNet model
        model = RRDBNet(num_in_ch=1, num_out_ch=1, num_feat=32, num_block=1, num_grow_ch=16, scale=2)
        netscale = 2
    if args.model_path is not None:
        model_path = args.model_path
    else:
        model_path = os.path.join('weights', args.model_name + '.pth')
    dni_weight = None
    upsampler = RealESRGANer(
        scale=netscale,
        model_path=model_path,
        dni_weight=dni_weight,
        model=model,
        tile=args.tile,
        tile_pad=args.tile_pad,
        pre_pad=args.pre_pad,
        half=not args.fp32,
        gpu_id=args.gpu_id
    )

    os.makedirs(args.output, exist_ok=True)
    cap = cv2.VideoCapture(args.input)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        img_yuv = cv2.cvtColor(frame, cv2.COLOR_BGR2YUV)
        Y, U, V = cv2.split(img_yuv)

        output, _ = upsampler.enhance(Y, outscale=args.outscale)

        h, w = output.shape
        resized_U = cv2.resize(U, (w, h))
        resized_V = cv2.resize(V, (w, h))
        img_YUV_OUT = cv2.merge([output, resized_U, resized_V])
        img_BGR_OUT = cv2.cvtColor(img_YUV_OUT, cv2.COLOR_YUV2BGR)

        q.put(img_BGR_OUT)
        print("Processed frame and added to queue")

def display_image(q):
    idx = 0
    all_start_time = None
    while True:
        if not q.empty():
            start_time = time.time()
            img_BGR_OUT = q.get()
            cv2.imshow('Image', img_BGR_OUT)
            idx += 1
            if idx == 1:
                all_start_time = time.time()
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
            end_time = time.time()
            print("Display time for this frame: " + str(end_time - start_time))
            print("------------------------------------ Frame rate: " + str(idx / (time.time() - all_start_time)))
        else:
            time.sleep(0.001)

    cv2.destroyAllWindows()

def main():
    """Inference demo for Real-ESRGAN.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\real\穆桂英挂帅_李胜素_整装待发_发兵出征_1080P.mp4', help='Input Video')
    parser.add_argument(
        '-n',
        '--model_name',
        type=str,
        default='RealESRGAN_x2plus')
    parser.add_argument('-o', '--output', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\weights\4K_Cartoon_Y_2X', help='Output folder')
    parser.add_argument(
        '-dn',
        '--denoise_strength',
        type=float,
        default=0.5)
    parser.add_argument('-s', '--outscale', type=float, default=2, help='The final upsampling scale of the image')
    parser.add_argument(
        '--model_path', type=str, default=R"C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\weights\32_1_16_pairdata_0929_animal_net_g_345000_39.2444.pth")
    parser.add_argument('--suffix', type=str, default='', help='Suffix of the restored image')
    parser.add_argument('-t', '--tile', type=int, default=0, help='Tile size, 0 for no tile during testing')
    parser.add_argument('--tile_pad', type=int, default=10, help='Tile padding')
    parser.add_argument('--pre_pad', type=int, default=0, help='Pre padding size at each border')
    parser.add_argument('--face_enhance', action='store_true', help='Use GFPGAN to enhance face')
    parser.add_argument(
        '--fp32', action='store_true')
    parser.add_argument(
        '--alpha_upsampler',
        type=str,
        default='realesrgan')
    parser.add_argument(
        '--ext',
        type=str,
        default='auto',)
    parser.add_argument(
        '-g', '--gpu-id', type=int, default=0)
    args = parser.parse_args()

    q = queue.Queue()

    thread1 = threading.Thread(target=process_image, args=(q, args))
    thread2 = threading.Thread(target=display_image, args=(q,))

    thread1.start()
    thread2.start()

    thread1.join()
    thread2.join()

if __name__ == '__main__':
    main()

Python 多进程

1. 高效的 CPU 密集型任务

多进程适合处理 CPU 密集型任务，如复杂的数学运算、数据分析、图像处理等。

多进程可以绕过 Python 中的 GIL，因为每个进程都有独立的内存空间和 Python 解释器实例。
多进程因此可以充分利用多核 CPU 的性能，使得 CPU 密集型任务可以并行处理。

2. 进程间通信复杂：

多进程间无法直接共享内存，需要使用 IPC（进程间通信）机制，例如管道（Pipe）、队列（Queue）、共享内存（Shared Memory）等方式来交换数据。

3. 内存消耗大、启动和管理开销高

多进程实现, 不适合I/O密集任务，进程间通信代价大，速度并未满足要求。

import argparse
import os
# os.add_dll_directory(r'D:\CUDA\CUDA02\bin')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_img_hash460.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_img_hash460d.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_world460.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_world460d.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin')

import cv2
print(f'OpenCV: {cv2.__version__} for python installed and working')
import glob
from basicsr.archs.rrdbnet_arch import RRDBNet
from basicsr.utils.download_util import load_file_from_url

from realesrgan import RealESRGANer
from realesrgan.archs.srvgg_arch import SRVGGNetCompact
import numpy as np
import time
import multiprocessing as mp

def process_image(queue, args):
    # determine models according to model names
    args.model_name = args.model_name.split('.')[0]
    if args.model_name == 'RealESRGAN_x2plus':  # x2 RRDBNet model
        model = RRDBNet(num_in_ch=1, num_out_ch=1, num_feat=32, num_block=1, num_grow_ch=16, scale=2)
        netscale = 2
    # determine model paths
    if args.model_path is not None:
        model_path = args.model_path
    else:
        model_path = os.path.join('weights', args.model_name + '.pth')
    # use dni to control the denoise strength
    dni_weight = None
    # restorer
    upsampler = RealESRGANer(
        scale=netscale,
        model_path=model_path,
        dni_weight=dni_weight,
        model=model,
        tile=args.tile,
        tile_pad=args.tile_pad,
        pre_pad=args.pre_pad,
        half=not args.fp32,
        gpu_id=args.gpu_id)

    os.makedirs(args.output, exist_ok=True)

    if os.path.isfile(args.input):
        paths = [args.input]
    else:
        paths = sorted(glob.glob(os.path.join(args.input, '*')))

    cap = cv2.VideoCapture(args.input)

    while cap.isOpened():
        start_time = time.time()
        ret, frame = cap.read()
        if not ret:
            break
        # 图像处理: BGR 到 YUV 转换
        img_yuv = cv2.cvtColor(frame, cv2.COLOR_BGR2YUV)
        Y, U, V = cv2.split(img_yuv)

        # 提交任务给上采样进程
        output, _ = upsampler.enhance(Y, outscale=args.outscale)

        # 图像重采样
        h, w = output.shape
        resized_U = cv2.resize(U, (w, h))
        resized_V = cv2.resize(V, (w, h))
        img_YUV_OUT = cv2.merge([output, resized_U, resized_V])
        img_BGR_OUT = cv2.cvtColor(img_YUV_OUT, cv2.COLOR_YUV2BGR)
        queue.put(img_BGR_OUT)
        # queue.put((output, resized_U, resized_V))
        # end_time = time.time()
        # if end_time - start_time < 0.04:
        #     time.sleep(0.04 - end_time + start_time)
        end_time = time.time()
        print("----------- process_image ------------- " + str(end_time - start_time))

def display_image(queue):
    idx = 0
    while True:
        if queue.qsize() < 24:
            start_time = time.time()
            img_BGR_OUT = queue.get()
            # 合并通道并转换
            # 显示图像
            cv2.imshow('Image', img_BGR_OUT)
            idx += 1
            if idx == 1:
                all_start_time = time.time()
            # 如果按下 'q' 键则退出
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
            end_time = time.time()
            print("----------- display_image ------------- " + str(end_time - start_time))
            print("---------------------- 帧率 ---------------------- " + str(idx / (time.time() - all_start_time)))
        else:
            # 如果队列为空，稍微等待一下，避免CPU空转
            time.sleep(0.001)

def main():
    """Inference demo for Real-ESRGAN.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\real\穆桂英挂帅_李胜素_整装待发_发兵出征_1080P.mp4', help='Input Video')
    parser.add_argument(
        '-n',
        '--model_name',
        type=str,
        default='RealESRGAN_x2plus')
    parser.add_argument('-o', '--output', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\weights\4K_Cartoon_Y_2X', help='Output folder')
    parser.add_argument(
        '-dn',
        '--denoise_strength',
        type=float,
        default=0.5)
    parser.add_argument('-s', '--outscale', type=float, default=2, help='The final upsampling scale of the image')
    parser.add_argument(
        '--model_path', type=str, default=R"C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\weights\32_1_16_pairdata_0929_animal_net_g_345000_39.2444.pth")
    parser.add_argument('--suffix', type=str, default='', help='Suffix of the restored image')
    parser.add_argument('-t', '--tile', type=int, default=0, help='Tile size, 0 for no tile during testing')
    parser.add_argument('--tile_pad', type=int, default=10, help='Tile padding')
    parser.add_argument('--pre_pad', type=int, default=0, help='Pre padding size at each border')
    parser.add_argument('--face_enhance', action='store_true', help='Use GFPGAN to enhance face')
    parser.add_argument(
        '--fp32', action='store_true')
    parser.add_argument(
        '--alpha_upsampler',
        type=str,
        default='realesrgan')
    parser.add_argument(
        '--ext',
        type=str,
        default='auto',)
    parser.add_argument(
        '-g', '--gpu-id', type=int, default=0)
    args = parser.parse_args()

    # 通过 Queue 在进程之间传递数据
    queue = mp.Queue()

    # 创建多进程
    process1 = mp.Process(target=process_image, args=(queue, args))
    process2 = mp.Process(target=display_image, args=(queue,))

    # 启动进程
    process1.start()
    process2.start()

    # 等待进程结束
    process1.join()
    process2.join()

    cv2.destroyAllWindows()

if __name__ == '__main__':
    main()

多线程优化版

避免速度差异导致内存占用过多。

        if q.qsize() > 30:
            time.sleep(0.025)

推理代码如下;

import argparse
import os
# os.add_dll_directory(r'D:\CUDA\CUDA02\bin')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_img_hash460.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_img_hash460d.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_world460.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_world460d.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin')

import cv2
import time
import threading
import queue
import argparse
import os
from basicsr.archs.rrdbnet_arch import RRDBNet
from realesrgan import RealESRGANer

def process_image(q, args):
    args.model_name = args.model_name.split('.')[0]
    if args.model_name == 'RealESRGAN_x2plus':  # x2 RRDBNet model
        model = RRDBNet(num_in_ch=1, num_out_ch=1, num_feat=32, num_block=1, num_grow_ch=16, scale=2)
        netscale = 2
    if args.model_path is not None:
        model_path = args.model_path
    else:
        model_path = os.path.join('weights', args.model_name + '.pth')
    dni_weight = None
    upsampler = RealESRGANer(
        scale=netscale,
        model_path=model_path,
        dni_weight=dni_weight,
        model=model,
        tile=args.tile,
        tile_pad=args.tile_pad,
        pre_pad=args.pre_pad,
        half=not args.fp32,
        gpu_id=args.gpu_id
    )

    os.makedirs(args.output, exist_ok=True)
    cap = cv2.VideoCapture(args.input)

    while cap.isOpened():
        start_time = time.time()
        ret, frame = cap.read()
        if not ret:
            break
        img_yuv = cv2.cvtColor(frame, cv2.COLOR_BGR2YUV)
        Y, U, V = cv2.split(img_yuv)

        output, _ = upsampler.enhance(Y, outscale=args.outscale)
        if q.qsize() > 30:
            time.sleep(0.025)
        q.put((output, U, V))
        print("Processed frame and added to queue")
        end_time = time.time()
        print("------Processed time for this frame: " + str(end_time - start_time))

def display_image(q):
    idx = 0
    all_start_time = None
    while True:
        if not q.empty():
            start_time = time.time()
            # 获取图像的尺寸
            output, U, V = q.get()
            h, w = output.shape
            resized_U = cv2.resize(U, (w, h), interpolation=cv2.INTER_CUBIC)
            resized_V = cv2.resize(V, (w, h), interpolation=cv2.INTER_CUBIC)
            img_YUV_OUT = cv2.merge([output, resized_U, resized_V])
            img_BGR_OUT = cv2.cvtColor(img_YUV_OUT, cv2.COLOR_YUV2BGR)

            cv2.imshow('Image', img_BGR_OUT)
            idx += 1
            if idx == 1:
                all_start_time = time.time()
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
            end_time = time.time()
            print("------Display time for this frame: " + str(end_time - start_time))
            print("------------------------------------------------------------------------ Frame rate:  " + str(idx / (time.time() - all_start_time)))
        else:
            time.sleep(0.001)

    cv2.destroyAllWindows()

def main():
    """Inference demo for Real-ESRGAN.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\real\穆桂英挂帅_李胜素_整装待发_发兵出征_1080P.mp4', help='Input Video')
    parser.add_argument(
        '-n',
        '--model_name',
        type=str,
        default='RealESRGAN_x2plus')
    parser.add_argument('-o', '--output', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\weights\4K_Cartoon_Y_2X', help='Output folder')
    parser.add_argument(
        '-dn',
        '--denoise_strength',
        type=float,
        default=0.5)
    parser.add_argument('-s', '--outscale', type=float, default=2, help='The final upsampling scale of the image')
    parser.add_argument(
        '--model_path', type=str, default=R"C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\weights\32_1_16_pairdata_0929_animal_net_g_345000_39.2444.pth")
    parser.add_argument('--suffix', type=str, default='', help='Suffix of the restored image')
    parser.add_argument('-t', '--tile', type=int, default=0, help='Tile size, 0 for no tile during testing')
    parser.add_argument('--tile_pad', type=int, default=10, help='Tile padding')
    parser.add_argument('--pre_pad', type=int, default=0, help='Pre padding size at each border')
    parser.add_argument('--face_enhance', action='store_true', help='Use GFPGAN to enhance face')
    parser.add_argument(
        '--fp32', action='store_true')
    parser.add_argument(
        '--alpha_upsampler',
        type=str,
        default='realesrgan')
    parser.add_argument(
        '--ext',
        type=str,
        default='auto',)
    parser.add_argument(
        '-g', '--gpu-id', type=int, default=0)
    args = parser.parse_args()

    q = queue.Queue()

    thread1 = threading.Thread(target=process_image, args=(q, args))
    thread2 = threading.Thread(target=display_image, args=(q,))

    thread1.start()
    thread2.start()

    thread1.join()
    thread2.join()

if __name__ == '__main__':
    main()

4. 最终实现效果：在康佳 4090 设备上，实现了1080P实时超分。

二. 精简模型：

近一周作了大量尝试：

1. 像素重排到16通道。

发现最后一层通道 16→1 时，Mac利用率小于1%，时间花费爆炸。

2. 不使用像素重拍。

输入卷积层的通道数为1。导致H*W过大，而RK3588通道是32对齐的。

如果通道C满足32对齐，则导致计算量非常大；
如果通道C较小，则导致Mac利用率过低；

在这里插入图片描述

3. 将 PixelUnshuffle 和 PixelShuffle 使用卷积和转置卷积替换，经训练后，确认成功。

		# 使得 pixel_unshuffle (reshape, transpose, reshape) CPU 运行
		# 转移到 NPU 上运行卷积
        self.pixel_unshuffle  = nn.Conv2d(
            num_in_ch // (self.scale ** 2),
            num_in_ch,
            kernel_size=self.scale, 
            stride=self.scale, 
            padding=0, bias=False)
        
        # 使用转置卷积实现 pixel_shuffle 的效果
        self.pixel_shuffle = nn.ConvTranspose2d(
            num_in_ch,
            num_in_ch // (self.scale ** 2),
            kernel_size=(self.scale),
            stride=(self.scale),
            padding=0, bias=False
        )