一. 康佳PC端实现:1080 → 4K 实时超分
1. 将图像预处理操作从 CPU → GPU 运行
2. 后处理部分操作 从 CPU → GPU 运行
inference_realesrgan_Animal_Video.py
import argparse
import cv2
import glob
import os
from basicsr.archs.rrdbnet_arch import RRDBNet
from basicsr.utils.download_util import load_file_from_url
from realesrgan import RealESRGANer
from realesrgan.archs.srvgg_arch import SRVGGNetCompact
import numpy as np
import time
import torch
# import cupy as cp
import subprocess
import threading
def play_video():
subprocess.call(["ffplay", "-nodisp", "-i", R"C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\第1集_桃园三结义_1080P.mp4"])
# subprocess.call(["ffplay", "-i", "/home/tyzc/0416_syl_chaofen/rknn-multi-threaded-Super-Resolution-syl/cartorn/video_640.mp4"])
def bgr_to_yuv(bgr_image):
"""
将 BGR 图像转换为 YUV 图像,并分离出 Y, U, V 通道
:param bgr_image: 输入的 BGR 图像,类型为 CuPy 数组
:return: Y, U, V 通道(都是 CuPy 数组)
"""
# 将 BGR 图像转换为 YUV
# BGR 到 YUV 转换的矩阵
# bgr_image = bgr_image.astype(cp.float32)
Y = 0.299 * bgr_image[:, :, 2] + 0.587 * bgr_image[:, :, 1] + 0.114 * bgr_image[:, :, 0]
U = -0.169 * bgr_image[:, :, 2] - 0.331 * bgr_image[:, :, 1] + 0.499 * bgr_image[:, :, 0] + 128
V = 0.499 * bgr_image[:, :, 2] - 0.419 * bgr_image[:, :, 1] - 0.0813 * bgr_image[:, :, 0] + 128
return Y, U, V
def main():
"""Inference demo for Real-ESRGAN.
"""
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\weights\4K_Cartoon_Y', help='Input image or folder')
parser.add_argument(
'-n',
'--model_name',
type=str,
default='RealESRGAN_x2plus',
help=('Model names: RealESRGAN_x4plus | RealESRNet_x4plus | RealESRGAN_x4plus_anime_6B | RealESRGAN_x2plus | '
'realesr-animevideov3 | realesr-general-x4v3'))
parser.add_argument('-o', '--output', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\weights\4K_Cartoon_Y_2X', help='Output folder')
parser.add_argument(
'-dn',
'--denoise_strength',
type=float,
default=0.5,
help=('Denoise strength. 0 for weak denoise (keep noise), 1 for strong denoise ability. '
'Only used for the realesr-general-x4v3 model'))
parser.add_argument('-s', '--outscale', type=float, default=2, help='The final upsampling scale of the image')
parser.add_argument(
'--model_path', type=str, default=R"C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\models\train_realesrnet_x2plus_32_1_16_4channel__123conv_1rdb1_net_oneresize_no_conv_hr_pairdata_0929_real_net_g_520000.pth", help='[Option] Model path. Usually, you do not need to specify it')
# '--model_path', type=str, default="/home/sunyingli/Real-ESRGAN/experiments/train_realesrnet_x2plus_32_1_16_4channel__123conv_1rdb1_net_oneresize_no_conv_hr_pairdata_0806/models/net_g_410000.pth", help='[Option] Model path. Usually, you do not need to specify it')
parser.add_argument('--suffix', type=str, default='', help='Suffix of the restored image')
parser.add_argument('-t', '--tile', type=int, default=0, help='Tile size, 0 for no tile during testing')
parser.add_argument('--tile_pad', type=int, default=10, help='Tile padding')
parser.add_argument('--pre_pad', type=int, default=0, help='Pre padding size at each border')
parser.add_argument('--face_enhance', action='store_true', help='Use GFPGAN to enhance face')
parser.add_argument(
'--fp32', action='store_true', help='Use fp32 precision during inference. Default: fp16 (half precision).')
parser.add_argument(
'--alpha_upsampler',
type=str,
default='realesrgan',
help='The upsampler for the alpha channels. Options: realesrgan | bicubic')
parser.add_argument(
'--ext',
type=str,
default='auto',
help='Image extension. Options: auto | jpg | png, auto means using the same extension as inputs')
parser.add_argument(
'-g', '--gpu-id', type=int, default=0, help='gpu device to use (default=None) can be 0,1,2 for multi-gpu')
args = parser.parse_args()
# determine models according to model names
args.model_name = args.model_name.split('.')[0]
if args.model_name == 'RealESRGAN_x4plus': # x4 RRDBNet model
model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
netscale = 4
file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth']
elif args.model_name == 'RealESRNet_x4plus': # x4 RRDBNet model
model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
netscale = 4
file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.1/RealESRNet_x4plus.pth']
elif args.model_name == 'RealESRGAN_x4plus_anime_6B': # x4 RRDBNet model with 6 blocks
model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4)
netscale = 4
file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth']
elif args.model_name == 'RealESRGAN_x2plus': # x2 RRDBNet model
model = RRDBNet(num_in_ch=1, num_out_ch=1, num_feat=32, num_block=1, num_grow_ch=16, scale=2)
netscale = 2
file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth']
elif args.model_name == 'realesr-animevideov3': # x4 VGG-style model (XS size)
model = SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=16, upscale=4, act_type='prelu')
netscale = 4
file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-animevideov3.pth']
elif args.model_name == 'realesr-general-x4v3': # x4 VGG-style model (S size)
model = SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=32, upscale=4, act_type='prelu')
netscale = 4
file_url = [
'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-wdn-x4v3.pth',
'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-x4v3.pth'
]
# determine model paths
if args.model_path is not None:
model_path = args.model_path
else:
model_path = os.path.join('weights', args.model_name + '.pth')
if not os.path.isfile(model_path):
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
for url in file_url:
# model_path will be updated
model_path = load_file_from_url(
url=url, model_dir=os.path.join(ROOT_DIR, 'weights'), progress=True, file_name=None)
# use dni to control the denoise strength
dni_weight = None
if args.model_name == 'realesr-general-x4v3' and args.denoise_strength != 1:
wdn_model_path = model_path.replace('realesr-general-x4v3', 'realesr-general-wdn-x4v3')
model_path = [model_path, wdn_model_path]
dni_weight = [args.denoise_strength, 1 - args.denoise_strength]
# restorer
upsampler = RealESRGANer(
scale=netscale,
model_path=model_path,
dni_weight=dni_weight,
model=model,
tile=args.tile,
tile_pad=args.tile_pad,
pre_pad=args.pre_pad,
half=not args.fp32,
gpu_id=args.gpu_id)
if args.face_enhance: # Use GFPGAN for face enhancement
from gfpgan import GFPGANer
face_enhancer = GFPGANer(
model_path='https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth',
upscale=args.outscale,
arch='clean',
channel_multiplier=2,
bg_upsampler=upsampler)
os.makedirs(args.output, exist_ok=True)
if os.path.isfile(args.input):
paths = [args.input]
else:
paths = sorted(glob.glob(os.path.join(args.input, '*')))
# for idx, path in enumerate(paths):
cap = cv2.VideoCapture(R'C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\整装待发发兵出征_720P.mp4')
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
idx = 0
while (cap.isOpened()):
idx += 1
if idx == 1:
one_all_start_time = time.time()
print('Testing', idx)
start_time = time.time()
ret, frame = cap.read()
frame_resize = frame[:,:width//2,]
img_BGR = frame[:,width//2:,]
end_time = time.time()
print("1 " + str(end_time - start_time))
start_time = time.time()
img_yuv = cv2.cvtColor(img_BGR, cv2.COLOR_BGR2YUV)
Y, U, V = cv2.split(img_yuv)
end_time = time.time()
print("2 " + str(end_time - start_time))
# image = img_BGR
# image_cp = cp.asarray(image) # 将图像转换为 CuPy 数组
# # 转换为 YUV,并分离通道
# Y, U, V = bgr_to_yuv(image_cp)
# # 将 Y, U, V 转换为 NumPy 数组以便查看(如果需要在 CPU 上操作)
# Y_cpu = cp.asnumpy(Y)
# U_cpu = cp.asnumpy(U)
# V_cpu = cp.asnumpy(V)
# # 显示 Y, U, V 通道的形状,或者将其保存为图像(可选)
# print("Y channel shape:", Y_cpu.shape)
# print("U channel shape:", U_cpu.shape)
# print("V channel shape:", V_cpu.shape)
# end_time = time.time()
# print("1 " + str(end_time - start_time))
# start_time = time.time()
# # 将图像转换为 Tensor,并将数据移动到 GPU 上
# img_tensor = torch.from_numpy(img_BGR).float().to(device)
# end_time = time.time()
# print("3 " + str(end_time - start_time))
# 将图像数据从 BGR 转换为 YUV
# BGR 到 YUV 转换矩阵
# bgr_to_yuv_matrix = torch.tensor([
# [0.114, 0.587, 0.299],
# [-0.169, -0.331, 0.499],
# [0.499, -0.419, -0.081]
# ], dtype=torch.float32).cuda()
# # 进行矩阵乘法,BGR -> YUV
# start_time = time.time()
# img_yuv_tensor = torch.matmul(img_tensor[..., :3], bgr_to_yuv_matrix.T)
# # 分离 Y, U, V 通道
# Y_tensor, U_tensor, V_tensor = img_yuv_tensor.split(1, dim=-1)
# end_time = time.time()
# print("4 " + str(end_time - start_time))
# # 将结果从 GPU 移回 CPU
# Y_cpu = Y_tensor.cpu().numpy()
# U_cpu = U_tensor.cpu().numpy()
# V_cpu = V_tensor.cpu().numpy()
# # 你也可以保存图像
# cv2.imwrite('Y_channel.png', Y_cpu.astype(np.uint8))
# 将图像转换为 PyTorch 张量并移至 GPU
# img_yuv = torch.tensor(img_BGR).permute(2, 0, 1).unsqueeze(0).float().to('cuda') # (1, 3, H, W)
# img = img_yuv
# 获取YUV三个通道
img_mode = None
output, _ = upsampler.enhance(Y, outscale=args.outscale)
if args.ext == 'auto':
extension = "png"
else:
extension = args.ext
if img_mode == 'RGBA': # RGBA images should be saved in png format
extension = 'png'
if args.suffix == '':
start_time = time.time()
save_path = os.path.join(args.output, f'{idx}.{extension}')
# 保存为彩色图像
# 创建一个空白的彩色图像
h, w = output.shape
# 对U和V通道进行resize
resized_U = cv2.resize(U, (w, h))
resized_V = cv2.resize(V, (w, h))
# 合并resize后的YUV通道
img_YUV_OUT = cv2.merge([output, resized_U, resized_V])
# # 将图像从YUV颜色空间转换回BGR颜色空间
img_BGR_OUT = cv2.cvtColor(img_YUV_OUT, cv2.COLOR_YUV2BGR)
frame_resize = cv2.resize(frame_resize, (width, height * 2), 2, 2)
# 创建一个空白帧,大小与输入视频相同
result_frame = np.zeros((height*2, width*2, 3), dtype=np.uint8)
# 将左半部分放置在左边
result_frame[:, :width] = frame_resize
# 将all_start_time右半部分放置在右边
result_frame[:, width:] = img_BGR_OUT
result_frame[:, width:width+1, :] = (255, 255, 255)
frame = result_frame
# 启动视频播放线程
# if idx == 1:
# video_thread = threading.Thread(target=play_video)
# video_thread.start()
# one_all_start_time = time.time()
# # 保存合成后的图像
# 显示图像
cv2.imshow('Image', frame)
# 等待用户按键,按任意键继续,按 'q' 键退出
if cv2.waitKey(1) & 0xFF == ord('q'):
# out.release()
break
# cv2.imwrite(save_path, img_BGR_OUT)
end_time = time.time()
print("8 " + str(end_time - start_time))
one_all_end_time = time.time()
print("One Picture All Time: " + str(one_all_end_time - one_all_start_time))
print("-"*60)
if one_all_end_time - one_all_start_time < 0.04:
time.sleep(0.04 - one_all_end_time + one_all_start_time)
print("time.sleep: " + str(0.04 - one_all_end_time + one_all_start_time))
# if one_all_end_time - one_all_start_time > 0.04:
# delay_time = delay_time + one_all_end_time - one_all_start_time - 0.04
one_all_start_time = time.time()
cv2.destroyAllWindows()
if __name__ == '__main__':
main()
utils.py
import cv2
import math
import numpy as np
import os
import queue
import threading
import torch
from basicsr.utils.download_util import load_file_from_url
from torch.nn import functional as F
import time
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
class RealESRGANer():
"""A helper class for upsampling images with RealESRGAN.
Args:
scale (int): Upsampling scale factor used in the networks. It is usually 2 or 4.
model_path (str): The path to the pretrained model. It can be urls (will first download it automatically).
model (nn.Module): The defined network. Default: None.
tile (int): As too large images result in the out of GPU memory issue, so this tile option will first crop
input images into tiles, and then process each of them. Finally, they will be merged into one image.
0 denotes for do not use tile. Default: 0.
tile_pad (int): The pad size for each tile, to remove border artifacts. Default: 10.
pre_pad (int): Pad the input images to avoid border artifacts. Default: 10.
half (float): Whether to use half precision during inference. Default: False.
"""
def __init__(self,
scale,
model_path,
dni_weight=None,
model=None,
tile=0,
tile_pad=10,
pre_pad=10,
half=False,
device=None,
gpu_id=None):
self.scale = scale
self.tile_size = tile
self.tile_pad = tile_pad
self.pre_pad = pre_pad
self.mod_scale = None
self.half = half
# initialize model
if gpu_id:
self.device = torch.device(
f'cuda:{gpu_id}' if torch.cuda.is_available() else 'cpu') if device is None else device
else:
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device is None else device
if isinstance(model_path, list):
# dni
assert len(model_path) == len(dni_weight), 'model_path and dni_weight should have the save length.'
loadnet = self.dni(model_path[0], model_path[1], dni_weight)
else:
# if the model_path starts with https, it will first download models to the folder: weights
if model_path.startswith('https://'):
model_path = load_file_from_url(
url=model_path, model_dir=os.path.join(ROOT_DIR, 'weights'), progress=True, file_name=None)
loadnet = torch.load(model_path, map_location=torch.device('cpu'))
# prefer to use params_ema
if 'params_ema' in loadnet:
keyname = 'params_ema'
else:
keyname = 'params'
model.load_state_dict(loadnet[keyname], strict=True)
model.eval()
self.model = model.to(self.device)
if self.half:
self.model = self.model.half()
def dni(self, net_a, net_b, dni_weight, key='params', loc='cpu'):
"""Deep network interpolation.
``Paper: Deep Network Interpolation for Continuous Imagery Effect Transition``
"""
net_a = torch.load(net_a, map_location=torch.device(loc))
net_b = torch.load(net_b, map_location=torch.device(loc))
for k, v_a in net_a[key].items():
net_a[key][k] = dni_weight[0] * v_a + dni_weight[1] * net_b[key][k]
return net_a
def pre_process(self, img):
"""Pre-process, such as pre-pad and mod pad, so that the images can be divisible
"""
self.img = torch.from_numpy(img).unsqueeze(0).unsqueeze(0).to(self.device).float() / 255.0
# img = torch.from_numpy(img)
# self.img = img.unsqueeze(0).to(self.device)
if self.half:
self.img = self.img.half()
# # pre_pad
# if self.pre_pad != 0:
# self.img = F.pad(self.img, (0, self.pre_pad, 0, self.pre_pad), 'reflect')
# mod pad for divisible borders
if self.scale == 2:
self.mod_scale = 2
# elif self.scale == 1:
# self.mod_scale = 4
if self.mod_scale is not None:
self.mod_pad_h, self.mod_pad_w = 0, 0
_, _, h, w = self.img.size()
if (h % self.mod_scale != 0):
self.mod_pad_h = (self.mod_scale - h % self.mod_scale)
if (w % self.mod_scale != 0):
self.mod_pad_w = (self.mod_scale - w % self.mod_scale)
self.img = F.pad(self.img, (0, self.mod_pad_w, 0, self.mod_pad_h), 'reflect')
def process(self):
# model inference
# self.output = self.model(self.img)
# cv2.imwrite("/home/sunyingli/y_channel.jpg", ((self.img[:, 2, :, :].cpu().numpy()*255).astype(np.uint8)[0]))
# self.output = self.model(self.img[:, 2, :, :].unsqueeze(1))
start_time = time.time()
print(self.img.shape)
self.output = self.model(self.img[:, :, :, :])
end_time = time.time()
print("Input_size: " + str(self.img[:, :, :, :].shape))
print("Inference Time: ", str(end_time - start_time))
# from thop import profile
# # 2 相当于只取Y通道
# f = open(os.devnull, "w")
# import sys
# sys.stdout = f
# flops, params = profile(self.model, inputs=(self.img[:, 2, :, :].unsqueeze(1),))
# sys.stdout = sys.__stdout__
# f.close()
# # 将FLOPs转换为TOPs,计算精度为32位
# tops = flops*24 / 10**12
# print("24 frames per second computing power: ")
# # 打印结果
# print("FLOPs:", flops)
# print("TOPs:", tops)
# print("Params:", params)
# print("-"*60)
def tile_process(self):
"""It will first crop input images to tiles, and then process each tile.
Finally, all the processed tiles are merged into one images.
Modified from: https://github.com/ata4/esrgan-launcher
"""
batch, channel, height, width = self.img.shape
output_height = height * self.scale
output_width = width * self.scale
output_shape = (batch, channel, output_height, output_width)
# start with black image
self.output = self.img.new_zeros(output_shape)
tiles_x = math.ceil(width / self.tile_size)
tiles_y = math.ceil(height / self.tile_size)
# loop over all tiles
for y in range(tiles_y):
for x in range(tiles_x):
# extract tile from input image
ofs_x = x * self.tile_size
ofs_y = y * self.tile_size
# input tile area on total image
input_start_x = ofs_x
input_end_x = min(ofs_x + self.tile_size, width)
input_start_y = ofs_y
input_end_y = min(ofs_y + self.tile_size, height)
# input tile area on total image with padding
input_start_x_pad = max(input_start_x - self.tile_pad, 0)
input_end_x_pad = min(input_end_x + self.tile_pad, width)
input_start_y_pad = max(input_start_y - self.tile_pad, 0)
input_end_y_pad = min(input_end_y + self.tile_pad, height)
# input tile dimensions
input_tile_width = input_end_x - input_start_x
input_tile_height = input_end_y - input_start_y
tile_idx = y * tiles_x + x + 1
input_tile = self.img[:, :, input_start_y_pad:input_end_y_pad, input_start_x_pad:input_end_x_pad]
# upscale tile
try:
with torch.no_grad():
output_tile = self.model(input_tile)
except RuntimeError as error:
print('Error', error)
print(f'\tTile {tile_idx}/{tiles_x * tiles_y}')
# output tile area on total image
output_start_x = input_start_x * self.scale
output_end_x = input_end_x * self.scale
output_start_y = input_start_y * self.scale
output_end_y = input_end_y * self.scale
# output tile area without padding
output_start_x_tile = (input_start_x - input_start_x_pad) * self.scale
output_end_x_tile = output_start_x_tile + input_tile_width * self.scale
output_start_y_tile = (input_start_y - input_start_y_pad) * self.scale
output_end_y_tile = output_start_y_tile + input_tile_height * self.scale
# put tile into output image
self.output[:, :, output_start_y:output_end_y,
output_start_x:output_end_x] = output_tile[:, :, output_start_y_tile:output_end_y_tile,
output_start_x_tile:output_end_x_tile]
def post_process(self):
# remove extra pad
if self.mod_scale is not None:
_, _, h, w = self.output.size()
self.output = self.output[:, :, 0:h - self.mod_pad_h * self.scale, 0:w - self.mod_pad_w * self.scale]
# remove prepad
if self.pre_pad != 0:
_, _, h, w = self.output.size()
self.output = self.output[:, :, 0:h - self.pre_pad * self.scale, 0:w - self.pre_pad * self.scale]
return self.output
@torch.no_grad()
def enhance(self, img, outscale=None, alpha_upsampler='realesrgan'):
# start_time = time.time()
# # img: numpy
# img = img.astype(np.float16)
# max_range = 255
# img = img / max_range
img_mode = 'RGB'
# img = np.expand_dims(img, axis=0)
# end_time = time.time()
# print("3 " + str(end_time - start_time))
# ------------------- process image (without the alpha channel) ------------------- #
start_time = time.time()
self.pre_process(img)
end_time = time.time()
print("4 " + str(end_time - start_time))
if self.tile_size > 0:
self.tile_process()
else:
start_time = time.time()
self.process()
end_time = time.time()
print("4 " + str(end_time - start_time))
start_time = time.time()
output_img = self.post_process()
end_time = time.time()
print("5 " + str(end_time - start_time))
start_time = time.time()
# output_img = output_img.data.squeeze().float().cpu().clamp_(0, 1).numpy()
output_img = output_img.data[0].clamp_(0, 1)*255
output = output_img.round().to(torch.uint8).cpu().numpy()[0]
end_time = time.time()
print("6 " + str(end_time - start_time))
# ------------------------------ return ------------------------------ #
# start_time = time.time()
# output = (output_img * 255.0).round().astype(np.uint8)
# end_time = time.time()
# print("7 " + str(end_time - start_time))
return output, img_mode
class PrefetchReader(threading.Thread):
"""Prefetch images.
Args:
img_list (list[str]): A image list of image paths to be read.
num_prefetch_queue (int): Number of prefetch queue.
"""
def __init__(self, img_list, num_prefetch_queue):
super().__init__()
self.que = queue.Queue(num_prefetch_queue)
self.img_list = img_list
def run(self):
for img_path in self.img_list:
img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
self.que.put(img)
self.que.put(None)
def __next__(self):
next_item = self.que.get()
if next_item is None:
raise StopIteration
return next_item
def __iter__(self):
return self
class IOConsumer(threading.Thread):
def __init__(self, opt, que, qid):
super().__init__()
self._queue = que
self.qid = qid
self.opt = opt
def run(self):
while True:
msg = self._queue.get()
if isinstance(msg, str) and msg == 'quit':
break
output = msg['output']
save_path = msg['save_path']
cv2.imwrite(save_path, output)
print(f'IO worker {self.qid} is done.')
3. PC端多线程加速( I/O 密集型任务)
PC端 多线程加速( I/O 密集型任务)和 多进程加速(CPU 密集型任务)
Python 多线程
1. 适合 I/O 密集型任务
多线程在 I/O 密集型任务(如文件读写、网络请求、数据库访问等)可显著提高程序性能。
多线程可以在 I/O 等待时间内可以执行其他任务。
即使 GIL(全局解释器锁) 限制了 CPU 密集型任务的并发性,多线程在 I/O 密集型任务中, 可以在等待 I/O 操作的同时让其他线程继续运行。
2. 共享内存, 共享变量
所有线程共享同一进程的内存空间,内存消耗较少。
线程间的数据交换和共享(直接共享变量)。
3. 受 GIL(全局解释器锁) 影响,不适合 CPU 密集型任务
下面为实现的python代码,对于频繁的cv2.imshow
,确实可提升整体帧率。
import argparse
import os
# os.add_dll_directory(r'D:\CUDA\CUDA02\bin')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_img_hash460.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_img_hash460d.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_world460.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_world460d.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin')
import cv2
import time
import threading
import queue
import argparse
import os
from basicsr.archs.rrdbnet_arch import RRDBNet
from realesrgan import RealESRGANer
def process_image(q, args):
args.model_name = args.model_name.split('.')[0]
if args.model_name == 'RealESRGAN_x2plus': # x2 RRDBNet model
model = RRDBNet(num_in_ch=1, num_out_ch=1, num_feat=32, num_block=1, num_grow_ch=16, scale=2)
netscale = 2
if args.model_path is not None:
model_path = args.model_path
else:
model_path = os.path.join('weights', args.model_name + '.pth')
dni_weight = None
upsampler = RealESRGANer(
scale=netscale,
model_path=model_path,
dni_weight=dni_weight,
model=model,
tile=args.tile,
tile_pad=args.tile_pad,
pre_pad=args.pre_pad,
half=not args.fp32,
gpu_id=args.gpu_id
)
os.makedirs(args.output, exist_ok=True)
cap = cv2.VideoCapture(args.input)
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
img_yuv = cv2.cvtColor(frame, cv2.COLOR_BGR2YUV)
Y, U, V = cv2.split(img_yuv)
output, _ = upsampler.enhance(Y, outscale=args.outscale)
h, w = output.shape
resized_U = cv2.resize(U, (w, h))
resized_V = cv2.resize(V, (w, h))
img_YUV_OUT = cv2.merge([output, resized_U, resized_V])
img_BGR_OUT = cv2.cvtColor(img_YUV_OUT, cv2.COLOR_YUV2BGR)
q.put(img_BGR_OUT)
print("Processed frame and added to queue")
def display_image(q):
idx = 0
all_start_time = None
while True:
if not q.empty():
start_time = time.time()
img_BGR_OUT = q.get()
cv2.imshow('Image', img_BGR_OUT)
idx += 1
if idx == 1:
all_start_time = time.time()
if cv2.waitKey(1) & 0xFF == ord('q'):
break
end_time = time.time()
print("Display time for this frame: " + str(end_time - start_time))
print("------------------------------------ Frame rate: " + str(idx / (time.time() - all_start_time)))
else:
time.sleep(0.001)
cv2.destroyAllWindows()
def main():
"""Inference demo for Real-ESRGAN.
"""
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\real\穆桂英挂帅_李胜素_整装待发_发兵出征_1080P.mp4', help='Input Video')
parser.add_argument(
'-n',
'--model_name',
type=str,
default='RealESRGAN_x2plus')
parser.add_argument('-o', '--output', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\weights\4K_Cartoon_Y_2X', help='Output folder')
parser.add_argument(
'-dn',
'--denoise_strength',
type=float,
default=0.5)
parser.add_argument('-s', '--outscale', type=float, default=2, help='The final upsampling scale of the image')
parser.add_argument(
'--model_path', type=str, default=R"C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\weights\32_1_16_pairdata_0929_animal_net_g_345000_39.2444.pth")
parser.add_argument('--suffix', type=str, default='', help='Suffix of the restored image')
parser.add_argument('-t', '--tile', type=int, default=0, help='Tile size, 0 for no tile during testing')
parser.add_argument('--tile_pad', type=int, default=10, help='Tile padding')
parser.add_argument('--pre_pad', type=int, default=0, help='Pre padding size at each border')
parser.add_argument('--face_enhance', action='store_true', help='Use GFPGAN to enhance face')
parser.add_argument(
'--fp32', action='store_true')
parser.add_argument(
'--alpha_upsampler',
type=str,
default='realesrgan')
parser.add_argument(
'--ext',
type=str,
default='auto',)
parser.add_argument(
'-g', '--gpu-id', type=int, default=0)
args = parser.parse_args()
q = queue.Queue()
thread1 = threading.Thread(target=process_image, args=(q, args))
thread2 = threading.Thread(target=display_image, args=(q,))
thread1.start()
thread2.start()
thread1.join()
thread2.join()
if __name__ == '__main__':
main()
Python 多进程
1. 高效的 CPU 密集型任务
多进程适合处理 CPU 密集型任务,如复杂的数学运算、数据分析、图像处理等。
多进程可以绕过 Python 中的 GIL,因为每个进程都有独立的内存空间和 Python 解释器实例。
多进程因此可以充分利用多核 CPU 的性能,使得 CPU 密集型任务可以并行处理。
2. 进程间通信复杂:
多进程间无法直接共享内存,需要使用 IPC(进程间通信)机制,例如管道(Pipe)、队列(Queue)、共享内存(Shared Memory)等方式来交换数据。
3. 内存消耗大、启动和管理开销高
多进程实现, 不适合I/O密集任务,进程间通信代价大,速度并未满足要求。
import argparse
import os
# os.add_dll_directory(r'D:\CUDA\CUDA02\bin')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_img_hash460.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_img_hash460d.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_world460.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_world460d.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin')
import cv2
print(f'OpenCV: {cv2.__version__} for python installed and working')
import glob
from basicsr.archs.rrdbnet_arch import RRDBNet
from basicsr.utils.download_util import load_file_from_url
from realesrgan import RealESRGANer
from realesrgan.archs.srvgg_arch import SRVGGNetCompact
import numpy as np
import time
import multiprocessing as mp
def process_image(queue, args):
# determine models according to model names
args.model_name = args.model_name.split('.')[0]
if args.model_name == 'RealESRGAN_x2plus': # x2 RRDBNet model
model = RRDBNet(num_in_ch=1, num_out_ch=1, num_feat=32, num_block=1, num_grow_ch=16, scale=2)
netscale = 2
# determine model paths
if args.model_path is not None:
model_path = args.model_path
else:
model_path = os.path.join('weights', args.model_name + '.pth')
# use dni to control the denoise strength
dni_weight = None
# restorer
upsampler = RealESRGANer(
scale=netscale,
model_path=model_path,
dni_weight=dni_weight,
model=model,
tile=args.tile,
tile_pad=args.tile_pad,
pre_pad=args.pre_pad,
half=not args.fp32,
gpu_id=args.gpu_id)
os.makedirs(args.output, exist_ok=True)
if os.path.isfile(args.input):
paths = [args.input]
else:
paths = sorted(glob.glob(os.path.join(args.input, '*')))
cap = cv2.VideoCapture(args.input)
while cap.isOpened():
start_time = time.time()
ret, frame = cap.read()
if not ret:
break
# 图像处理: BGR 到 YUV 转换
img_yuv = cv2.cvtColor(frame, cv2.COLOR_BGR2YUV)
Y, U, V = cv2.split(img_yuv)
# 提交任务给上采样进程
output, _ = upsampler.enhance(Y, outscale=args.outscale)
# 图像重采样
h, w = output.shape
resized_U = cv2.resize(U, (w, h))
resized_V = cv2.resize(V, (w, h))
img_YUV_OUT = cv2.merge([output, resized_U, resized_V])
img_BGR_OUT = cv2.cvtColor(img_YUV_OUT, cv2.COLOR_YUV2BGR)
queue.put(img_BGR_OUT)
# queue.put((output, resized_U, resized_V))
# end_time = time.time()
# if end_time - start_time < 0.04:
# time.sleep(0.04 - end_time + start_time)
end_time = time.time()
print("----------- process_image ------------- " + str(end_time - start_time))
def display_image(queue):
idx = 0
while True:
if queue.qsize() < 24:
start_time = time.time()
img_BGR_OUT = queue.get()
# 合并通道并转换
# 显示图像
cv2.imshow('Image', img_BGR_OUT)
idx += 1
if idx == 1:
all_start_time = time.time()
# 如果按下 'q' 键则退出
if cv2.waitKey(1) & 0xFF == ord('q'):
break
end_time = time.time()
print("----------- display_image ------------- " + str(end_time - start_time))
print("---------------------- 帧率 ---------------------- " + str(idx / (time.time() - all_start_time)))
else:
# 如果队列为空,稍微等待一下,避免CPU空转
time.sleep(0.001)
def main():
"""Inference demo for Real-ESRGAN.
"""
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\real\穆桂英挂帅_李胜素_整装待发_发兵出征_1080P.mp4', help='Input Video')
parser.add_argument(
'-n',
'--model_name',
type=str,
default='RealESRGAN_x2plus')
parser.add_argument('-o', '--output', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\weights\4K_Cartoon_Y_2X', help='Output folder')
parser.add_argument(
'-dn',
'--denoise_strength',
type=float,
default=0.5)
parser.add_argument('-s', '--outscale', type=float, default=2, help='The final upsampling scale of the image')
parser.add_argument(
'--model_path', type=str, default=R"C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\weights\32_1_16_pairdata_0929_animal_net_g_345000_39.2444.pth")
parser.add_argument('--suffix', type=str, default='', help='Suffix of the restored image')
parser.add_argument('-t', '--tile', type=int, default=0, help='Tile size, 0 for no tile during testing')
parser.add_argument('--tile_pad', type=int, default=10, help='Tile padding')
parser.add_argument('--pre_pad', type=int, default=0, help='Pre padding size at each border')
parser.add_argument('--face_enhance', action='store_true', help='Use GFPGAN to enhance face')
parser.add_argument(
'--fp32', action='store_true')
parser.add_argument(
'--alpha_upsampler',
type=str,
default='realesrgan')
parser.add_argument(
'--ext',
type=str,
default='auto',)
parser.add_argument(
'-g', '--gpu-id', type=int, default=0)
args = parser.parse_args()
# 通过 Queue 在进程之间传递数据
queue = mp.Queue()
# 创建多进程
process1 = mp.Process(target=process_image, args=(queue, args))
process2 = mp.Process(target=display_image, args=(queue,))
# 启动进程
process1.start()
process2.start()
# 等待进程结束
process1.join()
process2.join()
cv2.destroyAllWindows()
if __name__ == '__main__':
main()
多线程优化版
避免速度差异导致内存占用过多。
if q.qsize() > 30:
time.sleep(0.025)
推理代码如下;
import argparse
import os
# os.add_dll_directory(r'D:\CUDA\CUDA02\bin')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_img_hash460.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_img_hash460d.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_world460.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin\opencv_world460d.dll')
os.add_dll_directory(R'C:\Users\kk\Downloads\opencv_contrib_cuda_4.6.0.20221106_win_amd64\install\x64\vc17\bin')
import cv2
import time
import threading
import queue
import argparse
import os
from basicsr.archs.rrdbnet_arch import RRDBNet
from realesrgan import RealESRGANer
def process_image(q, args):
args.model_name = args.model_name.split('.')[0]
if args.model_name == 'RealESRGAN_x2plus': # x2 RRDBNet model
model = RRDBNet(num_in_ch=1, num_out_ch=1, num_feat=32, num_block=1, num_grow_ch=16, scale=2)
netscale = 2
if args.model_path is not None:
model_path = args.model_path
else:
model_path = os.path.join('weights', args.model_name + '.pth')
dni_weight = None
upsampler = RealESRGANer(
scale=netscale,
model_path=model_path,
dni_weight=dni_weight,
model=model,
tile=args.tile,
tile_pad=args.tile_pad,
pre_pad=args.pre_pad,
half=not args.fp32,
gpu_id=args.gpu_id
)
os.makedirs(args.output, exist_ok=True)
cap = cv2.VideoCapture(args.input)
while cap.isOpened():
start_time = time.time()
ret, frame = cap.read()
if not ret:
break
img_yuv = cv2.cvtColor(frame, cv2.COLOR_BGR2YUV)
Y, U, V = cv2.split(img_yuv)
output, _ = upsampler.enhance(Y, outscale=args.outscale)
if q.qsize() > 30:
time.sleep(0.025)
q.put((output, U, V))
print("Processed frame and added to queue")
end_time = time.time()
print("------Processed time for this frame: " + str(end_time - start_time))
def display_image(q):
idx = 0
all_start_time = None
while True:
if not q.empty():
start_time = time.time()
# 获取图像的尺寸
output, U, V = q.get()
h, w = output.shape
resized_U = cv2.resize(U, (w, h), interpolation=cv2.INTER_CUBIC)
resized_V = cv2.resize(V, (w, h), interpolation=cv2.INTER_CUBIC)
img_YUV_OUT = cv2.merge([output, resized_U, resized_V])
img_BGR_OUT = cv2.cvtColor(img_YUV_OUT, cv2.COLOR_YUV2BGR)
cv2.imshow('Image', img_BGR_OUT)
idx += 1
if idx == 1:
all_start_time = time.time()
if cv2.waitKey(1) & 0xFF == ord('q'):
break
end_time = time.time()
print("------Display time for this frame: " + str(end_time - start_time))
print("------------------------------------------------------------------------ Frame rate: " + str(idx / (time.time() - all_start_time)))
else:
time.sleep(0.001)
cv2.destroyAllWindows()
def main():
"""Inference demo for Real-ESRGAN.
"""
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\real\穆桂英挂帅_李胜素_整装待发_发兵出征_1080P.mp4', help='Input Video')
parser.add_argument(
'-n',
'--model_name',
type=str,
default='RealESRGAN_x2plus')
parser.add_argument('-o', '--output', type=str, default=R'C:\Users\kk\Downloads\Real-ESRGAN-master\weights\4K_Cartoon_Y_2X', help='Output folder')
parser.add_argument(
'-dn',
'--denoise_strength',
type=float,
default=0.5)
parser.add_argument('-s', '--outscale', type=float, default=2, help='The final upsampling scale of the image')
parser.add_argument(
'--model_path', type=str, default=R"C:\Users\kk\Downloads\Real-ESRGAN-master\inputs\weights\32_1_16_pairdata_0929_animal_net_g_345000_39.2444.pth")
parser.add_argument('--suffix', type=str, default='', help='Suffix of the restored image')
parser.add_argument('-t', '--tile', type=int, default=0, help='Tile size, 0 for no tile during testing')
parser.add_argument('--tile_pad', type=int, default=10, help='Tile padding')
parser.add_argument('--pre_pad', type=int, default=0, help='Pre padding size at each border')
parser.add_argument('--face_enhance', action='store_true', help='Use GFPGAN to enhance face')
parser.add_argument(
'--fp32', action='store_true')
parser.add_argument(
'--alpha_upsampler',
type=str,
default='realesrgan')
parser.add_argument(
'--ext',
type=str,
default='auto',)
parser.add_argument(
'-g', '--gpu-id', type=int, default=0)
args = parser.parse_args()
q = queue.Queue()
thread1 = threading.Thread(target=process_image, args=(q, args))
thread2 = threading.Thread(target=display_image, args=(q,))
thread1.start()
thread2.start()
thread1.join()
thread2.join()
if __name__ == '__main__':
main()
4. 最终实现效果:在康佳 4090 设备上,实现了1080P实时超分。
二. 精简模型:
近一周作了大量尝试:
1. 像素重排到16通道。
发现最后一层通道 16→1 时,Mac利用率小于1%, 时间花费爆炸。
2. 不使用像素重拍。
输入卷积层的通道数为1。导致H*W过大,而RK3588通道是32对齐的。
如果通道C满足32对齐,则导致计算量非常大;
如果通道C较小,则导致Mac利用率过低;
3. 将 PixelUnshuffle 和 PixelShuffle 使用卷积和转置卷积替换,经训练后,确认成功。
# 使得 pixel_unshuffle (reshape, transpose, reshape) CPU 运行
# 转移到 NPU 上运行卷积
self.pixel_unshuffle = nn.Conv2d(
num_in_ch // (self.scale ** 2),
num_in_ch,
kernel_size=self.scale,
stride=self.scale,
padding=0, bias=False)
# 使用转置卷积实现 pixel_shuffle 的效果
self.pixel_shuffle = nn.ConvTranspose2d(
num_in_ch,
num_in_ch // (self.scale ** 2),
kernel_size=(self.scale),
stride=(self.scale),
padding=0, bias=False
)
4. 去除不必要的 Add, Max 操作,经训练后,确认成功。
5. 精简 concat 数量,经训练后,确认成功。
6. 所以尝试将最后一层,在 CPU 上执行。
原因: RK3588输出通道越大,Mac利用率越高。但是我们超分后的结果,输出通道必须要<=3。
所以尝试将最后一层,在 CPU 上执行。
实际效果:
通过 pubin11 实现 python 调用 C++ 加速操作。最终结果是CPU内存占用大,且时间花费多,未能满足要求。
7. 最终效果:得到两种更轻量化的超分模型,目前正在训练中。
其他:
- 下载 4K 视频,为第三方检测作准备。
- 安装 cv2 的 cuda 版本, 但数据要在CPU GPU上频繁传输,受限于数据传输速度,总体速度反而变慢了。(花费时间较长)
- RK3588 内存零拷贝,但调研结果速度差异不大。