解决faster-rcnn中训练时assert(boxes[:,2]>=boxes[:,0]).all()的问题

本文解决了Faster R-CNN训练过程中遇到的错误,包括左上角坐标为0及标定区域溢出的问题,并提供了修改源代码的具体步骤。

训练数据时出错,所以查看网上的解决办法,办法很好的解决啦存在的问题,分享出来。

1、出现问题:训练faster rcnn时出现如下报错:

File "/py-faster-rcnn/tools/../lib/datasets/imdb.py", line 108, in append_flipped_images
    assert (boxes[:, 2] >= boxes[:, 0]).all()
AssertionError
2、问题分析:
检查自己数据发现,左上角坐标(x,y)可能为0,或标定区域溢出图片


而faster rcnn会对Xmin,Ymin,Xmax,Ymax进行减一操作

如果Xmin为0,减一后变为65535
问题解决
1、修改lib/datasets/imdb.py,append_flipped_images()函数
数据整理,在一行代码为 boxes[:, 2] = widths[i] - oldx1 - 1下加入代码:
for b in range(len(boxes)):
if boxes[b][2]< boxes[b][0]:
boxes[b][0] = 0
2、修改lib/datasets/pascal_voc.py,_load_pascal_annotation(,)函数
对Xmin,Ymin,Xmax,Ymax减一去掉,变为:

3、(可选,如果1和2可以解决问题,就没必要用3)修改lib/fast_rcnn/config.py,不使图片实现翻转,如下改为:
# Use horizontally-flipped images during training?
__C.TRAIN.USE_FLIPPED = False
问题解决
转载地址:http://blog.youkuaiyun.com/xzzppp/article/details/52036794

import torch import torch.nn as nn from utils.trainer import model_init_ from utils.build import check_cfg, build_from_cfg import os import glob from torchvision import transforms, datasets from PIL import Image, ImageDraw, ImageFont import time from graphic.RawDataProcessor import generate_images import imageio import sys import cv2 import numpy as np from torch.utils.data import DataLoader try: from DetModels import YOLOV5S from DetModels.yolo.basic import LoadImages, Profile, Path, non_max_suppression, Annotator, scale_boxes, colorstr, \ Colors, letterbox except ImportError: pass # Current directory and metric directory current_dir = os.path.dirname(os.path.abspath(__file__)) METRIC = os.path.join(current_dir, './metrics') sys.path.append(METRIC) sys.path.append(current_dir) sys.path.append('utils/DetModels/yolo') try: from .metrics.base_metric import EVAMetric except ImportError: pass from logger import colorful_logger # Supported image and raw data extensions image_ext = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff'] raw_data_ext = ['.iq', '.dat'] class Classify_Model(nn.Module): """ A class representing a classification model for performing inference and benchmarking using a pre-trained model. Attributes: - logger (colorful_logger): Logger for logging messages with color. - cfg (str): Path to configuration dictionary. - device (str): Device to use for inference (CPU or GPU). - model (torch.nn.Module): Pre-trained model. - save_path (str): Path to save the results. - save (bool): Flag to indicate whether to save the results. """ def __init__(self, cfg: str = '../configs/exp1_test.yaml', weight_path: str = '../default.path', save: bool = True, ): """ Initializes the Classify_Model. Parameters: - cfg (str): Path to configuration dictionary. - weight_path (str): Path to the pre-trained model weights. - save (bool): Flag to indicate whether to save the results. """ super().__init__() self.logger = self.set_logger if check_cfg(cfg): self.logger.log_with_color(f"Using config file: {cfg}") self.cfg = build_from_cfg(cfg) if self.cfg['device'] == 'cuda': if torch.cuda.is_available(): self.logger.log_with_color("Using GPU for inference") self.device = self.cfg['device'] else: self.logger.log_with_color("Using CPU for inference") self.device = "cpu" if os.path.exists(weight_path): self.logger.log_with_color(f"Using weight file: {weight_path}") self.weight_path = weight_path else: raise FileNotFoundError(f"weight path: {weight_path} does not exist") self.model = self.load_model self.model.to(self.device) self.model.eval() self.save_path = None self.save = save self.confidence_threshold = self.cfg.get('confidence_threshold', 0.49) self.logger.log_with_color(f"Using confidence threshold: {self.confidence_threshold * 100}%") def inference(self, source='../example/', save_path: str = '../result'): """ Performs inference on the given source data. Parameters: - source (str): Path to the source data. - save_path (str): Path to save the results. """ torch.no_grad() if self.save: if not os.path.exists(save_path): os.mkdir(save_path) self.save_path = save_path self.logger.log_with_color(f"Saving results to: {save_path}") if not os.path.exists(source): self.logger.log_with_color(f"Source {source} dose not exit") # dir detect if os.path.isdir(source): data_list = glob.glob(os.path.join(source, '*')) for data in data_list: # detect images in dir if is_valid_file(data, image_ext): self.ImgProcessor(data) # detect raw datas in dir elif is_valid_file(data, raw_data_ext): self.RawdataProcess(data) else: continue # detect single image elif is_valid_file(source, image_ext): self.ImgProcessor(source) # detect single pack of raw data elif is_valid_file(source, raw_data_ext): self.RawdataProcess(source) def forward(self, img): """ Forward pass through the model. Parameters: - img (torch.Tensor): Input image tensor. Returns: - probability (float): Confidence probability of the predicted class. - predicted_class_name (str): Name of the predicted class. """ self.model.eval() temp = self.model(img) probabilities = torch.softmax(temp, dim=1) predicted_class_index = torch.argmax(probabilities, dim=1).item() predicted_class_name = get_key_from_value(self.cfg['class_names'], predicted_class_index) probability = probabilities[0][predicted_class_index].item() * 100 return probability, predicted_class_name @property def load_model(self): """ Loads the pre-trained model. Returns: - model (torch.nn.Module): Loaded model. """ self.logger.log_with_color(f"Using device: {self.device}") # model = model_init_(self.cfg['model'], self.cfg['num_classes'], pretrained=True) model = model_init_(self.cfg['model'], self.cfg['num_classes'], pretrained_path=None) if os.path.exists(self.weight_path): self.logger.log_with_color(f"Loading init weights from: {self.weight_path}") # state_dict = torch.load(self.weight_path, map_location=self.device) state_dict = torch.load(self.weight_path, map_location=self.device, weights_only=True) model.load_state_dict(state_dict) self.logger.log_with_color(f"Successfully loaded pretrained weights from: {self.weight_path}") else: self.logger.log_with_color(f"init weights file not found at: {self.weight_path}. Skipping weight loading.") return model def ImgProcessor(self, source): """ Performs inference on spectromgram data. Parameters: - source (str): Path to the image. """ start_time = time.time() name = os.path.basename(source)[:-4] origin_image = Image.open(source).convert('RGB') preprocessed_image = self.preprocess(source) # 提取文件名(仅保留文件名,不含路径) filename = os.path.basename(source) temp = self.model(preprocessed_image) probabilities = torch.softmax(temp, dim=1) # # 新增:获取最大概率和对应类别索引 max_prob, predicted_class_index = torch.max(probabilities, dim=1) max_prob_val = max_prob.item() # 转换为浮点数' # 核心:计算unknown置信度为1 - 最高置信度(转换为百分比) unknown_prob = (1 - max_prob_val) * 100 # 已知类别置信度为模型输出值(转换为百分比) known_prob = max_prob_val * 100 # predicted_class_index = torch.argmax(probabilities, dim=1).item() # predicted_class_name = get_key_from_value(self.cfg['class_names'], predicted_class_index) if max_prob_val < self.confidence_threshold: predicted_class_name = 'unknown' current_prob = unknown_prob # 使用1-置信度 else: predicted_class_name = get_key_from_value(self.cfg['class_names'], predicted_class_index.item()) current_prob = known_prob # 使用模型原始置信度 end_time = time.time() self.logger.log_with_color(f"Inference time: {(end_time - start_time) / 100 :.8f} sec") # self.logger.log_with_color(f"{source} contains Drone: {predicted_class_name}, " # f"confidence1: {probabilities[0][predicted_class_index].item() * 100 :.2f} %," # f" start saving result") #这个版本是对未知机型置信度做了处理 # self.logger.log_with_color(f"{source} contains Drone: {predicted_class_name}, confidence: {current_prob:.2f}%") # 仅输出:文件名、机型、置信度(简化格式) self.logger.log_with_color(f"{filename}, contains Drone: {predicted_class_name}, {current_prob:.2f}%, 推理: {(end_time - start_time):.6f} sec") if self.save: # res = self.add_result(res=predicted_class_name, # probability=probabilities[0][predicted_class_index].item() * 100, # image=origin_image) res = self.add_result(res=predicted_class_name, probability=current_prob, image=origin_image) res.save(os.path.join(self.save_path, name + '.jpg')) def RawdataProcess(self, source): """ Transforming raw data into a video and performing inference on video. Parameters: - source (str): Path to the raw data. """ res = [] images = generate_images(source) name = os.path.splitext(os.path.basename(source)) for image in images: temp = self.model(self.preprocess(image)) probabilities = torch.softmax(temp, dim=1) predicted_class_index = torch.argmax(probabilities, dim=1).item() predicted_class_name = get_key_from_value(self.cfg['class_names'], predicted_class_index) _ = self.add_result(res=predicted_class_name, probability=probabilities[0][predicted_class_index].item() * 100, image=image) res.append(_) imageio.mimsave(os.path.join(self.save_path, name + '.mp4'), res, fps=5) def add_result(self, res, image, position=(40, 40), font="arial.ttf", font_size=45, text_color=(255, 0, 0), probability=0.0 ): """ Adds the inference result to the image. Parameters: - res (str): Inference result. - image (PIL.Image): Input image. - position (tuple): Position to add the text. - font (str): Font file path. - font_size (int): Font size. - text_color (tuple): Text color. - probability (float): Confidence probability. Returns: - image (PIL.Image): Image with added result. """ draw = ImageDraw.Draw(image) font = ImageFont.truetype("C:/Windows/Fonts/simhei.ttf", font_size) draw.text(position, res + f" {probability:.2f}%", fill=text_color, font=font) return image @property def set_logger(self): """ Sets up the logger. Returns: - logger (colorful_logger): Logger instance. """ logger = colorful_logger('Inference') return logger def preprocess(self, img): transform = transforms.Compose([ transforms.Resize((self.cfg['image_size'], self.cfg['image_size'])), transforms.ToTensor(), ]) image = Image.open(img).convert('RGB') preprocessed_image = transform(image) preprocessed_image = preprocessed_image.to(self.device) preprocessed_image = preprocessed_image.unsqueeze(0) return preprocessed_image def benchmark(self, data_path, save_path=None): """ Performs benchmarking on the given data and calculates evaluation metrics. Parameters: - data_path (str): Path to the benchmark data. Returns: - metrics (dict): Dictionary containing evaluation metrics. """ snrs = os.listdir(data_path) if not save_path: save_path = os.path.join(data_path, 'benchmark result') if not os.path.exists(save_path): os.mkdir(save_path) if not os.path.exists(save_path): os.mkdir(save_path) #根据得到映射关系写下面的,我得到的是★ 最佳映射 pred → gt: {0: 2, 1: 1, 2: 3, 3: 4, 4: 0} #MAP_P2G=torch.tensor([2,1,3,4,0],device=self.cfg['device']) #INV_MAP=torch.argsort(MAP_P2G) with torch.no_grad(): for snr in snrs: CMS = os.listdir(os.path.join(data_path, snr)) for CM in CMS: stat_time = time.time() self.model.eval() _dataset = datasets.ImageFolder( root=os.path.join(data_path, snr, CM), transform=transforms.Compose([ transforms.Resize((self.cfg['image_size'], self.cfg['image_size'])), transforms.ToTensor(),]) ) dataset = DataLoader(_dataset, batch_size=self.cfg['batch_size'], shuffle=self.cfg['shuffle']) print("Starting Benchmark...") correct = 0 total = 0 probabilities = [] total_labels = [] classes_name = tuple(self.cfg['class_names'].keys()) cm_raw = np.zeros((5, 5), dtype=int) for images, labels in dataset: images, labels = images.to(self.cfg['device']), labels.to(self.cfg['device']) outputs = self.model(images) #outputs=outputs[:,INV_MAP] #probs =torch.softmax(outputs,dim=1) for output in outputs: probabilities.append(list(torch.softmax(output, dim=0))) _, predicted = outputs.max(1) for p, t in zip(predicted.cpu(), labels.cpu()): cm_raw[p,t]+=1 cm_raw[p, t] += 1 # 行 = pred, 列 = gt total += labels.size(0) correct += predicted.eq(labels).sum().item() total_labels.append(labels) _total_labels = torch.concat(total_labels, dim=0) _probabilities = torch.tensor(probabilities) metrics = EVAMetric(preds=_probabilities.to(self.cfg['device']), labels=_total_labels, num_classes=self.cfg['num_classes'], tasks=('f1', 'precision', 'CM'), topk=(1, 3, 5), save_path=save_path, classes_name=classes_name, pic_name=f'{snr}_{CM}') metrics['acc'] = 100 * correct / total s = (f'{snr} ' + f'CM: {CM} eva result:' + ' acc: ' + f'{metrics["acc"]}' + ' top-1: ' + f'{metrics["Top-k"]["top1"]}' + ' top-1: ' + f'{metrics["Top-k"]["top1"]}' + ' top-2 ' + f'{metrics["Top-k"]["top2"]}' + ' top-3 ' + f'{metrics["Top-k"]["top3"]}' + ' mAP: ' + f'{metrics["mAP"]["mAP"]}' + ' macro_f1: ' + f'{metrics["f1"]["macro_f1"]}' + ' micro_f1 : ' + f' {metrics["f1"]["micro_f1"]}\n') txt_path = os.path.join(save_path, 'benchmark_result.txt') colorful_logger(f'cost {(time.time()-stat_time)/60} mins') with open(txt_path, 'a') as file: file.write(s) print(f'{CM} Done!') print(f'{snr} Done!') row_ind, col_ind = linear_sum_assignment(-cm_raw) # 取负→最大化对角线 mapping_pred2gt = {int(r): int(c) for r, c in zip(row_ind, col_ind)} print("\n★ 最佳映射 pred → gt:", mapping_pred2gt) # 若要保存下来以后用: import json json.dump(mapping_pred2gt, open('class_to_idx_pred2gt.json', 'w')) print("映射已保存到 class_to_idx_pred2gt.json") class Detection_Model: """ A common interface for initializing and running different detection models. This class provides methods to initialize and run object detection models such as YOLOv5 and Faster R-CNN. It allows for easy switching between different models by providing a unified interface. Attributes: - S1model: The initialized detection model (e.g., YOLOv5S). - model_name: The name of the detection model to be used. - weight_path: The path to the pre-trained model weights. Methods: - __init__(self, cfg=None, model_name=None, weight_path=None): Initializes the detection model based on the provided configuration or parameters. If a configuration dictionary `cfg` is provided, it will be used to set the model name and weight path. Otherwise, the `model_name` and `weight_path` parameters can be specified directly. - yolov5_detect(self, source='../example/source/', save_dir='../res', imgsz=(640, 640), conf_thres=0.6, iou_thres=0.45, max_det=1000, line_thickness=3, hide_labels=True, hide_conf=False): Runs YOLOv5 object detection on the specified source. - source: Path to the input image or directory containing images. - save_dir: Directory to save the detection results. - imgsz: Image size for inference (height, width). - conf_thres: Confidence threshold for filtering detections. - iou_thres: IoU threshold for non-maximum suppression. - max_det: Maximum number of detections per image. - line_thickness: Thickness of the bounding box lines. - hide_labels: Whether to hide class labels in the output. - hide_conf: Whether to hide confidence scores in the output. - faster_rcnn_detect(self, source='../example/source/', save_dir='../res', weight_path='../example/detect/', imgsz=(640, 640), conf_thres=0.25, iou_thres=0.45, max_det=1000, line_thickness=3, hide_labels=False, hide_conf=False): Placeholder method for running Faster R-CNN object detection. This method is currently not implemented and should be replaced with the actual implementation. """ def __init__(self, cfg=None, model_name=None, weight_path=None): if cfg: model_name = cfg['model_name'] weight_path = cfg['weight_path'] if model_name == 'yolov5': self.S1model = YOLOV5S(weights=weight_path) self.S1model.inference = self.yolov5_detect # ToDo elif model_name == 'faster_rcnn': self.S1model = YOLOV5S(weights=weight_path) self.S1model.inference = self.yolov5_detect else: if model_name == 'yolov5': self.S1model = YOLOV5S(weights=weight_path) self.S1model.inference = self.yolov5_detect # ToDo elif model_name == 'faster_rcnn': self.S1model = YOLOV5S(weights=weight_path) self.S1model.inference = self.yolov5_detect def yolov5_detect(self, source='../example/source/', save_dir='../res', imgsz=(640, 640), conf_thres=0.6, iou_thres=0.45, max_det=1000, line_thickness=3, hide_labels=True, hide_conf=False, ): color = Colors() detmodel = self.S1model stride, names = detmodel.stride, detmodel.names torch.no_grad() # Run inference if isinstance(source, np.ndarray): detmodel.eval() im = letterbox(source, imgsz, stride=stride, auto=True)[0] # padded resize im = im.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB im = np.ascontiguousarray(im) # contiguous im = torch.from_numpy(im).to(detmodel.device) im = im.float() # uint8 to fp16/32 im /= 255 # 0 - 255 to 0.0 - 1.0 if len(im.shape) == 3: im = im[None] # expand for batch dim # Inference pred = detmodel(im) # NMS pred = non_max_suppression(pred, conf_thres, iou_thres, agnostic=False, max_det=max_det) # Process predictions for i, det in enumerate(pred): # per image annotator = Annotator(source, line_width=line_thickness, example=str(names)) if len(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_boxes(im.shape[2:], det[:, :4], source.shape).round() # Print results for c in det[:, 5].unique(): n = (det[:, 5] == c).sum() # detections per class for *xyxy, conf, cls in reversed(det): c = int(cls) # integer class label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}') annotator.box_label(xyxy, label, color=color(c + 2, True)) # Stream results im0 = annotator.result() # Save results (image with detections) return im0 else: # Ensure the save directory exists os.makedirs(save_dir, exist_ok=True) dataset = LoadImages(source, img_size=imgsz, stride=stride) seen, windows, dt = 0, [], (Profile(), Profile(), Profile()) for path, im, im0s, s in dataset: im = torch.from_numpy(im).to(detmodel.device) im = im.float() # uint8 to fp16/32 im /= 255 # 0 - 255 to 0.0 - 1.0 if len(im.shape) == 3: im = im[None] # expand for batch dim # Inference pred = detmodel(im) # NMS pred = non_max_suppression(pred, conf_thres, iou_thres, agnostic=False, max_det=max_det) # Process predictions for i, det in enumerate(pred): # per image seen += 1 p, im0, frame = path, im0s.copy(), getattr(dataset, 'frame', 0) p = Path(p) # to Path save_path = str(save_dir + p.name) # im.jpg s += '%gx%g ' % im.shape[2:] # print string annotator = Annotator(im0, line_width=line_thickness, example=str(names)) if len(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_boxes(im.shape[2:], det[:, :4], im0.shape).round() # Print results for c in det[:, 5].unique(): n = (det[:, 5] == c).sum() # detections per class s += f"{n} {names[int(c)]}{'s' * (n > 1)}, " # add to string for *xyxy, conf, cls in reversed(det): c = int(cls) # integer class label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}') annotator.box_label(xyxy, label, color=color(c + 2, True)) # Stream results im0 = annotator.result() # Save results (image with detections) if save_dir == 'buffer': return im0 else: cv2.imwrite(save_path, im0) del im0 # Release memory after saving # Print results print(f"Results saved to {colorstr('bold', save_dir)}") #ToDo def faster_rcnn_detect(self, source='../example/source/', save_dir='../res', weight_path='../example/detect/', imgsz=(640, 640), conf_thres=0.25, iou_thres=0.45, max_det=1000, line_thickness=3, hide_labels=False, hide_conf=False, ): pass def is_valid_file(path, total_ext): """ Checks if the file has a valid extension. Parameters: - path (str): Path to the file. - total_ext (list): List of valid extensions. Returns: - bool: True if the file has a valid extension, False otherwise. """ last_element = os.path.basename(path) if any(last_element.lower().endswith(ext) for ext in total_ext): return True else: return False def get_key_from_value(d, value): """ Gets the key from a dictionary based on the value. Parameters: - d (dict): Dictionary. - value: Value to find the key for. Returns: - key: Key corresponding to the value, or None if not found. """ for key, val in d.items(): if val == value: return key return None def preprocess_image_yolo(im0, imgsz, stride, detmodel): im = letterbox(im0, imgsz, stride=stride, auto=True)[0] # padded resize im = im.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB im = np.ascontiguousarray(im) # contiguous im = torch.from_numpy(im).to(detmodel.device) im = im.float() # uint8 to fp16/32 im /= 255 # 0 - 255 to 0.0 - 1.0 if len(im.shape) == 3: im = im[None] # expand for batch dim return im def process_predictions_yolo(det, im, im0, names, line_thickness, hide_labels, hide_conf, color): annotator = Annotator(im0, line_width=line_thickness, example=str(names)) if len(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_boxes(im.shape[2:], det[:, :4], im0.shape).round() # Print results for c in det[:, 5].unique(): n = (det[:, 5] == c).sum() # detections per class for *xyxy, conf, cls in reversed(det): c = int(cls) # integer class label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}') annotator.box_label(xyxy, label, color=color(c + 2, True)) # Stream results im0 = annotator.result() return im0 # Usage----------------------------------------------------------------------------------------------------------------- def main(): """ cfg = '' weight_path = '' source = '' save_path = '' test = Classify_Model(cfg=cfg, weight_path=weight_path) test.inference(source=source, save_path=save_path) # test.benchmark() """ """ source = '' weight_path = '' save_dir = '' test = Detection_Model(model_name='yolov5', weight_path=weight_path) test.yolov5_detect(source=source, save_dir=save_dir,) """ if __name__ == '__main__': main() 报错部分代码是这样的,我该怎么做,才能让我的推理正常跑通呢
最新发布
09-08
<think> 我们面临的核心问题是:在未联网且无GPU的主机上,即使虚拟环境和torch版本一致,推理代码也无法运行。结合之前的分析和当前引用内容,我们重点解决模型加载的隐式在线依赖和硬件兼容性问题。 ### 问题根源深度分析 1. **预训练模型加载机制** Hugging Face的`from_pretrained`方法默认会尝试在线下载模型权重(即使代码中指定了本地路径)[^1]。数学表达为: $$ \text{ModelLoader}(path) = \begin{cases} \text{LoadLocal}(path) & \text{if } \exists \text{权重文件} \\ \text{DownloadFromHuggingFaceHub}(model\_id) & \text{otherwise} \end{cases} $$ 离线环境必然触发下载失败。 2. **二进制依赖不匹配** 打包环境可能遗漏动态链接库,尤其是CUDA相关库。引用[2]指出:PyTorch扩展包(如`flash_attn`)的`.whl`文件名包含详细的编译环境信息(如`cu118`、`torch2.1`)。在无GPU主机上,缺失CUDA库会导致: $$ \forall \text{libcuda.so.*} \notin LD\_LIBRARY\_PATH \implies \text{ImportError} $$ ### 系统级解决方案 #### 第一阶段:强制模型完全离线加载 修改模型加载代码,彻底禁用在线连接: ```python from transformers import AutoModel, AutoTokenizer import os # 关键设置:强制离线模式 os.environ['TRANSFORMERS_OFFLINE'] = '1' # 禁用HF Hub连接 os.environ['HF_DATASETS_OFFLINE'] = '1' # 禁用数据集连接 # 必须使用绝对路径指向模型目录 model_path = "/absolute/path/to/deepseek-model" # 加载指定local_files_only tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True) model = AutoModel.from_pretrained( model_path, local_files_only=True, device_map="auto" if torch.cuda.is_available() else "cpu", # 兼容处理 torch_dtype=torch.float32 # CPU必须用float32 ) # 无GPU强制在CPU运行 if not torch.cuda.is_available(): model = model.to('cpu') ``` #### 第二阶段:处理动态库依赖 1. **收集所有必要共享库** 在可联网主机执行: ```bash # 查找PyTorch核心依赖库 find ${VIRTUAL_ENV}/lib -name "*.so*" > so_list.txt # 额外检查CUDA相关库(即使目标机无GPU) ldd ${VIRTUAL_ENV}/lib/python3.8/site-packages/torch/lib/libtorch.so | grep cuda >> cuda_deps.txt ``` 2. **打包规范** 将以下文件加入压缩包: - `${VIRTUAL_ENV}` 整个虚拟环境目录 - `so_list.txt` 列出的所有.so文件 - 特别包含虚拟环境外的关键库(如libstdc++.so.6): ```bash cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${PACKAGE_DIR}/external_libs/ ``` #### 第三阶段:无GPU环境启动配置 创建启动脚本`start.sh`: ```bash #!/bin/bash # 重置环境变量 export CUDA_VISIBLE_DEVICES="" # 彻底隐藏GPU export LD_LIBRARY_PATH="./external_libs:$VIRTUAL_ENV/lib:$LD_LIBRARY_PATH" # 设置Torch使用CPU后端 export TORCH_DEVICE="cpu" # 验证环境 python -c "import torch; assert not torch.cuda.is_available(), 'CUDA仍被检测到!'" # 启动推理服务 python inference_service.py ``` ### 极端情况处理 **场景:代码中硬编码了在线模型ID** 解决方案:使用`huggingface_hub`的缓存重定向机制 ```python from huggingface_hub import snapshot_download local_path = snapshot_download(repo_id="deepseek-ai/deepseek-model", local_dir="./cache") ``` ### 验证流程 1. 在离线主机运行: ```bash chmod +x start.sh ./start.sh ``` 2. 检查输出日志是否包含: - `All model checkpoint weights were used` → 权重加载成功 - `Running on device: cpu` → 硬件模式正确 ### 预防性措施 1. **构建自包含Docker镜像** ```dockerfile FROM ubuntu:20.04 COPY ./offline_package /app RUN echo "LD_LIBRARY_PATH=/app/external_libs:$LD_LIBRARY_PATH" >> /etc/environment CMD ["/app/start.sh"] ``` ### 相关问题 1. 如何解决 `ERROR: Could not find a version that satisfies the requirement flash-attn` 错误? 2. Hugging Face模型离线缓存如何手动管理?[^1] 3. PyTorch在多版本CUDA环境中如何避免库冲突? 4. 无NVIDIA驱动的主机上如何彻底禁用CUDA? > 通过环境变量隔离[^1]、动态库路径重定向和强制离线加载三管齐下,可确保模型在无网无GPU环境下稳定运行。核心原则是**消除所有隐式在线请求**并**显式控制硬件资源访问**。 </think> ### 解决方案:解决未联网无GPU主机上的PyTorch推理问题 以下针对分类模型和检测模型的完整解决方案,按优先级分步执行: --- ### 一、强制模型完全离线加载 #### 1. 禁用所有在线请求 ```python import os # 关键设置:禁用所有可能的在线连接 os.environ['TRANSFORMERS_OFFLINE'] = '1' # 禁用Hugging Face连接[^1] os.environ['HF_DATASETS_OFFLINE'] = '1' # 禁用数据集连接 os.environ['TOKENIZERS_PARALLELISM'] = 'false' # 避免并行请求 ``` #### 2. 显式加载本地模型 ```python from transformers import AutoModel, AutoTokenizer import torch # 必须使用绝对路径 model_path = "/absolute/path/to/local_model" tokenizer = AutoTokenizer.from_pretrained( model_path, local_files_only=True # 强制只加载本地文件 ) model = AutoModel.from_pretrained( model_path, local_files_only=True, device_map="auto" if torch.cuda.is_available() else "cpu", # 自动适配设备 torch_dtype=torch.float32 # CPU必须使用float32 ) ``` --- ### 二、处理二进制依赖问题(关键步骤) #### 1. 验证GPU库的存在性 在离线主机执行: ```bash # 检查CUDA库是否存在 find / -name "libcudart.so*" 2>/dev/null ``` #### 2. 创建符号链接(若无GPU) ```bash # 在虚拟环境目录执行 mkdir -p torch/lib/fake_cuda touch torch/lib/fake_cuda/libcudart.so # 创建空文件作为占位 export LD_LIBRARY_PATH=$PWD/torch/lib/fake_cuda:$LD_LIBRARY_PATH ``` #### 3. 强制CPU模式启动 ```python import torch torch.set_default_tensor_type(torch.FloatTensor) # 全局使用float32 device = torch.device("cpu") model = model.to(device).float() # 确保模型使用float32 ``` --- ### 三、环境完整性检查脚本 创建`check_env.py`: ```python import torch, os def check_environment(): # 1. 验证关键文件 required_files = [ "model/pytorch_model.bin", "model/config.json", "model/vocab.json" ] # 2. 验证依赖库 libs = [ "libgomp.so.1", "libmkl_intel_lp64.so.1", "libcudart.so" # 即使无GPU也需要占位文件 ] # 3. 执行验证 for f in required_files: assert os.path.exists(f), f"缺失关键文件: {f}" print(f"PyTorch版本: {torch.__version__}") print(f"CUDA可用: {torch.cuda.is_available()}") print("环境验证通过!") if __name__ == "__main__": check_environment() ``` --- ### 四、推理代码优化(适配CPU) ```python # 使用CPU优化配置 torch.set_num_threads(4) # 根据CPU核心数设置 torch.backends.cudnn.enabled = False # 禁用CUDA相关优化 # 推理示例 with torch.no_grad(): inputs = tokenizer("输入文本", return_tensors="pt").to(device) outputs = model(**inputs) logits = outputs.logits # CPU专用后处理 probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy() ``` --- ### 五、启动脚本模板 创建`start.sh`: ```bash #!/bin/bash # 1. 重置环境变量 unset CUDA_VISIBLE_DEVICES export LD_LIBRARY_PATH="/path/to/venv/lib:$LD_LIBRARY_PATH" # 2. 验证环境 python check_env.py || exit 1 # 3. 强制CPU模式 export CUDA_VISIBLE_DEVICES="" export TORCH_DEVICE="cpu" # 4. 启动推理 python inference.py --offline_mode true ``` --- ### 常见错误解决方案 | 错误现象 | 解决方案 | |---------|---------| | `ImportError: libcudart.so.11.0` | 创建空占位文件:`touch venv/lib/libcudart.so.11.0` | | `ConnectionError` | 确保设置了`TRANSFORMERS_OFFLINE=1` | | `RuntimeError: Expected all tensors on CPU` | 添加`.to('cpu')`显式转换 | | `AttributeError: 'NoneType'` | 检查模型文件是否完整下载 | --- ### 相关问题 1. 如何验证PyTorch模型是否完全脱离GPU运行? 2. Hugging Face模型离线缓存如何手动管理?[^1] 3. 无NVIDIA驱动的主机上如何彻底禁用CUDA? 4. PyTorch CPU推理性能优化有哪些方法? 5. 如何创建完全自包含的PyTorch运行环境? > 通过环境变量隔离[^1]、二进制依赖占位和显式CPU设备映射三管齐下,可确保模型在无网无GPU环境下稳定运行。关键是**消除所有隐式在线请求**并**绕过CUDA库依赖**。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值