持续性的调用GPU:os.environ[“CUDA_VISIBLE_DEVICES”] = “1,0”

本文介绍两种GPU调用方法,一种适用于持续性任务,通过设置环境变量实现多GPU切换;另一种适用于一次性任务,直接通过系统调用指定GPU。掌握这些技巧有助于优化深度学习模型训练过程。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

持续性的调用GPU:

os.environ['CUDA_VISIBLE_DEVICES'] = '1,0'

如果调用一次GPU:

os.system('CUDA_VISIBLE__DEVICES = 0')
运行 import cv2 import mediapipe as mp import numpy as np import matplotlib.pyplot as plt import os import pandas as pd from datetime import datetime import math from scipy.spatial import distance as dist import librosa import noisereduce as nr import tensorflow as tf from tensorflow.keras import layers, models from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from pydub import AudioSegment import wave import contextlib import joblib # 初始化MediaPipe模型 mp_pose = mp.solutions.pose mp_drawing = mp.solutions.drawing_utils pose = mp_pose.Pose( min_detection_confidence=0.5, min_tracking_confidence=0.5, model_complexity=2 ) # 姿态类型定义 POSTURE_TYPES = [ "双臂放下", "单手指点", "双手指点", "双手合掌", "左侧身", "右侧身", "背身", "上举手臂", "无人" ] # ====================== 姿态分析模块(含自动学习) ====================== class PostureAnalyzer: def __init__(self, auto_learn=True): self.posture_history = [] self.current_posture = "未知" self.posture_timers = {p: 0 for p in POSTURE_TYPES} self.last_posture_time = 0 self.transition_count = 0 self.shoulder_instability = 0 self.leg_instability = 0 self.auto_learn = auto_learn # 姿态定义阈值(初始值) self.POSTURE_THRESHOLDS = { "双臂放下": {"shoulder_angle": (160, 200), "elbow_angle": (160, 200)}, "单手指点": {"elbow_angle": (60, 120), "wrist_height": 0.7}, "双手指点": {"elbow_angle": (60, 120), "wrist_distance": 0.2}, "双手合掌": {"wrist_distance": (0.05, 0.15), "hand_height": (0.3, 0.7)}, "左侧身": {"shoulder_hip_angle": (70, 110), "hip_knee_angle": (160, 200)}, "右侧身": {"shoulder_hip_angle": (70, 110), "hip_knee_angle": (160, 200)}, "背身": {"visibility": (0.5, 1.0)}, "上举手臂": {"elbow_angle": (30, 90), "wrist_height": 0.8} } # 自动学习数据结构 self.posture_features = {p: [] for p in POSTURE_TYPES} self.posture_classifier = None self.scaler = StandardScaler() # 检查并加载预训练模型 # model_path = "models/posture/posture_model.joblib" # if os.path.exists(model_path): # self.load_learning_model(model_path) # elif auto_learn: # print("未找到预训练姿态模型,将从零开始学习...") # else: # print("警告: 未找到姿态模型且未启用自动学习") # 加载已有的学习模型(如果存在) if os.path.exists("posture_model.joblib"): self.load_learning_model() def save_learning_model(self): """保存学习到的模型""" model_data = { 'classifier': self.posture_classifier, 'scaler': self.scaler, 'thresholds': self.POSTURE_THRESHOLDS } joblib.dump(model_data, "posture_model.joblib") print("姿态模型已保存") # def load_learning_model(self, model_path="posture_model.joblib"): # """加载学习到的模型""" # model_data = joblib.load(model_path) # self.posture_classifier = model_data['classifier'] # self.scaler = model_data['scaler'] # self.POSTURE_THRESHOLDS = model_data['thresholds'] # print("姿态模型已加载") def load_learning_model(self): """加载学习到的模型""" model_data = joblib.load("posture_model.joblib") self.posture_classifier = model_data['classifier'] self.scaler = model_data['scaler'] self.POSTURE_THRESHOLDS = model_data['thresholds'] print("姿态模型已加载") def train_classifier(self): """训练姿态分类器""" # 准备训练数据 X = [] y = [] for posture, features_list in self.posture_features.items(): if len(features_list) > 10: # 确保有足够样本 for features in features_list: X.append(features) y.append(POSTURE_TYPES.index(posture)) if len(X) < 100: # 样本不足 print("样本不足,无法训练分类器") return # 数据标准化 X = self.scaler.fit_transform(X) # 训练测试分割 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # 创建并训练分类器 self.posture_classifier = KMeans(n_clusters=len(POSTURE_TYPES), random_state=42) self.posture_classifier.fit(X_train) # 评估模型 train_score = self.posture_classifier.score(X_train) test_score = self.posture_classifier.score(X_test) print(f"姿态分类器训练完成 - 训练得分: {train_score:.2f}, 测试得分: {test_score:.2f}") # 保存模型 self.save_learning_model() def extract_features(self, keypoints): """从关键点提取特征向量""" # 获取必要的关键点 left_shoulder = keypoints[mp_pose.PoseLandmark.LEFT_SHOULDER.value] right_shoulder = keypoints[mp_pose.PoseLandmark.RIGHT_SHOULDER.value] left_elbow = keypoints[mp_pose.PoseLandmark.LEFT_ELBOW.value] right_elbow = keypoints[mp_pose.PoseLandmark.RIGHT_ELBOW.value] left_wrist = keypoints[mp_pose.PoseLandmark.LEFT_WRIST.value] right_wrist = keypoints[mp_pose.PoseLandmark.RIGHT_WRIST.value] left_hip = keypoints[mp_pose.PoseLandmark.LEFT_HIP.value] right_hip = keypoints[mp_pose.PoseLandmark.RIGHT_HIP.value] left_knee = keypoints[mp_pose.PoseLandmark.LEFT_KNEE.value] right_knee = keypoints[mp_pose.PoseLandmark.RIGHT_KNEE.value] # 计算特征 features = [ # 肩部特征 abs(left_shoulder[1] - right_shoulder[1]), # 肩部不平度 dist.euclidean((left_shoulder[0], left_shoulder[1]), (right_shoulder[0], right_shoulder[1])), # 肩宽 # 手臂特征 self.calculate_angle(left_shoulder, left_elbow, left_wrist), # 左肘角度 self.calculate_angle(right_shoulder, right_elbow, right_wrist), # 右肘角度 left_wrist[1], # 左手腕高度 right_wrist[1], # 右手腕高度 dist.euclidean((left_wrist[0], left_wrist[1]), (right_wrist[0], right_wrist[1])), # 手腕间距离 # 身体特征 self.calculate_angle(left_shoulder, left_hip, left_knee), # 左髋角度 self.calculate_angle(right_shoulder, right_hip, right_knee), # 右髋角度 abs(left_hip[1] - right_hip[1]), # 髋部不平度 abs(left_knee[1] - right_knee[1]), # 膝盖不平度 ] return features def calculate_angle(self, a, b, c): """计算三个点之间的角度""" radians = np.arctan2(c[1] - b[1], c[0] - b[0]) - np.arctan2(a[1] - b[1], a[0] - b[0]) angle = np.abs(radians * 180.0 / np.pi) return angle if angle < 180 else 360 - angle def analyze_frame(self, frame, timestamp): """分析视频帧中的姿态""" # 转换颜色空间并处理 image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) results = pose.process(image) if not results.pose_landmarks: self.current_posture = "无人" return None # 提取关键点坐标 landmarks = results.pose_landmarks.landmark keypoints = {} for idx, landmark in enumerate(landmarks): keypoints[idx] = (landmark.x, landmark.y, landmark.visibility) # 姿态识别 if self.posture_classifier and self.auto_learn: # 使用学习到的分类器 features = self.extract_features(keypoints) scaled_features = self.scaler.transform([features]) posture_idx = self.posture_classifier.predict(scaled_features)[0] posture = POSTURE_TYPES[posture_idx] else: # 使用基于规则的识别 posture = self.detect_posture(keypoints) # 收集样本用于学习 if self.auto_learn: features = self.extract_features(keypoints) self.posture_features[posture].append(features) # 定期训练分类器 if len(self.posture_features[posture]) % 50 == 0: self.train_classifier() # 姿态变化检测 if posture != self.current_posture: self.transition_count += 1 self.current_posture = posture self.last_posture_time = timestamp # 更新姿态持续时间 self.posture_timers[posture] += 1 # 肩部和腿部稳定性分析 self.analyze_stability(keypoints, timestamp) return results def detect_posture(self, keypoints): """根据关键点检测具体姿态(基于规则)""" # 获取必要的关键点 left_shoulder = keypoints[mp_pose.PoseLandmark.LEFT_SHOULDER.value] right_shoulder = keypoints[mp_pose.PoseLandmark.RIGHT_SHOULDER.value] left_elbow = keypoints[mp_pose.PoseLandmark.LEFT_ELBOW.value] right_elbow = keypoints[mp_pose.PoseLandmark.RIGHT_ELBOW.value] left_wrist = keypoints[mp_pose.PoseLandmark.LEFT_WRIST.value] right_wrist = keypoints[mp_pose.PoseLandmark.RIGHT_WRIST.value] # 1. 检测手臂位置 if left_wrist[1] < 0.3 and right_wrist[1] < 0.3: return "上举手臂" # 2. 检测单手/双手指点 if self.is_pointing_gesture(left_elbow, left_wrist): return "单手指点" if not self.is_pointing_gesture(right_elbow, right_wrist) else "双手指点" # 3. 检测双手合掌 if dist.euclidean((left_wrist[0], left_wrist[1]), (right_wrist[0], right_wrist[1])) < 0.1: return "双手合掌" # 4. 检测身体朝向 body_orientation = self.detect_body_orientation(keypoints) if body_orientation != "正面": return body_orientation # 默认姿态 return "双臂放下" def analyze_stability(self, keypoints, timestamp): """分析肩部和腿部的稳定性""" # 肩部不平度计算 left_shoulder = keypoints[mp_pose.PoseLandmark.LEFT_SHOULDER.value] right_shoulder = keypoints[mp_pose.PoseLandmark.RIGHT_SHOULDER.value] shoulder_diff = abs(left_shoulder[1] - right_shoulder[1]) if shoulder_diff > 0.08: # 阈值 self.shoulder_instability += 1 # 腿部晃动检测 left_hip = keypoints[mp_pose.PoseLandmark.LEFT_HIP.value] right_hip = keypoints[mp_pose.PoseLandmark.RIGHT_HIP.value] left_knee = keypoints[mp_pose.PoseLandmark.LEFT_KNEE.value] right_knee = keypoints[mp_pose.PoseLandmark.RIGHT_KNEE.value] hip_diff = abs(left_hip[1] - right_hip[1]) knee_diff = abs(left_knee[1] - right_knee[1]) if hip_diff > 0.1 or knee_diff > 0.15: self.leg_instability += 1 # ====================== 语音分析模块(本地处理) ====================== class SpeechAnalyzer: def __init__(self, reference_text, auto_adapt=True): self.reference_text = reference_text self.phoneme_accuracy = [] self.pronunciation_model = None self.auto_adapt = auto_adapt self.adaptive_model_path = "pronunciation_model.h5" # 检查并加载预训练模型 # model_path = "models/speech/pronunciation_model.h5" # if os.path.exists(model_path): # self.pronunciation_model = tf.keras.models.load_model(model_path) # print("已加载预训练发音模型") # 加载预训练模型 if os.path.exists(self.adaptive_model_path): self.pronunciation_model = tf.keras.models.load_model(self.adaptive_model_path) print("已加载自适应发音模型") def save_adaptive_model(self): """保存自适应发音模型""" if self.pronunciation_model: self.pronunciation_model.save(self.adaptive_model_path) print("自适应发音模型已保存") def extract_mfcc(self, audio_path, n_mfcc=13): """提取音频的MFCC特征""" try: # 读取音频文件 y, sr = librosa.load(audio_path, sr=None) # 降噪 y = nr.reduce_noise(y=y, sr=sr) # 提取MFCC特征 mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc) # 计算均值和标准差 mfcc_mean = np.mean(mfcc, axis=1) mfcc_std = np.std(mfcc, axis=1) return np.concatenate([mfcc_mean, mfcc_std]) except Exception as e: print(f"音频处理错误: {str(e)}") return None def build_pronunciation_model(self, input_dim): """构建发音评估模型""" model = models.Sequential([ layers.Dense(64, activation='relu', input_shape=(input_dim,)), layers.Dropout(0.3), layers.Dense(32, activation='relu'), layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) return model def train_adaptive_model(self, new_features, new_labels): """训练自适应发音模型""" # 如果没有模型,创建一个新模型 if not self.pronunciation_model: input_dim = len(new_features[0]) self.pronunciation_model = self.build_pronunciation_model(input_dim) # 转换为numpy数组 new_features = np.array(new_features) new_labels = np.array(new_labels) # 训练模型 self.pronunciation_model.fit( new_features, new_labels, epochs=10, batch_size=16, validation_split=0.2, verbose=0 ) # 保存模型 self.save_adaptive_model() def analyze_audio(self, audio_path): """分析音频文件并计算发音准确率""" # 分割音频为单个字符(这里简化为按固定时间分割) char_audio_segments = self.split_audio_by_chars(audio_path, self.reference_text) # 分析每个字符的发音 for i, (segment_path, char) in enumerate(char_audio_segments): # 提取特征 features = self.extract_mfcc(segment_path) if features is None: continue # 评估发音准确率 if self.pronunciation_model and self.auto_adapt: # 使用自适应模型 prediction = self.pronunciation_model.predict(np.array([features]))[0][0] is_correct = 1 if prediction > 0.7 else 0 else: # 使用基于规则的方法(简化为随机) is_correct = 1 if np.random.random() > 0.3 else 0 # 记录结果 status = "正确" if is_correct == 1 else "错误" self.phoneme_accuracy.append((status, char)) # 收集样本用于自适应学习 if self.auto_adapt: # 在实际应用中,这里需要专家标注 # 简化为假设前20%是正确发音 label = 1 if i < len(self.reference_text) * 0.2 else 0 # 定期更新模型 if i % 10 == 0: self.train_adaptive_model([features], [label]) return self.reference_text def split_audio_by_chars(self, audio_path, text): """将音频分割为单个字符(简化版)""" # 在实际应用中,这里需要使用语音对齐算法 # 这里简化为按字符平均分割 # 获取音频长度 with contextlib.closing(wave.open(audio_path, 'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / float(rate) # 计算每个字符的持续时间 char_duration = duration / len(text) # 分割音频 segments = [] audio = AudioSegment.from_wav(audio_path) for i, char in enumerate(text): start = i * char_duration * 1000 # 转换为毫秒 end = (i + 1) * char_duration * 1000 segment = audio[start:end] # 保存临时文件 segment_path = f"char_{i}.wav" segment.export(segment_path, format="wav") segments.append((segment_path, char)) return segments # ====================== 报告生成模块 ====================== class ReportGenerator: def __init__(self, posture_analyzer, speech_analyzer, video_duration): self.posture = posture_analyzer self.speech = speech_analyzer self.video_duration = video_duration def generate_report(self): """生成完整的教学分析报告""" report = { "基本统计": self.basic_statistics(), "姿态分析": self.posture_analysis(), "语音分析": self.speech_analysis(), "教学行为分析": self.teaching_behavior_analysis(), "改进建议": self.suggestions() } # 生成可视化图表 self.generate_visualizations() return report def basic_statistics(self): """生成基本统计数据""" return { "分析日期": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "视频时长(秒)": self.video_duration, "姿态变化总次数": self.posture.transition_count, "平均姿态变化频率(次/分钟)": self.posture.transition_count / (self.video_duration / 60), "肩部不平总时长(秒)": self.posture.shoulder_instability / 30, # 假设30fps "腿部不稳总时长(秒)": self.posture.leg_instability / 30 } def posture_analysis(self): """生成姿态分析数据""" posture_percentage = {} total_frames = sum(self.posture.posture_timers.values()) for posture, count in self.posture.posture_timers.items(): percentage = (count / total_frames) * 100 posture_percentage[posture] = f"{percentage:.2f}%" return posture_percentage def speech_analysis(self): """生成语音分析数据""" total_chars = len(self.speech.phoneme_accuracy) correct_chars = sum(1 for status, _ in self.speech.phoneme_accuracy if status == "正确") accuracy = (correct_chars / total_chars) * 100 if total_chars > 0 else 0 return { "总字数": total_chars, "正确字数": correct_chars, "普通话准确率": f"{accuracy:.2f}%", "详细发音分析": self.speech.phoneme_accuracy[:50] # 仅显示前50个字符 } def teaching_behavior_analysis(self): """生成教学行为分析""" # 基于姿态数据推断教学行为 pointing_time = (self.posture.posture_timers["单手指点"] + self.posture.posture_timers["双手指点"]) / 30 writing_estimate = self.posture.posture_timers["背身"] / 30 * 0.7 # 假设70%背身时间是板书 return { "板书行为(秒)": writing_estimate, "讲授行为(秒)": (self.posture.posture_timers["双臂放下"] + self.posture.posture_timers["单手指点"]) / 30, "问答行为(次)": self.posture.posture_timers["双手合掌"] / 30 * 2, # 估计值 "教学激励行为": "高" if self.posture.posture_timers["上举手臂"] > 100 else "中", "课堂组织效率": self.calculate_classroom_organization(), "教学活动多样性": self.calculate_activity_diversity(), "教学技能评分": self.calculate_teaching_skill(), "时间分配合理性": self.calculate_time_distribution(), "安全教学指数": self.calculate_safety_index() } def calculate_classroom_organization(self): """计算课堂组织效率""" # 基于姿态变化频率和语音同步情况 posture_changes = self.posture.transition_count speech_char_count = len(self.speech.phoneme_accuracy) if speech_char_count == 0: return 0 # 简化的效率计算公式 efficiency = min(100, 80 + (posture_changes / speech_char_count) * 20) return f"{efficiency:.1f}/100" def generate_visualizations(self): """生成可视化图表""" # 姿态分布饼图 plt.figure(figsize=(10, 7)) postures = list(self.posture.posture_timers.keys()) counts = [self.posture.posture_timers[p] for p in postures] plt.pie(counts, labels=postures, autopct='%1.1f%%') plt.title('教师姿态分布') plt.savefig('posture_distribution.png') # 发音准确率柱状图 plt.figure(figsize=(12, 6)) status_counts = { "正确": sum(1 for s, _ in self.speech.phoneme_accuracy if s == "正确"), "错误": sum(1 for s, _ in self.speech.phoneme_accuracy if s == "错误") } plt.bar(status_counts.keys(), status_counts.values()) plt.title('发音准确率分析') plt.savefig('pronunciation_accuracy.png') # 教学行为时间分配图 plt.figure(figsize=(12, 6)) behaviors = { "板书": self.posture.posture_timers["背身"] / 30 * 0.7, "讲授": (self.posture.posture_timers["双臂放下"] + self.posture.posture_timers["单手指点"]) / 30, "互动": self.posture.posture_timers["双手合掌"] / 30 * 2, "激励": self.posture.posture_timers["上举手臂"] / 30 } plt.bar(behaviors.keys(), behaviors.values()) plt.title('教学行为时间分配') plt.ylabel('时间(秒)') plt.savefig('teaching_behaviors.png') def suggestions(self): """生成改进建议""" suggestions = [] # 姿态相关建议 if self.posture.shoulder_instability / 30 > 60: # 超过60秒 suggestions.append("肩部不平时间较长,建议注意保持肩部平衡") if self.posture.posture_timers["背身"] / 30 > 120: # 超过2分钟 suggestions.append("背向学生时间过长,建议增加面向学生的时间") if self.posture.posture_timers["上举手臂"] / 30 < 30: # 少于30秒 suggestions.append("教学激励行为不足,建议增加手势互动") # 语音相关建议 correct_count = sum(1 for s, _ in self.speech.phoneme_accuracy if s == "正确") accuracy = correct_count / len(self.speech.phoneme_accuracy) * 100 if len( self.speech.phoneme_accuracy) > 0 else 0 if accuracy < 90: suggestions.append(f"普通话准确率({accuracy:.1f}%)有待提高,建议加强发音练习") # 教学行为建议 writing_time = self.posture.posture_timers["背身"] / 30 * 0.7 teaching_time = (self.posture.posture_timers["双臂放下"] + self.posture.posture_timers["单手指点"]) / 30 if writing_time > teaching_time * 0.5: suggestions.append("板书时间过长,建议平衡板书与讲解的比例") return suggestions # ====================== 主执行流程 ====================== def main(video_path, audio_path, reference_text): # 初始化分析器 posture_analyzer = PostureAnalyzer(auto_learn=True) speech_analyzer = SpeechAnalyzer(reference_text, auto_adapt=True) print("开始视频分析...") # 处理视频 cap = cv2.VideoCapture(video_path) if not cap.isOpened(): print("无法打开视频文件") return fps = cap.get(cv2.CAP_PROP_FPS) frame_count = 0 total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) while cap.isOpened(): success, frame = cap.read() if not success: break # 显示进度 if frame_count % 100 == 0: print(f"视频分析进度: {frame_count}/{total_frames} ({frame_count / total_frames * 100:.1f}%)") # 每隔5帧分析一次(提高性能) if frame_count % 5 == 0: timestamp = frame_count / fps posture_analyzer.analyze_frame(frame, timestamp) frame_count += 1 video_duration = frame_count / fps cap.release() print("视频分析完成") print("开始音频分析...") # 处理音频 recognized_text = speech_analyzer.analyze_audio(audio_path) print("音频分析完成") # 生成报告 report_generator = ReportGenerator(posture_analyzer, speech_analyzer, video_duration) report = report_generator.generate_report() # 保存报告 with open("teaching_analysis_report.txt", "w", encoding="utf-8") as f: f.write("=============== 教师授课分析报告 ===============\n\n") for section, content in report.items(): f.write(f"=== {section} ===\n") if isinstance(content, dict): for k, v in content.items(): f.write(f"{k}: {v}\n") elif isinstance(content, list): for item in content: f.write(f"- {item}\n") else: f.write(f"{content}\n") f.write("\n") print("分析报告已生成: teaching_analysis_report.txt") print("可视化图表已保存: posture_distribution.png, pronunciation_accuracy.png, teaching_behaviors.png") if __name__ == "__main__": # 配置参数 # 视频 VIDEO_PATH = "D:/java/桌面资源/666/11.mp4" # 音频 AUDIO_PATH = "D:/java/桌面资源/666/11.wav" # 稿子 REFERENCE_TEXT = "这里是教师的标准讲稿文本,用于发音对比..." main(VIDEO_PATH, AUDIO_PATH, REFERENCE_TEXT) 出现 INFO: Created TensorFlow Lite XNNPACK delegate for CPU. WARNING: All log messages before absl::InitializeLog() is called are written to STDERR W0000 00:00:1752563404.663794 28056 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors. Process finished with exit code -1073741819 (0xC0000005)
最新发布
07-16
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1269, internal error, NCCL version 2.14.3 ncclInternalError: Internal check failed. Last error: Duplicate GPU detected : rank 7 and rank 0 both on CUDA device 21000 work = default_pg.broadcast([tensor], opts) RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1269, internal error, NCCL version 2.14.3 ncclInternalError: Internal check failed. Last error: Duplicate GPU detected : rank 6 and rank 0 both on CUDA device 21000 work = default_pg.broadcast([tensor], opts) RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1269, internal error, NCCL version 2.14.3 ncclInternalError: Internal check failed. Last error: Duplicate GPU detected : rank 5 and rank 0 both on CUDA device 21000 work = default_pg.broadcast([tensor], opts) RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1269, internal error, NCCL version 2.14.3 ncclInternalError: Internal check failed. Last error: Duplicate GPU detected : rank 4 and rank 0 both on CUDA device 21000 work = default_pg.broadcast([tensor], opts) RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1269, internal error, NCCL version 2.14.3 ncclInternalError: Internal check failed. Last error: Duplicate GPU detected : rank 1 and rank 0 both on CUDA device 21000 work = default_pg.broadcast([tensor], opts) RuntimeError : work = default_pg.broadcast([tensor], opts)NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1269, internal error, NCCL version 2.14.3 ncclInternalError: Internal check failed. Last error: Duplicate GPU detected : rank 2 and rank 0 both on CUDA device 21000 RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1269, internal error, NCCL version 2.14.3 ncclInternalError: Internal check failed. Last error: Duplicate GPU detected : rank 0 and rank 1 both on CUDA device 21000work = default_pg.broadcast([tensor], opts) RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1269, internal error, NCCL version 2.14.3 ncclInternalError: Internal check failed. Last error: Duplicate GPU detected : rank 3 and rank 0 both on CUDA device 21000 ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 779602) of binary: /home/wangbaihui/anaconda3/envs/sparsed/bin/python3 Traceback (most recent call last): File "/home/wangbaihui/anaconda3/envs/sparsed/lib/python3.8/runpy.py", line 194, in _run_module_as_main return _run_code(code, main_globals, None, File "/home/wangbaihui/anaconda3/envs/sparsed/lib/python3.8/runpy.py", line 87, in _run_code exec(code, run_globals) File "/home/wangbaihui/anaconda3/envs/sparsed/lib/python3.8/site-packages/torch/distributed/launch.py", line 195, in <module> main() File "/home/wangbaihui/anaconda3/envs/sparsed/lib/python3.8/site-packages/torch/distributed/launch.py", line 191, in main launch(args) File "/home/wangbaihui/anaconda3/envs/sparsed/lib/python3.8/site-packages/torch/distributed/launch.py", line 176, in launch run(args) File "/home/wangbaihui/anaconda3/envs/sparsed/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run elastic_launch( File "/home/wangbaihui/anaconda3/envs/sparsed/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/home/wangbaihui/anaconda3/envs/sparsed/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ ./tools/train.py FAILED
06-17
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值