基于开源模型进行的视频文件抽帧进行人物情绪分析-1.图像2.音频 3.文本，----实现什么就分享什么，我把精华都贡献出来，准确度之类的欢迎评论-优快云博客

本文链接：https://blog.youkuaiyun.com/www_enjoy1_com/article/details/148607429
import cv2
from deepface import DeepFace
from vosk import Model, KaldiRecognizer
import json
from transformers import pipeline
from collections import Counter
import os
import librosa
import torch
from pydub import AudioSegment
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor

# -----------------------------
# Step 1: 抽帧 + 图像情绪分析
# -----------------------------
def extract_and_analyze_image_emotion(video_path, interval=1):
    print("🖼️ 正在抽取图像并分析情绪...")
    frame_dir = "image_frames"
    if not os.path.exists(frame_dir):
        os.makedirs(frame_dir)

    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    image_emotions = {}

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        current_frame = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
        if current_frame % (fps * interval) == 0:
            sec = current_frame // fps
            frame_path = os.path.join(frame_dir, f"frame_{sec:04d}.jpg")
            cv2.imwrite(frame_path, frame)
            try:
                result = DeepFace.analyze(img_path=frame_path, actions=["emotion"], enforce_detection=False)
                emotion = result[0]["dominant_emotion"] if isinstance(result, list) else result["dominant_emotion"]
            except Exception:
                emotion = "无法识别"
            image_emotions[sec] = emotion
            print(f"📸 帧 {current_frame} -> 情绪: {emotion}")
    cap.release()
    return image_emotions


# -----------------------------
# 加载 wav2vec2 情绪识别模型
# -----------------------------
def load_wav2vec2_emotion_model():
    model_path = "models/wav2vec2-large-xlsr-53-chinese-zh-cn"
    print(f"🧠 正在加载本地语音情绪识别模型: {model_path}")

    # 加载预处理器和模型（从本地路径）
    processor = Wav2Vec2Processor.from_pretrained(model_path)
    model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path, num_labels=4)

    return processor, model


# -----------------------------
# 提取音频并分析情绪
# -----------------------------
def extract_and_analyze_audio_emotion(video_path, interval=1):
    print("🔊 正在提取音频并分析情绪...")

    # Step 1: 使用 pydub 提取音频（无需 moviepy）
    audio = AudioSegment.from_file(video_path)
    audio_path = "extracted_audio.wav"
    audio.export(audio_path, format="wav")
    print(f"✅ 音频已保存为 {audio_path}")

    # Step 2: 切分为每秒片段
    segment_dir = "audio_segments"
    if not os.path.exists(segment_dir):
        os.makedirs(segment_dir)

    audio = AudioSegment.from_wav(audio_path)
    segment_length_ms = interval * 1000
    segments = []
    for i in range(0, len(audio), segment_length_ms):
        seg = audio[i:i + segment_length_ms]
        seg_path = os.path.join(segment_dir, f"seg_{i // segment_length_ms:04d}.wav")
        seg.export(seg_path, format="wav")
        segments.append(seg_path)

    # Step 3: 加载本地 HuggingFace 模型（支持中文情绪识别）
    try:
        processor, model = load_wav2vec2_emotion_model()
    except Exception as e:
        print("❌ 无法加载本地模型，请检查路径或文件完整性")
        return {}

    emotion_labels = ['angry', 'happy', 'neutral', 'sad']  # 示例情绪类别（根据你的模型调整）

    audio_emotions = {}
    for idx, path in enumerate(segments):
        try:
            speech, sr = librosa.load(path, sr=16000)
            input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values

            with torch.no_grad():
                logits = model(input_values).logits

            predicted_ids = torch.argmax(logits, dim=-1)
            emotion = emotion_labels[predicted_ids.item()]
            audio_emotions[idx] = emotion
            print(f"🕒 第 {idx} 秒 -> 音频情绪: {emotion}")
        except Exception as e:
            print(f"❌ 第 {idx} 秒音频分析失败: {str(e)}")
            audio_emotions[idx] = "无法识别"

    return audio_emotions


# -----------------------------
# Step 3: 语音转文字 + 文本情绪分析
# -----------------------------

def analyze_text_emotion_from_audio(seg_path):
    vosk_model_path = "models/vosk-model-en-us-0.22-lgraph"  # 确保这是vosk模型的正确路径
    if not os.path.exists(vosk_model_path):
        print("❌ 找不到语音识别模型，请先下载并放置在 models/vosk-model-en-us-0.22-lgraph/")
        return "无法识别"

    # Step 1: 使用 Vosk 进行语音识别
    model = Model(vosk_model_path)
    rec = KaldiRecognizer(model, 16000)

    with open(seg_path, "rb") as f:
        data = f.read(44)  # 跳过wav文件头
        while True:
            wav_data = f.read(4000)
            if len(wav_data) == 0:
                break
            if rec.AcceptWaveform(wav_data):
                pass

    result = json.loads(rec.FinalResult())
    text = result.get("text", "")
    if not text.strip():
        return "无内容"

    # Step 2: 使用本地 BERT 模型进行情感分析
    bert_model_path = "models/bert-base-chinese"  # 修改为你BERT模型所在的本地目录
    try:
        classifier = pipeline("sentiment-analysis", model=bert_model_path)
    except Exception as e:
        print(f"❌ 加载 BERT 情感分析模型失败: {str(e)}")
        return "无法识别"

    label = classifier(text)[0]['label'].lower()
    return label if label in ["positive", "negative", "neutral"] else "unknown"


# -----------------------------
# Step 4: 整合所有数据并输出 TXT
# -----------------------------
def save_combined_report(image_emotions, audio_emotions, text_emotions, output_file="emotion_report.txt"):
    print("📄 正在生成综合情绪报告...")
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("时间(秒)\t图像情绪\t音频情绪\t文本情绪\t综合情绪\n")
        all_times = sorted(set(image_emotions.keys()).union(audio_emotions.keys()).union(text_emotions.keys()))
        for sec in all_times:
            img_e = image_emotions.get(sec, "无")
            aud_e = audio_emotions.get(sec, "无")
            txt_e = text_emotions.get(sec, "无")
            votes = [img_e, aud_e, txt_e]
            final = Counter(votes).most_common(1)[0][0]
            f.write(f"{sec}\t{img_e}\t{aud_e}\t{txt_e}\t{final}\n")
            print(f"🕒 第 {sec} 秒 | 图像: {img_e} | 音频: {aud_e} | 文本: {txt_e} | 最终: {final}")


# -----------------------------
# 主函数入口
# -----------------------------
if __name__ == "__main__":
    # 自动查找当前目录下的第一个 MP4 视频文件
    current_dir = os.getcwd()
    video_files = [f for f in os.listdir(current_dir) if f.lower().endswith(".mp4")]
    if not video_files:
        print("❌ 当前目录下未找到 MP4 视频文件")
    else:
        video_path = os.path.join(current_dir, video_files[0])
        print(f"✅ 找到视频文件：{video_path}")

        # Step 1: 图像情绪分析
        image_emotions = extract_and_analyze_image_emotion(video_path)

        # Step 2: 音频情绪分析 + 文本情绪分析
        audio_emotions = {}
        text_emotions = {}
        segment_dir = "audio_segments"
        for seg_file in sorted(os.listdir(segment_dir), key=lambda x: int(x.split('_')[1].split('.')[0])):
            sec = int(seg_file.split('_')[1].split('.')[0])  # 获取秒数
            seg_path = os.path.join(segment_dir, seg_file)

            # 分析音频情绪
            try:
                emotion = extract_and_analyze_audio_emotion(seg_path)  # 假设你有一个单独的函数来分析每个片段的情绪
                audio_emotions[sec] = emotion
            except Exception as e:
                print(f"❌ 第 {sec} 秒音频分析失败: {str(e)}")
                audio_emotions[sec] = "无法识别"

            # 分析文本情绪
            try:
                text_emotion = analyze_text_emotion_from_audio(seg_path)
                text_emotions[sec] = text_emotion
            except Exception as e:
                print(f"❌ 第 {sec} 秒文本分析失败: {str(e)}")
                text_emotions[sec] = "无法识别"

        # Step 3: 输出综合报告（包括图像、音频、文本情绪）
        save_combined_report(image_emotions, audio_emotions, text_emotions=text_emotions)