Phase 7 Task 7: 插件与集成系统

- 创建 plugin_manager.py 模块 - PluginManager: 插件管理主类 - ChromeExtensionHandler: Chrome 插件处理 - BotHandler: 飞书/钉钉/Slack 机器人处理 - WebhookIntegration: Zapier/Make Webhook 集成 - WebDAVSync: WebDAV 同步管理 - 创建完整的 Chrome 扩展代码 - manifest.json, background.js, content.js, content.css - popup.html/js: 弹出窗口界面 - options.html/js: 设置页面 - 支持网页剪藏、选中文本保存、项目选择 - 更新 schema.sql 添加插件相关数据库表 - plugins: 插件配置表 - bot_sessions: 机器人会话表 - webhook_endpoints: Webhook 端点表 - webdav_syncs: WebDAV 同步配置表 - plugin_activity_logs: 插件活动日志表 - 更新 main.py 添加插件相关 API 端点 - GET/POST /api/v1/plugins - 插件管理 - POST /api/v1/plugins/chrome/clip - Chrome 插件保存网页 - POST /api/v1/bots/webhook/{platform} - 接收机器人消息 - GET /api/v1/bots/sessions - 机器人会话列表 - POST /api/v1/webhook-endpoints - 创建 Webhook 端点 - POST /webhook/{type}/{token} - 接收外部 Webhook - POST /api/v1/webdav-syncs - WebDAV 同步配置 - POST /api/v1/webdav-syncs/{id}/test - 测试 WebDAV 连接 - POST /api/v1/webdav-syncs/{id}/sync - 触发 WebDAV 同步 - 更新 requirements.txt 添加插件依赖 - beautifulsoup4: HTML 解析 - webdavclient3: WebDAV 客户端 - 更新 STATUS.md 和 README.md 开发进度
2026-02-23 12:09:15 +08:00
parent 08535e54ba
commit 797ca58e8e
27 changed files with 7350 additions and 11 deletions
--- a/backend/multimodal_processor.py
+++ b/backend/multimodal_processor.py
@@ -0,0 +1,434 @@
+#!/usr/bin/env python3
+"""
+InsightFlow Multimodal Processor - Phase 7
+视频处理模块：提取音频、关键帧、OCR识别
+"""
+
+import os
+import json
+import uuid
+import tempfile
+import subprocess
+from typing import List, Dict, Optional, Tuple
+from dataclasses import dataclass
+from pathlib import Path
+
+# 尝试导入OCR库
+try:
+    import pytesseract
+    from PIL import Image
+    PYTESSERACT_AVAILABLE = True
+except ImportError:
+    PYTESSERACT_AVAILABLE = False
+
+try:
+    import cv2
+    CV2_AVAILABLE = True
+except ImportError:
+    CV2_AVAILABLE = False
+
+try:
+    import ffmpeg
+    FFMPEG_AVAILABLE = True
+except ImportError:
+    FFMPEG_AVAILABLE = False
+
+
+@dataclass
+class VideoFrame:
+    """视频关键帧数据类"""
+    id: str
+    video_id: str
+    frame_number: int
+    timestamp: float
+    frame_path: str
+    ocr_text: str = ""
+    ocr_confidence: float = 0.0
+    entities_detected: List[Dict] = None
+    
+    def __post_init__(self):
+        if self.entities_detected is None:
+            self.entities_detected = []
+
+
+@dataclass
+class VideoInfo:
+    """视频信息数据类"""
+    id: str
+    project_id: str
+    filename: str
+    file_path: str
+    duration: float = 0.0
+    width: int = 0
+    height: int = 0
+    fps: float = 0.0
+    audio_extracted: bool = False
+    audio_path: str = ""
+    transcript_id: str = ""
+    status: str = "pending"
+    error_message: str = ""
+    metadata: Dict = None
+    
+    def __post_init__(self):
+        if self.metadata is None:
+            self.metadata = {}
+
+
+@dataclass
+class VideoProcessingResult:
+    """视频处理结果"""
+    video_id: str
+    audio_path: str
+    frames: List[VideoFrame]
+    ocr_results: List[Dict]
+    full_text: str  # 整合的文本（音频转录 + OCR文本）
+    success: bool
+    error_message: str = ""
+
+
+class MultimodalProcessor:
+    """多模态处理器 - 处理视频文件"""
+    
+    def __init__(self, temp_dir: str = None, frame_interval: int = 5):
+        """
+        初始化多模态处理器
+        
+        Args:
+            temp_dir: 临时文件目录
+            frame_interval: 关键帧提取间隔（秒）
+        """
+        self.temp_dir = temp_dir or tempfile.gettempdir()
+        self.frame_interval = frame_interval
+        self.video_dir = os.path.join(self.temp_dir, "videos")
+        self.frames_dir = os.path.join(self.temp_dir, "frames")
+        self.audio_dir = os.path.join(self.temp_dir, "audio")
+        
+        # 创建目录
+        os.makedirs(self.video_dir, exist_ok=True)
+        os.makedirs(self.frames_dir, exist_ok=True)
+        os.makedirs(self.audio_dir, exist_ok=True)
+    
+    def extract_video_info(self, video_path: str) -> Dict:
+        """
+        提取视频基本信息
+        
+        Args:
+            video_path: 视频文件路径
+            
+        Returns:
+            视频信息字典
+        """
+        try:
+            if FFMPEG_AVAILABLE:
+                probe = ffmpeg.probe(video_path)
+                video_stream = next((s for s in probe['streams'] if s['codec_type'] == 'video'), None)
+                audio_stream = next((s for s in probe['streams'] if s['codec_type'] == 'audio'), None)
+                
+                if video_stream:
+                    return {
+                        'duration': float(probe['format'].get('duration', 0)),
+                        'width': int(video_stream.get('width', 0)),
+                        'height': int(video_stream.get('height', 0)),
+                        'fps': eval(video_stream.get('r_frame_rate', '0/1')),
+                        'has_audio': audio_stream is not None,
+                        'bitrate': int(probe['format'].get('bit_rate', 0))
+                    }
+            else:
+                # 使用 ffprobe 命令行
+                cmd = [
+                    'ffprobe', '-v', 'error', '-show_entries',
+                    'format=duration,bit_rate', '-show_entries',
+                    'stream=width,height,r_frame_rate', '-of', 'json',
+                    video_path
+                ]
+                result = subprocess.run(cmd, capture_output=True, text=True)
+                if result.returncode == 0:
+                    data = json.loads(result.stdout)
+                    return {
+                        'duration': float(data['format'].get('duration', 0)),
+                        'width': int(data['streams'][0].get('width', 0)) if data['streams'] else 0,
+                        'height': int(data['streams'][0].get('height', 0)) if data['streams'] else 0,
+                        'fps': 30.0,  # 默认值
+                        'has_audio': len(data['streams']) > 1,
+                        'bitrate': int(data['format'].get('bit_rate', 0))
+                    }
+        except Exception as e:
+            print(f"Error extracting video info: {e}")
+        
+        return {
+            'duration': 0,
+            'width': 0,
+            'height': 0,
+            'fps': 0,
+            'has_audio': False,
+            'bitrate': 0
+        }
+    
+    def extract_audio(self, video_path: str, output_path: str = None) -> str:
+        """
+        从视频中提取音频
+        
+        Args:
+            video_path: 视频文件路径
+            output_path: 输出音频路径（可选）
+            
+        Returns:
+            提取的音频文件路径
+        """
+        if output_path is None:
+            video_name = Path(video_path).stem
+            output_path = os.path.join(self.audio_dir, f"{video_name}.wav")
+        
+        try:
+            if FFMPEG_AVAILABLE:
+                (
+                    ffmpeg
+                    .input(video_path)
+                    .output(output_path, ac=1, ar=16000, vn=None)
+                    .overwrite_output()
+                    .run(quiet=True)
+                )
+            else:
+                # 使用命令行 ffmpeg
+                cmd = [
+                    'ffmpeg', '-i', video_path,
+                    '-vn', '-acodec', 'pcm_s16le',
+                    '-ac', '1', '-ar', '16000',
+                    '-y', output_path
+                ]
+                subprocess.run(cmd, check=True, capture_output=True)
+            
+            return output_path
+        except Exception as e:
+            print(f"Error extracting audio: {e}")
+            raise
+    
+    def extract_keyframes(self, video_path: str, video_id: str, 
+                         interval: int = None) -> List[str]:
+        """
+        从视频中提取关键帧
+        
+        Args:
+            video_path: 视频文件路径
+            video_id: 视频ID
+            interval: 提取间隔（秒），默认使用初始化时的间隔
+            
+        Returns:
+            提取的帧文件路径列表
+        """
+        interval = interval or self.frame_interval
+        frame_paths = []
+        
+        # 创建帧存储目录
+        video_frames_dir = os.path.join(self.frames_dir, video_id)
+        os.makedirs(video_frames_dir, exist_ok=True)
+        
+        try:
+            if CV2_AVAILABLE:
+                # 使用 OpenCV 提取帧
+                cap = cv2.VideoCapture(video_path)
+                fps = cap.get(cv2.CAP_PROP_FPS)
+                total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                
+                frame_interval_frames = int(fps * interval)
+                frame_number = 0
+                
+                while True:
+                    ret, frame = cap.read()
+                    if not ret:
+                        break
+                    
+                    if frame_number % frame_interval_frames == 0:
+                        timestamp = frame_number / fps
+                        frame_path = os.path.join(
+                            video_frames_dir, 
+                            f"frame_{frame_number:06d}_{timestamp:.2f}.jpg"
+                        )
+                        cv2.imwrite(frame_path, frame)
+                        frame_paths.append(frame_path)
+                    
+                    frame_number += 1
+                
+                cap.release()
+            else:
+                # 使用 ffmpeg 命令行提取帧
+                video_name = Path(video_path).stem
+                output_pattern = os.path.join(video_frames_dir, "frame_%06d_%t.jpg")
+                
+                cmd = [
+                    'ffmpeg', '-i', video_path,
+                    '-vf', f'fps=1/{interval}',
+                    '-frame_pts', '1',
+                    '-y', output_pattern
+                ]
+                subprocess.run(cmd, check=True, capture_output=True)
+                
+                # 获取生成的帧文件列表
+                frame_paths = sorted([
+                    os.path.join(video_frames_dir, f)
+                    for f in os.listdir(video_frames_dir)
+                    if f.startswith('frame_')
+                ])
+        except Exception as e:
+            print(f"Error extracting keyframes: {e}")
+        
+        return frame_paths
+    
+    def perform_ocr(self, image_path: str) -> Tuple[str, float]:
+        """
+        对图片进行OCR识别
+        
+        Args:
+            image_path: 图片文件路径
+            
+        Returns:
+            (识别的文本, 置信度)
+        """
+        if not PYTESSERACT_AVAILABLE:
+            return "", 0.0
+        
+        try:
+            image = Image.open(image_path)
+            
+            # 预处理：转换为灰度图
+            if image.mode != 'L':
+                image = image.convert('L')
+            
+            # 使用 pytesseract 进行 OCR
+            text = pytesseract.image_to_string(image, lang='chi_sim+eng')
+            
+            # 获取置信度数据
+            data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
+            confidences = [int(c) for c in data['conf'] if int(c) > 0]
+            avg_confidence = sum(confidences) / len(confidences) if confidences else 0
+            
+            return text.strip(), avg_confidence / 100.0
+        except Exception as e:
+            print(f"OCR error for {image_path}: {e}")
+            return "", 0.0
+    
+    def process_video(self, video_data: bytes, filename: str, 
+                     project_id: str, video_id: str = None) -> VideoProcessingResult:
+        """
+        处理视频文件：提取音频、关键帧、OCR
+        
+        Args:
+            video_data: 视频文件二进制数据
+            filename: 视频文件名
+            project_id: 项目ID
+            video_id: 视频ID（可选，自动生成）
+            
+        Returns:
+            视频处理结果
+        """
+        video_id = video_id or str(uuid.uuid4())[:8]
+        
+        try:
+            # 保存视频文件
+            video_path = os.path.join(self.video_dir, f"{video_id}_{filename}")
+            with open(video_path, 'wb') as f:
+                f.write(video_data)
+            
+            # 提取视频信息
+            video_info = self.extract_video_info(video_path)
+            
+            # 提取音频
+            audio_path = ""
+            if video_info['has_audio']:
+                audio_path = self.extract_audio(video_path)
+            
+            # 提取关键帧
+            frame_paths = self.extract_keyframes(video_path, video_id)
+            
+            # 对关键帧进行 OCR
+            frames = []
+            ocr_results = []
+            all_ocr_text = []
+            
+            for i, frame_path in enumerate(frame_paths):
+                # 解析帧信息
+                frame_name = os.path.basename(frame_path)
+                parts = frame_name.replace('.jpg', '').split('_')
+                frame_number = int(parts[1]) if len(parts) > 1 else i
+                timestamp = float(parts[2]) if len(parts) > 2 else i * self.frame_interval
+                
+                # OCR 识别
+                ocr_text, confidence = self.perform_ocr(frame_path)
+                
+                frame = VideoFrame(
+                    id=str(uuid.uuid4())[:8],
+                    video_id=video_id,
+                    frame_number=frame_number,
+                    timestamp=timestamp,
+                    frame_path=frame_path,
+                    ocr_text=ocr_text,
+                    ocr_confidence=confidence
+                )
+                frames.append(frame)
+                
+                if ocr_text:
+                    ocr_results.append({
+                        'frame_number': frame_number,
+                        'timestamp': timestamp,
+                        'text': ocr_text,
+                        'confidence': confidence
+                    })
+                    all_ocr_text.append(ocr_text)
+            
+            # 整合所有 OCR 文本
+            full_ocr_text = "\n\n".join(all_ocr_text)
+            
+            return VideoProcessingResult(
+                video_id=video_id,
+                audio_path=audio_path,
+                frames=frames,
+                ocr_results=ocr_results,
+                full_text=full_ocr_text,
+                success=True
+            )
+        
+        except Exception as e:
+            return VideoProcessingResult(
+                video_id=video_id,
+                audio_path="",
+                frames=[],
+                ocr_results=[],
+                full_text="",
+                success=False,
+                error_message=str(e)
+            )
+    
+    def cleanup(self, video_id: str = None):
+        """
+        清理临时文件
+        
+        Args:
+            video_id: 视频ID（可选，清理特定视频的文件）
+        """
+        import shutil
+        
+        if video_id:
+            # 清理特定视频的文件
+            for dir_path in [self.video_dir, self.frames_dir, self.audio_dir]:
+                target_dir = os.path.join(dir_path, video_id) if dir_path == self.frames_dir else dir_path
+                if os.path.exists(target_dir):
+                    for f in os.listdir(target_dir):
+                        if video_id in f:
+                            os.remove(os.path.join(target_dir, f))
+        else:
+            # 清理所有临时文件
+            for dir_path in [self.video_dir, self.frames_dir, self.audio_dir]:
+                if os.path.exists(dir_path):
+                    shutil.rmtree(dir_path)
+                    os.makedirs(dir_path, exist_ok=True)
+
+
+# Singleton instance
+_multimodal_processor = None
+
+def get_multimodal_processor(temp_dir: str = None, frame_interval: int = 5) -> MultimodalProcessor:
+    """获取多模态处理器单例"""
+    global _multimodal_processor
+    if _multimodal_processor is None:
+        _multimodal_processor = MultimodalProcessor(temp_dir, frame_interval)
+    return _multimodal_processor