feat: replace Whisper with Deepgram ASR + speaker diarization

2026-02-17 12:14:25 +08:00
parent ff72dd0c56
commit e85038a1fb
8 changed files with 299 additions and 2 deletions
--- a/20
+++ b/20
@@ -0,0 +1,20 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install system deps
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python deps
+COPY backend/requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy code
+COPY backend/ ./backend/
+COPY frontend/ ./frontend/
+
+EXPOSE 8000
+
+CMD ["python", "backend/main.py"]
--- a/README.md
+++ b/README.md
@@ -1,3 +1,27 @@
-# insightflow
+# InsightFlow

-音频与文档的领域知识构建平台 - 将会议录音转化为结构化知识图谱
+音频与文档的领域知识构建平台
+
+## 产品定位
+将会议录音和文档转化为结构化的知识图谱，通过人机回圈(Human-in-the-Loop)实现知识持续生长。
+
+## 核心特性
+- 🎙️ ASR 语音识别 + 热词注入
+- 🧠 LLM 实体抽取与解释
+- 🔗 双视图联动（文档视图 + 图谱视图）
+- 📈 知识生长（多文件实体对齐）
+
+## 技术栈
+- 前端: Next.js + Tailwind
+- 后端: Node.js / Python
+- 数据库: MySQL + Neo4j
+- ASR: Whisper
+- LLM: OpenAI / Kimi
+
+## 开发阶段
+- [ ] Phase 1: 骨架与单体分析 (MVP)
+- [ ] Phase 2: 交互与纠错工作台
+- [ ] Phase 3: 记忆与生长
+
+## 文档
+- [PRD v2.0](docs/PRD-v2.0.md)
--- a/backend/main.py
+++ b/backend/main.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+"""
+InsightFlow Backend - Phase 1 MVP with Deepgram
+ASR: Deepgram (Nova-3)
+Speaker Diarization: Deepgram
+LLM: Kimi API for entity extraction
+"""
+
+import os
+import json
+import httpx
+from fastapi import FastAPI, File, UploadFile, HTTPException, Form
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+from typing import List, Optional
+from datetime import datetime
+from deepgram import DeepgramClient, PrerecordedOptions, FileSource
+
+app = FastAPI(title="InsightFlow", version="0.1.0")
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Models
+class Entity(BaseModel):
+    id: str
+    name: str
+    type: str
+    start: int
+    end: int
+    definition: Optional[str] = None
+
+class TranscriptSegment(BaseModel):
+    start: float
+    end: float
+    text: str
+    speaker: Optional[str] = "Speaker A"
+
+class AnalysisResult(BaseModel):
+    transcript_id: str
+    segments: List[TranscriptSegment]
+    entities: List[Entity]
+    full_text: str
+    created_at: str
+
+storage = {}
+
+# API Keys
+DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
+KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
+KIMI_BASE_URL = "https://api.kimi.com/coding"
+
+def transcribe_with_deepgram(audio_data: bytes, filename: str) -> dict:
+    """使用 Deepgram 进行转录和说话人分离"""
+    if not DEEPGRAM_API_KEY:
+        raise HTTPException(status_code=500, detail="DEEPGRAM_API_KEY not configured")
+    
+    deepgram = DeepgramClient(DEEPGRAM_API_KEY)
+    
+    payload: FileSource = {
+        "buffer": audio_data,
+        "mimetype": "audio/wav" if filename.endswith(".wav") else "audio/mp3"
+    }
+    
+    options = PrerecordedOptions(
+        model="nova-3",
+        language="zh",
+        smart_format=True,
+        diarize=True,  # 说话人分离
+        utterances=True,
+        punctuate=True,
+        paragraphs=True
+    )
+    
+    response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
+    
+    # 解析结果
+    result = response.results
+    full_text = result.channels[0].alternatives[0].transcript if result.channels else ""
+    
+    # 提取带说话人的片段
+    segments = []
+    if result.utterances:
+        for u in result.utterances:
+            segments.append({
+                "start": u.start,
+                "end": u.end,
+                "text": u.transcript,
+                "speaker": f"Speaker {u.speaker}"
+            })
+    
+    return {
+        "full_text": full_text,
+        "segments": segments
+    }
+
+def extract_entities_with_llm(text: str) -> List[Entity]:
+    """使用 Kimi API 提取实体"""
+    if not KIMI_API_KEY or not text:
+        return []
+    
+    prompt = f"""请从以下会议文本中提取关键实体（专有名词、项目名、技术术语、人名等），并以 JSON 格式返回：
+
+文本：{text[:3000]}  # 限制长度避免超限
+
+要求：
+1. 每个实体包含：name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
+2. 只返回 JSON 数组，不要其他内容
+3. 确保 start/end 是字符在文本中的位置
+
+示例输出：
+[
+  {{"name": "Project Alpha", "type": "PROJECT", "start": 23, "end": 35, "definition": "Q3季度的核心项目"}},
+  {{"name": "K8s", "type": "TECH", "start": 37, "end": 40, "definition": "Kubernetes的缩写"}}
+]
+"""
+    
+    try:
+        response = httpx.post(
+            f"{KIMI_BASE_URL}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {KIMI_API_KEY}", "Content-Type": "application/json"},
+            json={
+                "model": "k2p5",
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": 0.1
+            },
+            timeout=60.0
+        )
+        response.raise_for_status()
+        result = response.json()
+        content = result["choices"][0]["message"]["content"]
+        
+        # 解析 JSON
+        import re
+        json_match = re.search(r'\[.*?\]', content, re.DOTALL)
+        if json_match:
+            entities_data = json.loads(json_match.group())
+            entities = []
+            for i, e in enumerate(entities_data):
+                entities.append(Entity(
+                    id=f"ent_{i+1}",
+                    name=e["name"],
+                    type=e.get("type", "OTHER"),
+                    start=e["start"],
+                    end=e["end"],
+                    definition=e.get("definition", "")
+                ))
+            return entities
+    except Exception as e:
+        print(f"LLM extraction failed: {e}")
+    
+    return []
+
+@app.post("/api/v1/upload", response_model=AnalysisResult)
+async def upload_audio(file: UploadFile = File(...)):
+    """上传音频并分析"""
+    content = await file.read()
+    
+    # Deepgram 转录
+    print(f"Transcribing with Deepgram: {file.filename}")
+    dg_result = transcribe_with_deepgram(content, file.filename)
+    
+    # 构建片段
+    segments = [
+        TranscriptSegment(**seg) for seg in dg_result["segments"]
+    ] or [TranscriptSegment(start=0, end=0, text=dg_result["full_text"], speaker="Speaker A")]
+    
+    # LLM 实体提取
+    print("Extracting entities with LLM...")
+    entities = extract_entities_with_llm(dg_result["full_text"])
+    
+    analysis = AnalysisResult(
+        transcript_id=os.urandom(8).hex(),
+        segments=segments,
+        entities=entities,
+        full_text=dg_result["full_text"],
+        created_at=datetime.now().isoformat()
+    )
+    
+    storage[analysis.transcript_id] = analysis
+    print(f"Analysis complete: {analysis.transcript_id}, {len(entities)} entities found")
+    return analysis
+
+@app.get("/api/v1/transcripts/{transcript_id}", response_model=AnalysisResult)
+async def get_transcript(transcript_id: str):
+    if transcript_id not in storage:
+        raise HTTPException(status_code=404, detail="Transcript not found")
+    return storage[transcript_id]
+
+@app.get("/api/v1/transcripts")
+async def list_transcripts():
+    return list(storage.values())
+
+# Serve frontend
+app.mount("/", StaticFiles(directory="frontend", html=True), name="frontend")
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -0,0 +1,7 @@
+fastapi==0.115.0
+uvicorn[standard]==0.32.0
+python-multipart==0.0.17
+deepgram-sdk==3.7.0
+httpx==0.27.2
+pydantic==2.9.2
+python-dotenv==1.0.1
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,13 @@
+version: '3.8'
+
+services:
+  insightflow:
+    build: .
+    ports:
+      - "18000:8000"
+    environment:
+      - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY}
+      - KIMI_API_KEY=${KIMI_API_KEY}
+    volumes:
+      - ./data:/app/data
+    restart: unless-stopped
--- a/frontend/EOF
+++ b/frontend/EOF
--- a/frontend/app.js
+++ b/frontend/app.js
@@ -0,0 +1,10 @@
+const API_BASE = '/api/v1';
+async function upload(file) {
+    const formData = new FormData();
+    formData.append('file', file);
+    const res = await fetch(API_BASE + '/upload', {
+        method: 'POST',
+        body: formData
+    });
+    return await res.json();
+}
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -0,0 +1,18 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <title>InsightFlow MVP</title>
+    <style>
+        body { font-family: sans-serif; background: #0a0a0a; color: #e0e0e0; padding: 40px; }
+        h1 { color: #00d4ff; }
+        .upload { border: 2px dashed #333; padding: 40px; text-align: center; border-radius: 8px; }
+        .entity { background: rgba(123,44,191,0.3); padding: 2px 6px; border-radius: 4px; }
+    </style>
+</head>
+<body>
+    <h1>InsightFlow</h1>
+    <p>Phase 1 MVP - 音频转录与实体提取</p>
+    <div class="upload">拖拽音频文件上传</div>
+</body>
+</html>