feat: replace Whisper with Deepgram ASR + speaker diarization

2026-02-17 12:14:25 +08:00
parent ff72dd0c56
commit e85038a1fb
8 changed files with 299 additions and 2 deletions
--- a/20
+++ b/20
@@ -0,0 +1,20 @@
 FROM python:3.11-slim
 WORKDIR /app
 # Install system deps
 RUN apt-get update && apt-get install -y \
    ffmpeg \
    && rm -rf /var/lib/apt/lists/*
 # Install Python deps
 COPY backend/requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy code
 COPY backend/ ./backend/
 COPY frontend/ ./frontend/
 EXPOSE 8000
 CMD ["python", "backend/main.py"]
--- a/README.md
+++ b/README.md
@@ -1,3 +1,27 @@
-# insightflow
+# InsightFlow
-音频与文档的领域知识构建平台 - 将会议录音转化为结构化知识图谱
+音频与文档的领域知识构建平台
 ## 产品定位
 将会议录音和文档转化为结构化的知识图谱，通过人机回圈(Human-in-the-Loop)实现知识持续生长。
 ## 核心特性
 - 🎙️ ASR 语音识别 + 热词注入
 - 🧠 LLM 实体抽取与解释
 - 🔗 双视图联动（文档视图 + 图谱视图）
 - 📈 知识生长（多文件实体对齐）
 ## 技术栈
 - 前端: Next.js + Tailwind
 - 后端: Node.js / Python
 - 数据库: MySQL + Neo4j
 - ASR: Whisper
 - LLM: OpenAI / Kimi
 ## 开发阶段
 - [ ] Phase 1: 骨架与单体分析 (MVP)
 - [ ] Phase 2: 交互与纠错工作台
 - [ ] Phase 3: 记忆与生长
 ## 文档
 - [PRD v2.0](docs/PRD-v2.0.md)
--- a/backend/main.py
+++ b/backend/main.py
@@ -0,0 +1,205 @@
 #!/usr/bin/env python3
 """
 InsightFlow Backend - Phase 1 MVP with Deepgram
 ASR: Deepgram (Nova-3)
 Speaker Diarization: Deepgram
 LLM: Kimi API for entity extraction
 """
 import os
 import json
 import httpx
 from fastapi import FastAPI, File, UploadFile, HTTPException, Form
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 from typing import List, Optional
 from datetime import datetime
 from deepgram import DeepgramClient, PrerecordedOptions, FileSource
 app = FastAPI(title="InsightFlow", version="0.1.0")
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # Models
 class Entity(BaseModel):
    id: str
    name: str
    type: str
    start: int
    end: int
    definition: Optional[str] = None
 class TranscriptSegment(BaseModel):
    start: float
    end: float
    text: str
    speaker: Optional[str] = "Speaker A"
 class AnalysisResult(BaseModel):
    transcript_id: str
    segments: List[TranscriptSegment]
    entities: List[Entity]
    full_text: str
    created_at: str
 storage = {}
 # API Keys
 DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
 KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
 KIMI_BASE_URL = "https://api.kimi.com/coding"
 def transcribe_with_deepgram(audio_data: bytes, filename: str) -> dict:
    """使用 Deepgram 进行转录和说话人分离"""
    if not DEEPGRAM_API_KEY:
        raise HTTPException(status_code=500, detail="DEEPGRAM_API_KEY not configured")
    deepgram = DeepgramClient(DEEPGRAM_API_KEY)
    payload: FileSource = {
        "buffer": audio_data,
        "mimetype": "audio/wav" if filename.endswith(".wav") else "audio/mp3"
    }
    options = PrerecordedOptions(
        model="nova-3",
        language="zh",
        smart_format=True,
        diarize=True,  # 说话人分离
        utterances=True,
        punctuate=True,
        paragraphs=True
    )
    response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
    # 解析结果
    result = response.results
    full_text = result.channels[0].alternatives[0].transcript if result.channels else ""
    # 提取带说话人的片段
    segments = []
    if result.utterances:
        for u in result.utterances:
            segments.append({
                "start": u.start,
                "end": u.end,
                "text": u.transcript,
                "speaker": f"Speaker {u.speaker}"
            })
    return {
        "full_text": full_text,
        "segments": segments
    }
 def extract_entities_with_llm(text: str) -> List[Entity]:
    """使用 Kimi API 提取实体"""
    if not KIMI_API_KEY or not text:
        return []
    prompt = f"""请从以下会议文本中提取关键实体（专有名词、项目名、技术术语、人名等），并以 JSON 格式返回：
 文本：{text[:3000]}  # 限制长度避免超限
 要求：
 1. 每个实体包含：name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
 2. 只返回 JSON 数组，不要其他内容
 3. 确保 start/end 是字符在文本中的位置
 示例输出：
 [
  {{"name": "Project Alpha", "type": "PROJECT", "start": 23, "end": 35, "definition": "Q3季度的核心项目"}},
  {{"name": "K8s", "type": "TECH", "start": 37, "end": 40, "definition": "Kubernetes的缩写"}}
 ]
 """
    try:
        response = httpx.post(
            f"{KIMI_BASE_URL}/v1/chat/completions",
            headers={"Authorization": f"Bearer {KIMI_API_KEY}", "Content-Type": "application/json"},
            json={
                "model": "k2p5",
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0.1
            },
            timeout=60.0
        )
        response.raise_for_status()
        result = response.json()
        content = result["choices"][0]["message"]["content"]
        # 解析 JSON
        import re
        json_match = re.search(r'\[.*?\]', content, re.DOTALL)
        if json_match:
            entities_data = json.loads(json_match.group())
            entities = []
            for i, e in enumerate(entities_data):
                entities.append(Entity(
                    id=f"ent_{i+1}",
                    name=e["name"],
                    type=e.get("type", "OTHER"),
                    start=e["start"],
                    end=e["end"],
                    definition=e.get("definition", "")
                ))
            return entities
    except Exception as e:
        print(f"LLM extraction failed: {e}")
    return []
@app.post("/api/v1/upload", response_model=AnalysisResult)
 async def upload_audio(file: UploadFile = File(...)):
    """上传音频并分析"""
    content = await file.read()
    # Deepgram 转录
    print(f"Transcribing with Deepgram: {file.filename}")
    dg_result = transcribe_with_deepgram(content, file.filename)
    # 构建片段
    segments = [
        TranscriptSegment(**seg) for seg in dg_result["segments"]
    ] or [TranscriptSegment(start=0, end=0, text=dg_result["full_text"], speaker="Speaker A")]
    # LLM 实体提取
    print("Extracting entities with LLM...")
    entities = extract_entities_with_llm(dg_result["full_text"])
    analysis = AnalysisResult(
        transcript_id=os.urandom(8).hex(),
        segments=segments,
        entities=entities,
        full_text=dg_result["full_text"],
        created_at=datetime.now().isoformat()
    )
    storage[analysis.transcript_id] = analysis
    print(f"Analysis complete: {analysis.transcript_id}, {len(entities)} entities found")
    return analysis
@app.get("/api/v1/transcripts/{transcript_id}", response_model=AnalysisResult)
 async def get_transcript(transcript_id: str):
    if transcript_id not in storage:
        raise HTTPException(status_code=404, detail="Transcript not found")
    return storage[transcript_id]
@app.get("/api/v1/transcripts")
 async def list_transcripts():
    return list(storage.values())
 # Serve frontend
 app.mount("/", StaticFiles(directory="frontend", html=True), name="frontend")
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -0,0 +1,7 @@
 fastapi==0.115.0
 uvicorn[standard]==0.32.0
 python-multipart==0.0.17
 deepgram-sdk==3.7.0
 httpx==0.27.2
 pydantic==2.9.2
 python-dotenv==1.0.1
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,13 @@
 version: '3.8'
 services:
  insightflow:
    build: .
    ports:
      - "18000:8000"
    environment:
      - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY}
      - KIMI_API_KEY=${KIMI_API_KEY}
    volumes:
      - ./data:/app/data
    restart: unless-stopped
--- a/frontend/EOF
+++ b/frontend/EOF
--- a/frontend/app.js
+++ b/frontend/app.js
@@ -0,0 +1,10 @@
 const API_BASE = '/api/v1';
 async function upload(file) {
    const formData = new FormData();
    formData.append('file', file);
    const res = await fetch(API_BASE + '/upload', {
        method: 'POST',
        body: formData
    });
    return await res.json();
 }
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -0,0 +1,18 @@
 <!DOCTYPE html>
 <html>
 <head>
    <meta charset="UTF-8">
    <title>InsightFlow MVP</title>
    <style>
        body { font-family: sans-serif; background: #0a0a0a; color: #e0e0e0; padding: 40px; }
        h1 { color: #00d4ff; }
        .upload { border: 2px dashed #333; padding: 40px; text-align: center; border-radius: 8px; }
        .entity { background: rgba(123,44,191,0.3); padding: 2px 6px; border-radius: 4px; }
    </style>
 </head>
 <body>
    <h1>InsightFlow</h1>
    <p>Phase 1 MVP - 音频转录与实体提取</p>
    <div class="upload">拖拽音频文件上传</div>
 </body>
 </html>