feat: replace Whisper with Deepgram ASR + speaker diarization

2026-02-17 12:14:25 +08:00
parent ff72dd0c56
commit e85038a1fb
8 changed files with 299 additions and 2 deletions
--- a/backend/main.py
+++ b/backend/main.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+"""
+InsightFlow Backend - Phase 1 MVP with Deepgram
+ASR: Deepgram (Nova-3)
+Speaker Diarization: Deepgram
+LLM: Kimi API for entity extraction
+"""
+
+import os
+import json
+import httpx
+from fastapi import FastAPI, File, UploadFile, HTTPException, Form
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+from typing import List, Optional
+from datetime import datetime
+from deepgram import DeepgramClient, PrerecordedOptions, FileSource
+
+app = FastAPI(title="InsightFlow", version="0.1.0")
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Models
+class Entity(BaseModel):
+    id: str
+    name: str
+    type: str
+    start: int
+    end: int
+    definition: Optional[str] = None
+
+class TranscriptSegment(BaseModel):
+    start: float
+    end: float
+    text: str
+    speaker: Optional[str] = "Speaker A"
+
+class AnalysisResult(BaseModel):
+    transcript_id: str
+    segments: List[TranscriptSegment]
+    entities: List[Entity]
+    full_text: str
+    created_at: str
+
+storage = {}
+
+# API Keys
+DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
+KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
+KIMI_BASE_URL = "https://api.kimi.com/coding"
+
+def transcribe_with_deepgram(audio_data: bytes, filename: str) -> dict:
+    """使用 Deepgram 进行转录和说话人分离"""
+    if not DEEPGRAM_API_KEY:
+        raise HTTPException(status_code=500, detail="DEEPGRAM_API_KEY not configured")
+    
+    deepgram = DeepgramClient(DEEPGRAM_API_KEY)
+    
+    payload: FileSource = {
+        "buffer": audio_data,
+        "mimetype": "audio/wav" if filename.endswith(".wav") else "audio/mp3"
+    }
+    
+    options = PrerecordedOptions(
+        model="nova-3",
+        language="zh",
+        smart_format=True,
+        diarize=True,  # 说话人分离
+        utterances=True,
+        punctuate=True,
+        paragraphs=True
+    )
+    
+    response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
+    
+    # 解析结果
+    result = response.results
+    full_text = result.channels[0].alternatives[0].transcript if result.channels else ""
+    
+    # 提取带说话人的片段
+    segments = []
+    if result.utterances:
+        for u in result.utterances:
+            segments.append({
+                "start": u.start,
+                "end": u.end,
+                "text": u.transcript,
+                "speaker": f"Speaker {u.speaker}"
+            })
+    
+    return {
+        "full_text": full_text,
+        "segments": segments
+    }
+
+def extract_entities_with_llm(text: str) -> List[Entity]:
+    """使用 Kimi API 提取实体"""
+    if not KIMI_API_KEY or not text:
+        return []
+    
+    prompt = f"""请从以下会议文本中提取关键实体（专有名词、项目名、技术术语、人名等），并以 JSON 格式返回：
+
+文本：{text[:3000]}  # 限制长度避免超限
+
+要求：
+1. 每个实体包含：name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
+2. 只返回 JSON 数组，不要其他内容
+3. 确保 start/end 是字符在文本中的位置
+
+示例输出：
+[
+  {{"name": "Project Alpha", "type": "PROJECT", "start": 23, "end": 35, "definition": "Q3季度的核心项目"}},
+  {{"name": "K8s", "type": "TECH", "start": 37, "end": 40, "definition": "Kubernetes的缩写"}}
+]
+"""
+    
+    try:
+        response = httpx.post(
+            f"{KIMI_BASE_URL}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {KIMI_API_KEY}", "Content-Type": "application/json"},
+            json={
+                "model": "k2p5",
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": 0.1
+            },
+            timeout=60.0
+        )
+        response.raise_for_status()
+        result = response.json()
+        content = result["choices"][0]["message"]["content"]
+        
+        # 解析 JSON
+        import re
+        json_match = re.search(r'\[.*?\]', content, re.DOTALL)
+        if json_match:
+            entities_data = json.loads(json_match.group())
+            entities = []
+            for i, e in enumerate(entities_data):
+                entities.append(Entity(
+                    id=f"ent_{i+1}",
+                    name=e["name"],
+                    type=e.get("type", "OTHER"),
+                    start=e["start"],
+                    end=e["end"],
+                    definition=e.get("definition", "")
+                ))
+            return entities
+    except Exception as e:
+        print(f"LLM extraction failed: {e}")
+    
+    return []
+
+@app.post("/api/v1/upload", response_model=AnalysisResult)
+async def upload_audio(file: UploadFile = File(...)):
+    """上传音频并分析"""
+    content = await file.read()
+    
+    # Deepgram 转录
+    print(f"Transcribing with Deepgram: {file.filename}")
+    dg_result = transcribe_with_deepgram(content, file.filename)
+    
+    # 构建片段
+    segments = [
+        TranscriptSegment(**seg) for seg in dg_result["segments"]
+    ] or [TranscriptSegment(start=0, end=0, text=dg_result["full_text"], speaker="Speaker A")]
+    
+    # LLM 实体提取
+    print("Extracting entities with LLM...")
+    entities = extract_entities_with_llm(dg_result["full_text"])
+    
+    analysis = AnalysisResult(
+        transcript_id=os.urandom(8).hex(),
+        segments=segments,
+        entities=entities,
+        full_text=dg_result["full_text"],
+        created_at=datetime.now().isoformat()
+    )
+    
+    storage[analysis.transcript_id] = analysis
+    print(f"Analysis complete: {analysis.transcript_id}, {len(entities)} entities found")
+    return analysis
+
+@app.get("/api/v1/transcripts/{transcript_id}", response_model=AnalysisResult)
+async def get_transcript(transcript_id: str):
+    if transcript_id not in storage:
+        raise HTTPException(status_code=404, detail="Transcript not found")
+    return storage[transcript_id]
+
+@app.get("/api/v1/transcripts")
+async def list_transcripts():
+    return list(storage.values())
+
+# Serve frontend
+app.mount("/", StaticFiles(directory="frontend", html=True), name="frontend")
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)