feat: replace Whisper with Deepgram ASR + speaker diarization
This commit is contained in:
205
backend/main.py
Normal file
205
backend/main.py
Normal file
@@ -0,0 +1,205 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
InsightFlow Backend - Phase 1 MVP with Deepgram
|
||||
ASR: Deepgram (Nova-3)
|
||||
Speaker Diarization: Deepgram
|
||||
LLM: Kimi API for entity extraction
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import httpx
|
||||
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from deepgram import DeepgramClient, PrerecordedOptions, FileSource
|
||||
|
||||
app = FastAPI(title="InsightFlow", version="0.1.0")
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Models
|
||||
class Entity(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
type: str
|
||||
start: int
|
||||
end: int
|
||||
definition: Optional[str] = None
|
||||
|
||||
class TranscriptSegment(BaseModel):
|
||||
start: float
|
||||
end: float
|
||||
text: str
|
||||
speaker: Optional[str] = "Speaker A"
|
||||
|
||||
class AnalysisResult(BaseModel):
|
||||
transcript_id: str
|
||||
segments: List[TranscriptSegment]
|
||||
entities: List[Entity]
|
||||
full_text: str
|
||||
created_at: str
|
||||
|
||||
storage = {}
|
||||
|
||||
# API Keys
|
||||
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
|
||||
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
|
||||
KIMI_BASE_URL = "https://api.kimi.com/coding"
|
||||
|
||||
def transcribe_with_deepgram(audio_data: bytes, filename: str) -> dict:
|
||||
"""使用 Deepgram 进行转录和说话人分离"""
|
||||
if not DEEPGRAM_API_KEY:
|
||||
raise HTTPException(status_code=500, detail="DEEPGRAM_API_KEY not configured")
|
||||
|
||||
deepgram = DeepgramClient(DEEPGRAM_API_KEY)
|
||||
|
||||
payload: FileSource = {
|
||||
"buffer": audio_data,
|
||||
"mimetype": "audio/wav" if filename.endswith(".wav") else "audio/mp3"
|
||||
}
|
||||
|
||||
options = PrerecordedOptions(
|
||||
model="nova-3",
|
||||
language="zh",
|
||||
smart_format=True,
|
||||
diarize=True, # 说话人分离
|
||||
utterances=True,
|
||||
punctuate=True,
|
||||
paragraphs=True
|
||||
)
|
||||
|
||||
response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
|
||||
|
||||
# 解析结果
|
||||
result = response.results
|
||||
full_text = result.channels[0].alternatives[0].transcript if result.channels else ""
|
||||
|
||||
# 提取带说话人的片段
|
||||
segments = []
|
||||
if result.utterances:
|
||||
for u in result.utterances:
|
||||
segments.append({
|
||||
"start": u.start,
|
||||
"end": u.end,
|
||||
"text": u.transcript,
|
||||
"speaker": f"Speaker {u.speaker}"
|
||||
})
|
||||
|
||||
return {
|
||||
"full_text": full_text,
|
||||
"segments": segments
|
||||
}
|
||||
|
||||
def extract_entities_with_llm(text: str) -> List[Entity]:
|
||||
"""使用 Kimi API 提取实体"""
|
||||
if not KIMI_API_KEY or not text:
|
||||
return []
|
||||
|
||||
prompt = f"""请从以下会议文本中提取关键实体(专有名词、项目名、技术术语、人名等),并以 JSON 格式返回:
|
||||
|
||||
文本:{text[:3000]} # 限制长度避免超限
|
||||
|
||||
要求:
|
||||
1. 每个实体包含:name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
|
||||
2. 只返回 JSON 数组,不要其他内容
|
||||
3. 确保 start/end 是字符在文本中的位置
|
||||
|
||||
示例输出:
|
||||
[
|
||||
{{"name": "Project Alpha", "type": "PROJECT", "start": 23, "end": 35, "definition": "Q3季度的核心项目"}},
|
||||
{{"name": "K8s", "type": "TECH", "start": 37, "end": 40, "definition": "Kubernetes的缩写"}}
|
||||
]
|
||||
"""
|
||||
|
||||
try:
|
||||
response = httpx.post(
|
||||
f"{KIMI_BASE_URL}/v1/chat/completions",
|
||||
headers={"Authorization": f"Bearer {KIMI_API_KEY}", "Content-Type": "application/json"},
|
||||
json={
|
||||
"model": "k2p5",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.1
|
||||
},
|
||||
timeout=60.0
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
content = result["choices"][0]["message"]["content"]
|
||||
|
||||
# 解析 JSON
|
||||
import re
|
||||
json_match = re.search(r'\[.*?\]', content, re.DOTALL)
|
||||
if json_match:
|
||||
entities_data = json.loads(json_match.group())
|
||||
entities = []
|
||||
for i, e in enumerate(entities_data):
|
||||
entities.append(Entity(
|
||||
id=f"ent_{i+1}",
|
||||
name=e["name"],
|
||||
type=e.get("type", "OTHER"),
|
||||
start=e["start"],
|
||||
end=e["end"],
|
||||
definition=e.get("definition", "")
|
||||
))
|
||||
return entities
|
||||
except Exception as e:
|
||||
print(f"LLM extraction failed: {e}")
|
||||
|
||||
return []
|
||||
|
||||
@app.post("/api/v1/upload", response_model=AnalysisResult)
|
||||
async def upload_audio(file: UploadFile = File(...)):
|
||||
"""上传音频并分析"""
|
||||
content = await file.read()
|
||||
|
||||
# Deepgram 转录
|
||||
print(f"Transcribing with Deepgram: {file.filename}")
|
||||
dg_result = transcribe_with_deepgram(content, file.filename)
|
||||
|
||||
# 构建片段
|
||||
segments = [
|
||||
TranscriptSegment(**seg) for seg in dg_result["segments"]
|
||||
] or [TranscriptSegment(start=0, end=0, text=dg_result["full_text"], speaker="Speaker A")]
|
||||
|
||||
# LLM 实体提取
|
||||
print("Extracting entities with LLM...")
|
||||
entities = extract_entities_with_llm(dg_result["full_text"])
|
||||
|
||||
analysis = AnalysisResult(
|
||||
transcript_id=os.urandom(8).hex(),
|
||||
segments=segments,
|
||||
entities=entities,
|
||||
full_text=dg_result["full_text"],
|
||||
created_at=datetime.now().isoformat()
|
||||
)
|
||||
|
||||
storage[analysis.transcript_id] = analysis
|
||||
print(f"Analysis complete: {analysis.transcript_id}, {len(entities)} entities found")
|
||||
return analysis
|
||||
|
||||
@app.get("/api/v1/transcripts/{transcript_id}", response_model=AnalysisResult)
|
||||
async def get_transcript(transcript_id: str):
|
||||
if transcript_id not in storage:
|
||||
raise HTTPException(status_code=404, detail="Transcript not found")
|
||||
return storage[transcript_id]
|
||||
|
||||
@app.get("/api/v1/transcripts")
|
||||
async def list_transcripts():
|
||||
return list(storage.values())
|
||||
|
||||
# Serve frontend
|
||||
app.mount("/", StaticFiles(directory="frontend", html=True), name="frontend")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
Reference in New Issue
Block a user