206 lines
6.2 KiB
Python
206 lines
6.2 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
InsightFlow Backend - Phase 1 MVP with Deepgram
|
||
ASR: Deepgram (Nova-3)
|
||
Speaker Diarization: Deepgram
|
||
LLM: Kimi API for entity extraction
|
||
"""
|
||
|
||
import os
|
||
import json
|
||
import httpx
|
||
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
|
||
from fastapi.middleware.cors import CORSMiddleware
|
||
from fastapi.staticfiles import StaticFiles
|
||
from pydantic import BaseModel
|
||
from typing import List, Optional
|
||
from datetime import datetime
|
||
from deepgram import DeepgramClient, PrerecordedOptions, FileSource
|
||
|
||
app = FastAPI(title="InsightFlow", version="0.1.0")
|
||
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=["*"],
|
||
allow_credentials=True,
|
||
allow_methods=["*"],
|
||
allow_headers=["*"],
|
||
)
|
||
|
||
# Models
|
||
class Entity(BaseModel):
|
||
id: str
|
||
name: str
|
||
type: str
|
||
start: int
|
||
end: int
|
||
definition: Optional[str] = None
|
||
|
||
class TranscriptSegment(BaseModel):
|
||
start: float
|
||
end: float
|
||
text: str
|
||
speaker: Optional[str] = "Speaker A"
|
||
|
||
class AnalysisResult(BaseModel):
|
||
transcript_id: str
|
||
segments: List[TranscriptSegment]
|
||
entities: List[Entity]
|
||
full_text: str
|
||
created_at: str
|
||
|
||
storage = {}
|
||
|
||
# API Keys
|
||
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
|
||
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
|
||
KIMI_BASE_URL = "https://api.kimi.com/coding"
|
||
|
||
def transcribe_with_deepgram(audio_data: bytes, filename: str) -> dict:
|
||
"""使用 Deepgram 进行转录和说话人分离"""
|
||
if not DEEPGRAM_API_KEY:
|
||
raise HTTPException(status_code=500, detail="DEEPGRAM_API_KEY not configured")
|
||
|
||
deepgram = DeepgramClient(DEEPGRAM_API_KEY)
|
||
|
||
payload: FileSource = {
|
||
"buffer": audio_data,
|
||
"mimetype": "audio/wav" if filename.endswith(".wav") else "audio/mp3"
|
||
}
|
||
|
||
options = PrerecordedOptions(
|
||
model="nova-3",
|
||
language="zh",
|
||
smart_format=True,
|
||
diarize=True, # 说话人分离
|
||
utterances=True,
|
||
punctuate=True,
|
||
paragraphs=True
|
||
)
|
||
|
||
response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
|
||
|
||
# 解析结果
|
||
result = response.results
|
||
full_text = result.channels[0].alternatives[0].transcript if result.channels else ""
|
||
|
||
# 提取带说话人的片段
|
||
segments = []
|
||
if result.utterances:
|
||
for u in result.utterances:
|
||
segments.append({
|
||
"start": u.start,
|
||
"end": u.end,
|
||
"text": u.transcript,
|
||
"speaker": f"Speaker {u.speaker}"
|
||
})
|
||
|
||
return {
|
||
"full_text": full_text,
|
||
"segments": segments
|
||
}
|
||
|
||
def extract_entities_with_llm(text: str) -> List[Entity]:
|
||
"""使用 Kimi API 提取实体"""
|
||
if not KIMI_API_KEY or not text:
|
||
return []
|
||
|
||
prompt = f"""请从以下会议文本中提取关键实体(专有名词、项目名、技术术语、人名等),并以 JSON 格式返回:
|
||
|
||
文本:{text[:3000]} # 限制长度避免超限
|
||
|
||
要求:
|
||
1. 每个实体包含:name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
|
||
2. 只返回 JSON 数组,不要其他内容
|
||
3. 确保 start/end 是字符在文本中的位置
|
||
|
||
示例输出:
|
||
[
|
||
{{"name": "Project Alpha", "type": "PROJECT", "start": 23, "end": 35, "definition": "Q3季度的核心项目"}},
|
||
{{"name": "K8s", "type": "TECH", "start": 37, "end": 40, "definition": "Kubernetes的缩写"}}
|
||
]
|
||
"""
|
||
|
||
try:
|
||
response = httpx.post(
|
||
f"{KIMI_BASE_URL}/v1/chat/completions",
|
||
headers={"Authorization": f"Bearer {KIMI_API_KEY}", "Content-Type": "application/json"},
|
||
json={
|
||
"model": "k2p5",
|
||
"messages": [{"role": "user", "content": prompt}],
|
||
"temperature": 0.1
|
||
},
|
||
timeout=60.0
|
||
)
|
||
response.raise_for_status()
|
||
result = response.json()
|
||
content = result["choices"][0]["message"]["content"]
|
||
|
||
# 解析 JSON
|
||
import re
|
||
json_match = re.search(r'\[.*?\]', content, re.DOTALL)
|
||
if json_match:
|
||
entities_data = json.loads(json_match.group())
|
||
entities = []
|
||
for i, e in enumerate(entities_data):
|
||
entities.append(Entity(
|
||
id=f"ent_{i+1}",
|
||
name=e["name"],
|
||
type=e.get("type", "OTHER"),
|
||
start=e["start"],
|
||
end=e["end"],
|
||
definition=e.get("definition", "")
|
||
))
|
||
return entities
|
||
except Exception as e:
|
||
print(f"LLM extraction failed: {e}")
|
||
|
||
return []
|
||
|
||
@app.post("/api/v1/upload", response_model=AnalysisResult)
|
||
async def upload_audio(file: UploadFile = File(...)):
|
||
"""上传音频并分析"""
|
||
content = await file.read()
|
||
|
||
# Deepgram 转录
|
||
print(f"Transcribing with Deepgram: {file.filename}")
|
||
dg_result = transcribe_with_deepgram(content, file.filename)
|
||
|
||
# 构建片段
|
||
segments = [
|
||
TranscriptSegment(**seg) for seg in dg_result["segments"]
|
||
] or [TranscriptSegment(start=0, end=0, text=dg_result["full_text"], speaker="Speaker A")]
|
||
|
||
# LLM 实体提取
|
||
print("Extracting entities with LLM...")
|
||
entities = extract_entities_with_llm(dg_result["full_text"])
|
||
|
||
analysis = AnalysisResult(
|
||
transcript_id=os.urandom(8).hex(),
|
||
segments=segments,
|
||
entities=entities,
|
||
full_text=dg_result["full_text"],
|
||
created_at=datetime.now().isoformat()
|
||
)
|
||
|
||
storage[analysis.transcript_id] = analysis
|
||
print(f"Analysis complete: {analysis.transcript_id}, {len(entities)} entities found")
|
||
return analysis
|
||
|
||
@app.get("/api/v1/transcripts/{transcript_id}", response_model=AnalysisResult)
|
||
async def get_transcript(transcript_id: str):
|
||
if transcript_id not in storage:
|
||
raise HTTPException(status_code=404, detail="Transcript not found")
|
||
return storage[transcript_id]
|
||
|
||
@app.get("/api/v1/transcripts")
|
||
async def list_transcripts():
|
||
return list(storage.values())
|
||
|
||
# Serve frontend
|
||
app.mount("/", StaticFiles(directory="frontend", html=True), name="frontend")
|
||
|
||
if __name__ == "__main__":
|
||
import uvicorn
|
||
uvicorn.run(app, host="0.0.0.0", port=8000)
|