Files
insightflow/backend/main.py

206 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
InsightFlow Backend - Phase 1 MVP with Deepgram
ASR: Deepgram (Nova-3)
Speaker Diarization: Deepgram
LLM: Kimi API for entity extraction
"""
import os
import json
import httpx
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
from typing import List, Optional
from datetime import datetime
from deepgram import DeepgramClient, PrerecordedOptions, FileSource
app = FastAPI(title="InsightFlow", version="0.1.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Models
class Entity(BaseModel):
id: str
name: str
type: str
start: int
end: int
definition: Optional[str] = None
class TranscriptSegment(BaseModel):
start: float
end: float
text: str
speaker: Optional[str] = "Speaker A"
class AnalysisResult(BaseModel):
transcript_id: str
segments: List[TranscriptSegment]
entities: List[Entity]
full_text: str
created_at: str
storage = {}
# API Keys
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
KIMI_BASE_URL = "https://api.kimi.com/coding"
def transcribe_with_deepgram(audio_data: bytes, filename: str) -> dict:
"""使用 Deepgram 进行转录和说话人分离"""
if not DEEPGRAM_API_KEY:
raise HTTPException(status_code=500, detail="DEEPGRAM_API_KEY not configured")
deepgram = DeepgramClient(DEEPGRAM_API_KEY)
payload: FileSource = {
"buffer": audio_data,
"mimetype": "audio/wav" if filename.endswith(".wav") else "audio/mp3"
}
options = PrerecordedOptions(
model="nova-3",
language="zh",
smart_format=True,
diarize=True, # 说话人分离
utterances=True,
punctuate=True,
paragraphs=True
)
response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
# 解析结果
result = response.results
full_text = result.channels[0].alternatives[0].transcript if result.channels else ""
# 提取带说话人的片段
segments = []
if result.utterances:
for u in result.utterances:
segments.append({
"start": u.start,
"end": u.end,
"text": u.transcript,
"speaker": f"Speaker {u.speaker}"
})
return {
"full_text": full_text,
"segments": segments
}
def extract_entities_with_llm(text: str) -> List[Entity]:
"""使用 Kimi API 提取实体"""
if not KIMI_API_KEY or not text:
return []
prompt = f"""请从以下会议文本中提取关键实体(专有名词、项目名、技术术语、人名等),并以 JSON 格式返回:
文本:{text[:3000]} # 限制长度避免超限
要求:
1. 每个实体包含name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
2. 只返回 JSON 数组,不要其他内容
3. 确保 start/end 是字符在文本中的位置
示例输出:
[
{{"name": "Project Alpha", "type": "PROJECT", "start": 23, "end": 35, "definition": "Q3季度的核心项目"}},
{{"name": "K8s", "type": "TECH", "start": 37, "end": 40, "definition": "Kubernetes的缩写"}}
]
"""
try:
response = httpx.post(
f"{KIMI_BASE_URL}/v1/chat/completions",
headers={"Authorization": f"Bearer {KIMI_API_KEY}", "Content-Type": "application/json"},
json={
"model": "k2p5",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1
},
timeout=60.0
)
response.raise_for_status()
result = response.json()
content = result["choices"][0]["message"]["content"]
# 解析 JSON
import re
json_match = re.search(r'\[.*?\]', content, re.DOTALL)
if json_match:
entities_data = json.loads(json_match.group())
entities = []
for i, e in enumerate(entities_data):
entities.append(Entity(
id=f"ent_{i+1}",
name=e["name"],
type=e.get("type", "OTHER"),
start=e["start"],
end=e["end"],
definition=e.get("definition", "")
))
return entities
except Exception as e:
print(f"LLM extraction failed: {e}")
return []
@app.post("/api/v1/upload", response_model=AnalysisResult)
async def upload_audio(file: UploadFile = File(...)):
"""上传音频并分析"""
content = await file.read()
# Deepgram 转录
print(f"Transcribing with Deepgram: {file.filename}")
dg_result = transcribe_with_deepgram(content, file.filename)
# 构建片段
segments = [
TranscriptSegment(**seg) for seg in dg_result["segments"]
] or [TranscriptSegment(start=0, end=0, text=dg_result["full_text"], speaker="Speaker A")]
# LLM 实体提取
print("Extracting entities with LLM...")
entities = extract_entities_with_llm(dg_result["full_text"])
analysis = AnalysisResult(
transcript_id=os.urandom(8).hex(),
segments=segments,
entities=entities,
full_text=dg_result["full_text"],
created_at=datetime.now().isoformat()
)
storage[analysis.transcript_id] = analysis
print(f"Analysis complete: {analysis.transcript_id}, {len(entities)} entities found")
return analysis
@app.get("/api/v1/transcripts/{transcript_id}", response_model=AnalysisResult)
async def get_transcript(transcript_id: str):
if transcript_id not in storage:
raise HTTPException(status_code=404, detail="Transcript not found")
return storage[transcript_id]
@app.get("/api/v1/transcripts")
async def list_transcripts():
return list(storage.values())
# Serve frontend
app.mount("/", StaticFiles(directory="frontend", html=True), name="frontend")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)