feat: replace Whisper with Deepgram ASR + speaker diarization
This commit is contained in:
20
Dockerfile
Normal file
20
Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install system deps
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python deps
|
||||
COPY backend/requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy code
|
||||
COPY backend/ ./backend/
|
||||
COPY frontend/ ./frontend/
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
CMD ["python", "backend/main.py"]
|
||||
28
README.md
28
README.md
@@ -1,3 +1,27 @@
|
||||
# insightflow
|
||||
# InsightFlow
|
||||
|
||||
音频与文档的领域知识构建平台 - 将会议录音转化为结构化知识图谱
|
||||
音频与文档的领域知识构建平台
|
||||
|
||||
## 产品定位
|
||||
将会议录音和文档转化为结构化的知识图谱,通过人机回圈(Human-in-the-Loop)实现知识持续生长。
|
||||
|
||||
## 核心特性
|
||||
- 🎙️ ASR 语音识别 + 热词注入
|
||||
- 🧠 LLM 实体抽取与解释
|
||||
- 🔗 双视图联动(文档视图 + 图谱视图)
|
||||
- 📈 知识生长(多文件实体对齐)
|
||||
|
||||
## 技术栈
|
||||
- 前端: Next.js + Tailwind
|
||||
- 后端: Node.js / Python
|
||||
- 数据库: MySQL + Neo4j
|
||||
- ASR: Whisper
|
||||
- LLM: OpenAI / Kimi
|
||||
|
||||
## 开发阶段
|
||||
- [ ] Phase 1: 骨架与单体分析 (MVP)
|
||||
- [ ] Phase 2: 交互与纠错工作台
|
||||
- [ ] Phase 3: 记忆与生长
|
||||
|
||||
## 文档
|
||||
- [PRD v2.0](docs/PRD-v2.0.md)
|
||||
|
||||
205
backend/main.py
Normal file
205
backend/main.py
Normal file
@@ -0,0 +1,205 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
InsightFlow Backend - Phase 1 MVP with Deepgram
|
||||
ASR: Deepgram (Nova-3)
|
||||
Speaker Diarization: Deepgram
|
||||
LLM: Kimi API for entity extraction
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import httpx
|
||||
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from deepgram import DeepgramClient, PrerecordedOptions, FileSource
|
||||
|
||||
app = FastAPI(title="InsightFlow", version="0.1.0")
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Models
|
||||
class Entity(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
type: str
|
||||
start: int
|
||||
end: int
|
||||
definition: Optional[str] = None
|
||||
|
||||
class TranscriptSegment(BaseModel):
|
||||
start: float
|
||||
end: float
|
||||
text: str
|
||||
speaker: Optional[str] = "Speaker A"
|
||||
|
||||
class AnalysisResult(BaseModel):
|
||||
transcript_id: str
|
||||
segments: List[TranscriptSegment]
|
||||
entities: List[Entity]
|
||||
full_text: str
|
||||
created_at: str
|
||||
|
||||
storage = {}
|
||||
|
||||
# API Keys
|
||||
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
|
||||
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
|
||||
KIMI_BASE_URL = "https://api.kimi.com/coding"
|
||||
|
||||
def transcribe_with_deepgram(audio_data: bytes, filename: str) -> dict:
|
||||
"""使用 Deepgram 进行转录和说话人分离"""
|
||||
if not DEEPGRAM_API_KEY:
|
||||
raise HTTPException(status_code=500, detail="DEEPGRAM_API_KEY not configured")
|
||||
|
||||
deepgram = DeepgramClient(DEEPGRAM_API_KEY)
|
||||
|
||||
payload: FileSource = {
|
||||
"buffer": audio_data,
|
||||
"mimetype": "audio/wav" if filename.endswith(".wav") else "audio/mp3"
|
||||
}
|
||||
|
||||
options = PrerecordedOptions(
|
||||
model="nova-3",
|
||||
language="zh",
|
||||
smart_format=True,
|
||||
diarize=True, # 说话人分离
|
||||
utterances=True,
|
||||
punctuate=True,
|
||||
paragraphs=True
|
||||
)
|
||||
|
||||
response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
|
||||
|
||||
# 解析结果
|
||||
result = response.results
|
||||
full_text = result.channels[0].alternatives[0].transcript if result.channels else ""
|
||||
|
||||
# 提取带说话人的片段
|
||||
segments = []
|
||||
if result.utterances:
|
||||
for u in result.utterances:
|
||||
segments.append({
|
||||
"start": u.start,
|
||||
"end": u.end,
|
||||
"text": u.transcript,
|
||||
"speaker": f"Speaker {u.speaker}"
|
||||
})
|
||||
|
||||
return {
|
||||
"full_text": full_text,
|
||||
"segments": segments
|
||||
}
|
||||
|
||||
def extract_entities_with_llm(text: str) -> List[Entity]:
|
||||
"""使用 Kimi API 提取实体"""
|
||||
if not KIMI_API_KEY or not text:
|
||||
return []
|
||||
|
||||
prompt = f"""请从以下会议文本中提取关键实体(专有名词、项目名、技术术语、人名等),并以 JSON 格式返回:
|
||||
|
||||
文本:{text[:3000]} # 限制长度避免超限
|
||||
|
||||
要求:
|
||||
1. 每个实体包含:name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
|
||||
2. 只返回 JSON 数组,不要其他内容
|
||||
3. 确保 start/end 是字符在文本中的位置
|
||||
|
||||
示例输出:
|
||||
[
|
||||
{{"name": "Project Alpha", "type": "PROJECT", "start": 23, "end": 35, "definition": "Q3季度的核心项目"}},
|
||||
{{"name": "K8s", "type": "TECH", "start": 37, "end": 40, "definition": "Kubernetes的缩写"}}
|
||||
]
|
||||
"""
|
||||
|
||||
try:
|
||||
response = httpx.post(
|
||||
f"{KIMI_BASE_URL}/v1/chat/completions",
|
||||
headers={"Authorization": f"Bearer {KIMI_API_KEY}", "Content-Type": "application/json"},
|
||||
json={
|
||||
"model": "k2p5",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.1
|
||||
},
|
||||
timeout=60.0
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
content = result["choices"][0]["message"]["content"]
|
||||
|
||||
# 解析 JSON
|
||||
import re
|
||||
json_match = re.search(r'\[.*?\]', content, re.DOTALL)
|
||||
if json_match:
|
||||
entities_data = json.loads(json_match.group())
|
||||
entities = []
|
||||
for i, e in enumerate(entities_data):
|
||||
entities.append(Entity(
|
||||
id=f"ent_{i+1}",
|
||||
name=e["name"],
|
||||
type=e.get("type", "OTHER"),
|
||||
start=e["start"],
|
||||
end=e["end"],
|
||||
definition=e.get("definition", "")
|
||||
))
|
||||
return entities
|
||||
except Exception as e:
|
||||
print(f"LLM extraction failed: {e}")
|
||||
|
||||
return []
|
||||
|
||||
@app.post("/api/v1/upload", response_model=AnalysisResult)
|
||||
async def upload_audio(file: UploadFile = File(...)):
|
||||
"""上传音频并分析"""
|
||||
content = await file.read()
|
||||
|
||||
# Deepgram 转录
|
||||
print(f"Transcribing with Deepgram: {file.filename}")
|
||||
dg_result = transcribe_with_deepgram(content, file.filename)
|
||||
|
||||
# 构建片段
|
||||
segments = [
|
||||
TranscriptSegment(**seg) for seg in dg_result["segments"]
|
||||
] or [TranscriptSegment(start=0, end=0, text=dg_result["full_text"], speaker="Speaker A")]
|
||||
|
||||
# LLM 实体提取
|
||||
print("Extracting entities with LLM...")
|
||||
entities = extract_entities_with_llm(dg_result["full_text"])
|
||||
|
||||
analysis = AnalysisResult(
|
||||
transcript_id=os.urandom(8).hex(),
|
||||
segments=segments,
|
||||
entities=entities,
|
||||
full_text=dg_result["full_text"],
|
||||
created_at=datetime.now().isoformat()
|
||||
)
|
||||
|
||||
storage[analysis.transcript_id] = analysis
|
||||
print(f"Analysis complete: {analysis.transcript_id}, {len(entities)} entities found")
|
||||
return analysis
|
||||
|
||||
@app.get("/api/v1/transcripts/{transcript_id}", response_model=AnalysisResult)
|
||||
async def get_transcript(transcript_id: str):
|
||||
if transcript_id not in storage:
|
||||
raise HTTPException(status_code=404, detail="Transcript not found")
|
||||
return storage[transcript_id]
|
||||
|
||||
@app.get("/api/v1/transcripts")
|
||||
async def list_transcripts():
|
||||
return list(storage.values())
|
||||
|
||||
# Serve frontend
|
||||
app.mount("/", StaticFiles(directory="frontend", html=True), name="frontend")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
7
backend/requirements.txt
Normal file
7
backend/requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.32.0
|
||||
python-multipart==0.0.17
|
||||
deepgram-sdk==3.7.0
|
||||
httpx==0.27.2
|
||||
pydantic==2.9.2
|
||||
python-dotenv==1.0.1
|
||||
13
docker-compose.yml
Normal file
13
docker-compose.yml
Normal file
@@ -0,0 +1,13 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
insightflow:
|
||||
build: .
|
||||
ports:
|
||||
- "18000:8000"
|
||||
environment:
|
||||
- DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY}
|
||||
- KIMI_API_KEY=${KIMI_API_KEY}
|
||||
volumes:
|
||||
- ./data:/app/data
|
||||
restart: unless-stopped
|
||||
0
frontend/EOF
Normal file
0
frontend/EOF
Normal file
10
frontend/app.js
Normal file
10
frontend/app.js
Normal file
@@ -0,0 +1,10 @@
|
||||
const API_BASE = '/api/v1';
|
||||
async function upload(file) {
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
const res = await fetch(API_BASE + '/upload', {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
return await res.json();
|
||||
}
|
||||
18
frontend/index.html
Normal file
18
frontend/index.html
Normal file
@@ -0,0 +1,18 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>InsightFlow MVP</title>
|
||||
<style>
|
||||
body { font-family: sans-serif; background: #0a0a0a; color: #e0e0e0; padding: 40px; }
|
||||
h1 { color: #00d4ff; }
|
||||
.upload { border: 2px dashed #333; padding: 40px; text-align: center; border-radius: 8px; }
|
||||
.entity { background: rgba(123,44,191,0.3); padding: 2px 6px; border-radius: 4px; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>InsightFlow</h1>
|
||||
<p>Phase 1 MVP - 音频转录与实体提取</p>
|
||||
<div class="upload">拖拽音频文件上传</div>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user