feat: replace Whisper with Deepgram ASR + speaker diarization
This commit is contained in:
20
Dockerfile
Normal file
20
Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system deps
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
ffmpeg \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Python deps
|
||||||
|
COPY backend/requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy code
|
||||||
|
COPY backend/ ./backend/
|
||||||
|
COPY frontend/ ./frontend/
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
CMD ["python", "backend/main.py"]
|
||||||
28
README.md
28
README.md
@@ -1,3 +1,27 @@
|
|||||||
# insightflow
|
# InsightFlow
|
||||||
|
|
||||||
音频与文档的领域知识构建平台 - 将会议录音转化为结构化知识图谱
|
音频与文档的领域知识构建平台
|
||||||
|
|
||||||
|
## 产品定位
|
||||||
|
将会议录音和文档转化为结构化的知识图谱,通过人机回圈(Human-in-the-Loop)实现知识持续生长。
|
||||||
|
|
||||||
|
## 核心特性
|
||||||
|
- 🎙️ ASR 语音识别 + 热词注入
|
||||||
|
- 🧠 LLM 实体抽取与解释
|
||||||
|
- 🔗 双视图联动(文档视图 + 图谱视图)
|
||||||
|
- 📈 知识生长(多文件实体对齐)
|
||||||
|
|
||||||
|
## 技术栈
|
||||||
|
- 前端: Next.js + Tailwind
|
||||||
|
- 后端: Node.js / Python
|
||||||
|
- 数据库: MySQL + Neo4j
|
||||||
|
- ASR: Whisper
|
||||||
|
- LLM: OpenAI / Kimi
|
||||||
|
|
||||||
|
## 开发阶段
|
||||||
|
- [ ] Phase 1: 骨架与单体分析 (MVP)
|
||||||
|
- [ ] Phase 2: 交互与纠错工作台
|
||||||
|
- [ ] Phase 3: 记忆与生长
|
||||||
|
|
||||||
|
## 文档
|
||||||
|
- [PRD v2.0](docs/PRD-v2.0.md)
|
||||||
|
|||||||
205
backend/main.py
Normal file
205
backend/main.py
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
InsightFlow Backend - Phase 1 MVP with Deepgram
|
||||||
|
ASR: Deepgram (Nova-3)
|
||||||
|
Speaker Diarization: Deepgram
|
||||||
|
LLM: Kimi API for entity extraction
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import httpx
|
||||||
|
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from typing import List, Optional
|
||||||
|
from datetime import datetime
|
||||||
|
from deepgram import DeepgramClient, PrerecordedOptions, FileSource
|
||||||
|
|
||||||
|
app = FastAPI(title="InsightFlow", version="0.1.0")
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Models
|
||||||
|
class Entity(BaseModel):
|
||||||
|
id: str
|
||||||
|
name: str
|
||||||
|
type: str
|
||||||
|
start: int
|
||||||
|
end: int
|
||||||
|
definition: Optional[str] = None
|
||||||
|
|
||||||
|
class TranscriptSegment(BaseModel):
|
||||||
|
start: float
|
||||||
|
end: float
|
||||||
|
text: str
|
||||||
|
speaker: Optional[str] = "Speaker A"
|
||||||
|
|
||||||
|
class AnalysisResult(BaseModel):
|
||||||
|
transcript_id: str
|
||||||
|
segments: List[TranscriptSegment]
|
||||||
|
entities: List[Entity]
|
||||||
|
full_text: str
|
||||||
|
created_at: str
|
||||||
|
|
||||||
|
storage = {}
|
||||||
|
|
||||||
|
# API Keys
|
||||||
|
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "")
|
||||||
|
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
|
||||||
|
KIMI_BASE_URL = "https://api.kimi.com/coding"
|
||||||
|
|
||||||
|
def transcribe_with_deepgram(audio_data: bytes, filename: str) -> dict:
|
||||||
|
"""使用 Deepgram 进行转录和说话人分离"""
|
||||||
|
if not DEEPGRAM_API_KEY:
|
||||||
|
raise HTTPException(status_code=500, detail="DEEPGRAM_API_KEY not configured")
|
||||||
|
|
||||||
|
deepgram = DeepgramClient(DEEPGRAM_API_KEY)
|
||||||
|
|
||||||
|
payload: FileSource = {
|
||||||
|
"buffer": audio_data,
|
||||||
|
"mimetype": "audio/wav" if filename.endswith(".wav") else "audio/mp3"
|
||||||
|
}
|
||||||
|
|
||||||
|
options = PrerecordedOptions(
|
||||||
|
model="nova-3",
|
||||||
|
language="zh",
|
||||||
|
smart_format=True,
|
||||||
|
diarize=True, # 说话人分离
|
||||||
|
utterances=True,
|
||||||
|
punctuate=True,
|
||||||
|
paragraphs=True
|
||||||
|
)
|
||||||
|
|
||||||
|
response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
|
||||||
|
|
||||||
|
# 解析结果
|
||||||
|
result = response.results
|
||||||
|
full_text = result.channels[0].alternatives[0].transcript if result.channels else ""
|
||||||
|
|
||||||
|
# 提取带说话人的片段
|
||||||
|
segments = []
|
||||||
|
if result.utterances:
|
||||||
|
for u in result.utterances:
|
||||||
|
segments.append({
|
||||||
|
"start": u.start,
|
||||||
|
"end": u.end,
|
||||||
|
"text": u.transcript,
|
||||||
|
"speaker": f"Speaker {u.speaker}"
|
||||||
|
})
|
||||||
|
|
||||||
|
return {
|
||||||
|
"full_text": full_text,
|
||||||
|
"segments": segments
|
||||||
|
}
|
||||||
|
|
||||||
|
def extract_entities_with_llm(text: str) -> List[Entity]:
|
||||||
|
"""使用 Kimi API 提取实体"""
|
||||||
|
if not KIMI_API_KEY or not text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
prompt = f"""请从以下会议文本中提取关键实体(专有名词、项目名、技术术语、人名等),并以 JSON 格式返回:
|
||||||
|
|
||||||
|
文本:{text[:3000]} # 限制长度避免超限
|
||||||
|
|
||||||
|
要求:
|
||||||
|
1. 每个实体包含:name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
|
||||||
|
2. 只返回 JSON 数组,不要其他内容
|
||||||
|
3. 确保 start/end 是字符在文本中的位置
|
||||||
|
|
||||||
|
示例输出:
|
||||||
|
[
|
||||||
|
{{"name": "Project Alpha", "type": "PROJECT", "start": 23, "end": 35, "definition": "Q3季度的核心项目"}},
|
||||||
|
{{"name": "K8s", "type": "TECH", "start": 37, "end": 40, "definition": "Kubernetes的缩写"}}
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = httpx.post(
|
||||||
|
f"{KIMI_BASE_URL}/v1/chat/completions",
|
||||||
|
headers={"Authorization": f"Bearer {KIMI_API_KEY}", "Content-Type": "application/json"},
|
||||||
|
json={
|
||||||
|
"model": "k2p5",
|
||||||
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
|
"temperature": 0.1
|
||||||
|
},
|
||||||
|
timeout=60.0
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json()
|
||||||
|
content = result["choices"][0]["message"]["content"]
|
||||||
|
|
||||||
|
# 解析 JSON
|
||||||
|
import re
|
||||||
|
json_match = re.search(r'\[.*?\]', content, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
entities_data = json.loads(json_match.group())
|
||||||
|
entities = []
|
||||||
|
for i, e in enumerate(entities_data):
|
||||||
|
entities.append(Entity(
|
||||||
|
id=f"ent_{i+1}",
|
||||||
|
name=e["name"],
|
||||||
|
type=e.get("type", "OTHER"),
|
||||||
|
start=e["start"],
|
||||||
|
end=e["end"],
|
||||||
|
definition=e.get("definition", "")
|
||||||
|
))
|
||||||
|
return entities
|
||||||
|
except Exception as e:
|
||||||
|
print(f"LLM extraction failed: {e}")
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
@app.post("/api/v1/upload", response_model=AnalysisResult)
|
||||||
|
async def upload_audio(file: UploadFile = File(...)):
|
||||||
|
"""上传音频并分析"""
|
||||||
|
content = await file.read()
|
||||||
|
|
||||||
|
# Deepgram 转录
|
||||||
|
print(f"Transcribing with Deepgram: {file.filename}")
|
||||||
|
dg_result = transcribe_with_deepgram(content, file.filename)
|
||||||
|
|
||||||
|
# 构建片段
|
||||||
|
segments = [
|
||||||
|
TranscriptSegment(**seg) for seg in dg_result["segments"]
|
||||||
|
] or [TranscriptSegment(start=0, end=0, text=dg_result["full_text"], speaker="Speaker A")]
|
||||||
|
|
||||||
|
# LLM 实体提取
|
||||||
|
print("Extracting entities with LLM...")
|
||||||
|
entities = extract_entities_with_llm(dg_result["full_text"])
|
||||||
|
|
||||||
|
analysis = AnalysisResult(
|
||||||
|
transcript_id=os.urandom(8).hex(),
|
||||||
|
segments=segments,
|
||||||
|
entities=entities,
|
||||||
|
full_text=dg_result["full_text"],
|
||||||
|
created_at=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
storage[analysis.transcript_id] = analysis
|
||||||
|
print(f"Analysis complete: {analysis.transcript_id}, {len(entities)} entities found")
|
||||||
|
return analysis
|
||||||
|
|
||||||
|
@app.get("/api/v1/transcripts/{transcript_id}", response_model=AnalysisResult)
|
||||||
|
async def get_transcript(transcript_id: str):
|
||||||
|
if transcript_id not in storage:
|
||||||
|
raise HTTPException(status_code=404, detail="Transcript not found")
|
||||||
|
return storage[transcript_id]
|
||||||
|
|
||||||
|
@app.get("/api/v1/transcripts")
|
||||||
|
async def list_transcripts():
|
||||||
|
return list(storage.values())
|
||||||
|
|
||||||
|
# Serve frontend
|
||||||
|
app.mount("/", StaticFiles(directory="frontend", html=True), name="frontend")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||||
7
backend/requirements.txt
Normal file
7
backend/requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
fastapi==0.115.0
|
||||||
|
uvicorn[standard]==0.32.0
|
||||||
|
python-multipart==0.0.17
|
||||||
|
deepgram-sdk==3.7.0
|
||||||
|
httpx==0.27.2
|
||||||
|
pydantic==2.9.2
|
||||||
|
python-dotenv==1.0.1
|
||||||
13
docker-compose.yml
Normal file
13
docker-compose.yml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
insightflow:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- "18000:8000"
|
||||||
|
environment:
|
||||||
|
- DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY}
|
||||||
|
- KIMI_API_KEY=${KIMI_API_KEY}
|
||||||
|
volumes:
|
||||||
|
- ./data:/app/data
|
||||||
|
restart: unless-stopped
|
||||||
0
frontend/EOF
Normal file
0
frontend/EOF
Normal file
10
frontend/app.js
Normal file
10
frontend/app.js
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
const API_BASE = '/api/v1';
|
||||||
|
async function upload(file) {
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append('file', file);
|
||||||
|
const res = await fetch(API_BASE + '/upload', {
|
||||||
|
method: 'POST',
|
||||||
|
body: formData
|
||||||
|
});
|
||||||
|
return await res.json();
|
||||||
|
}
|
||||||
18
frontend/index.html
Normal file
18
frontend/index.html
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>InsightFlow MVP</title>
|
||||||
|
<style>
|
||||||
|
body { font-family: sans-serif; background: #0a0a0a; color: #e0e0e0; padding: 40px; }
|
||||||
|
h1 { color: #00d4ff; }
|
||||||
|
.upload { border: 2px dashed #333; padding: 40px; text-align: center; border-radius: 8px; }
|
||||||
|
.entity { background: rgba(123,44,191,0.3); padding: 2px 6px; border-radius: 4px; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>InsightFlow</h1>
|
||||||
|
<p>Phase 1 MVP - 音频转录与实体提取</p>
|
||||||
|
<div class="upload">拖拽音频文件上传</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
Reference in New Issue
Block a user