feat: Phase 3 knowledge growth - multi-file fusion + entity alignment

This commit is contained in:
OpenClaw Bot
2026-02-17 13:47:06 +08:00
parent b28af9a611
commit 460bc5b052
3 changed files with 447 additions and 85 deletions

View File

@@ -1,15 +1,14 @@
#!/usr/bin/env python3
"""
InsightFlow Backend - Phase 1 MVP (Complete)
ASR: 阿里云听悟 (TingWu) + OSS
Speaker Diarization: 听悟内置
LLM: Kimi API for entity extraction
InsightFlow Backend - Phase 3 (Complete)
Knowledge Growth: Multi-file fusion + Entity Alignment
"""
import os
import json
import httpx
from fastapi import FastAPI, File, UploadFile, HTTPException
import uuid
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
@@ -29,7 +28,13 @@ try:
except ImportError:
TINGWU_AVAILABLE = False
app = FastAPI(title="InsightFlow", version="0.1.0")
try:
from db_manager import get_db_manager, Project, Entity, EntityMention
DB_AVAILABLE = True
except ImportError:
DB_AVAILABLE = False
app = FastAPI(title="InsightFlow", version="0.3.0")
app.add_middleware(
CORSMiddleware,
@@ -40,13 +45,12 @@ app.add_middleware(
)
# Models
class Entity(BaseModel):
class EntityModel(BaseModel):
id: str
name: str
type: str
start: int
end: int
definition: Optional[str] = None
definition: Optional[str] = ""
aliases: List[str] = []
class TranscriptSegment(BaseModel):
start: float
@@ -56,12 +60,15 @@ class TranscriptSegment(BaseModel):
class AnalysisResult(BaseModel):
transcript_id: str
project_id: str
segments: List[TranscriptSegment]
entities: List[Entity]
entities: List[EntityModel]
full_text: str
created_at: str
storage = {}
class ProjectCreate(BaseModel):
name: str
description: str = ""
# API Keys
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
@@ -69,73 +76,50 @@ KIMI_BASE_URL = "https://api.kimi.com/coding"
def transcribe_audio(audio_data: bytes, filename: str) -> dict:
"""转录音频OSS上传 + 听悟转录"""
# 1. 上传 OSS
if not OSS_AVAILABLE:
print("OSS not available, using mock")
if not OSS_AVAILABLE or not TINGWU_AVAILABLE:
return mock_transcribe()
try:
uploader = get_oss_uploader()
audio_url, object_name = uploader.upload_audio(audio_data, filename)
print(f"Uploaded to OSS: {object_name}")
except Exception as e:
print(f"OSS upload failed: {e}")
return mock_transcribe()
# 2. 听悟转录
if not TINGWU_AVAILABLE:
print("Tingwu not available, using mock")
return mock_transcribe()
try:
client = TingwuClient()
result = client.transcribe(audio_url)
print(f"Transcription complete: {len(result['segments'])} segments")
return result
except Exception as e:
print(f"Tingwu failed: {e}")
print(f"Transcription failed: {e}")
return mock_transcribe()
def mock_transcribe() -> dict:
"""Mock 转录结果用于测试"""
"""Mock 转录结果"""
return {
"full_text": "这是一个示例转录文本,包含 Project Alpha 和 K8s 等术语",
"full_text": "我们今天讨论 Project Alpha 的进度K8s 集群已经部署完成",
"segments": [
{"start": 0.0, "end": 5.0, "text": "这是一个示例转录文本,包含 Project Alpha 和 K8s 等术语", "speaker": "Speaker A"}
{"start": 0.0, "end": 5.0, "text": "我们今天讨论 Project Alpha 的进度K8s 集群已经部署完成", "speaker": "Speaker A"}
]
}
def extract_entities_with_llm(text: str) -> List[Entity]:
def extract_entities_with_llm(text: str) -> List[dict]:
"""使用 Kimi API 提取实体"""
if not KIMI_API_KEY or not text:
return []
prompt = f"""从以下会议文本中提取关键实体(专有名词、项目名、技术术语、人名等),并以 JSON 格式返回:
prompt = f"""从以下会议文本中提取关键实体以 JSON 格式返回:
文本:{text[:3000]}
要求:
1. 每个实体包含name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), start(起始字符位置), end(结束字符位置), definition(一句话定义)
2. 只返回 JSON 数组,不要其他内容
3. 确保 start/end 是字符在文本中的位置
1. 每个实体包含name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), definition(一句话定义)
2. 只返回 JSON 数组
示例输出
[
{{"name": "Project Alpha", "type": "PROJECT", "start": 23, "end": 35, "definition": "Q3季度的核心项目"}},
{{"name": "K8s", "type": "TECH", "start": 37, "end": 40, "definition": "Kubernetes的缩写"}}
]
示例:[{{"name": "Project Alpha", "type": "PROJECT", "definition": "核心项目"}}]
"""
try:
response = httpx.post(
f"{KIMI_BASE_URL}/v1/chat/completions",
headers={"Authorization": f"Bearer {KIMI_API_KEY}", "Content-Type": "application/json"},
json={
"model": "k2p5",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1
},
json={"model": "k2p5", "messages": [{"role": "user", "content": prompt}], "temperature": 0.1},
timeout=60.0
)
response.raise_for_status()
@@ -145,62 +129,136 @@ def extract_entities_with_llm(text: str) -> List[Entity]:
import re
json_match = re.search(r'\[.*?\]', content, re.DOTALL)
if json_match:
entities_data = json.loads(json_match.group())
entities = []
for i, e in enumerate(entities_data):
entities.append(Entity(
id=f"ent_{i+1}",
name=e["name"],
type=e.get("type", "OTHER"),
start=e["start"],
end=e["end"],
definition=e.get("definition", "")
))
return entities
return json.loads(json_match.group())
except Exception as e:
print(f"LLM extraction failed: {e}")
return []
@app.post("/api/v1/upload", response_model=AnalysisResult)
async def upload_audio(file: UploadFile = File(...)):
"""上传音频并分析"""
def align_entity(project_id: str, name: str, db) -> Optional[Entity]:
"""实体对齐:查找或创建实体"""
# 1. 尝试精确匹配
existing = db.get_entity_by_name(project_id, name)
if existing:
return existing
# 2. 尝试相似匹配(简单版)
similar = db.find_similar_entities(project_id, name)
if similar:
# 返回最相似的(第一个)
return similar[0]
return None
# API Endpoints
@app.post("/api/v1/projects", response_model=dict)
async def create_project(project: ProjectCreate):
"""创建新项目"""
if not DB_AVAILABLE:
raise HTTPException(status_code=500, detail="Database not available")
db = get_db_manager()
project_id = str(uuid.uuid4())[:8]
p = db.create_project(project_id, project.name, project.description)
return {"id": p.id, "name": p.name, "description": p.description}
@app.get("/api/v1/projects")
async def list_projects():
"""列出所有项目"""
if not DB_AVAILABLE:
return []
db = get_db_manager()
projects = db.list_projects()
return [{"id": p.id, "name": p.name, "description": p.description} for p in projects]
@app.post("/api/v1/projects/{project_id}/upload", response_model=AnalysisResult)
async def upload_audio(project_id: str, file: UploadFile = File(...)):
"""上传音频到指定项目"""
if not DB_AVAILABLE:
raise HTTPException(status_code=500, detail="Database not available")
db = get_db_manager()
project = db.get_project(project_id)
if not project:
raise HTTPException(status_code=404, detail="Project not found")
content = await file.read()
# 转录
print(f"Processing: {file.filename} ({len(content)} bytes)")
print(f"Processing: {file.filename}")
tw_result = transcribe_audio(content, file.filename)
# 构建片段
segments = [
TranscriptSegment(**seg) for seg in tw_result["segments"]
] or [TranscriptSegment(start=0, end=0, text=tw_result["full_text"], speaker="Speaker A")]
# LLM 实体提取
# 提取实体
print("Extracting entities...")
entities = extract_entities_with_llm(tw_result["full_text"])
raw_entities = extract_entities_with_llm(tw_result["full_text"])
analysis = AnalysisResult(
transcript_id=os.urandom(8).hex(),
# 实体对齐
aligned_entities = []
for raw_ent in raw_entities:
existing = align_entity(project_id, raw_ent["name"], db)
if existing:
# 复用已有实体
ent_model = EntityModel(
id=existing.id,
name=existing.name,
type=existing.type,
definition=existing.definition,
aliases=existing.aliases
)
else:
# 创建新实体
new_ent = db.create_entity(Entity(
id=str(uuid.uuid4())[:8],
project_id=project_id,
name=raw_ent["name"],
type=raw_ent.get("type", "OTHER"),
definition=raw_ent.get("definition", "")
))
ent_model = EntityModel(
id=new_ent.id,
name=new_ent.name,
type=new_ent.type,
definition=new_ent.definition
)
aligned_entities.append(ent_model)
# 构建片段
segments = [TranscriptSegment(**seg) for seg in tw_result["segments"]]
transcript_id = str(uuid.uuid4())[:8]
return AnalysisResult(
transcript_id=transcript_id,
project_id=project_id,
segments=segments,
entities=entities,
entities=aligned_entities,
full_text=tw_result["full_text"],
created_at=datetime.now().isoformat()
)
@app.get("/api/v1/projects/{project_id}/entities")
async def get_project_entities(project_id: str):
"""获取项目的全局实体列表"""
if not DB_AVAILABLE:
return []
storage[analysis.transcript_id] = analysis
print(f"Complete: {analysis.transcript_id}, {len(entities)} entities")
return analysis
db = get_db_manager()
entities = db.list_project_entities(project_id)
return [{"id": e.id, "name": e.name, "type": e.type, "definition": e.definition} for e in entities]
@app.get("/api/v1/transcripts/{transcript_id}", response_model=AnalysisResult)
async def get_transcript(transcript_id: str):
if transcript_id not in storage:
raise HTTPException(status_code=404, detail="Transcript not found")
return storage[transcript_id]
@app.get("/api/v1/transcripts")
async def list_transcripts():
return list(storage.values())
@app.post("/api/v1/entities/{entity_id}/merge")
async def merge_entities(entity_id: str, target_entity_id: str):
"""合并两个实体"""
if not DB_AVAILABLE:
raise HTTPException(status_code=500, detail="Database not available")
db = get_db_manager()
result = db.merge_entities(target_entity_id, entity_id)
return {"success": True, "merged_entity": {"id": result.id, "name": result.name}}
# Serve frontend
app.mount("/", StaticFiles(directory="frontend", html=True), name="frontend")