Phase 3: Memory & Growth - Multi-file fusion, Entity alignment with embedding, Document import, Knowledge base panel

This commit is contained in:
OpenClaw Bot
2026-02-18 12:12:39 +08:00
parent 643fe46780
commit da8a4db985
11 changed files with 1842 additions and 167 deletions

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""
InsightFlow Backend - Phase 3 (Production Ready)
Knowledge Growth: Multi-file fusion + Entity Alignment
InsightFlow Backend - Phase 3 (Memory & Growth)
Knowledge Growth: Multi-file fusion + Entity Alignment + Document Import
ASR: 阿里云听悟 + OSS
"""
@@ -9,6 +9,7 @@ import os
import json
import httpx
import uuid
import re
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
@@ -35,6 +36,18 @@ try:
except ImportError:
DB_AVAILABLE = False
try:
from document_processor import DocumentProcessor
DOC_PROCESSOR_AVAILABLE = True
except ImportError:
DOC_PROCESSOR_AVAILABLE = False
try:
from entity_aligner import EntityAligner
ALIGNER_AVAILABLE = True
except ImportError:
ALIGNER_AVAILABLE = False
app = FastAPI(title="InsightFlow", version="0.3.0")
app.add_middleware(
@@ -90,9 +103,29 @@ class EntityMergeRequest(BaseModel):
source_entity_id: str
target_entity_id: str
class GlossaryTermCreate(BaseModel):
term: str
pronunciation: Optional[str] = ""
# API Keys
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
KIMI_BASE_URL = "https://api.kimi.com/coding"
KIMI_BASE_URL = os.getenv("KIMI_BASE_URL", "https://api.kimi.com/coding")
# Phase 3: Entity Aligner singleton
_aligner = None
def get_aligner():
global _aligner
if _aligner is None and ALIGNER_AVAILABLE:
_aligner = EntityAligner()
return _aligner
# Phase 3: Document Processor singleton
_doc_processor = None
def get_doc_processor():
global _doc_processor
if _doc_processor is None and DOC_PROCESSOR_AVAILABLE:
_doc_processor = DocumentProcessor()
return _doc_processor
# Phase 2: Entity Edit API
@app.put("/api/v1/entities/{entity_id}")
@@ -406,12 +439,21 @@ def extract_entities_with_llm(text: str) -> tuple[List[dict], List[dict]]:
return [], []
def align_entity(project_id: str, name: str, db) -> Optional[Entity]:
"""实体对齐"""
def align_entity(project_id: str, name: str, db, definition: str = "") -> Optional[Entity]:
"""实体对齐 - Phase 3: 使用 embedding 对齐"""
# 1. 首先尝试精确匹配
existing = db.get_entity_by_name(project_id, name)
if existing:
return existing
# 2. 使用 embedding 对齐(如果可用)
aligner = get_aligner()
if aligner:
similar = aligner.find_similar_entity(project_id, name, definition)
if similar:
return similar
# 3. 回退到简单相似度匹配
similar = db.find_similar_entities(project_id, name)
if similar:
return similar[0]
@@ -443,7 +485,7 @@ async def list_projects():
@app.post("/api/v1/projects/{project_id}/upload", response_model=AnalysisResult)
async def upload_audio(project_id: str, file: UploadFile = File(...)):
"""上传音频到指定项目"""
"""上传音频到指定项目 - Phase 3: 支持多文件融合"""
if not DB_AVAILABLE:
raise HTTPException(status_code=500, detail="Database not available")
@@ -471,12 +513,12 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
full_text=tw_result["full_text"]
)
# 实体对齐并保存
# 实体对齐并保存 - Phase 3: 使用增强对齐
aligned_entities = []
entity_name_to_id = {} # 用于关系映射
for raw_ent in raw_entities:
existing = align_entity(project_id, raw_ent["name"], db)
existing = align_entity(project_id, raw_ent["name"], db, raw_ent.get("definition", ""))
if existing:
ent_model = EntityModel(
@@ -551,6 +593,302 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
created_at=datetime.now().isoformat()
)
# Phase 3: Document Upload API
@app.post("/api/v1/projects/{project_id}/upload-document")
async def upload_document(project_id: str, file: UploadFile = File(...)):
"""上传 PDF/DOCX 文档到指定项目"""
if not DB_AVAILABLE:
raise HTTPException(status_code=500, detail="Database not available")
if not DOC_PROCESSOR_AVAILABLE:
raise HTTPException(status_code=500, detail="Document processor not available")
db = get_db_manager()
project = db.get_project(project_id)
if not project:
raise HTTPException(status_code=404, detail="Project not found")
content = await file.read()
# 处理文档
processor = get_doc_processor()
try:
result = processor.process(content, file.filename)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Document processing failed: {str(e)}")
# 保存文档转录记录
transcript_id = str(uuid.uuid4())[:8]
db.save_transcript(
transcript_id=transcript_id,
project_id=project_id,
filename=file.filename,
full_text=result["text"],
transcript_type="document"
)
# 提取实体和关系
raw_entities, raw_relations = extract_entities_with_llm(result["text"])
# 实体对齐并保存
aligned_entities = []
entity_name_to_id = {}
for raw_ent in raw_entities:
existing = align_entity(project_id, raw_ent["name"], db, raw_ent.get("definition", ""))
if existing:
entity_name_to_id[raw_ent["name"]] = existing.id
aligned_entities.append(EntityModel(
id=existing.id,
name=existing.name,
type=existing.type,
definition=existing.definition,
aliases=existing.aliases
))
else:
new_ent = db.create_entity(Entity(
id=str(uuid.uuid4())[:8],
project_id=project_id,
name=raw_ent["name"],
type=raw_ent.get("type", "OTHER"),
definition=raw_ent.get("definition", "")
))
entity_name_to_id[raw_ent["name"]] = new_ent.id
aligned_entities.append(EntityModel(
id=new_ent.id,
name=new_ent.name,
type=new_ent.type,
definition=new_ent.definition
))
# 保存实体提及位置
full_text = result["text"]
name = raw_ent["name"]
start_pos = 0
while True:
pos = full_text.find(name, start_pos)
if pos == -1:
break
mention = EntityMention(
id=str(uuid.uuid4())[:8],
entity_id=entity_name_to_id[name],
transcript_id=transcript_id,
start_pos=pos,
end_pos=pos + len(name),
text_snippet=full_text[max(0, pos-20):min(len(full_text), pos+len(name)+20)],
confidence=1.0
)
db.add_mention(mention)
start_pos = pos + 1
# 保存关系
for rel in raw_relations:
source_id = entity_name_to_id.get(rel.get("source", ""))
target_id = entity_name_to_id.get(rel.get("target", ""))
if source_id and target_id:
db.create_relation(
project_id=project_id,
source_entity_id=source_id,
target_entity_id=target_id,
relation_type=rel.get("type", "related"),
evidence=result["text"][:200],
transcript_id=transcript_id
)
return {
"transcript_id": transcript_id,
"project_id": project_id,
"filename": file.filename,
"text_length": len(result["text"]),
"entities": [e.dict() for e in aligned_entities],
"created_at": datetime.now().isoformat()
}
# Phase 3: Knowledge Base API
@app.get("/api/v1/projects/{project_id}/knowledge-base")
async def get_knowledge_base(project_id: str):
"""获取项目知识库 - 包含所有实体、关系、术语表"""
if not DB_AVAILABLE:
raise HTTPException(status_code=500, detail="Database not available")
db = get_db_manager()
project = db.get_project(project_id)
if not project:
raise HTTPException(status_code=404, detail="Project not found")
# 获取所有实体
entities = db.list_project_entities(project_id)
# 获取所有关系
relations = db.list_project_relations(project_id)
# 获取所有转录
transcripts = db.list_project_transcripts(project_id)
# 获取术语表
glossary = db.list_glossary(project_id)
# 构建实体统计
entity_stats = {}
for ent in entities:
mentions = db.get_entity_mentions(ent.id)
entity_stats[ent.id] = {
"mention_count": len(mentions),
"transcript_ids": list(set([m.transcript_id for m in mentions]))
}
# 构建实体名称映射
entity_map = {e.id: e.name for e in entities}
return {
"project": {
"id": project.id,
"name": project.name,
"description": project.description
},
"stats": {
"entity_count": len(entities),
"relation_count": len(relations),
"transcript_count": len(transcripts),
"glossary_count": len(glossary)
},
"entities": [
{
"id": e.id,
"name": e.name,
"type": e.type,
"definition": e.definition,
"aliases": e.aliases,
"mention_count": entity_stats.get(e.id, {}).get("mention_count", 0),
"appears_in": entity_stats.get(e.id, {}).get("transcript_ids", [])
}
for e in entities
],
"relations": [
{
"id": r["id"],
"source_id": r["source_entity_id"],
"source_name": entity_map.get(r["source_entity_id"], "Unknown"),
"target_id": r["target_entity_id"],
"target_name": entity_map.get(r["target_entity_id"], "Unknown"),
"type": r["relation_type"],
"evidence": r["evidence"]
}
for r in relations
],
"glossary": [
{
"id": g["id"],
"term": g["term"],
"pronunciation": g["pronunciation"],
"frequency": g["frequency"]
}
for g in glossary
],
"transcripts": [
{
"id": t["id"],
"filename": t["filename"],
"type": t.get("type", "audio"),
"created_at": t["created_at"]
}
for t in transcripts
]
}
# Phase 3: Glossary API
@app.post("/api/v1/projects/{project_id}/glossary")
async def add_glossary_term(project_id: str, term: GlossaryTermCreate):
"""添加术语到项目术语表"""
if not DB_AVAILABLE:
raise HTTPException(status_code=500, detail="Database not available")
db = get_db_manager()
project = db.get_project(project_id)
if not project:
raise HTTPException(status_code=404, detail="Project not found")
term_id = db.add_glossary_term(
project_id=project_id,
term=term.term,
pronunciation=term.pronunciation
)
return {
"id": term_id,
"term": term.term,
"pronunciation": term.pronunciation,
"success": True
}
@app.get("/api/v1/projects/{project_id}/glossary")
async def get_glossary(project_id: str):
"""获取项目术语表"""
if not DB_AVAILABLE:
raise HTTPException(status_code=500, detail="Database not available")
db = get_db_manager()
glossary = db.list_glossary(project_id)
return glossary
@app.delete("/api/v1/glossary/{term_id}")
async def delete_glossary_term(term_id: str):
"""删除术语"""
if not DB_AVAILABLE:
raise HTTPException(status_code=500, detail="Database not available")
db = get_db_manager()
db.delete_glossary_term(term_id)
return {"success": True}
# Phase 3: Entity Alignment API
@app.post("/api/v1/projects/{project_id}/align-entities")
async def align_project_entities(project_id: str, threshold: float = 0.85):
"""运行实体对齐算法,合并相似实体"""
if not DB_AVAILABLE:
raise HTTPException(status_code=500, detail="Database not available")
aligner = get_aligner()
if not aligner:
raise HTTPException(status_code=500, detail="Entity aligner not available")
db = get_db_manager()
entities = db.list_project_entities(project_id)
merged_count = 0
merged_pairs = []
# 使用 embedding 对齐
for i, entity in enumerate(entities):
# 跳过已合并的实体
existing = db.get_entity(entity.id)
if not existing:
continue
similar = aligner.find_similar_entity(
project_id,
entity.name,
entity.definition,
exclude_id=entity.id,
threshold=threshold
)
if similar:
# 合并实体
db.merge_entities(similar.id, entity.id)
merged_count += 1
merged_pairs.append({
"source": entity.name,
"target": similar.name
})
return {
"success": True,
"merged_count": merged_count,
"merged_pairs": merged_pairs
}
@app.get("/api/v1/projects/{project_id}/entities")
async def get_project_entities(project_id: str):
"""获取项目的全局实体列表"""
@@ -559,7 +897,7 @@ async def get_project_entities(project_id: str):
db = get_db_manager()
entities = db.list_project_entities(project_id)
return [{"id": e.id, "name": e.name, "type": e.type, "definition": e.definition} for e in entities]
return [{"id": e.id, "name": e.name, "type": e.type, "definition": e.definition, "aliases": e.aliases} for e in entities]
@app.get("/api/v1/projects/{project_id}/relations")
@@ -597,6 +935,7 @@ async def get_project_transcripts(project_id: str):
return [{
"id": t["id"],
"filename": t["filename"],
"type": t.get("type", "audio"),
"created_at": t["created_at"],
"preview": t["full_text"][:100] + "..." if len(t["full_text"]) > 100 else t["full_text"]
} for t in transcripts]
@@ -619,42 +958,18 @@ async def get_entity_mentions(entity_id: str):
"confidence": m.confidence
} for m in mentions]
@app.post("/api/v1/entities/{entity_id}/merge")
async def merge_entities_endpoint(entity_id: str, merge_req: EntityMergeRequest):
"""合并两个实体"""
if not DB_AVAILABLE:
raise HTTPException(status_code=500, detail="Database not available")
db = get_db_manager()
# 验证两个实体都存在
source = db.get_entity(merge_req.source_entity_id)
target = db.get_entity(merge_req.target_entity_id)
if not source or not target:
raise HTTPException(status_code=404, detail="Entity not found")
result = db.merge_entities(merge_req.target_entity_id, merge_req.source_entity_id)
return {
"success": True,
"merged_entity": {
"id": result.id,
"name": result.name,
"type": result.type,
"definition": result.definition,
"aliases": result.aliases
}
}
# Health check
@app.get("/health")
async def health_check():
return {
"status": "ok",
"version": "0.3.0",
"phase": "Phase 3 - Memory & Growth",
"oss_available": OSS_AVAILABLE,
"tingwu_available": TINGWU_AVAILABLE,
"db_available": DB_AVAILABLE
"db_available": DB_AVAILABLE,
"doc_processor_available": DOC_PROCESSOR_AVAILABLE,
"aligner_available": ALIGNER_AVAILABLE
}
# Serve frontend