Phase 3: Memory & Growth - Multi-file fusion, Entity alignment with embedding, Document import, Knowledge base panel

2026-02-18 12:12:39 +08:00
parent 643fe46780
commit da8a4db985
11 changed files with 1842 additions and 167 deletions
--- a/backend/main.py
+++ b/backend/main.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
-InsightFlow Backend - Phase 3 (Production Ready)
-Knowledge Growth: Multi-file fusion + Entity Alignment
+InsightFlow Backend - Phase 3 (Memory & Growth)
+Knowledge Growth: Multi-file fusion + Entity Alignment + Document Import
 ASR: 阿里云听悟 + OSS
 """

@@ -9,6 +9,7 @@ import os
 import json
 import httpx
 import uuid
+import re
 from fastapi import FastAPI, File, UploadFile, HTTPException, Form
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
@@ -35,6 +36,18 @@ try:
 except ImportError:
    DB_AVAILABLE = False

+try:
+    from document_processor import DocumentProcessor
+    DOC_PROCESSOR_AVAILABLE = True
+except ImportError:
+    DOC_PROCESSOR_AVAILABLE = False
+
+try:
+    from entity_aligner import EntityAligner
+    ALIGNER_AVAILABLE = True
+except ImportError:
+    ALIGNER_AVAILABLE = False
+
 app = FastAPI(title="InsightFlow", version="0.3.0")

 app.add_middleware(
@@ -90,9 +103,29 @@ class EntityMergeRequest(BaseModel):
    source_entity_id: str
    target_entity_id: str

+class GlossaryTermCreate(BaseModel):
+    term: str
+    pronunciation: Optional[str] = ""
+
 # API Keys
 KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
-KIMI_BASE_URL = "https://api.kimi.com/coding"
+KIMI_BASE_URL = os.getenv("KIMI_BASE_URL", "https://api.kimi.com/coding")
+
+# Phase 3: Entity Aligner singleton
+_aligner = None
+def get_aligner():
+    global _aligner
+    if _aligner is None and ALIGNER_AVAILABLE:
+        _aligner = EntityAligner()
+    return _aligner
+
+# Phase 3: Document Processor singleton
+_doc_processor = None
+def get_doc_processor():
+    global _doc_processor
+    if _doc_processor is None and DOC_PROCESSOR_AVAILABLE:
+        _doc_processor = DocumentProcessor()
+    return _doc_processor

 # Phase 2: Entity Edit API
@app.put("/api/v1/entities/{entity_id}")
@@ -406,12 +439,21 @@ def extract_entities_with_llm(text: str) -> tuple[List[dict], List[dict]]:
    
    return [], []

-def align_entity(project_id: str, name: str, db) -> Optional[Entity]:
-    """实体对齐"""
+def align_entity(project_id: str, name: str, db, definition: str = "") -> Optional[Entity]:
+    """实体对齐 - Phase 3: 使用 embedding 对齐"""
+    # 1. 首先尝试精确匹配
    existing = db.get_entity_by_name(project_id, name)
    if existing:
        return existing
    
+    # 2. 使用 embedding 对齐（如果可用）
+    aligner = get_aligner()
+    if aligner:
+        similar = aligner.find_similar_entity(project_id, name, definition)
+        if similar:
+            return similar
+    
+    # 3. 回退到简单相似度匹配
    similar = db.find_similar_entities(project_id, name)
    if similar:
        return similar[0]
@@ -443,7 +485,7 @@ async def list_projects():

@app.post("/api/v1/projects/{project_id}/upload", response_model=AnalysisResult)
 async def upload_audio(project_id: str, file: UploadFile = File(...)):
-    """上传音频到指定项目"""
+    """上传音频到指定项目 - Phase 3: 支持多文件融合"""
    if not DB_AVAILABLE:
        raise HTTPException(status_code=500, detail="Database not available")
    
@@ -471,12 +513,12 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
        full_text=tw_result["full_text"]
    )
    
-    # 实体对齐并保存
+    # 实体对齐并保存 - Phase 3: 使用增强对齐
    aligned_entities = []
    entity_name_to_id = {}  # 用于关系映射
    
    for raw_ent in raw_entities:
-        existing = align_entity(project_id, raw_ent["name"], db)
+        existing = align_entity(project_id, raw_ent["name"], db, raw_ent.get("definition", ""))
        
        if existing:
            ent_model = EntityModel(
@@ -551,6 +593,302 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
        created_at=datetime.now().isoformat()
    )

+# Phase 3: Document Upload API
+@app.post("/api/v1/projects/{project_id}/upload-document")
+async def upload_document(project_id: str, file: UploadFile = File(...)):
+    """上传 PDF/DOCX 文档到指定项目"""
+    if not DB_AVAILABLE:
+        raise HTTPException(status_code=500, detail="Database not available")
+    
+    if not DOC_PROCESSOR_AVAILABLE:
+        raise HTTPException(status_code=500, detail="Document processor not available")
+    
+    db = get_db_manager()
+    project = db.get_project(project_id)
+    if not project:
+        raise HTTPException(status_code=404, detail="Project not found")
+    
+    content = await file.read()
+    
+    # 处理文档
+    processor = get_doc_processor()
+    try:
+        result = processor.process(content, file.filename)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Document processing failed: {str(e)}")
+    
+    # 保存文档转录记录
+    transcript_id = str(uuid.uuid4())[:8]
+    db.save_transcript(
+        transcript_id=transcript_id,
+        project_id=project_id,
+        filename=file.filename,
+        full_text=result["text"],
+        transcript_type="document"
+    )
+    
+    # 提取实体和关系
+    raw_entities, raw_relations = extract_entities_with_llm(result["text"])
+    
+    # 实体对齐并保存
+    aligned_entities = []
+    entity_name_to_id = {}
+    
+    for raw_ent in raw_entities:
+        existing = align_entity(project_id, raw_ent["name"], db, raw_ent.get("definition", ""))
+        
+        if existing:
+            entity_name_to_id[raw_ent["name"]] = existing.id
+            aligned_entities.append(EntityModel(
+                id=existing.id,
+                name=existing.name,
+                type=existing.type,
+                definition=existing.definition,
+                aliases=existing.aliases
+            ))
+        else:
+            new_ent = db.create_entity(Entity(
+                id=str(uuid.uuid4())[:8],
+                project_id=project_id,
+                name=raw_ent["name"],
+                type=raw_ent.get("type", "OTHER"),
+                definition=raw_ent.get("definition", "")
+            ))
+            entity_name_to_id[raw_ent["name"]] = new_ent.id
+            aligned_entities.append(EntityModel(
+                id=new_ent.id,
+                name=new_ent.name,
+                type=new_ent.type,
+                definition=new_ent.definition
+            ))
+        
+        # 保存实体提及位置
+        full_text = result["text"]
+        name = raw_ent["name"]
+        start_pos = 0
+        while True:
+            pos = full_text.find(name, start_pos)
+            if pos == -1:
+                break
+            mention = EntityMention(
+                id=str(uuid.uuid4())[:8],
+                entity_id=entity_name_to_id[name],
+                transcript_id=transcript_id,
+                start_pos=pos,
+                end_pos=pos + len(name),
+                text_snippet=full_text[max(0, pos-20):min(len(full_text), pos+len(name)+20)],
+                confidence=1.0
+            )
+            db.add_mention(mention)
+            start_pos = pos + 1
+    
+    # 保存关系
+    for rel in raw_relations:
+        source_id = entity_name_to_id.get(rel.get("source", ""))
+        target_id = entity_name_to_id.get(rel.get("target", ""))
+        if source_id and target_id:
+            db.create_relation(
+                project_id=project_id,
+                source_entity_id=source_id,
+                target_entity_id=target_id,
+                relation_type=rel.get("type", "related"),
+                evidence=result["text"][:200],
+                transcript_id=transcript_id
+            )
+    
+    return {
+        "transcript_id": transcript_id,
+        "project_id": project_id,
+        "filename": file.filename,
+        "text_length": len(result["text"]),
+        "entities": [e.dict() for e in aligned_entities],
+        "created_at": datetime.now().isoformat()
+    }
+
+# Phase 3: Knowledge Base API
+@app.get("/api/v1/projects/{project_id}/knowledge-base")
+async def get_knowledge_base(project_id: str):
+    """获取项目知识库 - 包含所有实体、关系、术语表"""
+    if not DB_AVAILABLE:
+        raise HTTPException(status_code=500, detail="Database not available")
+    
+    db = get_db_manager()
+    project = db.get_project(project_id)
+    if not project:
+        raise HTTPException(status_code=404, detail="Project not found")
+    
+    # 获取所有实体
+    entities = db.list_project_entities(project_id)
+    
+    # 获取所有关系
+    relations = db.list_project_relations(project_id)
+    
+    # 获取所有转录
+    transcripts = db.list_project_transcripts(project_id)
+    
+    # 获取术语表
+    glossary = db.list_glossary(project_id)
+    
+    # 构建实体统计
+    entity_stats = {}
+    for ent in entities:
+        mentions = db.get_entity_mentions(ent.id)
+        entity_stats[ent.id] = {
+            "mention_count": len(mentions),
+            "transcript_ids": list(set([m.transcript_id for m in mentions]))
+        }
+    
+    # 构建实体名称映射
+    entity_map = {e.id: e.name for e in entities}
+    
+    return {
+        "project": {
+            "id": project.id,
+            "name": project.name,
+            "description": project.description
+        },
+        "stats": {
+            "entity_count": len(entities),
+            "relation_count": len(relations),
+            "transcript_count": len(transcripts),
+            "glossary_count": len(glossary)
+        },
+        "entities": [
+            {
+                "id": e.id,
+                "name": e.name,
+                "type": e.type,
+                "definition": e.definition,
+                "aliases": e.aliases,
+                "mention_count": entity_stats.get(e.id, {}).get("mention_count", 0),
+                "appears_in": entity_stats.get(e.id, {}).get("transcript_ids", [])
+            }
+            for e in entities
+        ],
+        "relations": [
+            {
+                "id": r["id"],
+                "source_id": r["source_entity_id"],
+                "source_name": entity_map.get(r["source_entity_id"], "Unknown"),
+                "target_id": r["target_entity_id"],
+                "target_name": entity_map.get(r["target_entity_id"], "Unknown"),
+                "type": r["relation_type"],
+                "evidence": r["evidence"]
+            }
+            for r in relations
+        ],
+        "glossary": [
+            {
+                "id": g["id"],
+                "term": g["term"],
+                "pronunciation": g["pronunciation"],
+                "frequency": g["frequency"]
+            }
+            for g in glossary
+        ],
+        "transcripts": [
+            {
+                "id": t["id"],
+                "filename": t["filename"],
+                "type": t.get("type", "audio"),
+                "created_at": t["created_at"]
+            }
+            for t in transcripts
+        ]
+    }
+
+# Phase 3: Glossary API
+@app.post("/api/v1/projects/{project_id}/glossary")
+async def add_glossary_term(project_id: str, term: GlossaryTermCreate):
+    """添加术语到项目术语表"""
+    if not DB_AVAILABLE:
+        raise HTTPException(status_code=500, detail="Database not available")
+    
+    db = get_db_manager()
+    project = db.get_project(project_id)
+    if not project:
+        raise HTTPException(status_code=404, detail="Project not found")
+    
+    term_id = db.add_glossary_term(
+        project_id=project_id,
+        term=term.term,
+        pronunciation=term.pronunciation
+    )
+    
+    return {
+        "id": term_id,
+        "term": term.term,
+        "pronunciation": term.pronunciation,
+        "success": True
+    }
+
+@app.get("/api/v1/projects/{project_id}/glossary")
+async def get_glossary(project_id: str):
+    """获取项目术语表"""
+    if not DB_AVAILABLE:
+        raise HTTPException(status_code=500, detail="Database not available")
+    
+    db = get_db_manager()
+    glossary = db.list_glossary(project_id)
+    return glossary
+
+@app.delete("/api/v1/glossary/{term_id}")
+async def delete_glossary_term(term_id: str):
+    """删除术语"""
+    if not DB_AVAILABLE:
+        raise HTTPException(status_code=500, detail="Database not available")
+    
+    db = get_db_manager()
+    db.delete_glossary_term(term_id)
+    return {"success": True}
+
+# Phase 3: Entity Alignment API
+@app.post("/api/v1/projects/{project_id}/align-entities")
+async def align_project_entities(project_id: str, threshold: float = 0.85):
+    """运行实体对齐算法，合并相似实体"""
+    if not DB_AVAILABLE:
+        raise HTTPException(status_code=500, detail="Database not available")
+    
+    aligner = get_aligner()
+    if not aligner:
+        raise HTTPException(status_code=500, detail="Entity aligner not available")
+    
+    db = get_db_manager()
+    entities = db.list_project_entities(project_id)
+    
+    merged_count = 0
+    merged_pairs = []
+    
+    # 使用 embedding 对齐
+    for i, entity in enumerate(entities):
+        # 跳过已合并的实体
+        existing = db.get_entity(entity.id)
+        if not existing:
+            continue
+            
+        similar = aligner.find_similar_entity(
+            project_id, 
+            entity.name, 
+            entity.definition,
+            exclude_id=entity.id,
+            threshold=threshold
+        )
+        
+        if similar:
+            # 合并实体
+            db.merge_entities(similar.id, entity.id)
+            merged_count += 1
+            merged_pairs.append({
+                "source": entity.name,
+                "target": similar.name
+            })
+    
+    return {
+        "success": True,
+        "merged_count": merged_count,
+        "merged_pairs": merged_pairs
+    }
+
@app.get("/api/v1/projects/{project_id}/entities")
 async def get_project_entities(project_id: str):
    """获取项目的全局实体列表"""
@@ -559,7 +897,7 @@ async def get_project_entities(project_id: str):
    
    db = get_db_manager()
    entities = db.list_project_entities(project_id)
-    return [{"id": e.id, "name": e.name, "type": e.type, "definition": e.definition} for e in entities]
+    return [{"id": e.id, "name": e.name, "type": e.type, "definition": e.definition, "aliases": e.aliases} for e in entities]


@app.get("/api/v1/projects/{project_id}/relations")
@@ -597,6 +935,7 @@ async def get_project_transcripts(project_id: str):
    return [{
        "id": t["id"],
        "filename": t["filename"],
+        "type": t.get("type", "audio"),
        "created_at": t["created_at"],
        "preview": t["full_text"][:100] + "..." if len(t["full_text"]) > 100 else t["full_text"]
    } for t in transcripts]
@@ -619,42 +958,18 @@ async def get_entity_mentions(entity_id: str):
        "confidence": m.confidence
    } for m in mentions]

-@app.post("/api/v1/entities/{entity_id}/merge")
-async def merge_entities_endpoint(entity_id: str, merge_req: EntityMergeRequest):
-    """合并两个实体"""
-    if not DB_AVAILABLE:
-        raise HTTPException(status_code=500, detail="Database not available")
-    
-    db = get_db_manager()
-    
-    # 验证两个实体都存在
-    source = db.get_entity(merge_req.source_entity_id)
-    target = db.get_entity(merge_req.target_entity_id)
-    
-    if not source or not target:
-        raise HTTPException(status_code=404, detail="Entity not found")
-    
-    result = db.merge_entities(merge_req.target_entity_id, merge_req.source_entity_id)
-    return {
-        "success": True,
-        "merged_entity": {
-            "id": result.id,
-            "name": result.name,
-            "type": result.type,
-            "definition": result.definition,
-            "aliases": result.aliases
-        }
-    }
-
 # Health check
@app.get("/health")
 async def health_check():
    return {
        "status": "ok",
        "version": "0.3.0",
+        "phase": "Phase 3 - Memory & Growth",
        "oss_available": OSS_AVAILABLE,
        "tingwu_available": TINGWU_AVAILABLE,
-        "db_available": DB_AVAILABLE
+        "db_available": DB_AVAILABLE,
+        "doc_processor_available": DOC_PROCESSOR_AVAILABLE,
+        "aligner_available": ALIGNER_AVAILABLE
    }

 # Serve frontend