feat: Phase 1 MVP 完成

- 实现实体和关系同时提取（LLM） - 添加 transcripts/mentions/relations 数据持久化 - 新增 API: 关系列表、转录列表、实体提及位置 - 前端实体高亮显示和图谱联动 - 添加 STATUS.md 跟踪开发进度
2026-02-18 00:03:08 +08:00
parent 77d14e673f
commit 2a3081c151
5 changed files with 451 additions and 73 deletions
--- a/backend/main.py
+++ b/backend/main.py
@@ -114,20 +114,34 @@ def mock_transcribe() -> dict:
        ]
    }

-def extract_entities_with_llm(text: str) -> List[dict]:
-    """使用 Kimi API 提取实体"""
-    if not KIMI_API_KEY or not text:
-        return []
+def extract_entities_with_llm(text: str) -> tuple[List[dict], List[dict]]:
+    """使用 Kimi API 提取实体和关系
    
-    prompt = f"""从以下会议文本中提取关键实体，以 JSON 格式返回：
+    Returns:
+        (entities, relations): 实体列表和关系列表
+    """
+    if not KIMI_API_KEY or not text:
+        return [], []
+    
+    prompt = f"""从以下会议文本中提取关键实体和它们之间的关系，以 JSON 格式返回：

 文本：{text[:3000]}

 要求：
-1. 每个实体包含：name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), definition(一句话定义)
-2. 只返回 JSON 数组
+1. entities: 每个实体包含 name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), definition(一句话定义)
+2. relations: 每个关系包含 source(源实体名), target(目标实体名), type(关系类型: belongs_to/works_with/depends_on/mentions/related)
+3. 只返回 JSON 对象，格式: {{"entities": [...], "relations": [...]}}

-示例：[{{"name": "Project Alpha", "type": "PROJECT", "definition": "核心项目"}}]
+示例：
+{{
+  "entities": [
+    {{"name": "Project Alpha", "type": "PROJECT", "definition": "核心项目"}},
+    {{"name": "K8s", "type": "TECH", "definition": "Kubernetes容器编排平台"}}
+  ],
+  "relations": [
+    {{"source": "Project Alpha", "target": "K8s", "type": "depends_on"}}
+  ]
+}}
 """
    
    try:
@@ -142,13 +156,14 @@ def extract_entities_with_llm(text: str) -> List[dict]:
        content = result["choices"][0]["message"]["content"]
        
        import re
-        json_match = re.search(r'\[.*?\]', content, re.DOTALL)
+        json_match = re.search(r'\{{.*?\}}', content, re.DOTALL)
        if json_match:
-            return json.loads(json_match.group())
+            data = json.loads(json_match.group())
+            return data.get("entities", []), data.get("relations", [])
    except Exception as e:
        print(f"LLM extraction failed: {e}")
    
-    return []
+    return [], []

 def align_entity(project_id: str, name: str, db) -> Optional[Entity]:
    """实体对齐"""
@@ -202,12 +217,23 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
    print(f"Processing: {file.filename}")
    tw_result = transcribe_audio(content, file.filename)
    
-    # 提取实体
-    print("Extracting entities...")
-    raw_entities = extract_entities_with_llm(tw_result["full_text"])
+    # 提取实体和关系
+    print("Extracting entities and relations...")
+    raw_entities, raw_relations = extract_entities_with_llm(tw_result["full_text"])
    
-    # 实体对齐
+    # 保存转录记录
+    transcript_id = str(uuid.uuid4())[:8]
+    db.save_transcript(
+        transcript_id=transcript_id,
+        project_id=project_id,
+        filename=file.filename,
+        full_text=tw_result["full_text"]
+    )
+    
+    # 实体对齐并保存
    aligned_entities = []
+    entity_name_to_id = {}  # 用于关系映射
+    
    for raw_ent in raw_entities:
        existing = align_entity(project_id, raw_ent["name"], db)
        
@@ -219,6 +245,7 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
                definition=existing.definition,
                aliases=existing.aliases
            )
+            entity_name_to_id[raw_ent["name"]] = existing.id
        else:
            new_ent = db.create_entity(Entity(
                id=str(uuid.uuid4())[:8],
@@ -233,14 +260,47 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
                type=new_ent.type,
                definition=new_ent.definition
            )
+            entity_name_to_id[raw_ent["name"]] = new_ent.id
        
        aligned_entities.append(ent_model)
+        
+        # 保存实体提及位置
+        full_text = tw_result["full_text"]
+        name = raw_ent["name"]
+        start_pos = 0
+        while True:
+            pos = full_text.find(name, start_pos)
+            if pos == -1:
+                break
+            mention = EntityMention(
+                id=str(uuid.uuid4())[:8],
+                entity_id=entity_name_to_id[name],
+                transcript_id=transcript_id,
+                start_pos=pos,
+                end_pos=pos + len(name),
+                text_snippet=full_text[max(0, pos-20):min(len(full_text), pos+len(name)+20)],
+                confidence=1.0
+            )
+            db.add_mention(mention)
+            start_pos = pos + 1
+    
+    # 保存关系
+    for rel in raw_relations:
+        source_id = entity_name_to_id.get(rel.get("source", ""))
+        target_id = entity_name_to_id.get(rel.get("target", ""))
+        if source_id and target_id:
+            db.create_relation(
+                project_id=project_id,
+                source_entity_id=source_id,
+                target_entity_id=target_id,
+                relation_type=rel.get("type", "related"),
+                evidence=tw_result["full_text"][:200],
+                transcript_id=transcript_id
+            )
    
    # 构建片段
    segments = [TranscriptSegment(**seg) for seg in tw_result["segments"]]
    
-    transcript_id = str(uuid.uuid4())[:8]
-    
    return AnalysisResult(
        transcript_id=transcript_id,
        project_id=project_id,
@@ -260,6 +320,64 @@ async def get_project_entities(project_id: str):
    entities = db.list_project_entities(project_id)
    return [{"id": e.id, "name": e.name, "type": e.type, "definition": e.definition} for e in entities]

+
+@app.get("/api/v1/projects/{project_id}/relations")
+async def get_project_relations(project_id: str):
+    """获取项目的实体关系列表"""
+    if not DB_AVAILABLE:
+        return []
+    
+    db = get_db_manager()
+    relations = db.list_project_relations(project_id)
+    
+    # 获取实体名称映射
+    entities = db.list_project_entities(project_id)
+    entity_map = {e.id: e.name for e in entities}
+    
+    return [{
+        "id": r["id"],
+        "source_id": r["source_entity_id"],
+        "source_name": entity_map.get(r["source_entity_id"], "Unknown"),
+        "target_id": r["target_entity_id"],
+        "target_name": entity_map.get(r["target_entity_id"], "Unknown"),
+        "type": r["relation_type"],
+        "evidence": r["evidence"]
+    } for r in relations]
+
+
+@app.get("/api/v1/projects/{project_id}/transcripts")
+async def get_project_transcripts(project_id: str):
+    """获取项目的转录列表"""
+    if not DB_AVAILABLE:
+        return []
+    
+    db = get_db_manager()
+    transcripts = db.list_project_transcripts(project_id)
+    return [{
+        "id": t["id"],
+        "filename": t["filename"],
+        "created_at": t["created_at"],
+        "preview": t["full_text"][:100] + "..." if len(t["full_text"]) > 100 else t["full_text"]
+    } for t in transcripts]
+
+
+@app.get("/api/v1/entities/{entity_id}/mentions")
+async def get_entity_mentions(entity_id: str):
+    """获取实体的所有提及位置"""
+    if not DB_AVAILABLE:
+        return []
+    
+    db = get_db_manager()
+    mentions = db.get_entity_mentions(entity_id)
+    return [{
+        "id": m.id,
+        "transcript_id": m.transcript_id,
+        "start_pos": m.start_pos,
+        "end_pos": m.end_pos,
+        "text_snippet": m.text_snippet,
+        "confidence": m.confidence
+    } for m in mentions]
+
@app.post("/api/v1/entities/{entity_id}/merge")
 async def merge_entities(entity_id: str, target_entity_id: str):
    """合并两个实体"""