feat: Phase 1 MVP 完成

- 实现实体和关系同时提取(LLM)
- 添加 transcripts/mentions/relations 数据持久化
- 新增 API: 关系列表、转录列表、实体提及位置
- 前端实体高亮显示和图谱联动
- 添加 STATUS.md 跟踪开发进度
This commit is contained in:
OpenClaw Bot
2026-02-18 00:03:08 +08:00
parent 77d14e673f
commit 2a3081c151
5 changed files with 451 additions and 73 deletions

View File

@@ -221,9 +221,81 @@ class DatabaseManager:
conn.close()
return [EntityMention(**dict(r)) for r in rows]
# Transcript operations
def save_transcript(self, transcript_id: str, project_id: str, filename: str, full_text: str):
"""保存转录记录"""
conn = self.get_conn()
now = datetime.now().isoformat()
conn.execute(
"INSERT INTO transcripts (id, project_id, filename, full_text, created_at) VALUES (?, ?, ?, ?, ?)",
(transcript_id, project_id, filename, full_text, now)
)
conn.commit()
conn.close()
def get_transcript(self, transcript_id: str) -> Optional[dict]:
"""获取转录记录"""
conn = self.get_conn()
row = conn.execute("SELECT * FROM transcripts WHERE id = ?", (transcript_id,)).fetchone()
conn.close()
if row:
return dict(row)
return None
def list_project_transcripts(self, project_id: str) -> List[dict]:
"""列出项目的所有转录"""
conn = self.get_conn()
rows = conn.execute(
"SELECT * FROM transcripts WHERE project_id = ? ORDER BY created_at DESC",
(project_id,)
).fetchall()
conn.close()
return [dict(r) for r in rows]
# Relation operations
def create_relation(self, project_id: str, source_entity_id: str, target_entity_id: str,
relation_type: str = "related", evidence: str = "", transcript_id: str = ""):
"""创建实体关系"""
conn = self.get_conn()
relation_id = str(uuid.uuid4())[:8]
now = datetime.now().isoformat()
conn.execute(
"""INSERT INTO entity_relations
(id, project_id, source_entity_id, target_entity_id, relation_type, evidence, transcript_id, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
(relation_id, project_id, source_entity_id, target_entity_id, relation_type, evidence, transcript_id, now)
)
conn.commit()
conn.close()
return relation_id
def get_entity_relations(self, entity_id: str) -> List[dict]:
"""获取实体的所有关系"""
conn = self.get_conn()
rows = conn.execute(
"""SELECT * FROM entity_relations
WHERE source_entity_id = ? OR target_entity_id = ?
ORDER BY created_at DESC""",
(entity_id, entity_id)
).fetchall()
conn.close()
return [dict(r) for r in rows]
def list_project_relations(self, project_id: str) -> List[dict]:
"""列出项目的所有关系"""
conn = self.get_conn()
rows = conn.execute(
"SELECT * FROM entity_relations WHERE project_id = ? ORDER BY created_at DESC",
(project_id,)
).fetchall()
conn.close()
return [dict(r) for r in rows]
# Singleton instance
_db_manager = None
def get_db_manager() -> DatabaseManager:
global _db_manager
if _db_manager is None:

View File

@@ -114,20 +114,34 @@ def mock_transcribe() -> dict:
]
}
def extract_entities_with_llm(text: str) -> List[dict]:
"""使用 Kimi API 提取实体"""
if not KIMI_API_KEY or not text:
return []
def extract_entities_with_llm(text: str) -> tuple[List[dict], List[dict]]:
"""使用 Kimi API 提取实体和关系
prompt = f"""从以下会议文本中提取关键实体,以 JSON 格式返回:
Returns:
(entities, relations): 实体列表和关系列表
"""
if not KIMI_API_KEY or not text:
return [], []
prompt = f"""从以下会议文本中提取关键实体和它们之间的关系,以 JSON 格式返回:
文本:{text[:3000]}
要求:
1. 每个实体包含name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), definition(一句话定义)
2. 只返回 JSON 数组
1. entities: 每个实体包含 name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), definition(一句话定义)
2. relations: 每个关系包含 source(源实体名), target(目标实体名), type(关系类型: belongs_to/works_with/depends_on/mentions/related)
3. 只返回 JSON 对象,格式: {{"entities": [...], "relations": [...]}}
示例:[{{"name": "Project Alpha", "type": "PROJECT", "definition": "核心项目"}}]
示例:
{{
"entities": [
{{"name": "Project Alpha", "type": "PROJECT", "definition": "核心项目"}},
{{"name": "K8s", "type": "TECH", "definition": "Kubernetes容器编排平台"}}
],
"relations": [
{{"source": "Project Alpha", "target": "K8s", "type": "depends_on"}}
]
}}
"""
try:
@@ -142,13 +156,14 @@ def extract_entities_with_llm(text: str) -> List[dict]:
content = result["choices"][0]["message"]["content"]
import re
json_match = re.search(r'\[.*?\]', content, re.DOTALL)
json_match = re.search(r'\{{.*?\}}', content, re.DOTALL)
if json_match:
return json.loads(json_match.group())
data = json.loads(json_match.group())
return data.get("entities", []), data.get("relations", [])
except Exception as e:
print(f"LLM extraction failed: {e}")
return []
return [], []
def align_entity(project_id: str, name: str, db) -> Optional[Entity]:
"""实体对齐"""
@@ -202,12 +217,23 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
print(f"Processing: {file.filename}")
tw_result = transcribe_audio(content, file.filename)
# 提取实体
print("Extracting entities...")
raw_entities = extract_entities_with_llm(tw_result["full_text"])
# 提取实体和关系
print("Extracting entities and relations...")
raw_entities, raw_relations = extract_entities_with_llm(tw_result["full_text"])
# 实体对齐
# 保存转录记录
transcript_id = str(uuid.uuid4())[:8]
db.save_transcript(
transcript_id=transcript_id,
project_id=project_id,
filename=file.filename,
full_text=tw_result["full_text"]
)
# 实体对齐并保存
aligned_entities = []
entity_name_to_id = {} # 用于关系映射
for raw_ent in raw_entities:
existing = align_entity(project_id, raw_ent["name"], db)
@@ -219,6 +245,7 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
definition=existing.definition,
aliases=existing.aliases
)
entity_name_to_id[raw_ent["name"]] = existing.id
else:
new_ent = db.create_entity(Entity(
id=str(uuid.uuid4())[:8],
@@ -233,14 +260,47 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
type=new_ent.type,
definition=new_ent.definition
)
entity_name_to_id[raw_ent["name"]] = new_ent.id
aligned_entities.append(ent_model)
# 保存实体提及位置
full_text = tw_result["full_text"]
name = raw_ent["name"]
start_pos = 0
while True:
pos = full_text.find(name, start_pos)
if pos == -1:
break
mention = EntityMention(
id=str(uuid.uuid4())[:8],
entity_id=entity_name_to_id[name],
transcript_id=transcript_id,
start_pos=pos,
end_pos=pos + len(name),
text_snippet=full_text[max(0, pos-20):min(len(full_text), pos+len(name)+20)],
confidence=1.0
)
db.add_mention(mention)
start_pos = pos + 1
# 保存关系
for rel in raw_relations:
source_id = entity_name_to_id.get(rel.get("source", ""))
target_id = entity_name_to_id.get(rel.get("target", ""))
if source_id and target_id:
db.create_relation(
project_id=project_id,
source_entity_id=source_id,
target_entity_id=target_id,
relation_type=rel.get("type", "related"),
evidence=tw_result["full_text"][:200],
transcript_id=transcript_id
)
# 构建片段
segments = [TranscriptSegment(**seg) for seg in tw_result["segments"]]
transcript_id = str(uuid.uuid4())[:8]
return AnalysisResult(
transcript_id=transcript_id,
project_id=project_id,
@@ -260,6 +320,64 @@ async def get_project_entities(project_id: str):
entities = db.list_project_entities(project_id)
return [{"id": e.id, "name": e.name, "type": e.type, "definition": e.definition} for e in entities]
@app.get("/api/v1/projects/{project_id}/relations")
async def get_project_relations(project_id: str):
"""获取项目的实体关系列表"""
if not DB_AVAILABLE:
return []
db = get_db_manager()
relations = db.list_project_relations(project_id)
# 获取实体名称映射
entities = db.list_project_entities(project_id)
entity_map = {e.id: e.name for e in entities}
return [{
"id": r["id"],
"source_id": r["source_entity_id"],
"source_name": entity_map.get(r["source_entity_id"], "Unknown"),
"target_id": r["target_entity_id"],
"target_name": entity_map.get(r["target_entity_id"], "Unknown"),
"type": r["relation_type"],
"evidence": r["evidence"]
} for r in relations]
@app.get("/api/v1/projects/{project_id}/transcripts")
async def get_project_transcripts(project_id: str):
"""获取项目的转录列表"""
if not DB_AVAILABLE:
return []
db = get_db_manager()
transcripts = db.list_project_transcripts(project_id)
return [{
"id": t["id"],
"filename": t["filename"],
"created_at": t["created_at"],
"preview": t["full_text"][:100] + "..." if len(t["full_text"]) > 100 else t["full_text"]
} for t in transcripts]
@app.get("/api/v1/entities/{entity_id}/mentions")
async def get_entity_mentions(entity_id: str):
"""获取实体的所有提及位置"""
if not DB_AVAILABLE:
return []
db = get_db_manager()
mentions = db.get_entity_mentions(entity_id)
return [{
"id": m.id,
"transcript_id": m.transcript_id,
"start_pos": m.start_pos,
"end_pos": m.end_pos,
"text_snippet": m.text_snippet,
"confidence": m.confidence
} for m in mentions]
@app.post("/api/v1/entities/{entity_id}/merge")
async def merge_entities(entity_id: str, target_entity_id: str):
"""合并两个实体"""