feat: Phase 1 MVP 完成
- 实现实体和关系同时提取(LLM) - 添加 transcripts/mentions/relations 数据持久化 - 新增 API: 关系列表、转录列表、实体提及位置 - 前端实体高亮显示和图谱联动 - 添加 STATUS.md 跟踪开发进度
This commit is contained in:
152
backend/main.py
152
backend/main.py
@@ -114,20 +114,34 @@ def mock_transcribe() -> dict:
|
||||
]
|
||||
}
|
||||
|
||||
def extract_entities_with_llm(text: str) -> List[dict]:
|
||||
"""使用 Kimi API 提取实体"""
|
||||
if not KIMI_API_KEY or not text:
|
||||
return []
|
||||
def extract_entities_with_llm(text: str) -> tuple[List[dict], List[dict]]:
|
||||
"""使用 Kimi API 提取实体和关系
|
||||
|
||||
prompt = f"""从以下会议文本中提取关键实体,以 JSON 格式返回:
|
||||
Returns:
|
||||
(entities, relations): 实体列表和关系列表
|
||||
"""
|
||||
if not KIMI_API_KEY or not text:
|
||||
return [], []
|
||||
|
||||
prompt = f"""从以下会议文本中提取关键实体和它们之间的关系,以 JSON 格式返回:
|
||||
|
||||
文本:{text[:3000]}
|
||||
|
||||
要求:
|
||||
1. 每个实体包含:name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), definition(一句话定义)
|
||||
2. 只返回 JSON 数组
|
||||
1. entities: 每个实体包含 name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), definition(一句话定义)
|
||||
2. relations: 每个关系包含 source(源实体名), target(目标实体名), type(关系类型: belongs_to/works_with/depends_on/mentions/related)
|
||||
3. 只返回 JSON 对象,格式: {{"entities": [...], "relations": [...]}}
|
||||
|
||||
示例:[{{"name": "Project Alpha", "type": "PROJECT", "definition": "核心项目"}}]
|
||||
示例:
|
||||
{{
|
||||
"entities": [
|
||||
{{"name": "Project Alpha", "type": "PROJECT", "definition": "核心项目"}},
|
||||
{{"name": "K8s", "type": "TECH", "definition": "Kubernetes容器编排平台"}}
|
||||
],
|
||||
"relations": [
|
||||
{{"source": "Project Alpha", "target": "K8s", "type": "depends_on"}}
|
||||
]
|
||||
}}
|
||||
"""
|
||||
|
||||
try:
|
||||
@@ -142,13 +156,14 @@ def extract_entities_with_llm(text: str) -> List[dict]:
|
||||
content = result["choices"][0]["message"]["content"]
|
||||
|
||||
import re
|
||||
json_match = re.search(r'\[.*?\]', content, re.DOTALL)
|
||||
json_match = re.search(r'\{{.*?\}}', content, re.DOTALL)
|
||||
if json_match:
|
||||
return json.loads(json_match.group())
|
||||
data = json.loads(json_match.group())
|
||||
return data.get("entities", []), data.get("relations", [])
|
||||
except Exception as e:
|
||||
print(f"LLM extraction failed: {e}")
|
||||
|
||||
return []
|
||||
return [], []
|
||||
|
||||
def align_entity(project_id: str, name: str, db) -> Optional[Entity]:
|
||||
"""实体对齐"""
|
||||
@@ -202,12 +217,23 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
|
||||
print(f"Processing: {file.filename}")
|
||||
tw_result = transcribe_audio(content, file.filename)
|
||||
|
||||
# 提取实体
|
||||
print("Extracting entities...")
|
||||
raw_entities = extract_entities_with_llm(tw_result["full_text"])
|
||||
# 提取实体和关系
|
||||
print("Extracting entities and relations...")
|
||||
raw_entities, raw_relations = extract_entities_with_llm(tw_result["full_text"])
|
||||
|
||||
# 实体对齐
|
||||
# 保存转录记录
|
||||
transcript_id = str(uuid.uuid4())[:8]
|
||||
db.save_transcript(
|
||||
transcript_id=transcript_id,
|
||||
project_id=project_id,
|
||||
filename=file.filename,
|
||||
full_text=tw_result["full_text"]
|
||||
)
|
||||
|
||||
# 实体对齐并保存
|
||||
aligned_entities = []
|
||||
entity_name_to_id = {} # 用于关系映射
|
||||
|
||||
for raw_ent in raw_entities:
|
||||
existing = align_entity(project_id, raw_ent["name"], db)
|
||||
|
||||
@@ -219,6 +245,7 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
|
||||
definition=existing.definition,
|
||||
aliases=existing.aliases
|
||||
)
|
||||
entity_name_to_id[raw_ent["name"]] = existing.id
|
||||
else:
|
||||
new_ent = db.create_entity(Entity(
|
||||
id=str(uuid.uuid4())[:8],
|
||||
@@ -233,14 +260,47 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
|
||||
type=new_ent.type,
|
||||
definition=new_ent.definition
|
||||
)
|
||||
entity_name_to_id[raw_ent["name"]] = new_ent.id
|
||||
|
||||
aligned_entities.append(ent_model)
|
||||
|
||||
# 保存实体提及位置
|
||||
full_text = tw_result["full_text"]
|
||||
name = raw_ent["name"]
|
||||
start_pos = 0
|
||||
while True:
|
||||
pos = full_text.find(name, start_pos)
|
||||
if pos == -1:
|
||||
break
|
||||
mention = EntityMention(
|
||||
id=str(uuid.uuid4())[:8],
|
||||
entity_id=entity_name_to_id[name],
|
||||
transcript_id=transcript_id,
|
||||
start_pos=pos,
|
||||
end_pos=pos + len(name),
|
||||
text_snippet=full_text[max(0, pos-20):min(len(full_text), pos+len(name)+20)],
|
||||
confidence=1.0
|
||||
)
|
||||
db.add_mention(mention)
|
||||
start_pos = pos + 1
|
||||
|
||||
# 保存关系
|
||||
for rel in raw_relations:
|
||||
source_id = entity_name_to_id.get(rel.get("source", ""))
|
||||
target_id = entity_name_to_id.get(rel.get("target", ""))
|
||||
if source_id and target_id:
|
||||
db.create_relation(
|
||||
project_id=project_id,
|
||||
source_entity_id=source_id,
|
||||
target_entity_id=target_id,
|
||||
relation_type=rel.get("type", "related"),
|
||||
evidence=tw_result["full_text"][:200],
|
||||
transcript_id=transcript_id
|
||||
)
|
||||
|
||||
# 构建片段
|
||||
segments = [TranscriptSegment(**seg) for seg in tw_result["segments"]]
|
||||
|
||||
transcript_id = str(uuid.uuid4())[:8]
|
||||
|
||||
return AnalysisResult(
|
||||
transcript_id=transcript_id,
|
||||
project_id=project_id,
|
||||
@@ -260,6 +320,64 @@ async def get_project_entities(project_id: str):
|
||||
entities = db.list_project_entities(project_id)
|
||||
return [{"id": e.id, "name": e.name, "type": e.type, "definition": e.definition} for e in entities]
|
||||
|
||||
|
||||
@app.get("/api/v1/projects/{project_id}/relations")
|
||||
async def get_project_relations(project_id: str):
|
||||
"""获取项目的实体关系列表"""
|
||||
if not DB_AVAILABLE:
|
||||
return []
|
||||
|
||||
db = get_db_manager()
|
||||
relations = db.list_project_relations(project_id)
|
||||
|
||||
# 获取实体名称映射
|
||||
entities = db.list_project_entities(project_id)
|
||||
entity_map = {e.id: e.name for e in entities}
|
||||
|
||||
return [{
|
||||
"id": r["id"],
|
||||
"source_id": r["source_entity_id"],
|
||||
"source_name": entity_map.get(r["source_entity_id"], "Unknown"),
|
||||
"target_id": r["target_entity_id"],
|
||||
"target_name": entity_map.get(r["target_entity_id"], "Unknown"),
|
||||
"type": r["relation_type"],
|
||||
"evidence": r["evidence"]
|
||||
} for r in relations]
|
||||
|
||||
|
||||
@app.get("/api/v1/projects/{project_id}/transcripts")
|
||||
async def get_project_transcripts(project_id: str):
|
||||
"""获取项目的转录列表"""
|
||||
if not DB_AVAILABLE:
|
||||
return []
|
||||
|
||||
db = get_db_manager()
|
||||
transcripts = db.list_project_transcripts(project_id)
|
||||
return [{
|
||||
"id": t["id"],
|
||||
"filename": t["filename"],
|
||||
"created_at": t["created_at"],
|
||||
"preview": t["full_text"][:100] + "..." if len(t["full_text"]) > 100 else t["full_text"]
|
||||
} for t in transcripts]
|
||||
|
||||
|
||||
@app.get("/api/v1/entities/{entity_id}/mentions")
|
||||
async def get_entity_mentions(entity_id: str):
|
||||
"""获取实体的所有提及位置"""
|
||||
if not DB_AVAILABLE:
|
||||
return []
|
||||
|
||||
db = get_db_manager()
|
||||
mentions = db.get_entity_mentions(entity_id)
|
||||
return [{
|
||||
"id": m.id,
|
||||
"transcript_id": m.transcript_id,
|
||||
"start_pos": m.start_pos,
|
||||
"end_pos": m.end_pos,
|
||||
"text_snippet": m.text_snippet,
|
||||
"confidence": m.confidence
|
||||
} for m in mentions]
|
||||
|
||||
@app.post("/api/v1/entities/{entity_id}/merge")
|
||||
async def merge_entities(entity_id: str, target_entity_id: str):
|
||||
"""合并两个实体"""
|
||||
|
||||
Reference in New Issue
Block a user