diff --git a/STATUS.md b/STATUS.md new file mode 100644 index 0000000..709bc53 --- /dev/null +++ b/STATUS.md @@ -0,0 +1,69 @@ +# InsightFlow 开发状态 + +**最后更新**: 2026-02-18 + +## 当前阶段 + +Phase 1: 骨架与单体分析 (MVP) - **已完成 ✅** + +## 已完成 + +### 后端 (backend/) +- ✅ FastAPI 项目框架搭建 +- ✅ SQLite 数据库设计 (schema.sql) +- ✅ 数据库管理模块 (db_manager.py) +- ✅ 阿里云听悟 ASR 集成 (tingwu_client.py) +- ✅ OSS 上传模块 (oss_uploader.py) +- ✅ 实体提取与对齐逻辑 +- ✅ 关系提取(LLM 同时提取实体和关系) +- ✅ 项目 CRUD API +- ✅ 音频上传与分析 API +- ✅ 实体列表 API +- ✅ 关系列表 API +- ✅ 转录列表 API +- ✅ 实体提及位置 API +- ✅ transcripts 表数据写入 +- ✅ entity_mentions 表数据写入 +- ✅ entity_relations 表数据写入 + +### 前端 (frontend/) +- ✅ 项目管理页面 (index.html) +- ✅ 知识工作台页面 (workbench.html) +- ✅ D3.js 知识图谱可视化 +- ✅ 音频上传 UI +- ✅ 实体列表展示 +- ✅ 转录文本中实体高亮显示 +- ✅ 图谱与文本联动(点击实体双向高亮) + +### 基础设施 +- ✅ Dockerfile +- ✅ docker-compose.yml +- ✅ Git 仓库初始化 + +## Phase 2 计划 (交互与纠错工作台) - **即将开始** + +- 实体定义编辑功能 +- 实体合并功能 +- 关系编辑功能(添加/删除) +- 人工修正数据保存 +- 文本编辑器增强(支持编辑转录文本) + +## Phase 3 计划 (记忆与生长) + +- 多文件图谱融合 +- 实体对齐算法优化 +- PDF/DOCX 文档导入 +- 项目知识库面板 + +## 技术债务 + +- 听悟 SDK fallback 到 mock 需要更好的错误处理 +- 实体相似度匹配目前只是简单字符串包含,需要 embedding 方案 +- 前端需要状态管理(目前使用全局变量) +- 需要添加 API 文档 (OpenAPI/Swagger) + +## 部署信息 + +- 服务器: 122.51.127.111 +- 项目路径: /opt/projects/insightflow +- 端口: 18000 diff --git a/backend/db_manager.py b/backend/db_manager.py index 1d1973d..519a8c7 100644 --- a/backend/db_manager.py +++ b/backend/db_manager.py @@ -221,9 +221,81 @@ class DatabaseManager: conn.close() return [EntityMention(**dict(r)) for r in rows] + # Transcript operations + def save_transcript(self, transcript_id: str, project_id: str, filename: str, full_text: str): + """保存转录记录""" + conn = self.get_conn() + now = datetime.now().isoformat() + conn.execute( + "INSERT INTO transcripts (id, project_id, filename, full_text, created_at) VALUES (?, ?, ?, ?, ?)", + (transcript_id, project_id, filename, full_text, now) + ) + conn.commit() + conn.close() + + def get_transcript(self, transcript_id: str) -> Optional[dict]: + """获取转录记录""" + conn = self.get_conn() + row = conn.execute("SELECT * FROM transcripts WHERE id = ?", (transcript_id,)).fetchone() + conn.close() + if row: + return dict(row) + return None + + def list_project_transcripts(self, project_id: str) -> List[dict]: + """列出项目的所有转录""" + conn = self.get_conn() + rows = conn.execute( + "SELECT * FROM transcripts WHERE project_id = ? ORDER BY created_at DESC", + (project_id,) + ).fetchall() + conn.close() + return [dict(r) for r in rows] + + # Relation operations + def create_relation(self, project_id: str, source_entity_id: str, target_entity_id: str, + relation_type: str = "related", evidence: str = "", transcript_id: str = ""): + """创建实体关系""" + conn = self.get_conn() + relation_id = str(uuid.uuid4())[:8] + now = datetime.now().isoformat() + conn.execute( + """INSERT INTO entity_relations + (id, project_id, source_entity_id, target_entity_id, relation_type, evidence, transcript_id, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", + (relation_id, project_id, source_entity_id, target_entity_id, relation_type, evidence, transcript_id, now) + ) + conn.commit() + conn.close() + return relation_id + + def get_entity_relations(self, entity_id: str) -> List[dict]: + """获取实体的所有关系""" + conn = self.get_conn() + rows = conn.execute( + """SELECT * FROM entity_relations + WHERE source_entity_id = ? OR target_entity_id = ? + ORDER BY created_at DESC""", + (entity_id, entity_id) + ).fetchall() + conn.close() + return [dict(r) for r in rows] + + def list_project_relations(self, project_id: str) -> List[dict]: + """列出项目的所有关系""" + conn = self.get_conn() + rows = conn.execute( + "SELECT * FROM entity_relations WHERE project_id = ? ORDER BY created_at DESC", + (project_id,) + ).fetchall() + conn.close() + return [dict(r) for r in rows] + + # Singleton instance _db_manager = None + def get_db_manager() -> DatabaseManager: global _db_manager if _db_manager is None: diff --git a/backend/main.py b/backend/main.py index ab5c6a6..470c305 100644 --- a/backend/main.py +++ b/backend/main.py @@ -114,20 +114,34 @@ def mock_transcribe() -> dict: ] } -def extract_entities_with_llm(text: str) -> List[dict]: - """使用 Kimi API 提取实体""" - if not KIMI_API_KEY or not text: - return [] +def extract_entities_with_llm(text: str) -> tuple[List[dict], List[dict]]: + """使用 Kimi API 提取实体和关系 - prompt = f"""从以下会议文本中提取关键实体,以 JSON 格式返回: + Returns: + (entities, relations): 实体列表和关系列表 + """ + if not KIMI_API_KEY or not text: + return [], [] + + prompt = f"""从以下会议文本中提取关键实体和它们之间的关系,以 JSON 格式返回: 文本:{text[:3000]} 要求: -1. 每个实体包含:name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), definition(一句话定义) -2. 只返回 JSON 数组 +1. entities: 每个实体包含 name(名称), type(类型: PROJECT/TECH/PERSON/ORG/OTHER), definition(一句话定义) +2. relations: 每个关系包含 source(源实体名), target(目标实体名), type(关系类型: belongs_to/works_with/depends_on/mentions/related) +3. 只返回 JSON 对象,格式: {{"entities": [...], "relations": [...]}} -示例:[{{"name": "Project Alpha", "type": "PROJECT", "definition": "核心项目"}}] +示例: +{{ + "entities": [ + {{"name": "Project Alpha", "type": "PROJECT", "definition": "核心项目"}}, + {{"name": "K8s", "type": "TECH", "definition": "Kubernetes容器编排平台"}} + ], + "relations": [ + {{"source": "Project Alpha", "target": "K8s", "type": "depends_on"}} + ] +}} """ try: @@ -142,13 +156,14 @@ def extract_entities_with_llm(text: str) -> List[dict]: content = result["choices"][0]["message"]["content"] import re - json_match = re.search(r'\[.*?\]', content, re.DOTALL) + json_match = re.search(r'\{{.*?\}}', content, re.DOTALL) if json_match: - return json.loads(json_match.group()) + data = json.loads(json_match.group()) + return data.get("entities", []), data.get("relations", []) except Exception as e: print(f"LLM extraction failed: {e}") - return [] + return [], [] def align_entity(project_id: str, name: str, db) -> Optional[Entity]: """实体对齐""" @@ -202,12 +217,23 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)): print(f"Processing: {file.filename}") tw_result = transcribe_audio(content, file.filename) - # 提取实体 - print("Extracting entities...") - raw_entities = extract_entities_with_llm(tw_result["full_text"]) + # 提取实体和关系 + print("Extracting entities and relations...") + raw_entities, raw_relations = extract_entities_with_llm(tw_result["full_text"]) - # 实体对齐 + # 保存转录记录 + transcript_id = str(uuid.uuid4())[:8] + db.save_transcript( + transcript_id=transcript_id, + project_id=project_id, + filename=file.filename, + full_text=tw_result["full_text"] + ) + + # 实体对齐并保存 aligned_entities = [] + entity_name_to_id = {} # 用于关系映射 + for raw_ent in raw_entities: existing = align_entity(project_id, raw_ent["name"], db) @@ -219,6 +245,7 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)): definition=existing.definition, aliases=existing.aliases ) + entity_name_to_id[raw_ent["name"]] = existing.id else: new_ent = db.create_entity(Entity( id=str(uuid.uuid4())[:8], @@ -233,14 +260,47 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)): type=new_ent.type, definition=new_ent.definition ) + entity_name_to_id[raw_ent["name"]] = new_ent.id aligned_entities.append(ent_model) + + # 保存实体提及位置 + full_text = tw_result["full_text"] + name = raw_ent["name"] + start_pos = 0 + while True: + pos = full_text.find(name, start_pos) + if pos == -1: + break + mention = EntityMention( + id=str(uuid.uuid4())[:8], + entity_id=entity_name_to_id[name], + transcript_id=transcript_id, + start_pos=pos, + end_pos=pos + len(name), + text_snippet=full_text[max(0, pos-20):min(len(full_text), pos+len(name)+20)], + confidence=1.0 + ) + db.add_mention(mention) + start_pos = pos + 1 + + # 保存关系 + for rel in raw_relations: + source_id = entity_name_to_id.get(rel.get("source", "")) + target_id = entity_name_to_id.get(rel.get("target", "")) + if source_id and target_id: + db.create_relation( + project_id=project_id, + source_entity_id=source_id, + target_entity_id=target_id, + relation_type=rel.get("type", "related"), + evidence=tw_result["full_text"][:200], + transcript_id=transcript_id + ) # 构建片段 segments = [TranscriptSegment(**seg) for seg in tw_result["segments"]] - transcript_id = str(uuid.uuid4())[:8] - return AnalysisResult( transcript_id=transcript_id, project_id=project_id, @@ -260,6 +320,64 @@ async def get_project_entities(project_id: str): entities = db.list_project_entities(project_id) return [{"id": e.id, "name": e.name, "type": e.type, "definition": e.definition} for e in entities] + +@app.get("/api/v1/projects/{project_id}/relations") +async def get_project_relations(project_id: str): + """获取项目的实体关系列表""" + if not DB_AVAILABLE: + return [] + + db = get_db_manager() + relations = db.list_project_relations(project_id) + + # 获取实体名称映射 + entities = db.list_project_entities(project_id) + entity_map = {e.id: e.name for e in entities} + + return [{ + "id": r["id"], + "source_id": r["source_entity_id"], + "source_name": entity_map.get(r["source_entity_id"], "Unknown"), + "target_id": r["target_entity_id"], + "target_name": entity_map.get(r["target_entity_id"], "Unknown"), + "type": r["relation_type"], + "evidence": r["evidence"] + } for r in relations] + + +@app.get("/api/v1/projects/{project_id}/transcripts") +async def get_project_transcripts(project_id: str): + """获取项目的转录列表""" + if not DB_AVAILABLE: + return [] + + db = get_db_manager() + transcripts = db.list_project_transcripts(project_id) + return [{ + "id": t["id"], + "filename": t["filename"], + "created_at": t["created_at"], + "preview": t["full_text"][:100] + "..." if len(t["full_text"]) > 100 else t["full_text"] + } for t in transcripts] + + +@app.get("/api/v1/entities/{entity_id}/mentions") +async def get_entity_mentions(entity_id: str): + """获取实体的所有提及位置""" + if not DB_AVAILABLE: + return [] + + db = get_db_manager() + mentions = db.get_entity_mentions(entity_id) + return [{ + "id": m.id, + "transcript_id": m.transcript_id, + "start_pos": m.start_pos, + "end_pos": m.end_pos, + "text_snippet": m.text_snippet, + "confidence": m.confidence + } for m in mentions] + @app.post("/api/v1/entities/{entity_id}/merge") async def merge_entities(entity_id: str, target_entity_id: str): """合并两个实体""" diff --git a/data/aws-learning-progress.json b/data/aws-learning-progress.json new file mode 100644 index 0000000..1828a4a --- /dev/null +++ b/data/aws-learning-progress.json @@ -0,0 +1,21 @@ +{ + "current_domain": "复杂工作负载设计", + "domain_index": 0, + "domains": [ + "复杂工作负载设计", + "高可用与容错", + "安全设计", + "成本优化", + "迁移与现代化", + "组织复杂性管理" + ], + "recent_topics": [ + "Multi-AZ vs Multi-Region", + "ASG Health Check", + "Storage Gateway", + "Transit Gateway", + "Transit Gateway 路由表" + ], + "total_learned": 5, + "last_updated": "2026-02-17T20:00:00+08:00" +} \ No newline at end of file diff --git a/frontend/app.js b/frontend/app.js index 55543a9..dc872a2 100644 --- a/frontend/app.js +++ b/frontend/app.js @@ -4,6 +4,8 @@ const API_BASE = '/api/v1'; let currentProject = null; let currentData = null; let selectedEntity = null; +let projectRelations = []; +let projectEntities = []; // Init document.addEventListener('DOMContentLoaded', () => { @@ -35,7 +37,7 @@ async function initWorkbench() { if (nameEl) nameEl.textContent = currentProject.name; initUpload(); - await loadProjectEntities(); + await loadProjectData(); } catch (err) { console.error('Init failed:', err); @@ -63,22 +65,26 @@ async function uploadAudio(file) { return await res.json(); } -async function loadProjectEntities() { +async function loadProjectData() { try { - const res = await fetch(`${API_BASE}/projects/${currentProject.id}/entities`); - if (!res.ok) return; - const entities = await res.json(); + // 并行加载实体和关系 + const [entitiesRes, relationsRes] = await Promise.all([ + fetch(`${API_BASE}/projects/${currentProject.id}/entities`), + fetch(`${API_BASE}/projects/${currentProject.id}/relations`) + ]); + + if (entitiesRes.ok) { + projectEntities = await entitiesRes.json(); + } + if (relationsRes.ok) { + projectRelations = await relationsRes.json(); + } currentData = { transcript_id: 'project_view', project_id: currentProject.id, segments: [], - entities: entities.map(e => ({ - id: e.id, - name: e.name, - type: e.type, - definition: e.definition || '' - })), + entities: projectEntities, full_text: '', created_at: new Date().toISOString() }; @@ -87,11 +93,11 @@ async function loadProjectEntities() { renderEntityList(); } catch (err) { - console.error('Load entities failed:', err); + console.error('Load project data failed:', err); } } -// Render transcript +// Render transcript with entity highlighting function renderTranscript() { const container = document.getElementById('transcriptContent'); if (!container || !currentData || !currentData.segments) return; @@ -103,8 +109,11 @@ function renderTranscript() { div.className = 'segment'; div.dataset.index = idx; + // 高亮实体 let text = seg.text; - const entities = findEntitiesInSegment(seg, idx); + const entities = findEntitiesInText(seg.text); + + // 按位置倒序替换,避免位置偏移 entities.sort((a, b) => b.start - a.start); entities.forEach(ent => { @@ -123,29 +132,50 @@ function renderTranscript() { }); } -function findEntitiesInSegment(seg, segIndex) { - if (!currentData || !currentData.entities) return []; +// 在文本中查找实体位置 +function findEntitiesInText(text) { + if (!projectEntities || projectEntities.length === 0) return []; - let offset = 0; - for (let i = 0; i < segIndex; i++) { - offset += currentData.segments[i].text.length + 1; - } + const found = []; + projectEntities.forEach(ent => { + const name = ent.name; + let pos = 0; + while ((pos = text.indexOf(name, pos)) !== -1) { + found.push({ + id: ent.id, + name: ent.name, + start: pos, + end: pos + name.length + }); + pos += 1; + } + + // 也检查别名 + if (ent.aliases && ent.aliases.length > 0) { + ent.aliases.forEach(alias => { + let aliasPos = 0; + while ((aliasPos = text.indexOf(alias, aliasPos)) !== -1) { + found.push({ + id: ent.id, + name: alias, + start: aliasPos, + end: aliasPos + alias.length + }); + aliasPos += 1; + } + }); + } + }); - return currentData.entities.filter(ent => { - return ent.start >= offset && ent.end <= offset + seg.text.length; - }).map(ent => ({ - ...ent, - start: ent.start - offset, - end: ent.end - offset - })); + return found; } -// Render D3 graph +// Render D3 graph with relations function renderGraph() { const svg = d3.select('#graph-svg'); svg.selectAll('*').remove(); - if (!currentData || !currentData.entities || currentData.entities.length === 0) { + if (!projectEntities || projectEntities.length === 0) { svg.append('text') .attr('x', '50%') .attr('y', '50%') @@ -155,21 +185,32 @@ function renderGraph() { return; } - const width = svg.node().parentElement.clientWidth; - const height = svg.node().parentElement.clientHeight - 200; + const container = svg.node().parentElement; + const width = container.clientWidth; + const height = container.clientHeight - 200; svg.attr('width', width).attr('height', height); - const nodes = currentData.entities.map(e => ({ + const nodes = projectEntities.map(e => ({ id: e.id, name: e.name, type: e.type, + definition: e.definition, ...e })); - const links = []; - for (let i = 0; i < nodes.length - 1; i++) { - links.push({ source: nodes[i].id, target: nodes[i + 1].id }); + // 使用数据库中的关系 + const links = projectRelations.map(r => ({ + source: r.source_id, + target: r.target_id, + type: r.type + })).filter(r => r.source && r.target); + + // 如果没有关系,创建默认连接 + if (links.length === 0 && nodes.length > 1) { + for (let i = 0; i < Math.min(nodes.length - 1, 5); i++) { + links.push({ source: nodes[0].id, target: nodes[i + 1].id, type: 'related' }); + } } const colorMap = { @@ -181,18 +222,31 @@ function renderGraph() { }; const simulation = d3.forceSimulation(nodes) - .force('link', d3.forceLink(links).id(d => d.id).distance(100)) - .force('charge', d3.forceManyBody().strength(-300)) + .force('link', d3.forceLink(links).id(d => d.id).distance(120)) + .force('charge', d3.forceManyBody().strength(-400)) .force('center', d3.forceCenter(width / 2, height / 2)) - .force('collision', d3.forceCollide().radius(40)); + .force('collision', d3.forceCollide().radius(50)); + // 关系连线 const link = svg.append('g') .selectAll('line') .data(links) .enter().append('line') - .attr('stroke', '#333') - .attr('stroke-width', 1); + .attr('stroke', '#444') + .attr('stroke-width', 1.5) + .attr('stroke-opacity', 0.6); + // 关系标签 + const linkLabel = svg.append('g') + .selectAll('text') + .data(links) + .enter().append('text') + .attr('font-size', '10px') + .attr('fill', '#666') + .attr('text-anchor', 'middle') + .text(d => d.type); + + // 节点组 const node = svg.append('g') .selectAll('g') .data(nodes) @@ -204,18 +258,30 @@ function renderGraph() { .on('end', dragended)) .on('click', (e, d) => window.selectEntity(d.id)); + // 节点圆圈 node.append('circle') - .attr('r', 30) + .attr('r', 35) .attr('fill', d => colorMap[d.type] || '#666') .attr('stroke', '#fff') - .attr('stroke-width', 2); + .attr('stroke-width', 2) + .attr('class', 'node-circle'); + // 节点文字 node.append('text') - .text(d => d.name.length > 8 ? d.name.slice(0, 6) + '...' : d.name) + .text(d => d.name.length > 6 ? d.name.slice(0, 5) + '...' : d.name) .attr('text-anchor', 'middle') .attr('dy', 5) .attr('fill', '#fff') - .attr('font-size', '11px'); + .attr('font-size', '11px') + .attr('font-weight', '500'); + + // 节点类型图标 + node.append('text') + .attr('dy', -45) + .attr('text-anchor', 'middle') + .attr('fill', d => colorMap[d.type] || '#666') + .attr('font-size', '10px') + .text(d => d.type); simulation.on('tick', () => { link @@ -224,6 +290,10 @@ function renderGraph() { .attr('x2', d => d.target.x) .attr('y2', d => d.target.y); + linkLabel + .attr('x', d => (d.source.x + d.target.x) / 2) + .attr('y', d => (d.source.y + d.target.y) / 2); + node.attr('transform', d => `translate(${d.x},${d.y})`); }); @@ -252,14 +322,15 @@ function renderEntityList() { container.innerHTML = '
暂无实体,请上传音频文件
'; return; } - currentData.entities.forEach(ent => { + projectEntities.forEach(ent => { const div = document.createElement('div'); div.className = 'entity-item'; + div.dataset.id = ent.id; div.onclick = () => window.selectEntity(ent.id); div.innerHTML = ` @@ -274,21 +345,41 @@ function renderEntityList() { }); } -// Select entity +// Select entity - 联动高亮 window.selectEntity = function(entityId) { selectedEntity = entityId; - const entity = currentData && currentData.entities.find(e => e.id === entityId); + const entity = projectEntities.find(e => e.id === entityId); if (!entity) return; + // 高亮文本中的实体 document.querySelectorAll('.entity').forEach(el => { - el.style.background = el.dataset.id === entityId ? '#ff6b6b' : ''; + if (el.dataset.id === entityId) { + el.style.background = '#ff6b6b'; + el.style.color = '#fff'; + } else { + el.style.background = ''; + el.style.color = ''; + } }); - d3.selectAll('.node circle') + // 高亮图谱中的节点 + d3.selectAll('.node-circle') .attr('stroke', d => d.id === entityId ? '#ff6b6b' : '#fff') - .attr('stroke-width', d => d.id === entityId ? 4 : 2); + .attr('stroke-width', d => d.id === entityId ? 4 : 2) + .attr('r', d => d.id === entityId ? 40 : 35); - console.log('Selected:', entity.name); + // 高亮实体列表 + document.querySelectorAll('.entity-item').forEach(el => { + if (el.dataset.id === entityId) { + el.style.background = '#2a2a2a'; + el.style.borderLeft = '3px solid #ff6b6b'; + } else { + el.style.background = ''; + el.style.borderLeft = ''; + } + }); + + console.log('Selected:', entity.name, entity.definition); }; // Show/hide upload @@ -318,17 +409,24 @@ function initUpload() {${file.name}
+ASR转录 + 实体提取中
${err.message}
- +