Phase 3: Memory & Growth - Multi-file fusion, Entity alignment with embedding, Document import, Knowledge base panel
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
InsightFlow Database Manager
|
||||
InsightFlow Database Manager - Phase 3
|
||||
处理项目、实体、关系的持久化
|
||||
支持文档类型和多文件融合
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -166,6 +167,18 @@ class DatabaseManager:
|
||||
(target_id, source_id)
|
||||
)
|
||||
|
||||
# 更新关系 - source 作为 source_entity_id
|
||||
conn.execute(
|
||||
"UPDATE entity_relations SET source_entity_id = ? WHERE source_entity_id = ?",
|
||||
(target_id, source_id)
|
||||
)
|
||||
|
||||
# 更新关系 - source 作为 target_entity_id
|
||||
conn.execute(
|
||||
"UPDATE entity_relations SET target_entity_id = ? WHERE target_entity_id = ?",
|
||||
(target_id, source_id)
|
||||
)
|
||||
|
||||
# 删除源实体
|
||||
conn.execute("DELETE FROM entities WHERE id = ?", (source_id,))
|
||||
|
||||
@@ -222,13 +235,13 @@ class DatabaseManager:
|
||||
return [EntityMention(**dict(r)) for r in rows]
|
||||
|
||||
# Transcript operations
|
||||
def save_transcript(self, transcript_id: str, project_id: str, filename: str, full_text: str):
|
||||
def save_transcript(self, transcript_id: str, project_id: str, filename: str, full_text: str, transcript_type: str = "audio"):
|
||||
"""保存转录记录"""
|
||||
conn = self.get_conn()
|
||||
now = datetime.now().isoformat()
|
||||
conn.execute(
|
||||
"INSERT INTO transcripts (id, project_id, filename, full_text, created_at) VALUES (?, ?, ?, ?, ?)",
|
||||
(transcript_id, project_id, filename, full_text, now)
|
||||
"INSERT INTO transcripts (id, project_id, filename, full_text, type, created_at) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
(transcript_id, project_id, filename, full_text, transcript_type, now)
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
@@ -388,6 +401,58 @@ class DatabaseManager:
|
||||
conn.close()
|
||||
|
||||
return dict(row) if row else None
|
||||
|
||||
# Phase 3: Glossary operations
|
||||
def add_glossary_term(self, project_id: str, term: str, pronunciation: str = "") -> str:
|
||||
"""添加术语到术语表"""
|
||||
conn = self.get_conn()
|
||||
|
||||
# 检查是否已存在
|
||||
existing = conn.execute(
|
||||
"SELECT * FROM glossary WHERE project_id = ? AND term = ?",
|
||||
(project_id, term)
|
||||
).fetchone()
|
||||
|
||||
if existing:
|
||||
# 更新频率
|
||||
conn.execute(
|
||||
"UPDATE glossary SET frequency = frequency + 1 WHERE id = ?",
|
||||
(existing['id'],)
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return existing['id']
|
||||
|
||||
term_id = str(uuid.uuid4())[:8]
|
||||
conn.execute(
|
||||
"INSERT INTO glossary (id, project_id, term, pronunciation, frequency) VALUES (?, ?, ?, ?, ?)",
|
||||
(term_id, project_id, term, pronunciation, 1)
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return term_id
|
||||
|
||||
def list_glossary(self, project_id: str) -> List[dict]:
|
||||
"""列出项目术语表"""
|
||||
conn = self.get_conn()
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM glossary WHERE project_id = ? ORDER BY frequency DESC",
|
||||
(project_id,)
|
||||
).fetchall()
|
||||
conn.close()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
def delete_glossary_term(self, term_id: str):
|
||||
"""删除术语"""
|
||||
conn = self.get_conn()
|
||||
conn.execute("DELETE FROM glossary WHERE id = ?", (term_id,))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
# Phase 3: Get all entities for embedding
|
||||
def get_all_entities_for_embedding(self, project_id: str) -> List[Entity]:
|
||||
"""获取所有实体用于 embedding 计算"""
|
||||
return self.list_project_entities(project_id)
|
||||
|
||||
|
||||
# Singleton instance
|
||||
|
||||
180
backend/document_processor.py
Normal file
180
backend/document_processor.py
Normal file
@@ -0,0 +1,180 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Document Processor - Phase 3
|
||||
支持 PDF 和 DOCX 文档导入
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
from typing import Dict, Optional
|
||||
|
||||
class DocumentProcessor:
|
||||
"""文档处理器 - 提取 PDF/DOCX 文本"""
|
||||
|
||||
def __init__(self):
|
||||
self.supported_formats = {
|
||||
'.pdf': self._extract_pdf,
|
||||
'.docx': self._extract_docx,
|
||||
'.doc': self._extract_docx,
|
||||
'.txt': self._extract_txt,
|
||||
'.md': self._extract_txt,
|
||||
}
|
||||
|
||||
def process(self, content: bytes, filename: str) -> Dict[str, str]:
|
||||
"""
|
||||
处理文档并提取文本
|
||||
|
||||
Args:
|
||||
content: 文件二进制内容
|
||||
filename: 文件名
|
||||
|
||||
Returns:
|
||||
{"text": "提取的文本内容", "format": "文件格式"}
|
||||
"""
|
||||
ext = os.path.splitext(filename.lower())[1]
|
||||
|
||||
if ext not in self.supported_formats:
|
||||
raise ValueError(f"Unsupported file format: {ext}. Supported: {list(self.supported_formats.keys())}")
|
||||
|
||||
extractor = self.supported_formats[ext]
|
||||
text = extractor(content)
|
||||
|
||||
# 清理文本
|
||||
text = self._clean_text(text)
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"format": ext,
|
||||
"filename": filename
|
||||
}
|
||||
|
||||
def _extract_pdf(self, content: bytes) -> str:
|
||||
"""提取 PDF 文本"""
|
||||
try:
|
||||
import PyPDF2
|
||||
pdf_file = io.BytesIO(content)
|
||||
reader = PyPDF2.PdfReader(pdf_file)
|
||||
|
||||
text_parts = []
|
||||
for page in reader.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
except ImportError:
|
||||
# Fallback: 尝试使用 pdfplumber
|
||||
try:
|
||||
import pdfplumber
|
||||
text_parts = []
|
||||
with pdfplumber.open(io.BytesIO(content)) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
return "\n\n".join(text_parts)
|
||||
except ImportError:
|
||||
raise ImportError("PDF processing requires PyPDF2 or pdfplumber. Install with: pip install PyPDF2")
|
||||
except Exception as e:
|
||||
raise ValueError(f"PDF extraction failed: {str(e)}")
|
||||
|
||||
def _extract_docx(self, content: bytes) -> str:
|
||||
"""提取 DOCX 文本"""
|
||||
try:
|
||||
import docx
|
||||
doc_file = io.BytesIO(content)
|
||||
doc = docx.Document(doc_file)
|
||||
|
||||
text_parts = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
text_parts.append(para.text)
|
||||
|
||||
# 提取表格中的文本
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
row_text = []
|
||||
for cell in row.cells:
|
||||
if cell.text.strip():
|
||||
row_text.append(cell.text.strip())
|
||||
if row_text:
|
||||
text_parts.append(" | ".join(row_text))
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
except ImportError:
|
||||
raise ImportError("DOCX processing requires python-docx. Install with: pip install python-docx")
|
||||
except Exception as e:
|
||||
raise ValueError(f"DOCX extraction failed: {str(e)}")
|
||||
|
||||
def _extract_txt(self, content: bytes) -> str:
|
||||
"""提取纯文本"""
|
||||
# 尝试多种编码
|
||||
encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']
|
||||
|
||||
for encoding in encodings:
|
||||
try:
|
||||
return content.decode(encoding)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
# 如果都失败了,使用 latin-1 并忽略错误
|
||||
return content.decode('latin-1', errors='ignore')
|
||||
|
||||
def _clean_text(self, text: str) -> str:
|
||||
"""清理提取的文本"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# 移除多余的空白字符
|
||||
lines = text.split('\n')
|
||||
cleaned_lines = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
# 移除空行,但保留段落分隔
|
||||
if line:
|
||||
cleaned_lines.append(line)
|
||||
|
||||
# 合并行,保留段落结构
|
||||
text = '\n\n'.join(cleaned_lines)
|
||||
|
||||
# 移除多余的空格
|
||||
text = ' '.join(text.split())
|
||||
|
||||
# 移除控制字符
|
||||
text = ''.join(char for char in text if ord(char) >= 32 or char in '\n\r\t')
|
||||
|
||||
return text.strip()
|
||||
|
||||
def is_supported(self, filename: str) -> bool:
|
||||
"""检查文件格式是否支持"""
|
||||
ext = os.path.splitext(filename.lower())[1]
|
||||
return ext in self.supported_formats
|
||||
|
||||
|
||||
# 简单的文本提取器(不需要外部依赖)
|
||||
class SimpleTextExtractor:
|
||||
"""简单的文本提取器,用于测试"""
|
||||
|
||||
def extract(self, content: bytes, filename: str) -> str:
|
||||
"""尝试提取文本"""
|
||||
encodings = ['utf-8', 'gbk', 'latin-1']
|
||||
|
||||
for encoding in encodings:
|
||||
try:
|
||||
return content.decode(encoding)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
return content.decode('latin-1', errors='ignore')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试
|
||||
processor = DocumentProcessor()
|
||||
|
||||
# 测试文本提取
|
||||
test_text = "Hello World\n\nThis is a test document.\n\nMultiple paragraphs."
|
||||
result = processor.process(test_text.encode('utf-8'), "test.txt")
|
||||
print(f"Text extraction test: {len(result['text'])} chars")
|
||||
print(result['text'][:100])
|
||||
372
backend/entity_aligner.py
Normal file
372
backend/entity_aligner.py
Normal file
@@ -0,0 +1,372 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Entity Aligner - Phase 3
|
||||
使用 embedding 进行实体对齐
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import httpx
|
||||
import numpy as np
|
||||
from typing import List, Optional, Dict
|
||||
from dataclasses import dataclass
|
||||
|
||||
# API Keys
|
||||
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
|
||||
KIMI_BASE_URL = os.getenv("KIMI_BASE_URL", "https://api.kimi.com/coding")
|
||||
|
||||
@dataclass
|
||||
class EntityEmbedding:
|
||||
entity_id: str
|
||||
name: str
|
||||
definition: str
|
||||
embedding: List[float]
|
||||
|
||||
class EntityAligner:
|
||||
"""实体对齐器 - 使用 embedding 进行相似度匹配"""
|
||||
|
||||
def __init__(self, similarity_threshold: float = 0.85):
|
||||
self.similarity_threshold = similarity_threshold
|
||||
self.embedding_cache: Dict[str, List[float]] = {}
|
||||
|
||||
def get_embedding(self, text: str) -> Optional[List[float]]:
|
||||
"""
|
||||
使用 Kimi API 获取文本的 embedding
|
||||
|
||||
Args:
|
||||
text: 输入文本
|
||||
|
||||
Returns:
|
||||
embedding 向量或 None
|
||||
"""
|
||||
if not KIMI_API_KEY:
|
||||
return None
|
||||
|
||||
# 检查缓存
|
||||
cache_key = hash(text)
|
||||
if cache_key in self.embedding_cache:
|
||||
return self.embedding_cache[cache_key]
|
||||
|
||||
try:
|
||||
response = httpx.post(
|
||||
f"{KIMI_BASE_URL}/v1/embeddings",
|
||||
headers={"Authorization": f"Bearer {KIMI_API_KEY}", "Content-Type": "application/json"},
|
||||
json={
|
||||
"model": "k2p5",
|
||||
"input": text[:500] # 限制长度
|
||||
},
|
||||
timeout=30.0
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
|
||||
embedding = result["data"][0]["embedding"]
|
||||
self.embedding_cache[cache_key] = embedding
|
||||
return embedding
|
||||
|
||||
except Exception as e:
|
||||
print(f"Embedding API failed: {e}")
|
||||
return None
|
||||
|
||||
def compute_similarity(self, embedding1: List[float], embedding2: List[float]) -> float:
|
||||
"""
|
||||
计算两个 embedding 的余弦相似度
|
||||
|
||||
Args:
|
||||
embedding1: 第一个向量
|
||||
embedding2: 第二个向量
|
||||
|
||||
Returns:
|
||||
相似度分数 (0-1)
|
||||
"""
|
||||
vec1 = np.array(embedding1)
|
||||
vec2 = np.array(embedding2)
|
||||
|
||||
# 余弦相似度
|
||||
dot_product = np.dot(vec1, vec2)
|
||||
norm1 = np.linalg.norm(vec1)
|
||||
norm2 = np.linalg.norm(vec2)
|
||||
|
||||
if norm1 == 0 or norm2 == 0:
|
||||
return 0.0
|
||||
|
||||
return float(dot_product / (norm1 * norm2))
|
||||
|
||||
def get_entity_text(self, name: str, definition: str = "") -> str:
|
||||
"""
|
||||
构建用于 embedding 的实体文本
|
||||
|
||||
Args:
|
||||
name: 实体名称
|
||||
definition: 实体定义
|
||||
|
||||
Returns:
|
||||
组合文本
|
||||
"""
|
||||
if definition:
|
||||
return f"{name}: {definition}"
|
||||
return name
|
||||
|
||||
def find_similar_entity(
|
||||
self,
|
||||
project_id: str,
|
||||
name: str,
|
||||
definition: str = "",
|
||||
exclude_id: Optional[str] = None,
|
||||
threshold: Optional[float] = None
|
||||
) -> Optional[object]:
|
||||
"""
|
||||
查找相似的实体
|
||||
|
||||
Args:
|
||||
project_id: 项目 ID
|
||||
name: 实体名称
|
||||
definition: 实体定义
|
||||
exclude_id: 要排除的实体 ID
|
||||
threshold: 相似度阈值
|
||||
|
||||
Returns:
|
||||
相似的实体或 None
|
||||
"""
|
||||
if threshold is None:
|
||||
threshold = self.similarity_threshold
|
||||
|
||||
try:
|
||||
from db_manager import get_db_manager
|
||||
db = get_db_manager()
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
# 获取项目的所有实体
|
||||
entities = db.get_all_entities_for_embedding(project_id)
|
||||
|
||||
if not entities:
|
||||
return None
|
||||
|
||||
# 获取查询实体的 embedding
|
||||
query_text = self.get_entity_text(name, definition)
|
||||
query_embedding = self.get_embedding(query_text)
|
||||
|
||||
if query_embedding is None:
|
||||
# 如果 embedding API 失败,回退到简单匹配
|
||||
return self._fallback_similarity_match(entities, name, exclude_id)
|
||||
|
||||
best_match = None
|
||||
best_score = threshold
|
||||
|
||||
for entity in entities:
|
||||
if exclude_id and entity.id == exclude_id:
|
||||
continue
|
||||
|
||||
# 获取实体的 embedding
|
||||
entity_text = self.get_entity_text(entity.name, entity.definition)
|
||||
entity_embedding = self.get_embedding(entity_text)
|
||||
|
||||
if entity_embedding is None:
|
||||
continue
|
||||
|
||||
# 计算相似度
|
||||
similarity = self.compute_similarity(query_embedding, entity_embedding)
|
||||
|
||||
if similarity > best_score:
|
||||
best_score = similarity
|
||||
best_match = entity
|
||||
|
||||
return best_match
|
||||
|
||||
def _fallback_similarity_match(
|
||||
self,
|
||||
entities: List[object],
|
||||
name: str,
|
||||
exclude_id: Optional[str] = None
|
||||
) -> Optional[object]:
|
||||
"""
|
||||
回退到简单的相似度匹配(不使用 embedding)
|
||||
|
||||
Args:
|
||||
entities: 实体列表
|
||||
name: 查询名称
|
||||
exclude_id: 要排除的实体 ID
|
||||
|
||||
Returns:
|
||||
最相似的实体或 None
|
||||
"""
|
||||
name_lower = name.lower()
|
||||
|
||||
# 1. 精确匹配
|
||||
for entity in entities:
|
||||
if exclude_id and entity.id == exclude_id:
|
||||
continue
|
||||
if entity.name.lower() == name_lower:
|
||||
return entity
|
||||
if entity.aliases and name_lower in [a.lower() for a in entity.aliases]:
|
||||
return entity
|
||||
|
||||
# 2. 包含匹配
|
||||
for entity in entities:
|
||||
if exclude_id and entity.id == exclude_id:
|
||||
continue
|
||||
if name_lower in entity.name.lower() or entity.name.lower() in name_lower:
|
||||
return entity
|
||||
|
||||
return None
|
||||
|
||||
def batch_align_entities(
|
||||
self,
|
||||
project_id: str,
|
||||
new_entities: List[Dict],
|
||||
threshold: Optional[float] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
批量对齐实体
|
||||
|
||||
Args:
|
||||
project_id: 项目 ID
|
||||
new_entities: 新实体列表 [{"name": "...", "definition": "..."}]
|
||||
threshold: 相似度阈值
|
||||
|
||||
Returns:
|
||||
对齐结果列表 [{"new_entity": {...}, "matched_entity": {...}, "similarity": 0.9}]
|
||||
"""
|
||||
if threshold is None:
|
||||
threshold = self.similarity_threshold
|
||||
|
||||
results = []
|
||||
|
||||
for new_ent in new_entities:
|
||||
matched = self.find_similar_entity(
|
||||
project_id,
|
||||
new_ent["name"],
|
||||
new_ent.get("definition", ""),
|
||||
threshold=threshold
|
||||
)
|
||||
|
||||
result = {
|
||||
"new_entity": new_ent,
|
||||
"matched_entity": None,
|
||||
"similarity": 0.0,
|
||||
"should_merge": False
|
||||
}
|
||||
|
||||
if matched:
|
||||
# 计算相似度
|
||||
query_text = self.get_entity_text(new_ent["name"], new_ent.get("definition", ""))
|
||||
matched_text = self.get_entity_text(matched.name, matched.definition)
|
||||
|
||||
query_emb = self.get_embedding(query_text)
|
||||
matched_emb = self.get_embedding(matched_text)
|
||||
|
||||
if query_emb and matched_emb:
|
||||
similarity = self.compute_similarity(query_emb, matched_emb)
|
||||
result["matched_entity"] = {
|
||||
"id": matched.id,
|
||||
"name": matched.name,
|
||||
"type": matched.type,
|
||||
"definition": matched.definition
|
||||
}
|
||||
result["similarity"] = similarity
|
||||
result["should_merge"] = similarity >= threshold
|
||||
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
def suggest_entity_aliases(self, entity_name: str, entity_definition: str = "") -> List[str]:
|
||||
"""
|
||||
使用 LLM 建议实体的别名
|
||||
|
||||
Args:
|
||||
entity_name: 实体名称
|
||||
entity_definition: 实体定义
|
||||
|
||||
Returns:
|
||||
建议的别名列表
|
||||
"""
|
||||
if not KIMI_API_KEY:
|
||||
return []
|
||||
|
||||
prompt = f"""为以下实体生成可能的别名或简称:
|
||||
|
||||
实体名称:{entity_name}
|
||||
定义:{entity_definition}
|
||||
|
||||
请返回 JSON 格式的别名列表:
|
||||
{{"aliases": ["别名1", "别名2", "别名3"]}}
|
||||
|
||||
只返回 JSON,不要其他内容。"""
|
||||
|
||||
try:
|
||||
response = httpx.post(
|
||||
f"{KIMI_BASE_URL}/v1/chat/completions",
|
||||
headers={"Authorization": f"Bearer {KIMI_API_KEY}", "Content-Type": "application/json"},
|
||||
json={
|
||||
"model": "k2p5",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.3
|
||||
},
|
||||
timeout=30.0
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
content = result["choices"][0]["message"]["content"]
|
||||
|
||||
import re
|
||||
json_match = re.search(r'\{{.*?\}}', content, re.DOTALL)
|
||||
if json_match:
|
||||
data = json.loads(json_match.group())
|
||||
return data.get("aliases", [])
|
||||
except Exception as e:
|
||||
print(f"Alias suggestion failed: {e}")
|
||||
|
||||
return []
|
||||
|
||||
|
||||
# 简单的字符串相似度计算(不使用 embedding)
|
||||
def simple_similarity(str1: str, str2: str) -> float:
|
||||
"""
|
||||
计算两个字符串的简单相似度
|
||||
|
||||
Args:
|
||||
str1: 第一个字符串
|
||||
str2: 第二个字符串
|
||||
|
||||
Returns:
|
||||
相似度分数 (0-1)
|
||||
"""
|
||||
if str1 == str2:
|
||||
return 1.0
|
||||
|
||||
if not str1 or not str2:
|
||||
return 0.0
|
||||
|
||||
# 转换为小写
|
||||
s1 = str1.lower()
|
||||
s2 = str2.lower()
|
||||
|
||||
# 包含关系
|
||||
if s1 in s2 or s2 in s1:
|
||||
return 0.8
|
||||
|
||||
# 计算编辑距离相似度
|
||||
from difflib import SequenceMatcher
|
||||
return SequenceMatcher(None, s1, s2).ratio()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试
|
||||
aligner = EntityAligner()
|
||||
|
||||
# 测试 embedding
|
||||
test_text = "Kubernetes 容器编排平台"
|
||||
embedding = aligner.get_embedding(test_text)
|
||||
if embedding:
|
||||
print(f"Embedding dimension: {len(embedding)}")
|
||||
print(f"First 5 values: {embedding[:5]}")
|
||||
else:
|
||||
print("Embedding API not available")
|
||||
|
||||
# 测试相似度计算
|
||||
emb1 = [1.0, 0.0, 0.0]
|
||||
emb2 = [0.9, 0.1, 0.0]
|
||||
sim = aligner.compute_similarity(emb1, emb2)
|
||||
print(f"Similarity: {sim:.4f}")
|
||||
389
backend/main.py
389
backend/main.py
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
InsightFlow Backend - Phase 3 (Production Ready)
|
||||
Knowledge Growth: Multi-file fusion + Entity Alignment
|
||||
InsightFlow Backend - Phase 3 (Memory & Growth)
|
||||
Knowledge Growth: Multi-file fusion + Entity Alignment + Document Import
|
||||
ASR: 阿里云听悟 + OSS
|
||||
"""
|
||||
|
||||
@@ -9,6 +9,7 @@ import os
|
||||
import json
|
||||
import httpx
|
||||
import uuid
|
||||
import re
|
||||
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
@@ -35,6 +36,18 @@ try:
|
||||
except ImportError:
|
||||
DB_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from document_processor import DocumentProcessor
|
||||
DOC_PROCESSOR_AVAILABLE = True
|
||||
except ImportError:
|
||||
DOC_PROCESSOR_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from entity_aligner import EntityAligner
|
||||
ALIGNER_AVAILABLE = True
|
||||
except ImportError:
|
||||
ALIGNER_AVAILABLE = False
|
||||
|
||||
app = FastAPI(title="InsightFlow", version="0.3.0")
|
||||
|
||||
app.add_middleware(
|
||||
@@ -90,9 +103,29 @@ class EntityMergeRequest(BaseModel):
|
||||
source_entity_id: str
|
||||
target_entity_id: str
|
||||
|
||||
class GlossaryTermCreate(BaseModel):
|
||||
term: str
|
||||
pronunciation: Optional[str] = ""
|
||||
|
||||
# API Keys
|
||||
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
|
||||
KIMI_BASE_URL = "https://api.kimi.com/coding"
|
||||
KIMI_BASE_URL = os.getenv("KIMI_BASE_URL", "https://api.kimi.com/coding")
|
||||
|
||||
# Phase 3: Entity Aligner singleton
|
||||
_aligner = None
|
||||
def get_aligner():
|
||||
global _aligner
|
||||
if _aligner is None and ALIGNER_AVAILABLE:
|
||||
_aligner = EntityAligner()
|
||||
return _aligner
|
||||
|
||||
# Phase 3: Document Processor singleton
|
||||
_doc_processor = None
|
||||
def get_doc_processor():
|
||||
global _doc_processor
|
||||
if _doc_processor is None and DOC_PROCESSOR_AVAILABLE:
|
||||
_doc_processor = DocumentProcessor()
|
||||
return _doc_processor
|
||||
|
||||
# Phase 2: Entity Edit API
|
||||
@app.put("/api/v1/entities/{entity_id}")
|
||||
@@ -406,12 +439,21 @@ def extract_entities_with_llm(text: str) -> tuple[List[dict], List[dict]]:
|
||||
|
||||
return [], []
|
||||
|
||||
def align_entity(project_id: str, name: str, db) -> Optional[Entity]:
|
||||
"""实体对齐"""
|
||||
def align_entity(project_id: str, name: str, db, definition: str = "") -> Optional[Entity]:
|
||||
"""实体对齐 - Phase 3: 使用 embedding 对齐"""
|
||||
# 1. 首先尝试精确匹配
|
||||
existing = db.get_entity_by_name(project_id, name)
|
||||
if existing:
|
||||
return existing
|
||||
|
||||
# 2. 使用 embedding 对齐(如果可用)
|
||||
aligner = get_aligner()
|
||||
if aligner:
|
||||
similar = aligner.find_similar_entity(project_id, name, definition)
|
||||
if similar:
|
||||
return similar
|
||||
|
||||
# 3. 回退到简单相似度匹配
|
||||
similar = db.find_similar_entities(project_id, name)
|
||||
if similar:
|
||||
return similar[0]
|
||||
@@ -443,7 +485,7 @@ async def list_projects():
|
||||
|
||||
@app.post("/api/v1/projects/{project_id}/upload", response_model=AnalysisResult)
|
||||
async def upload_audio(project_id: str, file: UploadFile = File(...)):
|
||||
"""上传音频到指定项目"""
|
||||
"""上传音频到指定项目 - Phase 3: 支持多文件融合"""
|
||||
if not DB_AVAILABLE:
|
||||
raise HTTPException(status_code=500, detail="Database not available")
|
||||
|
||||
@@ -471,12 +513,12 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
|
||||
full_text=tw_result["full_text"]
|
||||
)
|
||||
|
||||
# 实体对齐并保存
|
||||
# 实体对齐并保存 - Phase 3: 使用增强对齐
|
||||
aligned_entities = []
|
||||
entity_name_to_id = {} # 用于关系映射
|
||||
|
||||
for raw_ent in raw_entities:
|
||||
existing = align_entity(project_id, raw_ent["name"], db)
|
||||
existing = align_entity(project_id, raw_ent["name"], db, raw_ent.get("definition", ""))
|
||||
|
||||
if existing:
|
||||
ent_model = EntityModel(
|
||||
@@ -551,6 +593,302 @@ async def upload_audio(project_id: str, file: UploadFile = File(...)):
|
||||
created_at=datetime.now().isoformat()
|
||||
)
|
||||
|
||||
# Phase 3: Document Upload API
|
||||
@app.post("/api/v1/projects/{project_id}/upload-document")
|
||||
async def upload_document(project_id: str, file: UploadFile = File(...)):
|
||||
"""上传 PDF/DOCX 文档到指定项目"""
|
||||
if not DB_AVAILABLE:
|
||||
raise HTTPException(status_code=500, detail="Database not available")
|
||||
|
||||
if not DOC_PROCESSOR_AVAILABLE:
|
||||
raise HTTPException(status_code=500, detail="Document processor not available")
|
||||
|
||||
db = get_db_manager()
|
||||
project = db.get_project(project_id)
|
||||
if not project:
|
||||
raise HTTPException(status_code=404, detail="Project not found")
|
||||
|
||||
content = await file.read()
|
||||
|
||||
# 处理文档
|
||||
processor = get_doc_processor()
|
||||
try:
|
||||
result = processor.process(content, file.filename)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=f"Document processing failed: {str(e)}")
|
||||
|
||||
# 保存文档转录记录
|
||||
transcript_id = str(uuid.uuid4())[:8]
|
||||
db.save_transcript(
|
||||
transcript_id=transcript_id,
|
||||
project_id=project_id,
|
||||
filename=file.filename,
|
||||
full_text=result["text"],
|
||||
transcript_type="document"
|
||||
)
|
||||
|
||||
# 提取实体和关系
|
||||
raw_entities, raw_relations = extract_entities_with_llm(result["text"])
|
||||
|
||||
# 实体对齐并保存
|
||||
aligned_entities = []
|
||||
entity_name_to_id = {}
|
||||
|
||||
for raw_ent in raw_entities:
|
||||
existing = align_entity(project_id, raw_ent["name"], db, raw_ent.get("definition", ""))
|
||||
|
||||
if existing:
|
||||
entity_name_to_id[raw_ent["name"]] = existing.id
|
||||
aligned_entities.append(EntityModel(
|
||||
id=existing.id,
|
||||
name=existing.name,
|
||||
type=existing.type,
|
||||
definition=existing.definition,
|
||||
aliases=existing.aliases
|
||||
))
|
||||
else:
|
||||
new_ent = db.create_entity(Entity(
|
||||
id=str(uuid.uuid4())[:8],
|
||||
project_id=project_id,
|
||||
name=raw_ent["name"],
|
||||
type=raw_ent.get("type", "OTHER"),
|
||||
definition=raw_ent.get("definition", "")
|
||||
))
|
||||
entity_name_to_id[raw_ent["name"]] = new_ent.id
|
||||
aligned_entities.append(EntityModel(
|
||||
id=new_ent.id,
|
||||
name=new_ent.name,
|
||||
type=new_ent.type,
|
||||
definition=new_ent.definition
|
||||
))
|
||||
|
||||
# 保存实体提及位置
|
||||
full_text = result["text"]
|
||||
name = raw_ent["name"]
|
||||
start_pos = 0
|
||||
while True:
|
||||
pos = full_text.find(name, start_pos)
|
||||
if pos == -1:
|
||||
break
|
||||
mention = EntityMention(
|
||||
id=str(uuid.uuid4())[:8],
|
||||
entity_id=entity_name_to_id[name],
|
||||
transcript_id=transcript_id,
|
||||
start_pos=pos,
|
||||
end_pos=pos + len(name),
|
||||
text_snippet=full_text[max(0, pos-20):min(len(full_text), pos+len(name)+20)],
|
||||
confidence=1.0
|
||||
)
|
||||
db.add_mention(mention)
|
||||
start_pos = pos + 1
|
||||
|
||||
# 保存关系
|
||||
for rel in raw_relations:
|
||||
source_id = entity_name_to_id.get(rel.get("source", ""))
|
||||
target_id = entity_name_to_id.get(rel.get("target", ""))
|
||||
if source_id and target_id:
|
||||
db.create_relation(
|
||||
project_id=project_id,
|
||||
source_entity_id=source_id,
|
||||
target_entity_id=target_id,
|
||||
relation_type=rel.get("type", "related"),
|
||||
evidence=result["text"][:200],
|
||||
transcript_id=transcript_id
|
||||
)
|
||||
|
||||
return {
|
||||
"transcript_id": transcript_id,
|
||||
"project_id": project_id,
|
||||
"filename": file.filename,
|
||||
"text_length": len(result["text"]),
|
||||
"entities": [e.dict() for e in aligned_entities],
|
||||
"created_at": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Phase 3: Knowledge Base API
|
||||
@app.get("/api/v1/projects/{project_id}/knowledge-base")
|
||||
async def get_knowledge_base(project_id: str):
|
||||
"""获取项目知识库 - 包含所有实体、关系、术语表"""
|
||||
if not DB_AVAILABLE:
|
||||
raise HTTPException(status_code=500, detail="Database not available")
|
||||
|
||||
db = get_db_manager()
|
||||
project = db.get_project(project_id)
|
||||
if not project:
|
||||
raise HTTPException(status_code=404, detail="Project not found")
|
||||
|
||||
# 获取所有实体
|
||||
entities = db.list_project_entities(project_id)
|
||||
|
||||
# 获取所有关系
|
||||
relations = db.list_project_relations(project_id)
|
||||
|
||||
# 获取所有转录
|
||||
transcripts = db.list_project_transcripts(project_id)
|
||||
|
||||
# 获取术语表
|
||||
glossary = db.list_glossary(project_id)
|
||||
|
||||
# 构建实体统计
|
||||
entity_stats = {}
|
||||
for ent in entities:
|
||||
mentions = db.get_entity_mentions(ent.id)
|
||||
entity_stats[ent.id] = {
|
||||
"mention_count": len(mentions),
|
||||
"transcript_ids": list(set([m.transcript_id for m in mentions]))
|
||||
}
|
||||
|
||||
# 构建实体名称映射
|
||||
entity_map = {e.id: e.name for e in entities}
|
||||
|
||||
return {
|
||||
"project": {
|
||||
"id": project.id,
|
||||
"name": project.name,
|
||||
"description": project.description
|
||||
},
|
||||
"stats": {
|
||||
"entity_count": len(entities),
|
||||
"relation_count": len(relations),
|
||||
"transcript_count": len(transcripts),
|
||||
"glossary_count": len(glossary)
|
||||
},
|
||||
"entities": [
|
||||
{
|
||||
"id": e.id,
|
||||
"name": e.name,
|
||||
"type": e.type,
|
||||
"definition": e.definition,
|
||||
"aliases": e.aliases,
|
||||
"mention_count": entity_stats.get(e.id, {}).get("mention_count", 0),
|
||||
"appears_in": entity_stats.get(e.id, {}).get("transcript_ids", [])
|
||||
}
|
||||
for e in entities
|
||||
],
|
||||
"relations": [
|
||||
{
|
||||
"id": r["id"],
|
||||
"source_id": r["source_entity_id"],
|
||||
"source_name": entity_map.get(r["source_entity_id"], "Unknown"),
|
||||
"target_id": r["target_entity_id"],
|
||||
"target_name": entity_map.get(r["target_entity_id"], "Unknown"),
|
||||
"type": r["relation_type"],
|
||||
"evidence": r["evidence"]
|
||||
}
|
||||
for r in relations
|
||||
],
|
||||
"glossary": [
|
||||
{
|
||||
"id": g["id"],
|
||||
"term": g["term"],
|
||||
"pronunciation": g["pronunciation"],
|
||||
"frequency": g["frequency"]
|
||||
}
|
||||
for g in glossary
|
||||
],
|
||||
"transcripts": [
|
||||
{
|
||||
"id": t["id"],
|
||||
"filename": t["filename"],
|
||||
"type": t.get("type", "audio"),
|
||||
"created_at": t["created_at"]
|
||||
}
|
||||
for t in transcripts
|
||||
]
|
||||
}
|
||||
|
||||
# Phase 3: Glossary API
|
||||
@app.post("/api/v1/projects/{project_id}/glossary")
|
||||
async def add_glossary_term(project_id: str, term: GlossaryTermCreate):
|
||||
"""添加术语到项目术语表"""
|
||||
if not DB_AVAILABLE:
|
||||
raise HTTPException(status_code=500, detail="Database not available")
|
||||
|
||||
db = get_db_manager()
|
||||
project = db.get_project(project_id)
|
||||
if not project:
|
||||
raise HTTPException(status_code=404, detail="Project not found")
|
||||
|
||||
term_id = db.add_glossary_term(
|
||||
project_id=project_id,
|
||||
term=term.term,
|
||||
pronunciation=term.pronunciation
|
||||
)
|
||||
|
||||
return {
|
||||
"id": term_id,
|
||||
"term": term.term,
|
||||
"pronunciation": term.pronunciation,
|
||||
"success": True
|
||||
}
|
||||
|
||||
@app.get("/api/v1/projects/{project_id}/glossary")
|
||||
async def get_glossary(project_id: str):
|
||||
"""获取项目术语表"""
|
||||
if not DB_AVAILABLE:
|
||||
raise HTTPException(status_code=500, detail="Database not available")
|
||||
|
||||
db = get_db_manager()
|
||||
glossary = db.list_glossary(project_id)
|
||||
return glossary
|
||||
|
||||
@app.delete("/api/v1/glossary/{term_id}")
|
||||
async def delete_glossary_term(term_id: str):
|
||||
"""删除术语"""
|
||||
if not DB_AVAILABLE:
|
||||
raise HTTPException(status_code=500, detail="Database not available")
|
||||
|
||||
db = get_db_manager()
|
||||
db.delete_glossary_term(term_id)
|
||||
return {"success": True}
|
||||
|
||||
# Phase 3: Entity Alignment API
|
||||
@app.post("/api/v1/projects/{project_id}/align-entities")
|
||||
async def align_project_entities(project_id: str, threshold: float = 0.85):
|
||||
"""运行实体对齐算法,合并相似实体"""
|
||||
if not DB_AVAILABLE:
|
||||
raise HTTPException(status_code=500, detail="Database not available")
|
||||
|
||||
aligner = get_aligner()
|
||||
if not aligner:
|
||||
raise HTTPException(status_code=500, detail="Entity aligner not available")
|
||||
|
||||
db = get_db_manager()
|
||||
entities = db.list_project_entities(project_id)
|
||||
|
||||
merged_count = 0
|
||||
merged_pairs = []
|
||||
|
||||
# 使用 embedding 对齐
|
||||
for i, entity in enumerate(entities):
|
||||
# 跳过已合并的实体
|
||||
existing = db.get_entity(entity.id)
|
||||
if not existing:
|
||||
continue
|
||||
|
||||
similar = aligner.find_similar_entity(
|
||||
project_id,
|
||||
entity.name,
|
||||
entity.definition,
|
||||
exclude_id=entity.id,
|
||||
threshold=threshold
|
||||
)
|
||||
|
||||
if similar:
|
||||
# 合并实体
|
||||
db.merge_entities(similar.id, entity.id)
|
||||
merged_count += 1
|
||||
merged_pairs.append({
|
||||
"source": entity.name,
|
||||
"target": similar.name
|
||||
})
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"merged_count": merged_count,
|
||||
"merged_pairs": merged_pairs
|
||||
}
|
||||
|
||||
@app.get("/api/v1/projects/{project_id}/entities")
|
||||
async def get_project_entities(project_id: str):
|
||||
"""获取项目的全局实体列表"""
|
||||
@@ -559,7 +897,7 @@ async def get_project_entities(project_id: str):
|
||||
|
||||
db = get_db_manager()
|
||||
entities = db.list_project_entities(project_id)
|
||||
return [{"id": e.id, "name": e.name, "type": e.type, "definition": e.definition} for e in entities]
|
||||
return [{"id": e.id, "name": e.name, "type": e.type, "definition": e.definition, "aliases": e.aliases} for e in entities]
|
||||
|
||||
|
||||
@app.get("/api/v1/projects/{project_id}/relations")
|
||||
@@ -597,6 +935,7 @@ async def get_project_transcripts(project_id: str):
|
||||
return [{
|
||||
"id": t["id"],
|
||||
"filename": t["filename"],
|
||||
"type": t.get("type", "audio"),
|
||||
"created_at": t["created_at"],
|
||||
"preview": t["full_text"][:100] + "..." if len(t["full_text"]) > 100 else t["full_text"]
|
||||
} for t in transcripts]
|
||||
@@ -619,42 +958,18 @@ async def get_entity_mentions(entity_id: str):
|
||||
"confidence": m.confidence
|
||||
} for m in mentions]
|
||||
|
||||
@app.post("/api/v1/entities/{entity_id}/merge")
|
||||
async def merge_entities_endpoint(entity_id: str, merge_req: EntityMergeRequest):
|
||||
"""合并两个实体"""
|
||||
if not DB_AVAILABLE:
|
||||
raise HTTPException(status_code=500, detail="Database not available")
|
||||
|
||||
db = get_db_manager()
|
||||
|
||||
# 验证两个实体都存在
|
||||
source = db.get_entity(merge_req.source_entity_id)
|
||||
target = db.get_entity(merge_req.target_entity_id)
|
||||
|
||||
if not source or not target:
|
||||
raise HTTPException(status_code=404, detail="Entity not found")
|
||||
|
||||
result = db.merge_entities(merge_req.target_entity_id, merge_req.source_entity_id)
|
||||
return {
|
||||
"success": True,
|
||||
"merged_entity": {
|
||||
"id": result.id,
|
||||
"name": result.name,
|
||||
"type": result.type,
|
||||
"definition": result.definition,
|
||||
"aliases": result.aliases
|
||||
}
|
||||
}
|
||||
|
||||
# Health check
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
return {
|
||||
"status": "ok",
|
||||
"version": "0.3.0",
|
||||
"phase": "Phase 3 - Memory & Growth",
|
||||
"oss_available": OSS_AVAILABLE,
|
||||
"tingwu_available": TINGWU_AVAILABLE,
|
||||
"db_available": DB_AVAILABLE
|
||||
"db_available": DB_AVAILABLE,
|
||||
"doc_processor_available": DOC_PROCESSOR_AVAILABLE,
|
||||
"aligner_available": ALIGNER_AVAILABLE
|
||||
}
|
||||
|
||||
# Serve frontend
|
||||
|
||||
24
backend/requirements.txt
Normal file
24
backend/requirements.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
# InsightFlow Backend Dependencies
|
||||
|
||||
# Web Framework
|
||||
fastapi==0.109.0
|
||||
uvicorn[standard]==0.27.0
|
||||
python-multipart==0.0.6
|
||||
|
||||
# HTTP Client
|
||||
httpx==0.26.0
|
||||
|
||||
# Document Processing
|
||||
PyPDF2==3.0.1
|
||||
python-docx==1.1.0
|
||||
|
||||
# Data Processing
|
||||
numpy==1.26.3
|
||||
|
||||
# Aliyun SDK
|
||||
aliyun-python-sdk-core==2.14.0
|
||||
aliyun-python-sdk-oss==2.18.5
|
||||
oss2==2.18.5
|
||||
|
||||
# Utilities
|
||||
python-dotenv==1.0.0
|
||||
@@ -16,7 +16,9 @@ CREATE TABLE IF NOT EXISTS transcripts (
|
||||
project_id TEXT NOT NULL,
|
||||
filename TEXT,
|
||||
full_text TEXT,
|
||||
type TEXT DEFAULT 'audio', -- 'audio' 或 'document'
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (project_id) REFERENCES projects(id)
|
||||
);
|
||||
|
||||
@@ -29,6 +31,7 @@ CREATE TABLE IF NOT EXISTS entities (
|
||||
type TEXT,
|
||||
definition TEXT,
|
||||
aliases TEXT, -- JSON 数组:["别名1", "别名2"]
|
||||
embedding TEXT, -- JSON 数组:实体名称+定义的 embedding
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (project_id) REFERENCES projects(id)
|
||||
@@ -71,3 +74,12 @@ CREATE TABLE IF NOT EXISTS glossary (
|
||||
frequency INTEGER DEFAULT 1,
|
||||
FOREIGN KEY (project_id) REFERENCES projects(id)
|
||||
);
|
||||
|
||||
-- 创建索引以提高查询性能
|
||||
CREATE INDEX IF NOT EXISTS idx_entities_project ON entities(project_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_entities_name ON entities(name);
|
||||
CREATE INDEX IF NOT EXISTS idx_transcripts_project ON transcripts(project_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_mentions_entity ON entity_mentions(entity_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_mentions_transcript ON entity_mentions(transcript_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_relations_project ON entity_relations(project_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_glossary_project ON glossary(project_id);
|
||||
|
||||
Reference in New Issue
Block a user