Phase 7 Task 6 & 8: 高级搜索与发现 + 性能优化与扩展

- 新增 search_manager.py 搜索管理模块
  - FullTextSearch: 全文搜索引擎 (FTS5)
  - SemanticSearch: 语义搜索引擎 (sentence-transformers)
  - EntityPathDiscovery: 实体关系路径发现 (BFS/DFS)
  - KnowledgeGapDetector: 知识缺口检测器

- 新增 performance_manager.py 性能管理模块
  - CacheManager: Redis 缓存层 (支持内存回退)
  - DatabaseSharding: 数据库分片管理
  - TaskQueue: 异步任务队列 (Celery + Redis)
  - PerformanceMonitor: 性能监控器

- 更新 schema.sql 添加新表
  - search_indexes, embeddings, fts_transcripts
  - cache_stats, task_queue, performance_metrics, shard_mappings

- 更新 main.py 添加 API 端点
  - 搜索: /search/fulltext, /search/semantic, /entities/{id}/paths
  - 性能: /cache/stats, /performance/metrics, /tasks, /health

- 更新 requirements.txt 添加依赖
  - sentence-transformers==2.5.1
  - redis==5.0.1
  - celery==5.3.6

- 创建测试脚本和文档
  - test_phase7_task6_8.py
  - docs/PHASE7_TASK6_8_SUMMARY.md

Phase 7 全部完成!
This commit is contained in:
OpenClaw Bot
2026-02-24 18:15:35 +08:00
parent 7a2dc5f810
commit e4550b066e
6 changed files with 2328 additions and 8 deletions

View File

@@ -175,6 +175,30 @@ except ImportError as e:
print(f"Collaboration Manager import error: {e}")
COLLABORATION_AVAILABLE = False
# Phase 7 Task 6: Search Manager
try:
from search_manager import (
get_search_manager, SearchManager, FullTextSearch, SemanticSearch,
EntityPathDiscovery, KnowledgeGapDetector,
SearchOperator, SearchField
)
SEARCH_MANAGER_AVAILABLE = True
except ImportError as e:
print(f"Search Manager import error: {e}")
SEARCH_MANAGER_AVAILABLE = False
# Phase 7 Task 8: Performance Manager
try:
from performance_manager import (
get_performance_manager, PerformanceManager, CacheManager,
DatabaseSharding, TaskQueue, PerformanceMonitor,
CacheStats, TaskInfo, PerformanceMetric, TaskStatus, TaskPriority
)
PERFORMANCE_MANAGER_AVAILABLE = True
except ImportError as e:
print(f"Performance Manager import error: {e}")
PERFORMANCE_MANAGER_AVAILABLE = False
# FastAPI app with enhanced metadata for Swagger
app = FastAPI(
title="InsightFlow API",
@@ -7534,6 +7558,440 @@ async def check_project_permissions(project_id: str, user_id: str = "current_use
}
# ==================== Phase 7 Task 6: Advanced Search & Discovery ====================
class FullTextSearchRequest(BaseModel):
"""全文搜索请求"""
query: str
content_types: Optional[List[str]] = None
operator: str = "AND" # AND, OR, NOT
limit: int = 20
class SemanticSearchRequest(BaseModel):
"""语义搜索请求"""
query: str
content_types: Optional[List[str]] = None
threshold: float = 0.7
limit: int = 20
@app.post("/api/v1/search/fulltext", tags=["Search"])
async def fulltext_search(
project_id: str,
request: FullTextSearchRequest,
_=Depends(verify_api_key)
):
"""全文搜索"""
if not SEARCH_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Search manager not available")
search_manager = get_search_manager()
try:
operator = SearchOperator(request.operator.upper())
except ValueError:
operator = SearchOperator.AND
results = search_manager.fulltext_search.search(
query=request.query,
project_id=project_id,
content_types=request.content_types,
operator=operator,
limit=request.limit
)
return {
"query": request.query,
"operator": request.operator,
"total": len(results),
"results": [{
"id": r.id,
"type": r.type,
"title": r.title,
"content": r.content,
"highlights": r.highlights,
"score": r.score
} for r in results]
}
@app.post("/api/v1/search/semantic", tags=["Search"])
async def semantic_search(
project_id: str,
request: SemanticSearchRequest,
_=Depends(verify_api_key)
):
"""语义搜索"""
if not SEARCH_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Search manager not available")
search_manager = get_search_manager()
results = search_manager.semantic_search.search(
query=request.query,
project_id=project_id,
content_types=request.content_types,
threshold=request.threshold,
limit=request.limit
)
return {
"query": request.query,
"threshold": request.threshold,
"total": len(results),
"results": [{
"id": r.id,
"type": r.type,
"text": r.text,
"similarity": r.similarity
} for r in results]
}
@app.get("/api/v1/entities/{entity_id}/paths/{target_entity_id}", tags=["Search"])
async def find_entity_paths(
entity_id: str,
target_entity_id: str,
max_depth: int = 5,
find_all: bool = False,
_=Depends(verify_api_key)
):
"""查找实体关系路径"""
if not SEARCH_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Search manager not available")
search_manager = get_search_manager()
if find_all:
paths = search_manager.path_discovery.find_all_paths(
source_entity_id=entity_id,
target_entity_id=target_entity_id,
max_depth=max_depth
)
else:
path = search_manager.path_discovery.find_shortest_path(
source_entity_id=entity_id,
target_entity_id=target_entity_id,
max_depth=max_depth
)
paths = [path] if path else []
return {
"source_entity_id": entity_id,
"target_entity_id": target_entity_id,
"path_count": len(paths),
"paths": [{
"path_id": p.path_id,
"path_length": p.path_length,
"nodes": p.nodes,
"edges": p.edges,
"confidence": p.confidence
} for p in paths]
}
@app.get("/api/v1/entities/{entity_id}/network", tags=["Search"])
async def get_entity_network(
entity_id: str,
depth: int = 2,
_=Depends(verify_api_key)
):
"""获取实体关系网络"""
if not SEARCH_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Search manager not available")
search_manager = get_search_manager()
network = search_manager.path_discovery.get_entity_network(entity_id, depth)
return network
@app.get("/api/v1/projects/{project_id}/knowledge-gaps", tags=["Search"])
async def detect_knowledge_gaps(
project_id: str,
_=Depends(verify_api_key)
):
"""检测知识缺口"""
if not SEARCH_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Search manager not available")
search_manager = get_search_manager()
gaps = search_manager.gap_detector.detect_gaps(project_id)
completeness = search_manager.gap_detector.get_completeness_score(project_id)
return {
"project_id": project_id,
"completeness": completeness,
"gap_count": len(gaps),
"gaps": [{
"gap_id": g.gap_id,
"gap_type": g.gap_type,
"entity_id": g.entity_id,
"entity_name": g.entity_name,
"description": g.description,
"severity": g.severity,
"suggestion": g.suggestion
} for g in gaps]
}
@app.post("/api/v1/projects/{project_id}/search/index", tags=["Search"])
async def index_project_for_search(
project_id: str,
_=Depends(verify_api_key)
):
"""为项目创建搜索索引"""
if not SEARCH_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Search manager not available")
search_manager = get_search_manager()
success = search_manager.index_project_content(project_id)
if success:
return {"message": "Project indexed successfully", "project_id": project_id}
else:
raise HTTPException(status_code=500, detail="Failed to index project")
# ==================== Phase 7 Task 8: Performance & Scaling ====================
@app.get("/api/v1/cache/stats", tags=["Performance"])
async def get_cache_stats(
_=Depends(verify_api_key)
):
"""获取缓存统计"""
if not PERFORMANCE_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Performance manager not available")
perf_manager = get_performance_manager()
stats = perf_manager.cache.get_stats()
return {
"total_keys": stats.total_keys,
"memory_usage_bytes": stats.memory_usage,
"hit_count": stats.hit_count,
"miss_count": stats.miss_count,
"hit_rate": stats.hit_rate,
"evicted_count": stats.evicted_count,
"expired_count": stats.expired_count
}
@app.post("/api/v1/cache/clear", tags=["Performance"])
async def clear_cache(
pattern: Optional[str] = None,
_=Depends(verify_api_key)
):
"""清除缓存"""
if not PERFORMANCE_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Performance manager not available")
perf_manager = get_performance_manager()
success = perf_manager.cache.clear(pattern)
if success:
return {"message": "Cache cleared successfully", "pattern": pattern}
else:
raise HTTPException(status_code=500, detail="Failed to clear cache")
@app.get("/api/v1/performance/metrics", tags=["Performance"])
async def get_performance_metrics(
metric_type: Optional[str] = None,
endpoint: Optional[str] = None,
hours: int = 24,
limit: int = 1000,
_=Depends(verify_api_key)
):
"""获取性能指标"""
if not PERFORMANCE_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Performance manager not available")
perf_manager = get_performance_manager()
start_time = (datetime.now() - timedelta(hours=hours)).isoformat()
metrics = perf_manager.monitor.get_metrics(
metric_type=metric_type,
endpoint=endpoint,
start_time=start_time,
limit=limit
)
return {
"period_hours": hours,
"total": len(metrics),
"metrics": [{
"id": m.id,
"metric_type": m.metric_type,
"endpoint": m.endpoint,
"duration_ms": m.duration_ms,
"status_code": m.status_code,
"timestamp": m.timestamp
} for m in metrics]
}
@app.get("/api/v1/performance/summary", tags=["Performance"])
async def get_performance_summary(
hours: int = 24,
_=Depends(verify_api_key)
):
"""获取性能汇总统计"""
if not PERFORMANCE_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Performance manager not available")
perf_manager = get_performance_manager()
summary = perf_manager.monitor.get_summary_stats(hours)
return summary
@app.get("/api/v1/tasks/{task_id}/status", tags=["Performance"])
async def get_task_status(
task_id: str,
_=Depends(verify_api_key)
):
"""获取任务状态"""
if not PERFORMANCE_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Performance manager not available")
perf_manager = get_performance_manager()
task = perf_manager.task_queue.get_task_status(task_id)
if not task:
raise HTTPException(status_code=404, detail="Task not found")
return {
"task_id": task.task_id,
"task_type": task.task_type,
"status": task.status,
"project_id": task.project_id,
"params": task.params,
"result": task.result,
"error": task.error,
"created_at": task.created_at,
"started_at": task.started_at,
"completed_at": task.completed_at,
"retry_count": task.retry_count,
"priority": task.priority
}
@app.get("/api/v1/tasks", tags=["Performance"])
async def list_tasks(
project_id: Optional[str] = None,
status: Optional[str] = None,
limit: int = 50,
_=Depends(verify_api_key)
):
"""列出任务"""
if not PERFORMANCE_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Performance manager not available")
perf_manager = get_performance_manager()
tasks = perf_manager.task_queue.list_tasks(project_id, status, limit)
return {
"total": len(tasks),
"tasks": [{
"task_id": t.task_id,
"task_type": t.task_type,
"status": t.status,
"project_id": t.project_id,
"created_at": t.created_at,
"retry_count": t.retry_count,
"priority": t.priority
} for t in tasks]
}
@app.post("/api/v1/tasks/{task_id}/cancel", tags=["Performance"])
async def cancel_task(
task_id: str,
_=Depends(verify_api_key)
):
"""取消任务"""
if not PERFORMANCE_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Performance manager not available")
perf_manager = get_performance_manager()
success = perf_manager.task_queue.cancel_task(task_id)
if success:
return {"message": "Task cancelled successfully", "task_id": task_id}
else:
raise HTTPException(status_code=400, detail="Failed to cancel task or task already completed")
@app.get("/api/v1/shards", tags=["Performance"])
async def list_shards(
_=Depends(verify_api_key)
):
"""列出数据库分片"""
if not PERFORMANCE_MANAGER_AVAILABLE:
raise HTTPException(status_code=500, detail="Performance manager not available")
perf_manager = get_performance_manager()
shards = perf_manager.sharding.get_shard_stats()
return {
"shard_count": len(shards),
"shards": [{
"shard_id": s.shard_id,
"entity_count": s.entity_count,
"db_path": s.db_path,
"created_at": s.created_at
} for s in shards]
}
@app.get("/api/v1/health", tags=["System"])
async def health_check():
"""健康检查"""
health = {
"status": "healthy",
"timestamp": datetime.now().isoformat(),
"components": {}
}
# 数据库检查
if DB_AVAILABLE:
try:
db = get_db_manager()
conn = db.get_conn()
conn.execute("SELECT 1")
conn.close()
health["components"]["database"] = "ok"
except Exception as e:
health["components"]["database"] = f"error: {str(e)}"
health["status"] = "degraded"
else:
health["components"]["database"] = "unavailable"
# 性能管理器检查
if PERFORMANCE_MANAGER_AVAILABLE:
try:
perf_manager = get_performance_manager()
perf_health = perf_manager.health_check()
health["components"].update(perf_health)
if perf_health.get("overall") != "healthy":
health["status"] = "degraded"
except Exception as e:
health["components"]["performance"] = f"error: {str(e)}"
health["status"] = "degraded"
# 搜索管理器检查
if SEARCH_MANAGER_AVAILABLE:
health["components"]["search"] = "available"
else:
health["components"]["search"] = "unavailable"
return health
# Serve frontend - MUST be last to not override API routes
app.mount("/", StaticFiles(directory="frontend", html=True), name="frontend")

File diff suppressed because it is too large Load Diff

View File

@@ -53,3 +53,10 @@ webdavclient3==3.14.6
# Phase 7 Task 3: Security & Compliance
cryptography==42.0.0
# Phase 7 Task 6: Advanced Search & Discovery
sentence-transformers==2.5.1
# Phase 7 Task 8: Performance Optimization & Scaling
redis==5.0.1
celery==5.3.6

View File

@@ -725,3 +725,123 @@ CREATE INDEX IF NOT EXISTS idx_change_history_entity ON change_history(entity_ty
CREATE INDEX IF NOT EXISTS idx_change_history_session ON change_history(session_id);
CREATE INDEX IF NOT EXISTS idx_team_members_project ON team_members(project_id);
CREATE INDEX IF NOT EXISTS idx_team_members_user ON team_members(user_id);
-- ============================================
-- Phase 7 Task 6: 高级搜索与发现
-- ============================================
-- 搜索索引表
CREATE TABLE IF NOT EXISTS search_indexes (
id TEXT PRIMARY KEY,
project_id TEXT NOT NULL,
content_type TEXT NOT NULL, -- transcript, entity, relation
content_id TEXT NOT NULL,
content_text TEXT NOT NULL,
tokens TEXT, -- JSON array of tokens
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (project_id) REFERENCES projects(id),
UNIQUE(project_id, content_type, content_id)
);
-- 文本 Embedding 表
CREATE TABLE IF NOT EXISTS embeddings (
id TEXT PRIMARY KEY,
project_id TEXT NOT NULL,
content_type TEXT NOT NULL, -- transcript, entity
content_id TEXT NOT NULL,
text TEXT NOT NULL,
embedding TEXT NOT NULL, -- JSON array of floats
model TEXT NOT NULL, -- 使用的模型名称
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (project_id) REFERENCES projects(id),
UNIQUE(project_id, content_type, content_id)
);
-- 全文搜索虚拟表 (FTS5)
CREATE VIRTUAL TABLE IF NOT EXISTS fts_transcripts USING fts5(
content_text,
content_id UNINDEXED,
project_id UNINDEXED,
content_type UNINDEXED
);
-- 搜索相关索引
CREATE INDEX IF NOT EXISTS idx_search_indexes_project ON search_indexes(project_id);
CREATE INDEX IF NOT EXISTS idx_search_indexes_type ON search_indexes(content_type);
CREATE INDEX IF NOT EXISTS idx_search_indexes_content ON search_indexes(content_id);
CREATE INDEX IF NOT EXISTS idx_embeddings_project ON embeddings(project_id);
CREATE INDEX IF NOT EXISTS idx_embeddings_type ON embeddings(content_type);
CREATE INDEX IF NOT EXISTS idx_embeddings_content ON embeddings(content_id);
CREATE INDEX IF NOT EXISTS idx_embeddings_model ON embeddings(model);
-- ============================================
-- Phase 7 Task 8: 性能优化与扩展
-- ============================================
-- 缓存统计表
CREATE TABLE IF NOT EXISTS cache_stats (
id TEXT PRIMARY KEY,
stat_date DATE NOT NULL,
cache_type TEXT NOT NULL, -- redis, memory
total_keys INTEGER DEFAULT 0,
memory_usage INTEGER DEFAULT 0, -- bytes
hit_count INTEGER DEFAULT 0,
miss_count INTEGER DEFAULT 0,
evicted_count INTEGER DEFAULT 0,
expired_count INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(stat_date, cache_type)
);
-- 任务队列表
CREATE TABLE IF NOT EXISTS task_queue (
id TEXT PRIMARY KEY,
task_type TEXT NOT NULL, -- audio_analysis, report_generation, entity_extraction
status TEXT NOT NULL, -- pending, running, success, failure, retry, revoked
project_id TEXT NOT NULL,
params TEXT NOT NULL, -- JSON
result TEXT, -- JSON
error TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
started_at TIMESTAMP,
completed_at TIMESTAMP,
retry_count INTEGER DEFAULT 0,
priority INTEGER DEFAULT 5, -- 0=high, 5=normal, 10=low
worker_id TEXT, -- 执行任务的 worker
FOREIGN KEY (project_id) REFERENCES projects(id)
);
-- 性能指标表
CREATE TABLE IF NOT EXISTS performance_metrics (
id TEXT PRIMARY KEY,
metric_type TEXT NOT NULL, -- api_response, db_query, cache_operation
endpoint TEXT NOT NULL,
duration_ms REAL NOT NULL,
status_code INTEGER,
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
metadata TEXT -- JSON: 额外信息
);
-- 数据库分片映射表
CREATE TABLE IF NOT EXISTS shard_mappings (
id TEXT PRIMARY KEY,
project_id TEXT NOT NULL UNIQUE,
shard_id TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
migrated_at TIMESTAMP,
FOREIGN KEY (project_id) REFERENCES projects(id)
);
-- 性能相关索引
CREATE INDEX IF NOT EXISTS idx_cache_stats_date ON cache_stats(stat_date);
CREATE INDEX IF NOT EXISTS idx_task_queue_project ON task_queue(project_id);
CREATE INDEX IF NOT EXISTS idx_task_queue_status ON task_queue(status);
CREATE INDEX IF NOT EXISTS idx_task_queue_type ON task_queue(task_type);
CREATE INDEX IF NOT EXISTS idx_task_queue_created ON task_queue(created_at);
CREATE INDEX IF NOT EXISTS idx_metrics_type ON performance_metrics(metric_type);
CREATE INDEX IF NOT EXISTS idx_metrics_endpoint ON performance_metrics(endpoint);
CREATE INDEX IF NOT EXISTS idx_metrics_timestamp ON performance_metrics(timestamp);
CREATE INDEX IF NOT EXISTS idx_shard_mappings_project ON shard_mappings(project_id);
CREATE INDEX IF NOT EXISTS idx_shard_mappings_shard ON shard_mappings(shard_id);