Phase 7 Task 6 & 8: 高级搜索与发现 + 性能优化与扩展

- 新增 search_manager.py 搜索管理模块
  - FullTextSearch: 全文搜索引擎 (FTS5)
  - SemanticSearch: 语义搜索引擎 (sentence-transformers)
  - EntityPathDiscovery: 实体关系路径发现 (BFS/DFS)
  - KnowledgeGapDetector: 知识缺口检测器

- 新增 performance_manager.py 性能管理模块
  - CacheManager: Redis 缓存层 (支持内存回退)
  - DatabaseSharding: 数据库分片管理
  - TaskQueue: 异步任务队列 (Celery + Redis)
  - PerformanceMonitor: 性能监控器

- 更新 schema.sql 添加新表
  - search_indexes, embeddings, fts_transcripts
  - cache_stats, task_queue, performance_metrics, shard_mappings

- 更新 main.py 添加 API 端点
  - 搜索: /search/fulltext, /search/semantic, /entities/{id}/paths
  - 性能: /cache/stats, /performance/metrics, /tasks, /health

- 更新 requirements.txt 添加依赖
  - sentence-transformers==2.5.1
  - redis==5.0.1
  - celery==5.3.6

- 创建测试脚本和文档
  - test_phase7_task6_8.py
  - docs/PHASE7_TASK6_8_SUMMARY.md

Phase 7 全部完成!
This commit is contained in:
OpenClaw Bot
2026-02-24 18:15:35 +08:00
parent 7a2dc5f810
commit e4550b066e
6 changed files with 2328 additions and 8 deletions

View File

@@ -725,3 +725,123 @@ CREATE INDEX IF NOT EXISTS idx_change_history_entity ON change_history(entity_ty
CREATE INDEX IF NOT EXISTS idx_change_history_session ON change_history(session_id);
CREATE INDEX IF NOT EXISTS idx_team_members_project ON team_members(project_id);
CREATE INDEX IF NOT EXISTS idx_team_members_user ON team_members(user_id);
-- ============================================
-- Phase 7 Task 6: 高级搜索与发现
-- ============================================
-- 搜索索引表
CREATE TABLE IF NOT EXISTS search_indexes (
id TEXT PRIMARY KEY,
project_id TEXT NOT NULL,
content_type TEXT NOT NULL, -- transcript, entity, relation
content_id TEXT NOT NULL,
content_text TEXT NOT NULL,
tokens TEXT, -- JSON array of tokens
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (project_id) REFERENCES projects(id),
UNIQUE(project_id, content_type, content_id)
);
-- 文本 Embedding 表
CREATE TABLE IF NOT EXISTS embeddings (
id TEXT PRIMARY KEY,
project_id TEXT NOT NULL,
content_type TEXT NOT NULL, -- transcript, entity
content_id TEXT NOT NULL,
text TEXT NOT NULL,
embedding TEXT NOT NULL, -- JSON array of floats
model TEXT NOT NULL, -- 使用的模型名称
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (project_id) REFERENCES projects(id),
UNIQUE(project_id, content_type, content_id)
);
-- 全文搜索虚拟表 (FTS5)
CREATE VIRTUAL TABLE IF NOT EXISTS fts_transcripts USING fts5(
content_text,
content_id UNINDEXED,
project_id UNINDEXED,
content_type UNINDEXED
);
-- 搜索相关索引
CREATE INDEX IF NOT EXISTS idx_search_indexes_project ON search_indexes(project_id);
CREATE INDEX IF NOT EXISTS idx_search_indexes_type ON search_indexes(content_type);
CREATE INDEX IF NOT EXISTS idx_search_indexes_content ON search_indexes(content_id);
CREATE INDEX IF NOT EXISTS idx_embeddings_project ON embeddings(project_id);
CREATE INDEX IF NOT EXISTS idx_embeddings_type ON embeddings(content_type);
CREATE INDEX IF NOT EXISTS idx_embeddings_content ON embeddings(content_id);
CREATE INDEX IF NOT EXISTS idx_embeddings_model ON embeddings(model);
-- ============================================
-- Phase 7 Task 8: 性能优化与扩展
-- ============================================
-- 缓存统计表
CREATE TABLE IF NOT EXISTS cache_stats (
id TEXT PRIMARY KEY,
stat_date DATE NOT NULL,
cache_type TEXT NOT NULL, -- redis, memory
total_keys INTEGER DEFAULT 0,
memory_usage INTEGER DEFAULT 0, -- bytes
hit_count INTEGER DEFAULT 0,
miss_count INTEGER DEFAULT 0,
evicted_count INTEGER DEFAULT 0,
expired_count INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(stat_date, cache_type)
);
-- 任务队列表
CREATE TABLE IF NOT EXISTS task_queue (
id TEXT PRIMARY KEY,
task_type TEXT NOT NULL, -- audio_analysis, report_generation, entity_extraction
status TEXT NOT NULL, -- pending, running, success, failure, retry, revoked
project_id TEXT NOT NULL,
params TEXT NOT NULL, -- JSON
result TEXT, -- JSON
error TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
started_at TIMESTAMP,
completed_at TIMESTAMP,
retry_count INTEGER DEFAULT 0,
priority INTEGER DEFAULT 5, -- 0=high, 5=normal, 10=low
worker_id TEXT, -- 执行任务的 worker
FOREIGN KEY (project_id) REFERENCES projects(id)
);
-- 性能指标表
CREATE TABLE IF NOT EXISTS performance_metrics (
id TEXT PRIMARY KEY,
metric_type TEXT NOT NULL, -- api_response, db_query, cache_operation
endpoint TEXT NOT NULL,
duration_ms REAL NOT NULL,
status_code INTEGER,
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
metadata TEXT -- JSON: 额外信息
);
-- 数据库分片映射表
CREATE TABLE IF NOT EXISTS shard_mappings (
id TEXT PRIMARY KEY,
project_id TEXT NOT NULL UNIQUE,
shard_id TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
migrated_at TIMESTAMP,
FOREIGN KEY (project_id) REFERENCES projects(id)
);
-- 性能相关索引
CREATE INDEX IF NOT EXISTS idx_cache_stats_date ON cache_stats(stat_date);
CREATE INDEX IF NOT EXISTS idx_task_queue_project ON task_queue(project_id);
CREATE INDEX IF NOT EXISTS idx_task_queue_status ON task_queue(status);
CREATE INDEX IF NOT EXISTS idx_task_queue_type ON task_queue(task_type);
CREATE INDEX IF NOT EXISTS idx_task_queue_created ON task_queue(created_at);
CREATE INDEX IF NOT EXISTS idx_metrics_type ON performance_metrics(metric_type);
CREATE INDEX IF NOT EXISTS idx_metrics_endpoint ON performance_metrics(endpoint);
CREATE INDEX IF NOT EXISTS idx_metrics_timestamp ON performance_metrics(timestamp);
CREATE INDEX IF NOT EXISTS idx_shard_mappings_project ON shard_mappings(project_id);
CREATE INDEX IF NOT EXISTS idx_shard_mappings_shard ON shard_mappings(shard_id);