fix: auto-fix code issues (cron)
- 修复重复导入/字段 - 修复异常处理 - 修复PEP8格式问题 - 添加类型注解
This commit is contained in:
@@ -9,15 +9,14 @@ Phase 7 Task 6: Advanced Search & Discovery
|
||||
4. KnowledgeGapDetection - 知识缺口识别
|
||||
"""
|
||||
|
||||
import re
|
||||
import hashlib
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
import sqlite3
|
||||
import hashlib
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Dict, Optional, Tuple, Set
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
|
||||
|
||||
@@ -46,10 +45,10 @@ class SearchResult:
|
||||
content_type: str # transcript, entity, relation
|
||||
project_id: str
|
||||
score: float
|
||||
highlights: List[Tuple[int, int]] = field(default_factory=list) # 高亮位置
|
||||
metadata: Dict = field(default_factory=dict)
|
||||
highlights: list[tuple[int, int]] = field(default_factory=list) # 高亮位置
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"id": self.id,
|
||||
"content": self.content,
|
||||
@@ -69,10 +68,10 @@ class SemanticSearchResult:
|
||||
content_type: str
|
||||
project_id: str
|
||||
similarity: float
|
||||
embedding: Optional[List[float]] = None
|
||||
metadata: Dict = field(default_factory=dict)
|
||||
embedding: list[float] | None = None
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
def to_dict(self) -> dict:
|
||||
result = {
|
||||
"id": self.id,
|
||||
"content": self.content[:500] + "..." if len(self.content) > 500 else self.content,
|
||||
@@ -95,12 +94,12 @@ class EntityPath:
|
||||
target_entity_id: str
|
||||
target_entity_name: str
|
||||
path_length: int
|
||||
nodes: List[Dict] # 路径上的节点
|
||||
edges: List[Dict] # 路径上的边
|
||||
nodes: list[dict] # 路径上的节点
|
||||
edges: list[dict] # 路径上的边
|
||||
confidence: float
|
||||
path_description: str
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"path_id": self.path_id,
|
||||
"source_entity_id": self.source_entity_id,
|
||||
@@ -120,15 +119,15 @@ class KnowledgeGap:
|
||||
"""知识缺口数据模型"""
|
||||
gap_id: str
|
||||
gap_type: str # missing_attribute, sparse_relation, isolated_entity, incomplete_entity
|
||||
entity_id: Optional[str]
|
||||
entity_name: Optional[str]
|
||||
entity_id: str | None
|
||||
entity_name: str | None
|
||||
description: str
|
||||
severity: str # high, medium, low
|
||||
suggestions: List[str]
|
||||
related_entities: List[str]
|
||||
metadata: Dict = field(default_factory=dict)
|
||||
suggestions: list[str]
|
||||
related_entities: list[str]
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"gap_id": self.gap_id,
|
||||
"gap_type": self.gap_type,
|
||||
@@ -149,8 +148,8 @@ class SearchIndex:
|
||||
content_id: str
|
||||
content_type: str
|
||||
project_id: str
|
||||
tokens: List[str]
|
||||
token_positions: Dict[str, List[int]] # 词 -> 位置列表
|
||||
tokens: list[str]
|
||||
token_positions: dict[str, list[int]] # 词 -> 位置列表
|
||||
created_at: str
|
||||
updated_at: str
|
||||
|
||||
@@ -162,7 +161,7 @@ class TextEmbedding:
|
||||
content_id: str
|
||||
content_type: str
|
||||
project_id: str
|
||||
embedding: List[float]
|
||||
embedding: list[float]
|
||||
model_name: str
|
||||
created_at: str
|
||||
|
||||
@@ -231,7 +230,7 @@ class FullTextSearch:
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def _tokenize(self, text: str) -> List[str]:
|
||||
def _tokenize(self, text: str) -> list[str]:
|
||||
"""
|
||||
中文分词(简化版)
|
||||
|
||||
@@ -243,7 +242,7 @@ class FullTextSearch:
|
||||
tokens = re.findall(r'[\u4e00-\u9fa5]+|[a-z]+|\d+', text)
|
||||
return tokens
|
||||
|
||||
def _extract_positions(self, text: str, tokens: List[str]) -> Dict[str, List[int]]:
|
||||
def _extract_positions(self, text: str, tokens: list[str]) -> dict[str, list[int]]:
|
||||
"""提取每个词在文本中的位置"""
|
||||
positions = defaultdict(list)
|
||||
text_lower = text.lower()
|
||||
@@ -326,9 +325,9 @@ class FullTextSearch:
|
||||
print(f"索引创建失败: {e}")
|
||||
return False
|
||||
|
||||
def search(self, query: str, project_id: Optional[str] = None,
|
||||
content_types: Optional[List[str]] = None,
|
||||
limit: int = 20, offset: int = 0) -> List[SearchResult]:
|
||||
def search(self, query: str, project_id: str | None = None,
|
||||
content_types: list[str] | None = None,
|
||||
limit: int = 20, offset: int = 0) -> list[SearchResult]:
|
||||
"""
|
||||
全文搜索
|
||||
|
||||
@@ -358,7 +357,7 @@ class FullTextSearch:
|
||||
|
||||
return scored_results[offset:offset + limit]
|
||||
|
||||
def _parse_boolean_query(self, query: str) -> Dict:
|
||||
def _parse_boolean_query(self, query: str) -> dict:
|
||||
"""
|
||||
解析布尔查询
|
||||
|
||||
@@ -401,9 +400,9 @@ class FullTextSearch:
|
||||
"phrases": phrases
|
||||
}
|
||||
|
||||
def _execute_boolean_search(self, parsed_query: Dict,
|
||||
project_id: Optional[str] = None,
|
||||
content_types: Optional[List[str]] = None) -> List[Dict]:
|
||||
def _execute_boolean_search(self, parsed_query: dict,
|
||||
project_id: str | None = None,
|
||||
content_types: list[str] | None = None) -> list[dict]:
|
||||
"""执行布尔搜索"""
|
||||
conn = self._get_conn()
|
||||
|
||||
@@ -503,7 +502,7 @@ class FullTextSearch:
|
||||
return results
|
||||
|
||||
def _get_content_by_id(self, conn: sqlite3.Connection,
|
||||
content_id: str, content_type: str) -> Optional[str]:
|
||||
content_id: str, content_type: str) -> str | None:
|
||||
"""根据ID获取内容"""
|
||||
try:
|
||||
if content_type == "transcript":
|
||||
@@ -542,7 +541,7 @@ class FullTextSearch:
|
||||
return None
|
||||
|
||||
def _get_project_id(self, conn: sqlite3.Connection,
|
||||
content_id: str, content_type: str) -> Optional[str]:
|
||||
content_id: str, content_type: str) -> str | None:
|
||||
"""获取内容所属的项目ID"""
|
||||
try:
|
||||
if content_type == "transcript":
|
||||
@@ -567,7 +566,7 @@ class FullTextSearch:
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _score_results(self, results: List[Dict], parsed_query: Dict) -> List[SearchResult]:
|
||||
def _score_results(self, results: list[dict], parsed_query: dict) -> list[SearchResult]:
|
||||
"""计算搜索结果的相关性分数"""
|
||||
scored = []
|
||||
all_terms = parsed_query["and"] + parsed_query["or"] + parsed_query["phrases"]
|
||||
@@ -689,7 +688,7 @@ class FullTextSearch:
|
||||
print(f"删除索引失败: {e}")
|
||||
return False
|
||||
|
||||
def reindex_project(self, project_id: str) -> Dict:
|
||||
def reindex_project(self, project_id: str) -> dict:
|
||||
"""重新索引整个项目"""
|
||||
conn = self._get_conn()
|
||||
stats = {"transcripts": 0, "entities": 0, "relations": 0, "errors": 0}
|
||||
@@ -808,7 +807,7 @@ class SemanticSearch:
|
||||
"""检查语义搜索是否可用"""
|
||||
return self.model is not None and SENTENCE_TRANSFORMERS_AVAILABLE
|
||||
|
||||
def generate_embedding(self, text: str) -> Optional[List[float]]:
|
||||
def generate_embedding(self, text: str) -> list[float] | None:
|
||||
"""
|
||||
生成文本的 embedding 向量
|
||||
|
||||
@@ -878,9 +877,9 @@ class SemanticSearch:
|
||||
print(f"索引 embedding 失败: {e}")
|
||||
return False
|
||||
|
||||
def search(self, query: str, project_id: Optional[str] = None,
|
||||
content_types: Optional[List[str]] = None,
|
||||
top_k: int = 10, threshold: float = 0.5) -> List[SemanticSearchResult]:
|
||||
def search(self, query: str, project_id: str | None = None,
|
||||
content_types: list[str] | None = None,
|
||||
top_k: int = 10, threshold: float = 0.5) -> list[SemanticSearchResult]:
|
||||
"""
|
||||
语义搜索
|
||||
|
||||
@@ -959,7 +958,7 @@ class SemanticSearch:
|
||||
results.sort(key=lambda x: x.similarity, reverse=True)
|
||||
return results[:top_k]
|
||||
|
||||
def _get_content_text(self, content_id: str, content_type: str) -> Optional[str]:
|
||||
def _get_content_text(self, content_id: str, content_type: str) -> str | None:
|
||||
"""获取内容文本"""
|
||||
conn = self._get_conn()
|
||||
|
||||
@@ -1002,7 +1001,7 @@ class SemanticSearch:
|
||||
return None
|
||||
|
||||
def find_similar_content(self, content_id: str, content_type: str,
|
||||
top_k: int = 5) -> List[SemanticSearchResult]:
|
||||
top_k: int = 5) -> list[SemanticSearchResult]:
|
||||
"""
|
||||
查找与指定内容相似的内容
|
||||
|
||||
@@ -1107,7 +1106,7 @@ class EntityPathDiscovery:
|
||||
|
||||
def find_shortest_path(self, source_entity_id: str,
|
||||
target_entity_id: str,
|
||||
max_depth: int = 5) -> Optional[EntityPath]:
|
||||
max_depth: int = 5) -> EntityPath | None:
|
||||
"""
|
||||
查找两个实体之间的最短路径(BFS算法)
|
||||
|
||||
@@ -1181,7 +1180,7 @@ class EntityPathDiscovery:
|
||||
def find_all_paths(self, source_entity_id: str,
|
||||
target_entity_id: str,
|
||||
max_depth: int = 4,
|
||||
max_paths: int = 10) -> List[EntityPath]:
|
||||
max_paths: int = 10) -> list[EntityPath]:
|
||||
"""
|
||||
查找两个实体之间的所有路径(限制数量和深度)
|
||||
|
||||
@@ -1211,7 +1210,7 @@ class EntityPathDiscovery:
|
||||
paths = []
|
||||
|
||||
def dfs(current_id: str, target_id: str,
|
||||
path: List[str], visited: Set[str], depth: int):
|
||||
path: list[str], visited: set[str], depth: int):
|
||||
if depth > max_depth:
|
||||
return
|
||||
|
||||
@@ -1247,7 +1246,7 @@ class EntityPathDiscovery:
|
||||
# 构建路径对象
|
||||
return [self._build_path_object(path, project_id) for path in paths]
|
||||
|
||||
def _build_path_object(self, entity_ids: List[str],
|
||||
def _build_path_object(self, entity_ids: list[str],
|
||||
project_id: str) -> EntityPath:
|
||||
"""构建路径对象"""
|
||||
conn = self._get_conn()
|
||||
@@ -1312,7 +1311,7 @@ class EntityPathDiscovery:
|
||||
)
|
||||
|
||||
def find_multi_hop_relations(self, entity_id: str,
|
||||
max_hops: int = 3) -> List[Dict]:
|
||||
max_hops: int = 3) -> list[dict]:
|
||||
"""
|
||||
查找实体的多跳关系
|
||||
|
||||
@@ -1394,7 +1393,7 @@ class EntityPathDiscovery:
|
||||
return relations
|
||||
|
||||
def _get_path_to_entity(self, source_id: str, target_id: str,
|
||||
project_id: str, conn: sqlite3.Connection) -> List[str]:
|
||||
project_id: str, conn: sqlite3.Connection) -> list[str]:
|
||||
"""获取从源实体到目标实体的路径(简化版)"""
|
||||
# BFS 找路径
|
||||
visited = {source_id}
|
||||
@@ -1428,7 +1427,7 @@ class EntityPathDiscovery:
|
||||
|
||||
return []
|
||||
|
||||
def generate_path_visualization(self, path: EntityPath) -> Dict:
|
||||
def generate_path_visualization(self, path: EntityPath) -> dict:
|
||||
"""
|
||||
生成路径可视化数据
|
||||
|
||||
@@ -1467,7 +1466,7 @@ class EntityPathDiscovery:
|
||||
"confidence": path.confidence
|
||||
}
|
||||
|
||||
def analyze_path_centrality(self, project_id: str) -> List[Dict]:
|
||||
def analyze_path_centrality(self, project_id: str) -> list[dict]:
|
||||
"""
|
||||
分析项目中实体的路径中心性(桥接程度)
|
||||
|
||||
@@ -1558,7 +1557,7 @@ class KnowledgeGapDetection:
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
def analyze_project(self, project_id: str) -> List[KnowledgeGap]:
|
||||
def analyze_project(self, project_id: str) -> list[KnowledgeGap]:
|
||||
"""
|
||||
分析项目中的知识缺口
|
||||
|
||||
@@ -1591,7 +1590,7 @@ class KnowledgeGapDetection:
|
||||
|
||||
return gaps
|
||||
|
||||
def _check_entity_attribute_completeness(self, project_id: str) -> List[KnowledgeGap]:
|
||||
def _check_entity_attribute_completeness(self, project_id: str) -> list[KnowledgeGap]:
|
||||
"""检查实体属性完整性"""
|
||||
conn = self._get_conn()
|
||||
gaps = []
|
||||
@@ -1661,7 +1660,7 @@ class KnowledgeGapDetection:
|
||||
conn.close()
|
||||
return gaps
|
||||
|
||||
def _check_relation_sparsity(self, project_id: str) -> List[KnowledgeGap]:
|
||||
def _check_relation_sparsity(self, project_id: str) -> list[KnowledgeGap]:
|
||||
"""检查关系稀疏度"""
|
||||
conn = self._get_conn()
|
||||
gaps = []
|
||||
@@ -1720,7 +1719,7 @@ class KnowledgeGapDetection:
|
||||
conn.close()
|
||||
return gaps
|
||||
|
||||
def _check_isolated_entities(self, project_id: str) -> List[KnowledgeGap]:
|
||||
def _check_isolated_entities(self, project_id: str) -> list[KnowledgeGap]:
|
||||
"""检查孤立实体(没有任何关系)"""
|
||||
conn = self._get_conn()
|
||||
gaps = []
|
||||
@@ -1756,7 +1755,7 @@ class KnowledgeGapDetection:
|
||||
conn.close()
|
||||
return gaps
|
||||
|
||||
def _check_incomplete_entities(self, project_id: str) -> List[KnowledgeGap]:
|
||||
def _check_incomplete_entities(self, project_id: str) -> list[KnowledgeGap]:
|
||||
"""检查不完整实体(缺少名称、类型或定义)"""
|
||||
conn = self._get_conn()
|
||||
gaps = []
|
||||
@@ -1788,7 +1787,7 @@ class KnowledgeGapDetection:
|
||||
conn.close()
|
||||
return gaps
|
||||
|
||||
def _check_missing_key_entities(self, project_id: str) -> List[KnowledgeGap]:
|
||||
def _check_missing_key_entities(self, project_id: str) -> list[KnowledgeGap]:
|
||||
"""检查可能缺失的关键实体"""
|
||||
conn = self._get_conn()
|
||||
gaps = []
|
||||
@@ -1841,7 +1840,7 @@ class KnowledgeGapDetection:
|
||||
conn.close()
|
||||
return gaps[:10] # 限制数量
|
||||
|
||||
def generate_completeness_report(self, project_id: str) -> Dict:
|
||||
def generate_completeness_report(self, project_id: str) -> dict:
|
||||
"""
|
||||
生成知识完整性报告
|
||||
|
||||
@@ -1898,7 +1897,7 @@ class KnowledgeGapDetection:
|
||||
"recommendations": self._generate_recommendations(gaps)
|
||||
}
|
||||
|
||||
def _generate_recommendations(self, gaps: List[KnowledgeGap]) -> List[str]:
|
||||
def _generate_recommendations(self, gaps: list[KnowledgeGap]) -> list[str]:
|
||||
"""生成改进建议"""
|
||||
recommendations = []
|
||||
|
||||
@@ -1941,8 +1940,8 @@ class SearchManager:
|
||||
self.path_discovery = EntityPathDiscovery(db_path)
|
||||
self.gap_detection = KnowledgeGapDetection(db_path)
|
||||
|
||||
def hybrid_search(self, query: str, project_id: Optional[str] = None,
|
||||
limit: int = 20) -> Dict:
|
||||
def hybrid_search(self, query: str, project_id: str | None = None,
|
||||
limit: int = 20) -> dict:
|
||||
"""
|
||||
混合搜索(全文 + 语义)
|
||||
|
||||
@@ -2014,7 +2013,7 @@ class SearchManager:
|
||||
"results": results[:limit]
|
||||
}
|
||||
|
||||
def index_project(self, project_id: str) -> Dict:
|
||||
def index_project(self, project_id: str) -> dict:
|
||||
"""
|
||||
为项目建立所有索引
|
||||
|
||||
@@ -2071,7 +2070,7 @@ class SearchManager:
|
||||
"semantic": semantic_stats
|
||||
}
|
||||
|
||||
def get_search_stats(self, project_id: Optional[str] = None) -> Dict:
|
||||
def get_search_stats(self, project_id: str | None = None) -> dict:
|
||||
"""获取搜索统计信息"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
@@ -2126,28 +2125,28 @@ def get_search_manager(db_path: str = "insightflow.db") -> SearchManager:
|
||||
|
||||
|
||||
# 便捷函数
|
||||
def fulltext_search(query: str, project_id: Optional[str] = None,
|
||||
limit: int = 20) -> List[SearchResult]:
|
||||
def fulltext_search(query: str, project_id: str | None = None,
|
||||
limit: int = 20) -> list[SearchResult]:
|
||||
"""全文搜索便捷函数"""
|
||||
manager = get_search_manager()
|
||||
return manager.fulltext_search.search(query, project_id, limit=limit)
|
||||
|
||||
|
||||
def semantic_search(query: str, project_id: Optional[str] = None,
|
||||
top_k: int = 10) -> List[SemanticSearchResult]:
|
||||
def semantic_search(query: str, project_id: str | None = None,
|
||||
top_k: int = 10) -> list[SemanticSearchResult]:
|
||||
"""语义搜索便捷函数"""
|
||||
manager = get_search_manager()
|
||||
return manager.semantic_search.search(query, project_id, top_k=top_k)
|
||||
|
||||
|
||||
def find_entity_path(source_id: str, target_id: str,
|
||||
max_depth: int = 5) -> Optional[EntityPath]:
|
||||
max_depth: int = 5) -> EntityPath | None:
|
||||
"""查找实体路径便捷函数"""
|
||||
manager = get_search_manager()
|
||||
return manager.path_discovery.find_shortest_path(source_id, target_id, max_depth)
|
||||
|
||||
|
||||
def detect_knowledge_gaps(project_id: str) -> List[KnowledgeGap]:
|
||||
def detect_knowledge_gaps(project_id: str) -> list[KnowledgeGap]:
|
||||
"""知识缺口检测便捷函数"""
|
||||
manager = get_search_manager()
|
||||
return manager.gap_detection.analyze_project(project_id)
|
||||
|
||||
Reference in New Issue
Block a user