fix: auto-fix code issues (cron)

- 修复重复导入/字段 - 修复异常处理 - 修复PEP8格式问题 - 添加类型注解 - 修复缺失的urllib.parse导入
2026-02-28 06:03:09 +08:00
parent ff83cab6c7
commit fe3d64a1d2
41 changed files with 4501 additions and 1176 deletions
--- a/backend/search_manager.py
+++ b/backend/search_manager.py
@@ -19,6 +19,7 @@ from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum

+
 class SearchOperator(Enum):
    """搜索操作符"""

@@ -26,6 +27,7 @@ class SearchOperator(Enum):
    OR = "OR"
    NOT = "NOT"

+
 # 尝试导入 sentence-transformers 用于语义搜索
 try:
    from sentence_transformers import SentenceTransformer
@@ -37,6 +39,7 @@ except ImportError:

 # ==================== 数据模型 ====================

+
@dataclass
 class SearchResult:
    """搜索结果数据模型"""
@@ -60,6 +63,7 @@ class SearchResult:
            "metadata": self.metadata,
        }

+
@dataclass
 class SemanticSearchResult:
    """语义搜索结果数据模型"""
@@ -85,6 +89,7 @@ class SemanticSearchResult:
            result["embedding_dim"] = len(self.embedding)
        return result

+
@dataclass
 class EntityPath:
    """实体关系路径数据模型"""
@@ -114,6 +119,7 @@ class EntityPath:
            "path_description": self.path_description,
        }

+
@dataclass
 class KnowledgeGap:
    """知识缺口数据模型"""
@@ -141,6 +147,7 @@ class KnowledgeGap:
            "metadata": self.metadata,
        }

+
@dataclass
 class SearchIndex:
    """搜索索引数据模型"""
@@ -154,6 +161,7 @@ class SearchIndex:
    created_at: str
    updated_at: str

+
@dataclass
 class TextEmbedding:
    """文本 Embedding 数据模型"""
@@ -166,8 +174,10 @@ class TextEmbedding:
    model_name: str
    created_at: str

+
 # ==================== 全文搜索 ====================

+
 class FullTextSearch:
    """
    全文搜索模块
@@ -222,10 +232,14 @@ class FullTextSearch:
        """)

        # 创建索引
-        conn.execute("CREATE INDEX IF NOT EXISTS idx_search_content ON search_indexes(content_id, content_type)")
+        conn.execute(
+            "CREATE INDEX IF NOT EXISTS idx_search_content ON search_indexes(content_id, content_type)"
+        )
        conn.execute("CREATE INDEX IF NOT EXISTS idx_search_project ON search_indexes(project_id)")
        conn.execute("CREATE INDEX IF NOT EXISTS idx_term_freq_term ON search_term_freq(term)")
-        conn.execute("CREATE INDEX IF NOT EXISTS idx_term_freq_project ON search_term_freq(project_id)")
+        conn.execute(
+            "CREATE INDEX IF NOT EXISTS idx_term_freq_project ON search_term_freq(project_id)"
+        )

        conn.commit()
        conn.close()
@@ -320,7 +334,14 @@ class FullTextSearch:
                    (term, content_id, content_type, project_id, frequency, positions)
                    VALUES (?, ?, ?, ?, ?, ?)
                """,
-                    (token, content_id, content_type, project_id, freq, json.dumps(positions, ensure_ascii=False)),
+                    (
+                        token,
+                        content_id,
+                        content_type,
+                        project_id,
+                        freq,
+                        json.dumps(positions, ensure_ascii=False),
+                    ),
                )

            conn.commit()
@@ -364,7 +385,7 @@ class FullTextSearch:
        # 排序和分页
        scored_results.sort(key=lambda x: x.score, reverse=True)

-        return scored_results[offset: offset + limit]
+        return scored_results[offset : offset + limit]

    def _parse_boolean_query(self, query: str) -> dict:
        """
@@ -405,7 +426,10 @@ class FullTextSearch:
        return {"and": and_terms + phrases, "or": or_terms, "not": not_terms, "phrases": phrases}

    def _execute_boolean_search(
-        self, parsed_query: dict, project_id: str | None = None, content_types: list[str] | None = None
+        self,
+        parsed_query: dict,
+        project_id: str | None = None,
+        content_types: list[str] | None = None,
    ) -> list[dict]:
        """执行布尔搜索"""
        conn = self._get_conn()
@@ -510,7 +534,8 @@ class FullTextSearch:
                    {
                        "id": content_id,
                        "content_type": content_type,
-                        "project_id": project_id or self._get_project_id(conn, content_id, content_type),
+                        "project_id": project_id
+                        or self._get_project_id(conn, content_id, content_type),
                        "content": content,
                        "terms": parsed_query["and"] + parsed_query["or"] + parsed_query["phrases"],
                    }
@@ -519,15 +544,21 @@ class FullTextSearch:
        conn.close()
        return results

-    def _get_content_by_id(self, conn: sqlite3.Connection, content_id: str, content_type: str) -> str | None:
+    def _get_content_by_id(
+        self, conn: sqlite3.Connection, content_id: str, content_type: str
+    ) -> str | None:
        """根据ID获取内容"""
        try:
            if content_type == "transcript":
-                row = conn.execute("SELECT full_text FROM transcripts WHERE id = ?", (content_id,)).fetchone()
+                row = conn.execute(
+                    "SELECT full_text FROM transcripts WHERE id = ?", (content_id,)
+                ).fetchone()
                return row["full_text"] if row else None

            elif content_type == "entity":
-                row = conn.execute("SELECT name, definition FROM entities WHERE id = ?", (content_id,)).fetchone()
+                row = conn.execute(
+                    "SELECT name, definition FROM entities WHERE id = ?", (content_id,)
+                ).fetchone()
                if row:
                    return f"{row['name']} {row['definition'] or ''}"
                return None
@@ -551,15 +582,23 @@ class FullTextSearch:
            print(f"获取内容失败: {e}")
            return None

-    def _get_project_id(self, conn: sqlite3.Connection, content_id: str, content_type: str) -> str | None:
+    def _get_project_id(
+        self, conn: sqlite3.Connection, content_id: str, content_type: str
+    ) -> str | None:
        """获取内容所属的项目ID"""
        try:
            if content_type == "transcript":
-                row = conn.execute("SELECT project_id FROM transcripts WHERE id = ?", (content_id,)).fetchone()
+                row = conn.execute(
+                    "SELECT project_id FROM transcripts WHERE id = ?", (content_id,)
+                ).fetchone()
            elif content_type == "entity":
-                row = conn.execute("SELECT project_id FROM entities WHERE id = ?", (content_id,)).fetchone()
+                row = conn.execute(
+                    "SELECT project_id FROM entities WHERE id = ?", (content_id,)
+                ).fetchone()
            elif content_type == "relation":
-                row = conn.execute("SELECT project_id FROM entity_relations WHERE id = ?", (content_id,)).fetchone()
+                row = conn.execute(
+                    "SELECT project_id FROM entity_relations WHERE id = ?", (content_id,)
+                ).fetchone()
            else:
                return None

@@ -673,12 +712,14 @@ class FullTextSearch:

            # 删除索引
            conn.execute(
-                "DELETE FROM search_indexes WHERE content_id = ? AND content_type = ?", (content_id, content_type)
+                "DELETE FROM search_indexes WHERE content_id = ? AND content_type = ?",
+                (content_id, content_type),
            )

            # 删除词频统计
            conn.execute(
-                "DELETE FROM search_term_freq WHERE content_id = ? AND content_type = ?", (content_id, content_type)
+                "DELETE FROM search_term_freq WHERE content_id = ? AND content_type = ?",
+                (content_id, content_type),
            )

            conn.commit()
@@ -696,7 +737,8 @@ class FullTextSearch:
        try:
            # 索引转录文本
            transcripts = conn.execute(
-                "SELECT id, project_id, full_text FROM transcripts WHERE project_id = ?", (project_id,)
+                "SELECT id, project_id, full_text FROM transcripts WHERE project_id = ?",
+                (project_id,),
            ).fetchall()

            for t in transcripts:
@@ -708,7 +750,8 @@ class FullTextSearch:

            # 索引实体
            entities = conn.execute(
-                "SELECT id, project_id, name, definition FROM entities WHERE project_id = ?", (project_id,)
+                "SELECT id, project_id, name, definition FROM entities WHERE project_id = ?",
+                (project_id,),
            ).fetchall()

            for e in entities:
@@ -743,8 +786,10 @@ class FullTextSearch:
        conn.close()
        return stats

+
 # ==================== 语义搜索 ====================

+
 class SemanticSearch:
    """
    语义搜索模块
@@ -756,7 +801,11 @@ class SemanticSearch:
    - 语义相似内容推荐
    """

-    def __init__(self, db_path: str = "insightflow.db", model_name: str = "paraphrase-multilingual-MiniLM-L12-v2"):
+    def __init__(
+        self,
+        db_path: str = "insightflow.db",
+        model_name: str = "paraphrase-multilingual-MiniLM-L12-v2",
+    ):
        self.db_path = db_path
        self.model_name = model_name
        self.model = None
@@ -793,7 +842,9 @@ class SemanticSearch:
            )
        """)

-        conn.execute("CREATE INDEX IF NOT EXISTS idx_embedding_content ON embeddings(content_id, content_type)")
+        conn.execute(
+            "CREATE INDEX IF NOT EXISTS idx_embedding_content ON embeddings(content_id, content_type)"
+        )
        conn.execute("CREATE INDEX IF NOT EXISTS idx_embedding_project ON embeddings(project_id)")

        conn.commit()
@@ -828,7 +879,9 @@ class SemanticSearch:
            print(f"生成 embedding 失败: {e}")
            return None

-    def index_embedding(self, content_id: str, content_type: str, project_id: str, text: str) -> bool:
+    def index_embedding(
+        self, content_id: str, content_type: str, project_id: str, text: str
+    ) -> bool:
        """
        为内容生成并保存 embedding

@@ -975,11 +1028,15 @@ class SemanticSearch:

        try:
            if content_type == "transcript":
-                row = conn.execute("SELECT full_text FROM transcripts WHERE id = ?", (content_id,)).fetchone()
+                row = conn.execute(
+                    "SELECT full_text FROM transcripts WHERE id = ?", (content_id,)
+                ).fetchone()
                result = row["full_text"] if row else None

            elif content_type == "entity":
-                row = conn.execute("SELECT name, definition FROM entities WHERE id = ?", (content_id,)).fetchone()
+                row = conn.execute(
+                    "SELECT name, definition FROM entities WHERE id = ?", (content_id,)
+                ).fetchone()
                result = f"{row['name']}: {row['definition']}" if row else None

            elif content_type == "relation":
@@ -992,7 +1049,11 @@ class SemanticSearch:
                       WHERE r.id = ?""",
                    (content_id,),
                ).fetchone()
-                result = f"{row['source_name']} {row['relation_type']} {row['target_name']}" if row else None
+                result = (
+                    f"{row['source_name']} {row['relation_type']} {row['target_name']}"
+                    if row
+                    else None
+                )

            else:
                result = None
@@ -1005,7 +1066,9 @@ class SemanticSearch:
            print(f"获取内容失败: {e}")
            return None

-    def find_similar_content(self, content_id: str, content_type: str, top_k: int = 5) -> list[SemanticSearchResult]:
+    def find_similar_content(
+        self, content_id: str, content_type: str, top_k: int = 5
+    ) -> list[SemanticSearchResult]:
        """
        查找与指定内容相似的内容

@@ -1076,7 +1139,10 @@ class SemanticSearch:
        """删除内容的 embedding"""
        try:
            conn = self._get_conn()
-            conn.execute("DELETE FROM embeddings WHERE content_id = ? AND content_type = ?", (content_id, content_type))
+            conn.execute(
+                "DELETE FROM embeddings WHERE content_id = ? AND content_type = ?",
+                (content_id, content_type),
+            )
            conn.commit()
            conn.close()
            return True
@@ -1084,8 +1150,10 @@ class SemanticSearch:
            print(f"删除 embedding 失败: {e}")
            return False

+
 # ==================== 实体关系路径发现 ====================

+
 class EntityPathDiscovery:
    """
    实体关系路径发现模块
@@ -1106,7 +1174,9 @@ class EntityPathDiscovery:
        conn.row_factory = sqlite3.Row
        return conn

-    def find_shortest_path(self, source_entity_id: str, target_entity_id: str, max_depth: int = 5) -> EntityPath | None:
+    def find_shortest_path(
+        self, source_entity_id: str, target_entity_id: str, max_depth: int = 5
+    ) -> EntityPath | None:
        """
        查找两个实体之间的最短路径（BFS算法）

@@ -1121,7 +1191,9 @@ class EntityPathDiscovery:
        conn = self._get_conn()

        # 获取项目ID
-        row = conn.execute("SELECT project_id FROM entities WHERE id = ?", (source_entity_id,)).fetchone()
+        row = conn.execute(
+            "SELECT project_id FROM entities WHERE id = ?", (source_entity_id,)
+        ).fetchone()

        if not row:
            conn.close()
@@ -1194,7 +1266,9 @@ class EntityPathDiscovery:
        conn = self._get_conn()

        # 获取项目ID
-        row = conn.execute("SELECT project_id FROM entities WHERE id = ?", (source_entity_id,)).fetchone()
+        row = conn.execute(
+            "SELECT project_id FROM entities WHERE id = ?", (source_entity_id,)
+        ).fetchone()

        if not row:
            conn.close()
@@ -1250,7 +1324,9 @@ class EntityPathDiscovery:
        # 获取实体信息
        nodes = []
        for entity_id in entity_ids:
-            row = conn.execute("SELECT id, name, type FROM entities WHERE id = ?", (entity_id,)).fetchone()
+            row = conn.execute(
+                "SELECT id, name, type FROM entities WHERE id = ?", (entity_id,)
+            ).fetchone()
            if row:
                nodes.append({"id": row["id"], "name": row["name"], "type": row["type"]})

@@ -1318,7 +1394,9 @@ class EntityPathDiscovery:
        conn = self._get_conn()

        # 获取项目ID
-        row = conn.execute("SELECT project_id, name FROM entities WHERE id = ?", (entity_id,)).fetchone()
+        row = conn.execute(
+            "SELECT project_id, name FROM entities WHERE id = ?", (entity_id,)
+        ).fetchone()

        if not row:
            conn.close()
@@ -1376,7 +1454,9 @@ class EntityPathDiscovery:
                                "hops": depth + 1,
                                "relation_type": neighbor["relation_type"],
                                "evidence": neighbor["evidence"],
-                                "path": self._get_path_to_entity(entity_id, neighbor_id, project_id, conn),
+                                "path": self._get_path_to_entity(
+                                    entity_id, neighbor_id, project_id, conn
+                                ),
                            }
                        )

@@ -1481,7 +1561,9 @@ class EntityPathDiscovery:
        conn = self._get_conn()

        # 获取所有实体
-        entities = conn.execute("SELECT id, name FROM entities WHERE project_id = ?", (project_id,)).fetchall()
+        entities = conn.execute(
+            "SELECT id, name FROM entities WHERE project_id = ?", (project_id,)
+        ).fetchall()

        # 计算每个实体作为桥梁的次数
        bridge_scores = []
@@ -1512,10 +1594,10 @@ class EntityPathDiscovery:
                    f"""
                    SELECT COUNT(*) as count
                    FROM entity_relations
-                    WHERE ((source_entity_id IN ({','.join(['?' for _ in neighbor_ids])})
-                       AND target_entity_id IN ({','.join(['?' for _ in neighbor_ids])}))
-                      OR (target_entity_id IN ({','.join(['?' for _ in neighbor_ids])})
-                       AND source_entity_id IN ({','.join(['?' for _ in neighbor_ids])})))
+                    WHERE ((source_entity_id IN ({",".join(["?" for _ in neighbor_ids])})
+                       AND target_entity_id IN ({",".join(["?" for _ in neighbor_ids])}))
+                      OR (target_entity_id IN ({",".join(["?" for _ in neighbor_ids])})
+                       AND source_entity_id IN ({",".join(["?" for _ in neighbor_ids])})))
                      AND project_id = ?
                """,
                    list(neighbor_ids) * 4 + [project_id],
@@ -1541,8 +1623,10 @@ class EntityPathDiscovery:
        bridge_scores.sort(key=lambda x: x["bridge_score"], reverse=True)
        return bridge_scores[:20]  # 返回前20

+
 # ==================== 知识缺口识别 ====================

+
 class KnowledgeGapDetection:
    """
    知识缺口识别模块
@@ -1603,7 +1687,8 @@ class KnowledgeGapDetection:

        # 获取项目的属性模板
        templates = conn.execute(
-            "SELECT id, name, type, is_required FROM attribute_templates WHERE project_id = ?", (project_id,)
+            "SELECT id, name, type, is_required FROM attribute_templates WHERE project_id = ?",
+            (project_id,),
        ).fetchall()

        if not templates:
@@ -1617,7 +1702,9 @@ class KnowledgeGapDetection:
            return []

        # 检查每个实体的属性完整性
-        entities = conn.execute("SELECT id, name FROM entities WHERE project_id = ?", (project_id,)).fetchall()
+        entities = conn.execute(
+            "SELECT id, name FROM entities WHERE project_id = ?", (project_id,)
+        ).fetchall()

        for entity in entities:
            entity_id = entity["id"]
@@ -1668,7 +1755,9 @@ class KnowledgeGapDetection:
        gaps = []

        # 获取所有实体及其关系数量
-        entities = conn.execute("SELECT id, name, type FROM entities WHERE project_id = ?", (project_id,)).fetchall()
+        entities = conn.execute(
+            "SELECT id, name, type FROM entities WHERE project_id = ?", (project_id,)
+        ).fetchall()

        for entity in entities:
            entity_id = entity["id"]
@@ -1807,13 +1896,17 @@ class KnowledgeGapDetection:
        gaps = []

        # 分析转录文本中频繁提及但未提取为实体的词
-        transcripts = conn.execute("SELECT full_text FROM transcripts WHERE project_id = ?", (project_id,)).fetchall()
+        transcripts = conn.execute(
+            "SELECT full_text FROM transcripts WHERE project_id = ?", (project_id,)
+        ).fetchall()

        # 合并所有文本
        all_text = " ".join([t["full_text"] or "" for t in transcripts])

        # 获取现有实体名称
-        existing_entities = conn.execute("SELECT name FROM entities WHERE project_id = ?", (project_id,)).fetchall()
+        existing_entities = conn.execute(
+            "SELECT name FROM entities WHERE project_id = ?", (project_id,)
+        ).fetchall()

        existing_names = {e["name"].lower() for e in existing_entities}

@@ -1838,7 +1931,10 @@ class KnowledgeGapDetection:
                        entity_name=None,
                        description=f"文本中频繁提及 '{entity}' 但未提取为实体（出现 {count} 次）",
                        severity="low",
-                        suggestions=[f"考虑将 '{entity}' 添加为实体", "检查实体提取算法是否需要优化"],
+                        suggestions=[
+                            f"考虑将 '{entity}' 添加为实体",
+                            "检查实体提取算法是否需要优化",
+                        ],
                        related_entities=[],
                        metadata={"mention_count": count},
                    )
@@ -1898,7 +1994,11 @@ class KnowledgeGapDetection:
                "relation_count": stats["relation_count"],
                "transcript_count": stats["transcript_count"],
            },
-            "gap_summary": {"total": len(gaps), "by_type": dict(gap_by_type), "by_severity": severity_count},
+            "gap_summary": {
+                "total": len(gaps),
+                "by_type": dict(gap_by_type),
+                "by_severity": severity_count,
+            },
            "top_gaps": [g.to_dict() for g in gaps[:10]],
            "recommendations": self._generate_recommendations(gaps),
        }
@@ -1929,8 +2029,10 @@ class KnowledgeGapDetection:

        return recommendations

+
 # ==================== 搜索管理器 ====================

+
 class SearchManager:
    """
    搜索管理器 - 统一入口
@@ -2035,7 +2137,8 @@ class SearchManager:

            # 索引转录文本
            transcripts = conn.execute(
-                "SELECT id, project_id, full_text FROM transcripts WHERE project_id = ?", (project_id,)
+                "SELECT id, project_id, full_text FROM transcripts WHERE project_id = ?",
+                (project_id,),
            ).fetchall()

            for t in transcripts:
@@ -2048,7 +2151,8 @@ class SearchManager:

            # 索引实体
            entities = conn.execute(
-                "SELECT id, project_id, name, definition FROM entities WHERE project_id = ?", (project_id,)
+                "SELECT id, project_id, name, definition FROM entities WHERE project_id = ?",
+                (project_id,),
            ).fetchall()

            for e in entities:
@@ -2076,9 +2180,9 @@ class SearchManager:
        ).fetchone()["count"]

        # 语义索引统计
-        semantic_count = conn.execute(f"SELECT COUNT(*) as count FROM embeddings {where_clause}", params).fetchone()[
-            "count"
-        ]
+        semantic_count = conn.execute(
+            f"SELECT COUNT(*) as count FROM embeddings {where_clause}", params
+        ).fetchone()["count"]

        # 按类型统计
        type_stats = {}
@@ -2101,9 +2205,11 @@ class SearchManager:
            "semantic_search_available": self.semantic_search.is_available(),
        }

+
 # 单例模式
 _search_manager = None

+
 def get_search_manager(db_path: str = "insightflow.db") -> SearchManager:
    """获取搜索管理器单例"""
    global _search_manager
@@ -2111,22 +2217,30 @@ def get_search_manager(db_path: str = "insightflow.db") -> SearchManager:
        _search_manager = SearchManager(db_path)
    return _search_manager

+
 # 便捷函数
-def fulltext_search(query: str, project_id: str | None = None, limit: int = 20) -> list[SearchResult]:
+def fulltext_search(
+    query: str, project_id: str | None = None, limit: int = 20
+) -> list[SearchResult]:
    """全文搜索便捷函数"""
    manager = get_search_manager()
    return manager.fulltext_search.search(query, project_id, limit=limit)

-def semantic_search(query: str, project_id: str | None = None, top_k: int = 10) -> list[SemanticSearchResult]:
+
+def semantic_search(
+    query: str, project_id: str | None = None, top_k: int = 10
+) -> list[SemanticSearchResult]:
    """语义搜索便捷函数"""
    manager = get_search_manager()
    return manager.semantic_search.search(query, project_id, top_k=top_k)

+
 def find_entity_path(source_id: str, target_id: str, max_depth: int = 5) -> EntityPath | None:
    """查找实体路径便捷函数"""
    manager = get_search_manager()
    return manager.path_discovery.find_shortest_path(source_id, target_id, max_depth)

+
 def detect_knowledge_gaps(project_id: str) -> list[KnowledgeGap]:
    """知识缺口检测便捷函数"""
    manager = get_search_manager()