fix: auto-fix code issues (cron)

- 修复重复导入/字段 - 修复异常处理 - 修复PEP8格式问题 - 添加类型注解
2026-03-02 12:14:39 +08:00
parent e23f1fec08
commit 98527c4de4
39 changed files with 8109 additions and 8147 deletions
--- a/backend/multimodal_entity_linker.py
+++ b/backend/multimodal_entity_linker.py
@@ -9,13 +9,13 @@ from dataclasses import dataclass
 from difflib import SequenceMatcher

 # Constants
-UUID_LENGTH  = 8  # UUID 截断长度
+UUID_LENGTH = 8  # UUID 截断长度

 # 尝试导入embedding库
 try:
-    NUMPY_AVAILABLE  = True
+    NUMPY_AVAILABLE = True
 except ImportError:
-    NUMPY_AVAILABLE  = False
+    NUMPY_AVAILABLE = False


@dataclass
@@ -30,11 +30,11 @@ class MultimodalEntity:
    source_id: str
    mention_context: str
    confidence: float
-    modality_features: dict  = None  # 模态特定特征
+    modality_features: dict = None  # 模态特定特征

    def __post_init__(self) -> None:
        if self.modality_features is None:
-            self.modality_features  = {}
+            self.modality_features = {}


@dataclass
@@ -78,7 +78,7 @@ class MultimodalEntityLinker:
    """多模态实体关联器 - 跨模态实体对齐和知识融合"""

    # 关联类型
-    LINK_TYPES  = {
+    LINK_TYPES = {
        "same_as": "同一实体",
        "related_to": "相关实体",
        "part_of": "组成部分",
@@ -86,16 +86,16 @@ class MultimodalEntityLinker:
    }

    # 模态类型
-    MODALITIES  = ["audio", "video", "image", "document"]
+    MODALITIES = ["audio", "video", "image", "document"]

-    def __init__(self, similarity_threshold: float  = 0.85) -> None:
+    def __init__(self, similarity_threshold: float = 0.85) -> None:
        """
        初始化多模态实体关联器

        Args:
            similarity_threshold: 相似度阈值
        """
-        self.similarity_threshold  = similarity_threshold
+        self.similarity_threshold = similarity_threshold

    def calculate_string_similarity(self, s1: str, s2: str) -> float:
        """
@@ -111,7 +111,7 @@ class MultimodalEntityLinker:
        if not s1 or not s2:
            return 0.0

-        s1, s2  = s1.lower().strip(), s2.lower().strip()
+        s1, s2 = s1.lower().strip(), s2.lower().strip()

        # 完全匹配
        if s1 == s2:
@@ -136,7 +136,7 @@ class MultimodalEntityLinker:
            (相似度, 匹配类型)
        """
        # 名称相似度
-        name_sim  = self.calculate_string_similarity(
+        name_sim = self.calculate_string_similarity(
            entity1.get("name", ""), entity2.get("name", "")
        )

@@ -145,8 +145,8 @@ class MultimodalEntityLinker:
            return 1.0, "exact"

        # 检查别名
-        aliases1  = set(a.lower() for a in entity1.get("aliases", []))
-        aliases2  = set(a.lower() for a in entity2.get("aliases", []))
+        aliases1 = set(a.lower() for a in entity1.get("aliases", []))
+        aliases2 = set(a.lower() for a in entity2.get("aliases", []))

        if aliases1 & aliases2:  # 有共同别名
            return 0.95, "alias_match"
@@ -157,12 +157,12 @@ class MultimodalEntityLinker:
            return 0.95, "alias_match"

        # 定义相似度
-        def_sim  = self.calculate_string_similarity(
+        def_sim = self.calculate_string_similarity(
            entity1.get("definition", ""), entity2.get("definition", "")
        )

        # 综合相似度
-        combined_sim  = name_sim * 0.7 + def_sim * 0.3
+        combined_sim = name_sim * 0.7 + def_sim * 0.3

        if combined_sim >= self.similarity_threshold:
            return combined_sim, "fuzzy"
@@ -170,7 +170,7 @@ class MultimodalEntityLinker:
        return combined_sim, "none"

    def find_matching_entity(
-        self, query_entity: dict, candidate_entities: list[dict], exclude_ids: set[str]  = None
+        self, query_entity: dict, candidate_entities: list[dict], exclude_ids: set[str] = None
    ) -> AlignmentResult | None:
        """
        在候选实体中查找匹配的实体
@@ -183,28 +183,28 @@ class MultimodalEntityLinker:
        Returns:
            对齐结果
        """
-        exclude_ids  = exclude_ids or set()
-        best_match  = None
-        best_similarity  = 0.0
+        exclude_ids = exclude_ids or set()
+        best_match = None
+        best_similarity = 0.0

        for candidate in candidate_entities:
            if candidate.get("id") in exclude_ids:
                continue

-            similarity, match_type  = self.calculate_entity_similarity(query_entity, candidate)
+            similarity, match_type = self.calculate_entity_similarity(query_entity, candidate)

            if similarity > best_similarity and similarity >= self.similarity_threshold:
-                best_similarity  = similarity
-                best_match  = candidate
-                best_match_type  = match_type
+                best_similarity = similarity
+                best_match = candidate
+                best_match_type = match_type

        if best_match:
            return AlignmentResult(
-                entity_id = query_entity.get("id"),
-                matched_entity_id = best_match.get("id"),
-                similarity = best_similarity,
-                match_type = best_match_type,
-                confidence = best_similarity,
+                entity_id=query_entity.get("id"),
+                matched_entity_id=best_match.get("id"),
+                similarity=best_similarity,
+                match_type=best_match_type,
+                confidence=best_similarity,
            )

        return None
@@ -230,10 +230,10 @@ class MultimodalEntityLinker:
        Returns:
            实体关联列表
        """
-        links  = []
+        links = []

        # 合并所有实体
-        all_entities  = {
+        all_entities = {
            "audio": audio_entities,
            "video": video_entities,
            "image": image_entities,
@@ -246,24 +246,24 @@ class MultimodalEntityLinker:
                if mod1 >= mod2:  # 避免重复比较
                    continue

-                entities1  = all_entities.get(mod1, [])
-                entities2  = all_entities.get(mod2, [])
+                entities1 = all_entities.get(mod1, [])
+                entities2 = all_entities.get(mod2, [])

                for ent1 in entities1:
                    # 在另一个模态中查找匹配
-                    result  = self.find_matching_entity(ent1, entities2)
+                    result = self.find_matching_entity(ent1, entities2)

                    if result and result.matched_entity_id:
-                        link  = EntityLink(
-                            id = str(uuid.uuid4())[:UUID_LENGTH],
-                            project_id = project_id,
-                            source_entity_id = ent1.get("id"),
-                            target_entity_id = result.matched_entity_id,
-                            link_type = "same_as" if result.similarity > 0.95 else "related_to",
-                            source_modality = mod1,
-                            target_modality = mod2,
-                            confidence = result.confidence,
-                            evidence = f"Cross-modal alignment: {result.match_type}",
+                        link = EntityLink(
+                            id=str(uuid.uuid4())[:UUID_LENGTH],
+                            project_id=project_id,
+                            source_entity_id=ent1.get("id"),
+                            target_entity_id=result.matched_entity_id,
+                            link_type="same_as" if result.similarity > 0.95 else "related_to",
+                            source_modality=mod1,
+                            target_modality=mod2,
+                            confidence=result.confidence,
+                            evidence=f"Cross-modal alignment: {result.match_type}",
                        )
                        links.append(link)

@@ -284,7 +284,7 @@ class MultimodalEntityLinker:
            融合结果
        """
        # 收集所有属性
-        fused_properties  = {
+        fused_properties = {
            "names": set(),
            "definitions": [],
            "aliases": set(),
@@ -293,7 +293,7 @@ class MultimodalEntityLinker:
            "contexts": [],
        }

-        merged_ids  = []
+        merged_ids = []

        for entity in linked_entities:
            merged_ids.append(entity.get("id"))
@@ -318,21 +318,21 @@ class MultimodalEntityLinker:
                fused_properties["contexts"].append(mention.get("mention_context"))

        # 选择最佳定义（最长的那个）
-        best_definition  = (
-            max(fused_properties["definitions"], key = len) if fused_properties["definitions"] else ""
+        best_definition = (
+            max(fused_properties["definitions"], key=len) if fused_properties["definitions"] else ""
        )

        # 选择最佳名称（最常见的那个）
        from collections import Counter

-        name_counts  = Counter(fused_properties["names"])
-        best_name  = name_counts.most_common(1)[0][0] if name_counts else ""
+        name_counts = Counter(fused_properties["names"])
+        best_name = name_counts.most_common(1)[0][0] if name_counts else ""

        # 构建融合结果
        return FusionResult(
-            canonical_entity_id = entity_id,
-            merged_entity_ids = merged_ids,
-            fused_properties = {
+            canonical_entity_id=entity_id,
+            merged_entity_ids=merged_ids,
+            fused_properties={
                "name": best_name,
                "definition": best_definition,
                "aliases": list(fused_properties["aliases"]),
@@ -340,8 +340,8 @@ class MultimodalEntityLinker:
                "modalities": list(fused_properties["modalities"]),
                "contexts": fused_properties["contexts"][:10],  # 最多10个上下文
            },
-            source_modalities = list(fused_properties["modalities"]),
-            confidence = min(1.0, len(linked_entities) * 0.2 + 0.5),
+            source_modalities=list(fused_properties["modalities"]),
+            confidence=min(1.0, len(linked_entities) * 0.2 + 0.5),
        )

    def detect_entity_conflicts(self, entities: list[dict]) -> list[dict]:
@@ -354,30 +354,30 @@ class MultimodalEntityLinker:
        Returns:
            冲突列表
        """
-        conflicts  = []
+        conflicts = []

        # 按名称分组
-        name_groups  = {}
+        name_groups = {}
        for entity in entities:
-            name  = entity.get("name", "").lower()
+            name = entity.get("name", "").lower()
            if name:
                if name not in name_groups:
-                    name_groups[name]  = []
+                    name_groups[name] = []
                name_groups[name].append(entity)

        # 检测同名但定义不同的实体
        for name, group in name_groups.items():
            if len(group) > 1:
                # 检查定义是否相似
-                definitions  = [e.get("definition", "") for e in group if e.get("definition")]
+                definitions = [e.get("definition", "") for e in group if e.get("definition")]

                if len(definitions) > 1:
                    # 计算定义之间的相似度
-                    sim_matrix  = []
+                    sim_matrix = []
                    for i, d1 in enumerate(definitions):
                        for j, d2 in enumerate(definitions):
                            if i < j:
-                                sim  = self.calculate_string_similarity(d1, d2)
+                                sim = self.calculate_string_similarity(d1, d2)
                                sim_matrix.append(sim)

                    # 如果定义相似度都很低，可能是冲突
@@ -394,7 +394,7 @@ class MultimodalEntityLinker:
        return conflicts

    def suggest_entity_merges(
-        self, entities: list[dict], existing_links: list[EntityLink]  = None
+        self, entities: list[dict], existing_links: list[EntityLink] = None
    ) -> list[dict]:
        """
        建议实体合并
@@ -406,13 +406,13 @@ class MultimodalEntityLinker:
        Returns:
            合并建议列表
        """
-        suggestions  = []
-        existing_pairs  = set()
+        suggestions = []
+        existing_pairs = set()

        # 记录已有的关联
        if existing_links:
            for link in existing_links:
-                pair  = tuple(sorted([link.source_entity_id, link.target_entity_id]))
+                pair = tuple(sorted([link.source_entity_id, link.target_entity_id]))
                existing_pairs.add(pair)

        # 检查所有实体对
@@ -422,12 +422,12 @@ class MultimodalEntityLinker:
                    continue

                # 检查是否已有关联
-                pair  = tuple(sorted([ent1.get("id"), ent2.get("id")]))
+                pair = tuple(sorted([ent1.get("id"), ent2.get("id")]))
                if pair in existing_pairs:
                    continue

                # 计算相似度
-                similarity, match_type  = self.calculate_entity_similarity(ent1, ent2)
+                similarity, match_type = self.calculate_entity_similarity(ent1, ent2)

                if similarity >= self.similarity_threshold:
                    suggestions.append(
@@ -441,7 +441,7 @@ class MultimodalEntityLinker:
                    )

        # 按相似度排序
-        suggestions.sort(key = lambda x: x["similarity"], reverse = True)
+        suggestions.sort(key=lambda x: x["similarity"], reverse=True)

        return suggestions

@@ -451,8 +451,8 @@ class MultimodalEntityLinker:
        entity_id: str,
        source_type: str,
        source_id: str,
-        mention_context: str  = "",
-        confidence: float  = 1.0,
+        mention_context: str = "",
+        confidence: float = 1.0,
    ) -> MultimodalEntity:
        """
        创建多模态实体记录
@@ -469,14 +469,14 @@ class MultimodalEntityLinker:
            多模态实体记录
        """
        return MultimodalEntity(
-            id = str(uuid.uuid4())[:UUID_LENGTH],
-            entity_id = entity_id,
-            project_id = project_id,
-            name = "",  # 将在后续填充
-            source_type = source_type,
-            source_id = source_id,
-            mention_context = mention_context,
-            confidence = confidence,
+            id=str(uuid.uuid4())[:UUID_LENGTH],
+            entity_id=entity_id,
+            project_id=project_id,
+            name="",  # 将在后续填充
+            source_type=source_type,
+            source_id=source_id,
+            mention_context=mention_context,
+            confidence=confidence,
        )

    def analyze_modality_distribution(self, multimodal_entities: list[MultimodalEntity]) -> dict:
@@ -489,7 +489,7 @@ class MultimodalEntityLinker:
        Returns:
            模态分布统计
        """
-        distribution  = {mod: 0 for mod in self.MODALITIES}
+        distribution = {mod: 0 for mod in self.MODALITIES}

        # 统计每个模态的实体数
        for me in multimodal_entities:
@@ -497,13 +497,13 @@ class MultimodalEntityLinker:
                distribution[me.source_type] += 1

        # 统计跨模态实体
-        entity_modalities  = {}
+        entity_modalities = {}
        for me in multimodal_entities:
            if me.entity_id not in entity_modalities:
-                entity_modalities[me.entity_id]  = set()
+                entity_modalities[me.entity_id] = set()
            entity_modalities[me.entity_id].add(me.source_type)

-        cross_modal_count  = sum(1 for mods in entity_modalities.values() if len(mods) > 1)
+        cross_modal_count = sum(1 for mods in entity_modalities.values() if len(mods) > 1)

        return {
            "modality_distribution": distribution,
@@ -517,12 +517,12 @@ class MultimodalEntityLinker:


 # Singleton instance
-_multimodal_entity_linker  = None
+_multimodal_entity_linker = None


-def get_multimodal_entity_linker(similarity_threshold: float  = 0.85) -> MultimodalEntityLinker:
+def get_multimodal_entity_linker(similarity_threshold: float = 0.85) -> MultimodalEntityLinker:
    """获取多模态实体关联器单例"""
    global _multimodal_entity_linker
    if _multimodal_entity_linker is None:
-        _multimodal_entity_linker  = MultimodalEntityLinker(similarity_threshold)
+        _multimodal_entity_linker = MultimodalEntityLinker(similarity_threshold)
    return _multimodal_entity_linker