fix: auto-fix code issues (cron)
- 修复重复导入/字段 - 修复异常处理 - 修复PEP8格式问题 - 修复语法错误(运算符空格问题) - 修复类型注解格式
This commit is contained in:
@@ -9,13 +9,13 @@ from dataclasses import dataclass
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
# Constants
|
||||
UUID_LENGTH = 8 # UUID 截断长度
|
||||
UUID_LENGTH = 8 # UUID 截断长度
|
||||
|
||||
# 尝试导入embedding库
|
||||
try:
|
||||
NUMPY_AVAILABLE = True
|
||||
NUMPY_AVAILABLE = True
|
||||
except ImportError:
|
||||
NUMPY_AVAILABLE = False
|
||||
NUMPY_AVAILABLE = False
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -30,11 +30,11 @@ class MultimodalEntity:
|
||||
source_id: str
|
||||
mention_context: str
|
||||
confidence: float
|
||||
modality_features: dict = None # 模态特定特征
|
||||
modality_features: dict = None # 模态特定特征
|
||||
|
||||
def __post_init__(self):
|
||||
def __post_init__(self) -> None:
|
||||
if self.modality_features is None:
|
||||
self.modality_features = {}
|
||||
self.modality_features = {}
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -78,7 +78,7 @@ class MultimodalEntityLinker:
|
||||
"""多模态实体关联器 - 跨模态实体对齐和知识融合"""
|
||||
|
||||
# 关联类型
|
||||
LINK_TYPES = {
|
||||
LINK_TYPES = {
|
||||
"same_as": "同一实体",
|
||||
"related_to": "相关实体",
|
||||
"part_of": "组成部分",
|
||||
@@ -86,16 +86,16 @@ class MultimodalEntityLinker:
|
||||
}
|
||||
|
||||
# 模态类型
|
||||
MODALITIES = ["audio", "video", "image", "document"]
|
||||
MODALITIES = ["audio", "video", "image", "document"]
|
||||
|
||||
def __init__(self, similarity_threshold: float = 0.85) -> None:
|
||||
def __init__(self, similarity_threshold: float = 0.85) -> None:
|
||||
"""
|
||||
初始化多模态实体关联器
|
||||
|
||||
Args:
|
||||
similarity_threshold: 相似度阈值
|
||||
"""
|
||||
self.similarity_threshold = similarity_threshold
|
||||
self.similarity_threshold = similarity_threshold
|
||||
|
||||
def calculate_string_similarity(self, s1: str, s2: str) -> float:
|
||||
"""
|
||||
@@ -111,7 +111,7 @@ class MultimodalEntityLinker:
|
||||
if not s1 or not s2:
|
||||
return 0.0
|
||||
|
||||
s1, s2 = s1.lower().strip(), s2.lower().strip()
|
||||
s1, s2 = s1.lower().strip(), s2.lower().strip()
|
||||
|
||||
# 完全匹配
|
||||
if s1 == s2:
|
||||
@@ -136,7 +136,7 @@ class MultimodalEntityLinker:
|
||||
(相似度, 匹配类型)
|
||||
"""
|
||||
# 名称相似度
|
||||
name_sim = self.calculate_string_similarity(
|
||||
name_sim = self.calculate_string_similarity(
|
||||
entity1.get("name", ""), entity2.get("name", "")
|
||||
)
|
||||
|
||||
@@ -145,8 +145,8 @@ class MultimodalEntityLinker:
|
||||
return 1.0, "exact"
|
||||
|
||||
# 检查别名
|
||||
aliases1 = set(a.lower() for a in entity1.get("aliases", []))
|
||||
aliases2 = set(a.lower() for a in entity2.get("aliases", []))
|
||||
aliases1 = set(a.lower() for a in entity1.get("aliases", []))
|
||||
aliases2 = set(a.lower() for a in entity2.get("aliases", []))
|
||||
|
||||
if aliases1 & aliases2: # 有共同别名
|
||||
return 0.95, "alias_match"
|
||||
@@ -157,12 +157,12 @@ class MultimodalEntityLinker:
|
||||
return 0.95, "alias_match"
|
||||
|
||||
# 定义相似度
|
||||
def_sim = self.calculate_string_similarity(
|
||||
def_sim = self.calculate_string_similarity(
|
||||
entity1.get("definition", ""), entity2.get("definition", "")
|
||||
)
|
||||
|
||||
# 综合相似度
|
||||
combined_sim = name_sim * 0.7 + def_sim * 0.3
|
||||
combined_sim = name_sim * 0.7 + def_sim * 0.3
|
||||
|
||||
if combined_sim >= self.similarity_threshold:
|
||||
return combined_sim, "fuzzy"
|
||||
@@ -170,7 +170,7 @@ class MultimodalEntityLinker:
|
||||
return combined_sim, "none"
|
||||
|
||||
def find_matching_entity(
|
||||
self, query_entity: dict, candidate_entities: list[dict], exclude_ids: set[str] = None
|
||||
self, query_entity: dict, candidate_entities: list[dict], exclude_ids: set[str] = None
|
||||
) -> AlignmentResult | None:
|
||||
"""
|
||||
在候选实体中查找匹配的实体
|
||||
@@ -183,28 +183,28 @@ class MultimodalEntityLinker:
|
||||
Returns:
|
||||
对齐结果
|
||||
"""
|
||||
exclude_ids = exclude_ids or set()
|
||||
best_match = None
|
||||
best_similarity = 0.0
|
||||
exclude_ids = exclude_ids or set()
|
||||
best_match = None
|
||||
best_similarity = 0.0
|
||||
|
||||
for candidate in candidate_entities:
|
||||
if candidate.get("id") in exclude_ids:
|
||||
continue
|
||||
|
||||
similarity, match_type = self.calculate_entity_similarity(query_entity, candidate)
|
||||
similarity, match_type = self.calculate_entity_similarity(query_entity, candidate)
|
||||
|
||||
if similarity > best_similarity and similarity >= self.similarity_threshold:
|
||||
best_similarity = similarity
|
||||
best_match = candidate
|
||||
best_match_type = match_type
|
||||
best_similarity = similarity
|
||||
best_match = candidate
|
||||
best_match_type = match_type
|
||||
|
||||
if best_match:
|
||||
return AlignmentResult(
|
||||
entity_id=query_entity.get("id"),
|
||||
matched_entity_id=best_match.get("id"),
|
||||
similarity=best_similarity,
|
||||
match_type=best_match_type,
|
||||
confidence=best_similarity,
|
||||
entity_id = query_entity.get("id"),
|
||||
matched_entity_id = best_match.get("id"),
|
||||
similarity = best_similarity,
|
||||
match_type = best_match_type,
|
||||
confidence = best_similarity,
|
||||
)
|
||||
|
||||
return None
|
||||
@@ -230,10 +230,10 @@ class MultimodalEntityLinker:
|
||||
Returns:
|
||||
实体关联列表
|
||||
"""
|
||||
links = []
|
||||
links = []
|
||||
|
||||
# 合并所有实体
|
||||
all_entities = {
|
||||
all_entities = {
|
||||
"audio": audio_entities,
|
||||
"video": video_entities,
|
||||
"image": image_entities,
|
||||
@@ -246,24 +246,24 @@ class MultimodalEntityLinker:
|
||||
if mod1 >= mod2: # 避免重复比较
|
||||
continue
|
||||
|
||||
entities1 = all_entities.get(mod1, [])
|
||||
entities2 = all_entities.get(mod2, [])
|
||||
entities1 = all_entities.get(mod1, [])
|
||||
entities2 = all_entities.get(mod2, [])
|
||||
|
||||
for ent1 in entities1:
|
||||
# 在另一个模态中查找匹配
|
||||
result = self.find_matching_entity(ent1, entities2)
|
||||
result = self.find_matching_entity(ent1, entities2)
|
||||
|
||||
if result and result.matched_entity_id:
|
||||
link = EntityLink(
|
||||
id=str(uuid.uuid4())[:UUID_LENGTH],
|
||||
project_id=project_id,
|
||||
source_entity_id=ent1.get("id"),
|
||||
target_entity_id=result.matched_entity_id,
|
||||
link_type="same_as" if result.similarity > 0.95 else "related_to",
|
||||
source_modality=mod1,
|
||||
target_modality=mod2,
|
||||
confidence=result.confidence,
|
||||
evidence=f"Cross-modal alignment: {result.match_type}",
|
||||
link = EntityLink(
|
||||
id = str(uuid.uuid4())[:UUID_LENGTH],
|
||||
project_id = project_id,
|
||||
source_entity_id = ent1.get("id"),
|
||||
target_entity_id = result.matched_entity_id,
|
||||
link_type = "same_as" if result.similarity > 0.95 else "related_to",
|
||||
source_modality = mod1,
|
||||
target_modality = mod2,
|
||||
confidence = result.confidence,
|
||||
evidence = f"Cross-modal alignment: {result.match_type}",
|
||||
)
|
||||
links.append(link)
|
||||
|
||||
@@ -284,7 +284,7 @@ class MultimodalEntityLinker:
|
||||
融合结果
|
||||
"""
|
||||
# 收集所有属性
|
||||
fused_properties = {
|
||||
fused_properties = {
|
||||
"names": set(),
|
||||
"definitions": [],
|
||||
"aliases": set(),
|
||||
@@ -293,7 +293,7 @@ class MultimodalEntityLinker:
|
||||
"contexts": [],
|
||||
}
|
||||
|
||||
merged_ids = []
|
||||
merged_ids = []
|
||||
|
||||
for entity in linked_entities:
|
||||
merged_ids.append(entity.get("id"))
|
||||
@@ -318,21 +318,21 @@ class MultimodalEntityLinker:
|
||||
fused_properties["contexts"].append(mention.get("mention_context"))
|
||||
|
||||
# 选择最佳定义(最长的那个)
|
||||
best_definition = (
|
||||
max(fused_properties["definitions"], key=len) if fused_properties["definitions"] else ""
|
||||
best_definition = (
|
||||
max(fused_properties["definitions"], key = len) if fused_properties["definitions"] else ""
|
||||
)
|
||||
|
||||
# 选择最佳名称(最常见的那个)
|
||||
from collections import Counter
|
||||
|
||||
name_counts = Counter(fused_properties["names"])
|
||||
best_name = name_counts.most_common(1)[0][0] if name_counts else ""
|
||||
name_counts = Counter(fused_properties["names"])
|
||||
best_name = name_counts.most_common(1)[0][0] if name_counts else ""
|
||||
|
||||
# 构建融合结果
|
||||
return FusionResult(
|
||||
canonical_entity_id=entity_id,
|
||||
merged_entity_ids=merged_ids,
|
||||
fused_properties={
|
||||
canonical_entity_id = entity_id,
|
||||
merged_entity_ids = merged_ids,
|
||||
fused_properties = {
|
||||
"name": best_name,
|
||||
"definition": best_definition,
|
||||
"aliases": list(fused_properties["aliases"]),
|
||||
@@ -340,8 +340,8 @@ class MultimodalEntityLinker:
|
||||
"modalities": list(fused_properties["modalities"]),
|
||||
"contexts": fused_properties["contexts"][:10], # 最多10个上下文
|
||||
},
|
||||
source_modalities=list(fused_properties["modalities"]),
|
||||
confidence=min(1.0, len(linked_entities) * 0.2 + 0.5),
|
||||
source_modalities = list(fused_properties["modalities"]),
|
||||
confidence = min(1.0, len(linked_entities) * 0.2 + 0.5),
|
||||
)
|
||||
|
||||
def detect_entity_conflicts(self, entities: list[dict]) -> list[dict]:
|
||||
@@ -354,30 +354,30 @@ class MultimodalEntityLinker:
|
||||
Returns:
|
||||
冲突列表
|
||||
"""
|
||||
conflicts = []
|
||||
conflicts = []
|
||||
|
||||
# 按名称分组
|
||||
name_groups = {}
|
||||
name_groups = {}
|
||||
for entity in entities:
|
||||
name = entity.get("name", "").lower()
|
||||
name = entity.get("name", "").lower()
|
||||
if name:
|
||||
if name not in name_groups:
|
||||
name_groups[name] = []
|
||||
name_groups[name] = []
|
||||
name_groups[name].append(entity)
|
||||
|
||||
# 检测同名但定义不同的实体
|
||||
for name, group in name_groups.items():
|
||||
if len(group) > 1:
|
||||
# 检查定义是否相似
|
||||
definitions = [e.get("definition", "") for e in group if e.get("definition")]
|
||||
definitions = [e.get("definition", "") for e in group if e.get("definition")]
|
||||
|
||||
if len(definitions) > 1:
|
||||
# 计算定义之间的相似度
|
||||
sim_matrix = []
|
||||
sim_matrix = []
|
||||
for i, d1 in enumerate(definitions):
|
||||
for j, d2 in enumerate(definitions):
|
||||
if i < j:
|
||||
sim = self.calculate_string_similarity(d1, d2)
|
||||
sim = self.calculate_string_similarity(d1, d2)
|
||||
sim_matrix.append(sim)
|
||||
|
||||
# 如果定义相似度都很低,可能是冲突
|
||||
@@ -394,7 +394,7 @@ class MultimodalEntityLinker:
|
||||
return conflicts
|
||||
|
||||
def suggest_entity_merges(
|
||||
self, entities: list[dict], existing_links: list[EntityLink] = None
|
||||
self, entities: list[dict], existing_links: list[EntityLink] = None
|
||||
) -> list[dict]:
|
||||
"""
|
||||
建议实体合并
|
||||
@@ -406,13 +406,13 @@ class MultimodalEntityLinker:
|
||||
Returns:
|
||||
合并建议列表
|
||||
"""
|
||||
suggestions = []
|
||||
existing_pairs = set()
|
||||
suggestions = []
|
||||
existing_pairs = set()
|
||||
|
||||
# 记录已有的关联
|
||||
if existing_links:
|
||||
for link in existing_links:
|
||||
pair = tuple(sorted([link.source_entity_id, link.target_entity_id]))
|
||||
pair = tuple(sorted([link.source_entity_id, link.target_entity_id]))
|
||||
existing_pairs.add(pair)
|
||||
|
||||
# 检查所有实体对
|
||||
@@ -422,12 +422,12 @@ class MultimodalEntityLinker:
|
||||
continue
|
||||
|
||||
# 检查是否已有关联
|
||||
pair = tuple(sorted([ent1.get("id"), ent2.get("id")]))
|
||||
pair = tuple(sorted([ent1.get("id"), ent2.get("id")]))
|
||||
if pair in existing_pairs:
|
||||
continue
|
||||
|
||||
# 计算相似度
|
||||
similarity, match_type = self.calculate_entity_similarity(ent1, ent2)
|
||||
similarity, match_type = self.calculate_entity_similarity(ent1, ent2)
|
||||
|
||||
if similarity >= self.similarity_threshold:
|
||||
suggestions.append(
|
||||
@@ -441,7 +441,7 @@ class MultimodalEntityLinker:
|
||||
)
|
||||
|
||||
# 按相似度排序
|
||||
suggestions.sort(key=lambda x: x["similarity"], reverse=True)
|
||||
suggestions.sort(key = lambda x: x["similarity"], reverse = True)
|
||||
|
||||
return suggestions
|
||||
|
||||
@@ -451,8 +451,8 @@ class MultimodalEntityLinker:
|
||||
entity_id: str,
|
||||
source_type: str,
|
||||
source_id: str,
|
||||
mention_context: str = "",
|
||||
confidence: float = 1.0,
|
||||
mention_context: str = "",
|
||||
confidence: float = 1.0,
|
||||
) -> MultimodalEntity:
|
||||
"""
|
||||
创建多模态实体记录
|
||||
@@ -469,14 +469,14 @@ class MultimodalEntityLinker:
|
||||
多模态实体记录
|
||||
"""
|
||||
return MultimodalEntity(
|
||||
id=str(uuid.uuid4())[:UUID_LENGTH],
|
||||
entity_id=entity_id,
|
||||
project_id=project_id,
|
||||
name="", # 将在后续填充
|
||||
source_type=source_type,
|
||||
source_id=source_id,
|
||||
mention_context=mention_context,
|
||||
confidence=confidence,
|
||||
id = str(uuid.uuid4())[:UUID_LENGTH],
|
||||
entity_id = entity_id,
|
||||
project_id = project_id,
|
||||
name = "", # 将在后续填充
|
||||
source_type = source_type,
|
||||
source_id = source_id,
|
||||
mention_context = mention_context,
|
||||
confidence = confidence,
|
||||
)
|
||||
|
||||
def analyze_modality_distribution(self, multimodal_entities: list[MultimodalEntity]) -> dict:
|
||||
@@ -489,7 +489,7 @@ class MultimodalEntityLinker:
|
||||
Returns:
|
||||
模态分布统计
|
||||
"""
|
||||
distribution = {mod: 0 for mod in self.MODALITIES}
|
||||
distribution = {mod: 0 for mod in self.MODALITIES}
|
||||
|
||||
# 统计每个模态的实体数
|
||||
for me in multimodal_entities:
|
||||
@@ -497,13 +497,13 @@ class MultimodalEntityLinker:
|
||||
distribution[me.source_type] += 1
|
||||
|
||||
# 统计跨模态实体
|
||||
entity_modalities = {}
|
||||
entity_modalities = {}
|
||||
for me in multimodal_entities:
|
||||
if me.entity_id not in entity_modalities:
|
||||
entity_modalities[me.entity_id] = set()
|
||||
entity_modalities[me.entity_id] = set()
|
||||
entity_modalities[me.entity_id].add(me.source_type)
|
||||
|
||||
cross_modal_count = sum(1 for mods in entity_modalities.values() if len(mods) > 1)
|
||||
cross_modal_count = sum(1 for mods in entity_modalities.values() if len(mods) > 1)
|
||||
|
||||
return {
|
||||
"modality_distribution": distribution,
|
||||
@@ -517,12 +517,12 @@ class MultimodalEntityLinker:
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_multimodal_entity_linker = None
|
||||
_multimodal_entity_linker = None
|
||||
|
||||
|
||||
def get_multimodal_entity_linker(similarity_threshold: float = 0.85) -> MultimodalEntityLinker:
|
||||
def get_multimodal_entity_linker(similarity_threshold: float = 0.85) -> MultimodalEntityLinker:
|
||||
"""获取多模态实体关联器单例"""
|
||||
global _multimodal_entity_linker
|
||||
if _multimodal_entity_linker is None:
|
||||
_multimodal_entity_linker = MultimodalEntityLinker(similarity_threshold)
|
||||
_multimodal_entity_linker = MultimodalEntityLinker(similarity_threshold)
|
||||
return _multimodal_entity_linker
|
||||
|
||||
Reference in New Issue
Block a user