fix: auto-fix code issues (cron)
- 修复重复导入/字段 - 修复异常处理 - 修复PEP8格式问题 - 修复语法错误(运算符空格问题) - 修复类型注解格式
This commit is contained in:
@@ -12,8 +12,8 @@ import httpx
|
||||
import numpy as np
|
||||
|
||||
# API Keys
|
||||
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
|
||||
KIMI_BASE_URL = os.getenv("KIMI_BASE_URL", "https://api.kimi.com/coding")
|
||||
KIMI_API_KEY = os.getenv("KIMI_API_KEY", "")
|
||||
KIMI_BASE_URL = os.getenv("KIMI_BASE_URL", "https://api.kimi.com/coding")
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -27,9 +27,9 @@ class EntityEmbedding:
|
||||
class EntityAligner:
|
||||
"""实体对齐器 - 使用 embedding 进行相似度匹配"""
|
||||
|
||||
def __init__(self, similarity_threshold: float = 0.85):
|
||||
self.similarity_threshold = similarity_threshold
|
||||
self.embedding_cache: dict[str, list[float]] = {}
|
||||
def __init__(self, similarity_threshold: float = 0.85) -> None:
|
||||
self.similarity_threshold = similarity_threshold
|
||||
self.embedding_cache: dict[str, list[float]] = {}
|
||||
|
||||
def get_embedding(self, text: str) -> list[float] | None:
|
||||
"""
|
||||
@@ -45,25 +45,25 @@ class EntityAligner:
|
||||
return None
|
||||
|
||||
# 检查缓存
|
||||
cache_key = hash(text)
|
||||
cache_key = hash(text)
|
||||
if cache_key in self.embedding_cache:
|
||||
return self.embedding_cache[cache_key]
|
||||
|
||||
try:
|
||||
response = httpx.post(
|
||||
response = httpx.post(
|
||||
f"{KIMI_BASE_URL}/v1/embeddings",
|
||||
headers={
|
||||
headers = {
|
||||
"Authorization": f"Bearer {KIMI_API_KEY}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={"model": "k2p5", "input": text[:500]}, # 限制长度
|
||||
timeout=30.0,
|
||||
json = {"model": "k2p5", "input": text[:500]}, # 限制长度
|
||||
timeout = 30.0,
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
result = response.json()
|
||||
|
||||
embedding = result["data"][0]["embedding"]
|
||||
self.embedding_cache[cache_key] = embedding
|
||||
embedding = result["data"][0]["embedding"]
|
||||
self.embedding_cache[cache_key] = embedding
|
||||
return embedding
|
||||
|
||||
except (httpx.HTTPError, json.JSONDecodeError, KeyError) as e:
|
||||
@@ -81,20 +81,20 @@ class EntityAligner:
|
||||
Returns:
|
||||
相似度分数 (0-1)
|
||||
"""
|
||||
vec1 = np.array(embedding1)
|
||||
vec2 = np.array(embedding2)
|
||||
vec1 = np.array(embedding1)
|
||||
vec2 = np.array(embedding2)
|
||||
|
||||
# 余弦相似度
|
||||
dot_product = np.dot(vec1, vec2)
|
||||
norm1 = np.linalg.norm(vec1)
|
||||
norm2 = np.linalg.norm(vec2)
|
||||
dot_product = np.dot(vec1, vec2)
|
||||
norm1 = np.linalg.norm(vec1)
|
||||
norm2 = np.linalg.norm(vec2)
|
||||
|
||||
if norm1 == 0 or norm2 == 0:
|
||||
return 0.0
|
||||
|
||||
return float(dot_product / (norm1 * norm2))
|
||||
|
||||
def get_entity_text(self, name: str, definition: str = "") -> str:
|
||||
def get_entity_text(self, name: str, definition: str = "") -> str:
|
||||
"""
|
||||
构建用于 embedding 的实体文本
|
||||
|
||||
@@ -113,9 +113,9 @@ class EntityAligner:
|
||||
self,
|
||||
project_id: str,
|
||||
name: str,
|
||||
definition: str = "",
|
||||
exclude_id: str | None = None,
|
||||
threshold: float | None = None,
|
||||
definition: str = "",
|
||||
exclude_id: str | None = None,
|
||||
threshold: float | None = None,
|
||||
) -> object | None:
|
||||
"""
|
||||
查找相似的实体
|
||||
@@ -131,54 +131,54 @@ class EntityAligner:
|
||||
相似的实体或 None
|
||||
"""
|
||||
if threshold is None:
|
||||
threshold = self.similarity_threshold
|
||||
threshold = self.similarity_threshold
|
||||
|
||||
try:
|
||||
from db_manager import get_db_manager
|
||||
|
||||
db = get_db_manager()
|
||||
db = get_db_manager()
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
# 获取项目的所有实体
|
||||
entities = db.get_all_entities_for_embedding(project_id)
|
||||
entities = db.get_all_entities_for_embedding(project_id)
|
||||
|
||||
if not entities:
|
||||
return None
|
||||
|
||||
# 获取查询实体的 embedding
|
||||
query_text = self.get_entity_text(name, definition)
|
||||
query_embedding = self.get_embedding(query_text)
|
||||
query_text = self.get_entity_text(name, definition)
|
||||
query_embedding = self.get_embedding(query_text)
|
||||
|
||||
if query_embedding is None:
|
||||
# 如果 embedding API 失败,回退到简单匹配
|
||||
return self._fallback_similarity_match(entities, name, exclude_id)
|
||||
|
||||
best_match = None
|
||||
best_score = threshold
|
||||
best_match = None
|
||||
best_score = threshold
|
||||
|
||||
for entity in entities:
|
||||
if exclude_id and entity.id == exclude_id:
|
||||
continue
|
||||
|
||||
# 获取实体的 embedding
|
||||
entity_text = self.get_entity_text(entity.name, entity.definition)
|
||||
entity_embedding = self.get_embedding(entity_text)
|
||||
entity_text = self.get_entity_text(entity.name, entity.definition)
|
||||
entity_embedding = self.get_embedding(entity_text)
|
||||
|
||||
if entity_embedding is None:
|
||||
continue
|
||||
|
||||
# 计算相似度
|
||||
similarity = self.compute_similarity(query_embedding, entity_embedding)
|
||||
similarity = self.compute_similarity(query_embedding, entity_embedding)
|
||||
|
||||
if similarity > best_score:
|
||||
best_score = similarity
|
||||
best_match = entity
|
||||
best_score = similarity
|
||||
best_match = entity
|
||||
|
||||
return best_match
|
||||
|
||||
def _fallback_similarity_match(
|
||||
self, entities: list[object], name: str, exclude_id: str | None = None
|
||||
self, entities: list[object], name: str, exclude_id: str | None = None
|
||||
) -> object | None:
|
||||
"""
|
||||
回退到简单的相似度匹配(不使用 embedding)
|
||||
@@ -191,7 +191,7 @@ class EntityAligner:
|
||||
Returns:
|
||||
最相似的实体或 None
|
||||
"""
|
||||
name_lower = name.lower()
|
||||
name_lower = name.lower()
|
||||
|
||||
# 1. 精确匹配
|
||||
for entity in entities:
|
||||
@@ -212,7 +212,7 @@ class EntityAligner:
|
||||
return None
|
||||
|
||||
def batch_align_entities(
|
||||
self, project_id: str, new_entities: list[dict], threshold: float | None = None
|
||||
self, project_id: str, new_entities: list[dict], threshold: float | None = None
|
||||
) -> list[dict]:
|
||||
"""
|
||||
批量对齐实体
|
||||
@@ -226,16 +226,16 @@ class EntityAligner:
|
||||
对齐结果列表 [{"new_entity": {...}, "matched_entity": {...}, "similarity": 0.9}]
|
||||
"""
|
||||
if threshold is None:
|
||||
threshold = self.similarity_threshold
|
||||
threshold = self.similarity_threshold
|
||||
|
||||
results = []
|
||||
results = []
|
||||
|
||||
for new_ent in new_entities:
|
||||
matched = self.find_similar_entity(
|
||||
project_id, new_ent["name"], new_ent.get("definition", ""), threshold=threshold
|
||||
matched = self.find_similar_entity(
|
||||
project_id, new_ent["name"], new_ent.get("definition", ""), threshold = threshold
|
||||
)
|
||||
|
||||
result = {
|
||||
result = {
|
||||
"new_entity": new_ent,
|
||||
"matched_entity": None,
|
||||
"similarity": 0.0,
|
||||
@@ -244,28 +244,28 @@ class EntityAligner:
|
||||
|
||||
if matched:
|
||||
# 计算相似度
|
||||
query_text = self.get_entity_text(new_ent["name"], new_ent.get("definition", ""))
|
||||
matched_text = self.get_entity_text(matched.name, matched.definition)
|
||||
query_text = self.get_entity_text(new_ent["name"], new_ent.get("definition", ""))
|
||||
matched_text = self.get_entity_text(matched.name, matched.definition)
|
||||
|
||||
query_emb = self.get_embedding(query_text)
|
||||
matched_emb = self.get_embedding(matched_text)
|
||||
query_emb = self.get_embedding(query_text)
|
||||
matched_emb = self.get_embedding(matched_text)
|
||||
|
||||
if query_emb and matched_emb:
|
||||
similarity = self.compute_similarity(query_emb, matched_emb)
|
||||
result["matched_entity"] = {
|
||||
similarity = self.compute_similarity(query_emb, matched_emb)
|
||||
result["matched_entity"] = {
|
||||
"id": matched.id,
|
||||
"name": matched.name,
|
||||
"type": matched.type,
|
||||
"definition": matched.definition,
|
||||
}
|
||||
result["similarity"] = similarity
|
||||
result["should_merge"] = similarity >= threshold
|
||||
result["similarity"] = similarity
|
||||
result["should_merge"] = similarity >= threshold
|
||||
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
def suggest_entity_aliases(self, entity_name: str, entity_definition: str = "") -> list[str]:
|
||||
def suggest_entity_aliases(self, entity_name: str, entity_definition: str = "") -> list[str]:
|
||||
"""
|
||||
使用 LLM 建议实体的别名
|
||||
|
||||
@@ -279,7 +279,7 @@ class EntityAligner:
|
||||
if not KIMI_API_KEY:
|
||||
return []
|
||||
|
||||
prompt = f"""为以下实体生成可能的别名或简称:
|
||||
prompt = f"""为以下实体生成可能的别名或简称:
|
||||
|
||||
实体名称:{entity_name}
|
||||
定义:{entity_definition}
|
||||
@@ -290,28 +290,28 @@ class EntityAligner:
|
||||
只返回 JSON,不要其他内容。"""
|
||||
|
||||
try:
|
||||
response = httpx.post(
|
||||
response = httpx.post(
|
||||
f"{KIMI_BASE_URL}/v1/chat/completions",
|
||||
headers={
|
||||
headers = {
|
||||
"Authorization": f"Bearer {KIMI_API_KEY}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
json = {
|
||||
"model": "k2p5",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.3,
|
||||
},
|
||||
timeout=30.0,
|
||||
timeout = 30.0,
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
content = result["choices"][0]["message"]["content"]
|
||||
result = response.json()
|
||||
content = result["choices"][0]["message"]["content"]
|
||||
|
||||
import re
|
||||
|
||||
json_match = re.search(r"\{{.*?\}}", content, re.DOTALL)
|
||||
json_match = re.search(r"\{{.*?\}}", content, re.DOTALL)
|
||||
if json_match:
|
||||
data = json.loads(json_match.group())
|
||||
data = json.loads(json_match.group())
|
||||
return data.get("aliases", [])
|
||||
except (httpx.HTTPError, json.JSONDecodeError, KeyError) as e:
|
||||
print(f"Alias suggestion failed: {e}")
|
||||
@@ -340,8 +340,8 @@ def simple_similarity(str1: str, str2: str) -> float:
|
||||
return 0.0
|
||||
|
||||
# 转换为小写
|
||||
s1 = str1.lower()
|
||||
s2 = str2.lower()
|
||||
s1 = str1.lower()
|
||||
s2 = str2.lower()
|
||||
|
||||
# 包含关系
|
||||
if s1 in s2 or s2 in s1:
|
||||
@@ -355,11 +355,11 @@ def simple_similarity(str1: str, str2: str) -> float:
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试
|
||||
aligner = EntityAligner()
|
||||
aligner = EntityAligner()
|
||||
|
||||
# 测试 embedding
|
||||
test_text = "Kubernetes 容器编排平台"
|
||||
embedding = aligner.get_embedding(test_text)
|
||||
test_text = "Kubernetes 容器编排平台"
|
||||
embedding = aligner.get_embedding(test_text)
|
||||
if embedding:
|
||||
print(f"Embedding dimension: {len(embedding)}")
|
||||
print(f"First 5 values: {embedding[:5]}")
|
||||
@@ -367,7 +367,7 @@ if __name__ == "__main__":
|
||||
print("Embedding API not available")
|
||||
|
||||
# 测试相似度计算
|
||||
emb1 = [1.0, 0.0, 0.0]
|
||||
emb2 = [0.9, 0.1, 0.0]
|
||||
sim = aligner.compute_similarity(emb1, emb2)
|
||||
emb1 = [1.0, 0.0, 0.0]
|
||||
emb2 = [0.9, 0.1, 0.0]
|
||||
sim = aligner.compute_similarity(emb1, emb2)
|
||||
print(f"Similarity: {sim:.4f}")
|
||||
|
||||
Reference in New Issue
Block a user