Files
insightflow/backend/test_phase8_task8.py
AutoFix Bot e23f1fec08 fix: auto-fix code issues (cron)
- 修复重复导入/字段
- 修复异常处理
- 修复PEP8格式问题
- 修复语法错误(运算符空格问题)
- 修复类型注解格式
2026-03-02 06:09:49 +08:00

734 lines
28 KiB
Python

#!/usr/bin/env python3
"""
InsightFlow Phase 8 Task 8: Operations & Monitoring Test Script
运维与监控模块测试脚本
测试内容:
1. 实时告警系统(告警规则、告警渠道、告警触发、抑制聚合)
2. 容量规划与自动扩缩容
3. 灾备与故障转移
4. 成本优化
"""
import json
import os
import random
import sys
from datetime import datetime, timedelta
from ops_manager import (
AlertChannelType,
AlertRuleType,
AlertSeverity,
AlertStatus,
ResourceType,
get_ops_manager,
)
# Add backend directory to path
backend_dir = os.path.dirname(os.path.abspath(__file__))
if backend_dir not in sys.path:
sys.path.insert(0, backend_dir)
class TestOpsManager:
"""测试运维与监控管理器"""
def __init__(self) -> None:
self.manager = get_ops_manager()
self.tenant_id = "test_tenant_001"
self.test_results = []
def log(self, message: str, success: bool = True) -> None:
"""记录测试结果"""
status = "" if success else ""
print(f"{status} {message}")
self.test_results.append((message, success))
def run_all_tests(self) -> None:
"""运行所有测试"""
print(" = " * 60)
print("InsightFlow Phase 8 Task 8: Operations & Monitoring Tests")
print(" = " * 60)
# 1. 告警系统测试
self.test_alert_rules()
self.test_alert_channels()
self.test_alerts()
# 2. 容量规划与自动扩缩容测试
self.test_capacity_planning()
self.test_auto_scaling()
# 3. 健康检查与故障转移测试
self.test_health_checks()
self.test_failover()
# 4. 备份与恢复测试
self.test_backup()
# 5. 成本优化测试
self.test_cost_optimization()
# 打印测试总结
self.print_summary()
def test_alert_rules(self) -> None:
"""测试告警规则管理"""
print("\n📋 Testing Alert Rules...")
try:
# 创建阈值告警规则
rule1 = self.manager.create_alert_rule(
tenant_id = self.tenant_id,
name = "CPU 使用率告警",
description = "当 CPU 使用率超过 80% 时触发告警",
rule_type = AlertRuleType.THRESHOLD,
severity = AlertSeverity.P1,
metric = "cpu_usage_percent",
condition = ">",
threshold = 80.0,
duration = 300,
evaluation_interval = 60,
channels = [],
labels = {"service": "api", "team": "platform"},
annotations = {"summary": "CPU 使用率过高", "runbook": "https://wiki/runbooks/cpu"},
created_by = "test_user",
)
self.log(f"Created alert rule: {rule1.name} (ID: {rule1.id})")
# 创建异常检测告警规则
rule2 = self.manager.create_alert_rule(
tenant_id = self.tenant_id,
name = "内存异常检测",
description = "检测内存使用异常",
rule_type = AlertRuleType.ANOMALY,
severity = AlertSeverity.P2,
metric = "memory_usage_percent",
condition = ">",
threshold = 0.0,
duration = 600,
evaluation_interval = 300,
channels = [],
labels = {"service": "database"},
annotations = {},
created_by = "test_user",
)
self.log(f"Created anomaly alert rule: {rule2.name} (ID: {rule2.id})")
# 获取告警规则
fetched_rule = self.manager.get_alert_rule(rule1.id)
assert fetched_rule is not None
assert fetched_rule.name == rule1.name
self.log(f"Fetched alert rule: {fetched_rule.name}")
# 列出租户的所有告警规则
rules = self.manager.list_alert_rules(self.tenant_id)
assert len(rules) >= 2
self.log(f"Listed {len(rules)} alert rules for tenant")
# 更新告警规则
updated_rule = self.manager.update_alert_rule(
rule1.id, threshold = 85.0, description = "更新后的描述"
)
assert updated_rule.threshold == 85.0
self.log(f"Updated alert rule threshold to {updated_rule.threshold}")
# 测试完成,清理
self.manager.delete_alert_rule(rule1.id)
self.manager.delete_alert_rule(rule2.id)
self.log("Deleted test alert rules")
except Exception as e:
self.log(f"Alert rules test failed: {e}", success = False)
def test_alert_channels(self) -> None:
"""测试告警渠道管理"""
print("\n📢 Testing Alert Channels...")
try:
# 创建飞书告警渠道
channel1 = self.manager.create_alert_channel(
tenant_id = self.tenant_id,
name = "飞书告警",
channel_type = AlertChannelType.FEISHU,
config = {
"webhook_url": "https://open.feishu.cn/open-apis/bot/v2/hook/test",
"secret": "test_secret",
},
severity_filter = ["p0", "p1"],
)
self.log(f"Created Feishu channel: {channel1.name} (ID: {channel1.id})")
# 创建钉钉告警渠道
channel2 = self.manager.create_alert_channel(
tenant_id = self.tenant_id,
name = "钉钉告警",
channel_type = AlertChannelType.DINGTALK,
config = {
"webhook_url": "https://oapi.dingtalk.com/robot/send?access_token = test",
"secret": "test_secret",
},
severity_filter = ["p0", "p1", "p2"],
)
self.log(f"Created DingTalk channel: {channel2.name} (ID: {channel2.id})")
# 创建 Slack 告警渠道
channel3 = self.manager.create_alert_channel(
tenant_id = self.tenant_id,
name = "Slack 告警",
channel_type = AlertChannelType.SLACK,
config = {"webhook_url": "https://hooks.slack.com/services/test"},
severity_filter = ["p0", "p1", "p2", "p3"],
)
self.log(f"Created Slack channel: {channel3.name} (ID: {channel3.id})")
# 获取告警渠道
fetched_channel = self.manager.get_alert_channel(channel1.id)
assert fetched_channel is not None
assert fetched_channel.name == channel1.name
self.log(f"Fetched alert channel: {fetched_channel.name}")
# 列出租户的所有告警渠道
channels = self.manager.list_alert_channels(self.tenant_id)
assert len(channels) >= 3
self.log(f"Listed {len(channels)} alert channels for tenant")
# 清理
for channel in channels:
if channel.tenant_id == self.tenant_id:
with self.manager._get_db() as conn:
conn.execute("DELETE FROM alert_channels WHERE id = ?", (channel.id, ))
conn.commit()
self.log("Deleted test alert channels")
except Exception as e:
self.log(f"Alert channels test failed: {e}", success = False)
def test_alerts(self) -> None:
"""测试告警管理"""
print("\n🚨 Testing Alerts...")
try:
# 创建告警规则
rule = self.manager.create_alert_rule(
tenant_id = self.tenant_id,
name = "测试告警规则",
description = "用于测试的告警规则",
rule_type = AlertRuleType.THRESHOLD,
severity = AlertSeverity.P1,
metric = "test_metric",
condition = ">",
threshold = 100.0,
duration = 60,
evaluation_interval = 60,
channels = [],
labels = {},
annotations = {},
created_by = "test_user",
)
# 记录资源指标
for i in range(10):
self.manager.record_resource_metric(
tenant_id = self.tenant_id,
resource_type = ResourceType.CPU,
resource_id = "server-001",
metric_name = "test_metric",
metric_value = 110.0 + i,
unit = "percent",
metadata = {"region": "cn-north-1"},
)
self.log("Recorded 10 resource metrics")
# 手动创建告警
from ops_manager import Alert
alert_id = f"test_alert_{datetime.now().strftime('%Y%m%d%H%M%S')}"
now = datetime.now().isoformat()
alert = Alert(
id = alert_id,
rule_id = rule.id,
tenant_id = self.tenant_id,
severity = AlertSeverity.P1,
status = AlertStatus.FIRING,
title = "测试告警",
description = "这是一条测试告警",
metric = "test_metric",
value = 120.0,
threshold = 100.0,
labels = {"test": "true"},
annotations = {},
started_at = now,
resolved_at = None,
acknowledged_by = None,
acknowledged_at = None,
notification_sent = {},
suppression_count = 0,
)
with self.manager._get_db() as conn:
conn.execute(
"""
INSERT INTO alerts
(id, rule_id, tenant_id, severity, status, title, description,
metric, value, threshold, labels, annotations, started_at, notification_sent, suppression_count)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
alert.id,
alert.rule_id,
alert.tenant_id,
alert.severity.value,
alert.status.value,
alert.title,
alert.description,
alert.metric,
alert.value,
alert.threshold,
json.dumps(alert.labels),
json.dumps(alert.annotations),
alert.started_at,
json.dumps(alert.notification_sent),
alert.suppression_count,
),
)
conn.commit()
self.log(f"Created test alert: {alert.id}")
# 列出租户的告警
alerts = self.manager.list_alerts(self.tenant_id)
assert len(alerts) >= 1
self.log(f"Listed {len(alerts)} alerts for tenant")
# 确认告警
self.manager.acknowledge_alert(alert_id, "test_user")
fetched_alert = self.manager.get_alert(alert_id)
assert fetched_alert.status == AlertStatus.ACKNOWLEDGED
assert fetched_alert.acknowledged_by == "test_user"
self.log(f"Acknowledged alert: {alert_id}")
# 解决告警
self.manager.resolve_alert(alert_id)
fetched_alert = self.manager.get_alert(alert_id)
assert fetched_alert.status == AlertStatus.RESOLVED
assert fetched_alert.resolved_at is not None
self.log(f"Resolved alert: {alert_id}")
# 清理
self.manager.delete_alert_rule(rule.id)
with self.manager._get_db() as conn:
conn.execute("DELETE FROM alerts WHERE id = ?", (alert_id, ))
conn.execute("DELETE FROM resource_metrics WHERE tenant_id = ?", (self.tenant_id, ))
conn.commit()
self.log("Cleaned up test data")
except Exception as e:
self.log(f"Alerts test failed: {e}", success = False)
def test_capacity_planning(self) -> None:
"""测试容量规划"""
print("\n📊 Testing Capacity Planning...")
try:
# 记录历史指标数据
base_time = datetime.now() - timedelta(days = 30)
for i in range(30):
timestamp = (base_time + timedelta(days = i)).isoformat()
with self.manager._get_db() as conn:
conn.execute(
"""
INSERT INTO resource_metrics
(id, tenant_id, resource_type, resource_id, metric_name, metric_value, unit, timestamp)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
(
f"cm_{i}",
self.tenant_id,
ResourceType.CPU.value,
"server-001",
"cpu_usage_percent",
50.0 + random.random() * 30,
"percent",
timestamp,
),
)
conn.commit()
self.log("Recorded 30 days of historical metrics")
# 创建容量规划
prediction_date = (datetime.now() + timedelta(days = 30)).strftime("%Y-%m-%d")
plan = self.manager.create_capacity_plan(
tenant_id = self.tenant_id,
resource_type = ResourceType.CPU,
current_capacity = 100.0,
prediction_date = prediction_date,
confidence = 0.85,
)
self.log(f"Created capacity plan: {plan.id}")
self.log(f" Current capacity: {plan.current_capacity}")
self.log(f" Predicted capacity: {plan.predicted_capacity}")
self.log(f" Recommended action: {plan.recommended_action}")
# 获取容量规划列表
plans = self.manager.get_capacity_plans(self.tenant_id)
assert len(plans) >= 1
self.log(f"Listed {len(plans)} capacity plans")
# 清理
with self.manager._get_db() as conn:
conn.execute("DELETE FROM capacity_plans WHERE tenant_id = ?", (self.tenant_id, ))
conn.execute("DELETE FROM resource_metrics WHERE tenant_id = ?", (self.tenant_id, ))
conn.commit()
self.log("Cleaned up capacity planning test data")
except Exception as e:
self.log(f"Capacity planning test failed: {e}", success = False)
def test_auto_scaling(self) -> None:
"""测试自动扩缩容"""
print("\n⚖️ Testing Auto Scaling...")
try:
# 创建自动扩缩容策略
policy = self.manager.create_auto_scaling_policy(
tenant_id = self.tenant_id,
name = "API 服务自动扩缩容",
resource_type = ResourceType.CPU,
min_instances = 2,
max_instances = 10,
target_utilization = 0.7,
scale_up_threshold = 0.8,
scale_down_threshold = 0.3,
scale_up_step = 2,
scale_down_step = 1,
cooldown_period = 300,
)
self.log(f"Created auto scaling policy: {policy.name} (ID: {policy.id})")
self.log(f" Min instances: {policy.min_instances}")
self.log(f" Max instances: {policy.max_instances}")
self.log(f" Target utilization: {policy.target_utilization}")
# 获取策略列表
policies = self.manager.list_auto_scaling_policies(self.tenant_id)
assert len(policies) >= 1
self.log(f"Listed {len(policies)} auto scaling policies")
# 模拟扩缩容评估
event = self.manager.evaluate_scaling_policy(
policy_id = policy.id, current_instances = 3, current_utilization = 0.85
)
if event:
self.log(f"Scaling event triggered: {event.action.value}")
self.log(f" From {event.from_count} to {event.to_count} instances")
self.log(f" Reason: {event.reason}")
else:
self.log("No scaling action needed")
# 获取扩缩容事件列表
events = self.manager.list_scaling_events(self.tenant_id)
self.log(f"Listed {len(events)} scaling events")
# 清理
with self.manager._get_db() as conn:
conn.execute("DELETE FROM scaling_events WHERE tenant_id = ?", (self.tenant_id, ))
conn.execute(
"DELETE FROM auto_scaling_policies WHERE tenant_id = ?", (self.tenant_id, )
)
conn.commit()
self.log("Cleaned up auto scaling test data")
except Exception as e:
self.log(f"Auto scaling test failed: {e}", success = False)
def test_health_checks(self) -> None:
"""测试健康检查"""
print("\n💓 Testing Health Checks...")
try:
# 创建 HTTP 健康检查
check1 = self.manager.create_health_check(
tenant_id = self.tenant_id,
name = "API 服务健康检查",
target_type = "service",
target_id = "api-service",
check_type = "http",
check_config = {"url": "https://api.insightflow.io/health", "expected_status": 200},
interval = 60,
timeout = 10,
retry_count = 3,
)
self.log(f"Created HTTP health check: {check1.name} (ID: {check1.id})")
# 创建 TCP 健康检查
check2 = self.manager.create_health_check(
tenant_id = self.tenant_id,
name = "数据库健康检查",
target_type = "database",
target_id = "postgres-001",
check_type = "tcp",
check_config = {"host": "db.insightflow.io", "port": 5432},
interval = 30,
timeout = 5,
retry_count = 2,
)
self.log(f"Created TCP health check: {check2.name} (ID: {check2.id})")
# 获取健康检查列表
checks = self.manager.list_health_checks(self.tenant_id)
assert len(checks) >= 2
self.log(f"Listed {len(checks)} health checks")
# 执行健康检查(异步)
async def run_health_check() -> None:
result = await self.manager.execute_health_check(check1.id)
return result
# 由于健康检查需要网络,这里只验证方法存在
self.log("Health check execution method verified")
# 清理
with self.manager._get_db() as conn:
conn.execute("DELETE FROM health_checks WHERE tenant_id = ?", (self.tenant_id, ))
conn.commit()
self.log("Cleaned up health check test data")
except Exception as e:
self.log(f"Health checks test failed: {e}", success = False)
def test_failover(self) -> None:
"""测试故障转移"""
print("\n🔄 Testing Failover...")
try:
# 创建故障转移配置
config = self.manager.create_failover_config(
tenant_id = self.tenant_id,
name = "主备数据中心故障转移",
primary_region = "cn-north-1",
secondary_regions = ["cn-south-1", "cn-east-1"],
failover_trigger = "health_check_failed",
auto_failover = False,
failover_timeout = 300,
health_check_id = None,
)
self.log(f"Created failover config: {config.name} (ID: {config.id})")
self.log(f" Primary region: {config.primary_region}")
self.log(f" Secondary regions: {config.secondary_regions}")
# 获取故障转移配置列表
configs = self.manager.list_failover_configs(self.tenant_id)
assert len(configs) >= 1
self.log(f"Listed {len(configs)} failover configs")
# 发起故障转移
event = self.manager.initiate_failover(
config_id = config.id, reason = "Primary region health check failed"
)
if event:
self.log(f"Initiated failover: {event.id}")
self.log(f" From: {event.from_region}")
self.log(f" To: {event.to_region}")
# 更新故障转移状态
self.manager.update_failover_status(event.id, "completed")
updated_event = self.manager.get_failover_event(event.id)
assert updated_event.status == "completed"
self.log("Failover completed")
# 获取故障转移事件列表
events = self.manager.list_failover_events(self.tenant_id)
self.log(f"Listed {len(events)} failover events")
# 清理
with self.manager._get_db() as conn:
conn.execute("DELETE FROM failover_events WHERE tenant_id = ?", (self.tenant_id, ))
conn.execute("DELETE FROM failover_configs WHERE tenant_id = ?", (self.tenant_id, ))
conn.commit()
self.log("Cleaned up failover test data")
except Exception as e:
self.log(f"Failover test failed: {e}", success = False)
def test_backup(self) -> None:
"""测试备份与恢复"""
print("\n💾 Testing Backup & Recovery...")
try:
# 创建备份任务
job = self.manager.create_backup_job(
tenant_id = self.tenant_id,
name = "每日数据库备份",
backup_type = "full",
target_type = "database",
target_id = "postgres-main",
schedule = "0 2 * * *", # 每天凌晨2点
retention_days = 30,
encryption_enabled = True,
compression_enabled = True,
storage_location = "s3://insightflow-backups/",
)
self.log(f"Created backup job: {job.name} (ID: {job.id})")
self.log(f" Schedule: {job.schedule}")
self.log(f" Retention: {job.retention_days} days")
# 获取备份任务列表
jobs = self.manager.list_backup_jobs(self.tenant_id)
assert len(jobs) >= 1
self.log(f"Listed {len(jobs)} backup jobs")
# 执行备份
record = self.manager.execute_backup(job.id)
if record:
self.log(f"Executed backup: {record.id}")
self.log(f" Status: {record.status.value}")
self.log(f" Storage: {record.storage_path}")
# 获取备份记录列表
records = self.manager.list_backup_records(self.tenant_id)
self.log(f"Listed {len(records)} backup records")
# 测试恢复(模拟)
restore_result = self.manager.restore_from_backup(record.id)
self.log(f"Restore test result: {restore_result}")
# 清理
with self.manager._get_db() as conn:
conn.execute("DELETE FROM backup_records WHERE tenant_id = ?", (self.tenant_id, ))
conn.execute("DELETE FROM backup_jobs WHERE tenant_id = ?", (self.tenant_id, ))
conn.commit()
self.log("Cleaned up backup test data")
except Exception as e:
self.log(f"Backup test failed: {e}", success = False)
def test_cost_optimization(self) -> None:
"""测试成本优化"""
print("\n💰 Testing Cost Optimization...")
try:
# 记录资源利用率数据
report_date = datetime.now().strftime("%Y-%m-%d")
for i in range(5):
self.manager.record_resource_utilization(
tenant_id = self.tenant_id,
resource_type = ResourceType.CPU,
resource_id = f"server-{i:03d}",
utilization_rate = 0.05 + random.random() * 0.1, # 低利用率
peak_utilization = 0.15,
avg_utilization = 0.08,
idle_time_percent = 0.85,
report_date = report_date,
recommendations = ["Consider downsizing this resource"],
)
self.log("Recorded 5 resource utilization records")
# 生成成本报告
now = datetime.now()
report = self.manager.generate_cost_report(
tenant_id = self.tenant_id, year = now.year, month = now.month
)
self.log(f"Generated cost report: {report.id}")
self.log(f" Period: {report.report_period}")
self.log(f" Total cost: {report.total_cost} {report.currency}")
self.log(f" Anomalies detected: {len(report.anomalies)}")
# 检测闲置资源
idle_resources = self.manager.detect_idle_resources(self.tenant_id)
self.log(f"Detected {len(idle_resources)} idle resources")
# 获取闲置资源列表
idle_list = self.manager.get_idle_resources(self.tenant_id)
for resource in idle_list:
self.log(
f" Idle resource: {resource.resource_name} (est. cost: {
resource.estimated_monthly_cost
}/month)"
)
# 生成成本优化建议
suggestions = self.manager.generate_cost_optimization_suggestions(self.tenant_id)
self.log(f"Generated {len(suggestions)} cost optimization suggestions")
for suggestion in suggestions:
self.log(f" Suggestion: {suggestion.title}")
self.log(
f" Potential savings: {suggestion.potential_savings} {suggestion.currency}"
)
self.log(f" Confidence: {suggestion.confidence}")
self.log(f" Difficulty: {suggestion.difficulty}")
# 获取优化建议列表
all_suggestions = self.manager.get_cost_optimization_suggestions(self.tenant_id)
self.log(f"Listed {len(all_suggestions)} optimization suggestions")
# 应用优化建议
if all_suggestions:
applied = self.manager.apply_cost_optimization_suggestion(all_suggestions[0].id)
if applied:
self.log(f"Applied optimization suggestion: {applied.title}")
assert applied.is_applied
assert applied.applied_at is not None
# 清理
with self.manager._get_db() as conn:
conn.execute(
"DELETE FROM cost_optimization_suggestions WHERE tenant_id = ?",
(self.tenant_id, ),
)
conn.execute("DELETE FROM idle_resources WHERE tenant_id = ?", (self.tenant_id, ))
conn.execute(
"DELETE FROM resource_utilizations WHERE tenant_id = ?", (self.tenant_id, )
)
conn.execute("DELETE FROM cost_reports WHERE tenant_id = ?", (self.tenant_id, ))
conn.commit()
self.log("Cleaned up cost optimization test data")
except Exception as e:
self.log(f"Cost optimization test failed: {e}", success = False)
def print_summary(self) -> None:
"""打印测试总结"""
print("\n" + " = " * 60)
print("Test Summary")
print(" = " * 60)
total = len(self.test_results)
passed = sum(1 for _, success in self.test_results if success)
failed = total - passed
print(f"Total tests: {total}")
print(f"Passed: {passed}")
print(f"Failed: {failed}")
if failed > 0:
print("\nFailed tests:")
for message, success in self.test_results:
if not success:
print(f"{message}")
print(" = " * 60)
def main() -> None:
"""主函数"""
test = TestOpsManager()
test.run_all_tests()
if __name__ == "__main__":
main()