insightflow/backend/test_phase8_task8.py

#!/usr/bin/env python3
"""
InsightFlow Phase 8 Task 8: Operations & Monitoring Test Script
运维与监控模块测试脚本

测试内容:
1. 实时告警系统（告警规则、告警渠道、告警触发、抑制聚合）
2. 容量规划与自动扩缩容
3. 灾备与故障转移
4. 成本优化
"""

import json
import os
import random
import sys
from datetime import datetime, timedelta

from ops_manager import (
    AlertChannelType,
    AlertRuleType,
    AlertSeverity,
    AlertStatus,
    ResourceType,
    get_ops_manager,
)

# Add backend directory to path
backend_dir  = os.path.dirname(os.path.abspath(__file__))
if backend_dir not in sys.path:
    sys.path.insert(0, backend_dir)


class TestOpsManager:
    """测试运维与监控管理器"""

    def __init__(self) -> None:
        self.manager  = get_ops_manager()
        self.tenant_id  = "test_tenant_001"
        self.test_results  = []

    def log(self, message: str, success: bool  = True) -> None:
        """记录测试结果"""
        status  = "✅" if success else "❌"
        print(f"{status} {message}")
        self.test_results.append((message, success))

    def run_all_tests(self) -> None:
        """运行所有测试"""
        print(" = " * 60)
        print("InsightFlow Phase 8 Task 8: Operations & Monitoring Tests")
        print(" = " * 60)

        # 1. 告警系统测试
        self.test_alert_rules()
        self.test_alert_channels()
        self.test_alerts()

        # 2. 容量规划与自动扩缩容测试
        self.test_capacity_planning()
        self.test_auto_scaling()

        # 3. 健康检查与故障转移测试
        self.test_health_checks()
        self.test_failover()

        # 4. 备份与恢复测试
        self.test_backup()

        # 5. 成本优化测试
        self.test_cost_optimization()

        # 打印测试总结
        self.print_summary()

    def test_alert_rules(self) -> None:
        """测试告警规则管理"""
        print("\n📋 Testing Alert Rules...")

        try:
            # 创建阈值告警规则
            rule1  = self.manager.create_alert_rule(
                tenant_id = self.tenant_id,
                name = "CPU 使用率告警",
                description = "当 CPU 使用率超过 80% 时触发告警",
                rule_type = AlertRuleType.THRESHOLD,
                severity = AlertSeverity.P1,
                metric = "cpu_usage_percent",
                condition = ">",
                threshold = 80.0,
                duration = 300,
                evaluation_interval = 60,
                channels = [],
                labels = {"service": "api", "team": "platform"},
                annotations = {"summary": "CPU 使用率过高", "runbook": "https://wiki/runbooks/cpu"},
                created_by = "test_user",
            )
            self.log(f"Created alert rule: {rule1.name} (ID: {rule1.id})")

            # 创建异常检测告警规则
            rule2  = self.manager.create_alert_rule(
                tenant_id = self.tenant_id,
                name = "内存异常检测",
                description = "检测内存使用异常",
                rule_type = AlertRuleType.ANOMALY,
                severity = AlertSeverity.P2,
                metric = "memory_usage_percent",
                condition = ">",
                threshold = 0.0,
                duration = 600,
                evaluation_interval = 300,
                channels = [],
                labels = {"service": "database"},
                annotations = {},
                created_by = "test_user",
            )
            self.log(f"Created anomaly alert rule: {rule2.name} (ID: {rule2.id})")

            # 获取告警规则
            fetched_rule  = self.manager.get_alert_rule(rule1.id)
            assert fetched_rule is not None
            assert fetched_rule.name == rule1.name
            self.log(f"Fetched alert rule: {fetched_rule.name}")

            # 列出租户的所有告警规则
            rules  = self.manager.list_alert_rules(self.tenant_id)
            assert len(rules) >= 2
            self.log(f"Listed {len(rules)} alert rules for tenant")

            # 更新告警规则
            updated_rule  = self.manager.update_alert_rule(
                rule1.id, threshold = 85.0, description = "更新后的描述"
            )
            assert updated_rule.threshold == 85.0
            self.log(f"Updated alert rule threshold to {updated_rule.threshold}")

            # 测试完成，清理
            self.manager.delete_alert_rule(rule1.id)
            self.manager.delete_alert_rule(rule2.id)
            self.log("Deleted test alert rules")

        except Exception as e:
            self.log(f"Alert rules test failed: {e}", success = False)

    def test_alert_channels(self) -> None:
        """测试告警渠道管理"""
        print("\n📢 Testing Alert Channels...")

        try:
            # 创建飞书告警渠道
            channel1  = self.manager.create_alert_channel(
                tenant_id = self.tenant_id,
                name = "飞书告警",
                channel_type = AlertChannelType.FEISHU,
                config = {
                    "webhook_url": "https://open.feishu.cn/open-apis/bot/v2/hook/test",
                    "secret": "test_secret",
                },
                severity_filter = ["p0", "p1"],
            )
            self.log(f"Created Feishu channel: {channel1.name} (ID: {channel1.id})")

            # 创建钉钉告警渠道
            channel2  = self.manager.create_alert_channel(
                tenant_id = self.tenant_id,
                name = "钉钉告警",
                channel_type = AlertChannelType.DINGTALK,
                config = {
                    "webhook_url": "https://oapi.dingtalk.com/robot/send?access_token = test",
                    "secret": "test_secret",
                },
                severity_filter = ["p0", "p1", "p2"],
            )
            self.log(f"Created DingTalk channel: {channel2.name} (ID: {channel2.id})")

            # 创建 Slack 告警渠道
            channel3  = self.manager.create_alert_channel(
                tenant_id = self.tenant_id,
                name = "Slack 告警",
                channel_type = AlertChannelType.SLACK,
                config = {"webhook_url": "https://hooks.slack.com/services/test"},
                severity_filter = ["p0", "p1", "p2", "p3"],
            )
            self.log(f"Created Slack channel: {channel3.name} (ID: {channel3.id})")

            # 获取告警渠道
            fetched_channel  = self.manager.get_alert_channel(channel1.id)
            assert fetched_channel is not None
            assert fetched_channel.name == channel1.name
            self.log(f"Fetched alert channel: {fetched_channel.name}")

            # 列出租户的所有告警渠道
            channels  = self.manager.list_alert_channels(self.tenant_id)
            assert len(channels) >= 3
            self.log(f"Listed {len(channels)} alert channels for tenant")

            # 清理
            for channel in channels:
                if channel.tenant_id == self.tenant_id:
                    with self.manager._get_db() as conn:
                        conn.execute("DELETE FROM alert_channels WHERE id  = ?", (channel.id, ))
                        conn.commit()
            self.log("Deleted test alert channels")

        except Exception as e:
            self.log(f"Alert channels test failed: {e}", success = False)

    def test_alerts(self) -> None:
        """测试告警管理"""
        print("\n🚨 Testing Alerts...")

        try:
            # 创建告警规则
            rule  = self.manager.create_alert_rule(
                tenant_id = self.tenant_id,
                name = "测试告警规则",
                description = "用于测试的告警规则",
                rule_type = AlertRuleType.THRESHOLD,
                severity = AlertSeverity.P1,
                metric = "test_metric",
                condition = ">",
                threshold = 100.0,
                duration = 60,
                evaluation_interval = 60,
                channels = [],
                labels = {},
                annotations = {},
                created_by = "test_user",
            )

            # 记录资源指标
            for i in range(10):
                self.manager.record_resource_metric(
                    tenant_id = self.tenant_id,
                    resource_type = ResourceType.CPU,
                    resource_id = "server-001",
                    metric_name = "test_metric",
                    metric_value = 110.0 + i,
                    unit = "percent",
                    metadata = {"region": "cn-north-1"},
                )
            self.log("Recorded 10 resource metrics")

            # 手动创建告警
            from ops_manager import Alert

            alert_id  = f"test_alert_{datetime.now().strftime('%Y%m%d%H%M%S')}"
            now  = datetime.now().isoformat()

            alert  = Alert(
                id = alert_id,
                rule_id = rule.id,
                tenant_id = self.tenant_id,
                severity = AlertSeverity.P1,
                status = AlertStatus.FIRING,
                title = "测试告警",
                description = "这是一条测试告警",
                metric = "test_metric",
                value = 120.0,
                threshold = 100.0,
                labels = {"test": "true"},
                annotations = {},
                started_at = now,
                resolved_at = None,
                acknowledged_by = None,
                acknowledged_at = None,
                notification_sent = {},
                suppression_count = 0,
            )

            with self.manager._get_db() as conn:
                conn.execute(
                    """
                    INSERT INTO alerts
                    (id, rule_id, tenant_id, severity, status, title, description,
                     metric, value, threshold, labels, annotations, started_at, notification_sent, suppression_count)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """,
                    (
                        alert.id,
                        alert.rule_id,
                        alert.tenant_id,
                        alert.severity.value,
                        alert.status.value,
                        alert.title,
                        alert.description,
                        alert.metric,
                        alert.value,
                        alert.threshold,
                        json.dumps(alert.labels),
                        json.dumps(alert.annotations),
                        alert.started_at,
                        json.dumps(alert.notification_sent),
                        alert.suppression_count,
                    ),
                )
                conn.commit()

            self.log(f"Created test alert: {alert.id}")

            # 列出租户的告警
            alerts  = self.manager.list_alerts(self.tenant_id)
            assert len(alerts) >= 1
            self.log(f"Listed {len(alerts)} alerts for tenant")

            # 确认告警
            self.manager.acknowledge_alert(alert_id, "test_user")
            fetched_alert  = self.manager.get_alert(alert_id)
            assert fetched_alert.status == AlertStatus.ACKNOWLEDGED
            assert fetched_alert.acknowledged_by == "test_user"
            self.log(f"Acknowledged alert: {alert_id}")

            # 解决告警
            self.manager.resolve_alert(alert_id)
            fetched_alert  = self.manager.get_alert(alert_id)
            assert fetched_alert.status == AlertStatus.RESOLVED
            assert fetched_alert.resolved_at is not None
            self.log(f"Resolved alert: {alert_id}")

            # 清理
            self.manager.delete_alert_rule(rule.id)
            with self.manager._get_db() as conn:
                conn.execute("DELETE FROM alerts WHERE id  = ?", (alert_id, ))
                conn.execute("DELETE FROM resource_metrics WHERE tenant_id  = ?", (self.tenant_id, ))
                conn.commit()
            self.log("Cleaned up test data")

        except Exception as e:
            self.log(f"Alerts test failed: {e}", success = False)

    def test_capacity_planning(self) -> None:
        """测试容量规划"""
        print("\n📊 Testing Capacity Planning...")

        try:
            # 记录历史指标数据
            base_time  = datetime.now() - timedelta(days = 30)
            for i in range(30):
                timestamp  = (base_time + timedelta(days = i)).isoformat()
                with self.manager._get_db() as conn:
                    conn.execute(
                        """
                        INSERT INTO resource_metrics
                        (id, tenant_id, resource_type, resource_id, metric_name, metric_value, unit, timestamp)
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                    """,
                        (
                            f"cm_{i}",
                            self.tenant_id,
                            ResourceType.CPU.value,
                            "server-001",
                            "cpu_usage_percent",
                            50.0 + random.random() * 30,
                            "percent",
                            timestamp,
                        ),
                    )
                    conn.commit()

            self.log("Recorded 30 days of historical metrics")

            # 创建容量规划
            prediction_date  = (datetime.now() + timedelta(days = 30)).strftime("%Y-%m-%d")
            plan  = self.manager.create_capacity_plan(
                tenant_id = self.tenant_id,
                resource_type = ResourceType.CPU,
                current_capacity = 100.0,
                prediction_date = prediction_date,
                confidence = 0.85,
            )

            self.log(f"Created capacity plan: {plan.id}")
            self.log(f"  Current capacity: {plan.current_capacity}")
            self.log(f"  Predicted capacity: {plan.predicted_capacity}")
            self.log(f"  Recommended action: {plan.recommended_action}")

            # 获取容量规划列表
            plans  = self.manager.get_capacity_plans(self.tenant_id)
            assert len(plans) >= 1
            self.log(f"Listed {len(plans)} capacity plans")

            # 清理
            with self.manager._get_db() as conn:
                conn.execute("DELETE FROM capacity_plans WHERE tenant_id  = ?", (self.tenant_id, ))
                conn.execute("DELETE FROM resource_metrics WHERE tenant_id  = ?", (self.tenant_id, ))
                conn.commit()
            self.log("Cleaned up capacity planning test data")

        except Exception as e:
            self.log(f"Capacity planning test failed: {e}", success = False)

    def test_auto_scaling(self) -> None:
        """测试自动扩缩容"""
        print("\n⚖️ Testing Auto Scaling...")

        try:
            # 创建自动扩缩容策略
            policy  = self.manager.create_auto_scaling_policy(
                tenant_id = self.tenant_id,
                name = "API 服务自动扩缩容",
                resource_type = ResourceType.CPU,
                min_instances = 2,
                max_instances = 10,
                target_utilization = 0.7,
                scale_up_threshold = 0.8,
                scale_down_threshold = 0.3,
                scale_up_step = 2,
                scale_down_step = 1,
                cooldown_period = 300,
            )

            self.log(f"Created auto scaling policy: {policy.name} (ID: {policy.id})")
            self.log(f"  Min instances: {policy.min_instances}")
            self.log(f"  Max instances: {policy.max_instances}")
            self.log(f"  Target utilization: {policy.target_utilization}")

            # 获取策略列表
            policies  = self.manager.list_auto_scaling_policies(self.tenant_id)
            assert len(policies) >= 1
            self.log(f"Listed {len(policies)} auto scaling policies")

            # 模拟扩缩容评估
            event  = self.manager.evaluate_scaling_policy(
                policy_id = policy.id, current_instances = 3, current_utilization = 0.85
            )

            if event:
                self.log(f"Scaling event triggered: {event.action.value}")
                self.log(f"  From {event.from_count} to {event.to_count} instances")
                self.log(f"  Reason: {event.reason}")
            else:
                self.log("No scaling action needed")

            # 获取扩缩容事件列表
            events  = self.manager.list_scaling_events(self.tenant_id)
            self.log(f"Listed {len(events)} scaling events")

            # 清理
            with self.manager._get_db() as conn:
                conn.execute("DELETE FROM scaling_events WHERE tenant_id  = ?", (self.tenant_id, ))
                conn.execute(
                    "DELETE FROM auto_scaling_policies WHERE tenant_id  = ?", (self.tenant_id, )
                )
                conn.commit()
            self.log("Cleaned up auto scaling test data")

        except Exception as e:
            self.log(f"Auto scaling test failed: {e}", success = False)

    def test_health_checks(self) -> None:
        """测试健康检查"""
        print("\n💓 Testing Health Checks...")

        try:
            # 创建 HTTP 健康检查
            check1  = self.manager.create_health_check(
                tenant_id = self.tenant_id,
                name = "API 服务健康检查",
                target_type = "service",
                target_id = "api-service",
                check_type = "http",
                check_config = {"url": "https://api.insightflow.io/health", "expected_status": 200},
                interval = 60,
                timeout = 10,
                retry_count = 3,
            )
            self.log(f"Created HTTP health check: {check1.name} (ID: {check1.id})")

            # 创建 TCP 健康检查
            check2  = self.manager.create_health_check(
                tenant_id = self.tenant_id,
                name = "数据库健康检查",
                target_type = "database",
                target_id = "postgres-001",
                check_type = "tcp",
                check_config = {"host": "db.insightflow.io", "port": 5432},
                interval = 30,
                timeout = 5,
                retry_count = 2,
            )
            self.log(f"Created TCP health check: {check2.name} (ID: {check2.id})")

            # 获取健康检查列表
            checks  = self.manager.list_health_checks(self.tenant_id)
            assert len(checks) >= 2
            self.log(f"Listed {len(checks)} health checks")

            # 执行健康检查（异步）
            async def run_health_check() -> None:
                result  = await self.manager.execute_health_check(check1.id)
                return result

            # 由于健康检查需要网络，这里只验证方法存在
            self.log("Health check execution method verified")

            # 清理
            with self.manager._get_db() as conn:
                conn.execute("DELETE FROM health_checks WHERE tenant_id  = ?", (self.tenant_id, ))
                conn.commit()
            self.log("Cleaned up health check test data")

        except Exception as e:
            self.log(f"Health checks test failed: {e}", success = False)

    def test_failover(self) -> None:
        """测试故障转移"""
        print("\n🔄 Testing Failover...")

        try:
            # 创建故障转移配置
            config  = self.manager.create_failover_config(
                tenant_id = self.tenant_id,
                name = "主备数据中心故障转移",
                primary_region = "cn-north-1",
                secondary_regions = ["cn-south-1", "cn-east-1"],
                failover_trigger = "health_check_failed",
                auto_failover = False,
                failover_timeout = 300,
                health_check_id = None,
            )

            self.log(f"Created failover config: {config.name} (ID: {config.id})")
            self.log(f"  Primary region: {config.primary_region}")
            self.log(f"  Secondary regions: {config.secondary_regions}")

            # 获取故障转移配置列表
            configs  = self.manager.list_failover_configs(self.tenant_id)
            assert len(configs) >= 1
            self.log(f"Listed {len(configs)} failover configs")

            # 发起故障转移
            event  = self.manager.initiate_failover(
                config_id = config.id, reason = "Primary region health check failed"
            )

            if event:
                self.log(f"Initiated failover: {event.id}")
                self.log(f"  From: {event.from_region}")
                self.log(f"  To: {event.to_region}")

                # 更新故障转移状态
                self.manager.update_failover_status(event.id, "completed")
                updated_event  = self.manager.get_failover_event(event.id)
                assert updated_event.status == "completed"
                self.log("Failover completed")

            # 获取故障转移事件列表
            events  = self.manager.list_failover_events(self.tenant_id)
            self.log(f"Listed {len(events)} failover events")

            # 清理
            with self.manager._get_db() as conn:
                conn.execute("DELETE FROM failover_events WHERE tenant_id  = ?", (self.tenant_id, ))
                conn.execute("DELETE FROM failover_configs WHERE tenant_id  = ?", (self.tenant_id, ))
                conn.commit()
            self.log("Cleaned up failover test data")

        except Exception as e:
            self.log(f"Failover test failed: {e}", success = False)

    def test_backup(self) -> None:
        """测试备份与恢复"""
        print("\n💾 Testing Backup & Recovery...")

        try:
            # 创建备份任务
            job  = self.manager.create_backup_job(
                tenant_id = self.tenant_id,
                name = "每日数据库备份",
                backup_type = "full",
                target_type = "database",
                target_id = "postgres-main",
                schedule = "0 2 * * *",  # 每天凌晨2点
                retention_days = 30,
                encryption_enabled = True,
                compression_enabled = True,
                storage_location = "s3://insightflow-backups/",
            )

            self.log(f"Created backup job: {job.name} (ID: {job.id})")
            self.log(f"  Schedule: {job.schedule}")
            self.log(f"  Retention: {job.retention_days} days")

            # 获取备份任务列表
            jobs  = self.manager.list_backup_jobs(self.tenant_id)
            assert len(jobs) >= 1
            self.log(f"Listed {len(jobs)} backup jobs")

            # 执行备份
            record  = self.manager.execute_backup(job.id)

            if record:
                self.log(f"Executed backup: {record.id}")
                self.log(f"  Status: {record.status.value}")
                self.log(f"  Storage: {record.storage_path}")

                # 获取备份记录列表
                records  = self.manager.list_backup_records(self.tenant_id)
                self.log(f"Listed {len(records)} backup records")

                # 测试恢复（模拟）
                restore_result  = self.manager.restore_from_backup(record.id)
                self.log(f"Restore test result: {restore_result}")

            # 清理
            with self.manager._get_db() as conn:
                conn.execute("DELETE FROM backup_records WHERE tenant_id  = ?", (self.tenant_id, ))
                conn.execute("DELETE FROM backup_jobs WHERE tenant_id  = ?", (self.tenant_id, ))
                conn.commit()
            self.log("Cleaned up backup test data")

        except Exception as e:
            self.log(f"Backup test failed: {e}", success = False)

    def test_cost_optimization(self) -> None:
        """测试成本优化"""
        print("\n💰 Testing Cost Optimization...")

        try:
            # 记录资源利用率数据
            report_date  = datetime.now().strftime("%Y-%m-%d")

            for i in range(5):
                self.manager.record_resource_utilization(
                    tenant_id = self.tenant_id,
                    resource_type = ResourceType.CPU,
                    resource_id = f"server-{i:03d}",
                    utilization_rate = 0.05 + random.random() * 0.1,  # 低利用率
                    peak_utilization = 0.15,
                    avg_utilization = 0.08,
                    idle_time_percent = 0.85,
                    report_date = report_date,
                    recommendations = ["Consider downsizing this resource"],
                )

            self.log("Recorded 5 resource utilization records")

            # 生成成本报告
            now  = datetime.now()
            report  = self.manager.generate_cost_report(
                tenant_id = self.tenant_id, year = now.year, month = now.month
            )

            self.log(f"Generated cost report: {report.id}")
            self.log(f"  Period: {report.report_period}")
            self.log(f"  Total cost: {report.total_cost} {report.currency}")
            self.log(f"  Anomalies detected: {len(report.anomalies)}")

            # 检测闲置资源
            idle_resources  = self.manager.detect_idle_resources(self.tenant_id)
            self.log(f"Detected {len(idle_resources)} idle resources")

            # 获取闲置资源列表
            idle_list  = self.manager.get_idle_resources(self.tenant_id)
            for resource in idle_list:
                self.log(
                    f"  Idle resource: {resource.resource_name} (est. cost: {
                        resource.estimated_monthly_cost
                    }/month)"
                )

            # 生成成本优化建议
            suggestions  = self.manager.generate_cost_optimization_suggestions(self.tenant_id)
            self.log(f"Generated {len(suggestions)} cost optimization suggestions")

            for suggestion in suggestions:
                self.log(f"  Suggestion: {suggestion.title}")
                self.log(
                    f"    Potential savings: {suggestion.potential_savings} {suggestion.currency}"
                )
                self.log(f"    Confidence: {suggestion.confidence}")
                self.log(f"    Difficulty: {suggestion.difficulty}")

            # 获取优化建议列表
            all_suggestions  = self.manager.get_cost_optimization_suggestions(self.tenant_id)
            self.log(f"Listed {len(all_suggestions)} optimization suggestions")

            # 应用优化建议
            if all_suggestions:
                applied  = self.manager.apply_cost_optimization_suggestion(all_suggestions[0].id)
                if applied:
                    self.log(f"Applied optimization suggestion: {applied.title}")
                    assert applied.is_applied
                    assert applied.applied_at is not None

            # 清理
            with self.manager._get_db() as conn:
                conn.execute(
                    "DELETE FROM cost_optimization_suggestions WHERE tenant_id  = ?",
                    (self.tenant_id, ),
                )
                conn.execute("DELETE FROM idle_resources WHERE tenant_id  = ?", (self.tenant_id, ))
                conn.execute(
                    "DELETE FROM resource_utilizations WHERE tenant_id  = ?", (self.tenant_id, )
                )
                conn.execute("DELETE FROM cost_reports WHERE tenant_id  = ?", (self.tenant_id, ))
                conn.commit()
            self.log("Cleaned up cost optimization test data")

        except Exception as e:
            self.log(f"Cost optimization test failed: {e}", success = False)

    def print_summary(self) -> None:
        """打印测试总结"""
        print("\n" + " = " * 60)
        print("Test Summary")
        print(" = " * 60)

        total  = len(self.test_results)
        passed  = sum(1 for _, success in self.test_results if success)
        failed  = total - passed

        print(f"Total tests: {total}")
        print(f"Passed: {passed} ✅")
        print(f"Failed: {failed} ❌")

        if failed > 0:
            print("\nFailed tests:")
            for message, success in self.test_results:
                if not success:
                    print(f"  ❌ {message}")

        print(" = " * 60)


def main() -> None:
    """主函数"""
    test  = TestOpsManager()
    test.run_all_tests()


if __name__ == "__main__":
    main()