insightflow/backend/test_phase8_task8.py

#!/usr/bin/env python3
"""
InsightFlow Phase 8 Task 8: Operations & Monitoring Test Script
运维与监控模块测试脚本

测试内容:
1. 实时告警系统（告警规则、告警渠道、告警触发、抑制聚合）
2. 容量规划与自动扩缩容
3. 灾备与故障转移
4. 成本优化
"""

import json
import os
import random
import sys
from datetime import datetime, timedelta

from ops_manager import (
    AlertChannelType,
    AlertRuleType,
    AlertSeverity,
    AlertStatus,
    ResourceType,
    get_ops_manager,
)

# Add backend directory to path
backend_dir = os.path.dirname(os.path.abspath(__file__))
if backend_dir not in sys.path:
    sys.path.insert(0, backend_dir)


class TestOpsManager:
    """测试运维与监控管理器"""

    def __init__(self):
        self.manager = get_ops_manager()
        self.tenant_id = "test_tenant_001"
        self.test_results = []

    def log(self, message: str, success: bool = True):
        """记录测试结果"""
        status = "✅" if success else "❌"
        print(f"{status} {message}")
        self.test_results.append((message, success))

    def run_all_tests(self):
        """运行所有测试"""
        print("=" * 60)
        print("InsightFlow Phase 8 Task 8: Operations & Monitoring Tests")
        print("=" * 60)

        # 1. 告警系统测试
        self.test_alert_rules()
        self.test_alert_channels()
        self.test_alerts()

        # 2. 容量规划与自动扩缩容测试
        self.test_capacity_planning()
        self.test_auto_scaling()

        # 3. 健康检查与故障转移测试
        self.test_health_checks()
        self.test_failover()

        # 4. 备份与恢复测试
        self.test_backup()

        # 5. 成本优化测试
        self.test_cost_optimization()

        # 打印测试总结
        self.print_summary()

    def test_alert_rules(self):
        """测试告警规则管理"""
        print("\n📋 Testing Alert Rules...")

        try:
            # 创建阈值告警规则
            rule1 = self.manager.create_alert_rule(
                tenant_id=self.tenant_id,
                name="CPU 使用率告警",
                description="当 CPU 使用率超过 80% 时触发告警",
                rule_type=AlertRuleType.THRESHOLD,
                severity=AlertSeverity.P1,
                metric="cpu_usage_percent",
                condition=">",
                threshold=80.0,
                duration=300,
                evaluation_interval=60,
                channels=[],
                labels={"service": "api", "team": "platform"},
                annotations={"summary": "CPU 使用率过高", "runbook": "https://wiki/runbooks/cpu"},
                created_by="test_user"
            )
            self.log(f"Created alert rule: {rule1.name} (ID: {rule1.id})")

            # 创建异常检测告警规则
            rule2 = self.manager.create_alert_rule(
                tenant_id=self.tenant_id,
                name="内存异常检测",
                description="检测内存使用异常",
                rule_type=AlertRuleType.ANOMALY,
                severity=AlertSeverity.P2,
                metric="memory_usage_percent",
                condition=">",
                threshold=0.0,
                duration=600,
                evaluation_interval=300,
                channels=[],
                labels={"service": "database"},
                annotations={},
                created_by="test_user"
            )
            self.log(f"Created anomaly alert rule: {rule2.name} (ID: {rule2.id})")

            # 获取告警规则
            fetched_rule = self.manager.get_alert_rule(rule1.id)
            assert fetched_rule is not None
            assert fetched_rule.name == rule1.name
            self.log(f"Fetched alert rule: {fetched_rule.name}")

            # 列出租户的所有告警规则
            rules = self.manager.list_alert_rules(self.tenant_id)
            assert len(rules) >= 2
            self.log(f"Listed {len(rules)} alert rules for tenant")

            # 更新告警规则
            updated_rule = self.manager.update_alert_rule(
                rule1.id,
                threshold=85.0,
                description="更新后的描述"
            )
            assert updated_rule.threshold == 85.0
            self.log(f"Updated alert rule threshold to {updated_rule.threshold}")

            # 测试完成，清理
            self.manager.delete_alert_rule(rule1.id)
            self.manager.delete_alert_rule(rule2.id)
            self.log("Deleted test alert rules")

        except Exception as e:
            self.log(f"Alert rules test failed: {e}", success=False)

    def test_alert_channels(self):
        """测试告警渠道管理"""
        print("\n📢 Testing Alert Channels...")

        try:
            # 创建飞书告警渠道
            channel1 = self.manager.create_alert_channel(
                tenant_id=self.tenant_id,
                name="飞书告警",
                channel_type=AlertChannelType.FEISHU,
                config={
                    "webhook_url": "https://open.feishu.cn/open-apis/bot/v2/hook/test",
                    "secret": "test_secret"
                },
                severity_filter=["p0", "p1"]
            )
            self.log(f"Created Feishu channel: {channel1.name} (ID: {channel1.id})")

            # 创建钉钉告警渠道
            channel2 = self.manager.create_alert_channel(
                tenant_id=self.tenant_id,
                name="钉钉告警",
                channel_type=AlertChannelType.DINGTALK,
                config={
                    "webhook_url": "https://oapi.dingtalk.com/robot/send?access_token=test",
                    "secret": "test_secret"
                },
                severity_filter=["p0", "p1", "p2"]
            )
            self.log(f"Created DingTalk channel: {channel2.name} (ID: {channel2.id})")

            # 创建 Slack 告警渠道
            channel3 = self.manager.create_alert_channel(
                tenant_id=self.tenant_id,
                name="Slack 告警",
                channel_type=AlertChannelType.SLACK,
                config={
                    "webhook_url": "https://hooks.slack.com/services/test"
                },
                severity_filter=["p0", "p1", "p2", "p3"]
            )
            self.log(f"Created Slack channel: {channel3.name} (ID: {channel3.id})")

            # 获取告警渠道
            fetched_channel = self.manager.get_alert_channel(channel1.id)
            assert fetched_channel is not None
            assert fetched_channel.name == channel1.name
            self.log(f"Fetched alert channel: {fetched_channel.name}")

            # 列出租户的所有告警渠道
            channels = self.manager.list_alert_channels(self.tenant_id)
            assert len(channels) >= 3
            self.log(f"Listed {len(channels)} alert channels for tenant")

            # 清理
            for channel in channels:
                if channel.tenant_id == self.tenant_id:
                    with self.manager._get_db() as conn:
                        conn.execute("DELETE FROM alert_channels WHERE id = ?", (channel.id,))
                        conn.commit()
            self.log("Deleted test alert channels")

        except Exception as e:
            self.log(f"Alert channels test failed: {e}", success=False)

    def test_alerts(self):
        """测试告警管理"""
        print("\n🚨 Testing Alerts...")

        try:
            # 创建告警规则
            rule = self.manager.create_alert_rule(
                tenant_id=self.tenant_id,
                name="测试告警规则",
                description="用于测试的告警规则",
                rule_type=AlertRuleType.THRESHOLD,
                severity=AlertSeverity.P1,
                metric="test_metric",
                condition=">",
                threshold=100.0,
                duration=60,
                evaluation_interval=60,
                channels=[],
                labels={},
                annotations={},
                created_by="test_user"
            )

            # 记录资源指标
            for i in range(10):
                self.manager.record_resource_metric(
                    tenant_id=self.tenant_id,
                    resource_type=ResourceType.CPU,
                    resource_id="server-001",
                    metric_name="test_metric",
                    metric_value=110.0 + i,
                    unit="percent",
                    metadata={"region": "cn-north-1"}
                )
            self.log("Recorded 10 resource metrics")

            # 手动创建告警
            from ops_manager import Alert
            alert_id = f"test_alert_{datetime.now().strftime('%Y%m%d%H%M%S')}"
            now = datetime.now().isoformat()

            alert = Alert(
                id=alert_id,
                rule_id=rule.id,
                tenant_id=self.tenant_id,
                severity=AlertSeverity.P1,
                status=AlertStatus.FIRING,
                title="测试告警",
                description="这是一条测试告警",
                metric="test_metric",
                value=120.0,
                threshold=100.0,
                labels={"test": "true"},
                annotations={},
                started_at=now,
                resolved_at=None,
                acknowledged_by=None,
                acknowledged_at=None,
                notification_sent={},
                suppression_count=0
            )

            with self.manager._get_db() as conn:
                conn.execute("""
                    INSERT INTO alerts
                    (id, rule_id, tenant_id, severity, status, title, description,
                     metric, value, threshold, labels, annotations, started_at, notification_sent, suppression_count)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """, (alert.id, alert.rule_id, alert.tenant_id, alert.severity.value,
                      alert.status.value, alert.title, alert.description,
                      alert.metric, alert.value, alert.threshold,
                      json.dumps(alert.labels), json.dumps(alert.annotations),
                      alert.started_at, json.dumps(alert.notification_sent), alert.suppression_count))
                conn.commit()

            self.log(f"Created test alert: {alert.id}")

            # 列出租户的告警
            alerts = self.manager.list_alerts(self.tenant_id)
            assert len(alerts) >= 1
            self.log(f"Listed {len(alerts)} alerts for tenant")

            # 确认告警
            self.manager.acknowledge_alert(alert_id, "test_user")
            fetched_alert = self.manager.get_alert(alert_id)
            assert fetched_alert.status == AlertStatus.ACKNOWLEDGED
            assert fetched_alert.acknowledged_by == "test_user"
            self.log(f"Acknowledged alert: {alert_id}")

            # 解决告警
            self.manager.resolve_alert(alert_id)
            fetched_alert = self.manager.get_alert(alert_id)
            assert fetched_alert.status == AlertStatus.RESOLVED
            assert fetched_alert.resolved_at is not None
            self.log(f"Resolved alert: {alert_id}")

            # 清理
            self.manager.delete_alert_rule(rule.id)
            with self.manager._get_db() as conn:
                conn.execute("DELETE FROM alerts WHERE id = ?", (alert_id,))
                conn.execute("DELETE FROM resource_metrics WHERE tenant_id = ?", (self.tenant_id,))
                conn.commit()
            self.log("Cleaned up test data")

        except Exception as e:
            self.log(f"Alerts test failed: {e}", success=False)

    def test_capacity_planning(self):
        """测试容量规划"""
        print("\n📊 Testing Capacity Planning...")

        try:
            # 记录历史指标数据
            import random
            base_time = datetime.now() - timedelta(days=30)
            for i in range(30):
                timestamp = (base_time + timedelta(days=i)).isoformat()
                with self.manager._get_db() as conn:
                    conn.execute("""
                        INSERT INTO resource_metrics
                        (id, tenant_id, resource_type, resource_id, metric_name, metric_value, unit, timestamp)
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                    """, (f"cm_{i}", self.tenant_id, ResourceType.CPU.value, "server-001",
                          "cpu_usage_percent", 50.0 + random.random() * 30, "percent", timestamp))
                    conn.commit()

            self.log("Recorded 30 days of historical metrics")

            # 创建容量规划
            prediction_date = (datetime.now() + timedelta(days=30)).strftime("%Y-%m-%d")
            plan = self.manager.create_capacity_plan(
                tenant_id=self.tenant_id,
                resource_type=ResourceType.CPU,
                current_capacity=100.0,
                prediction_date=prediction_date,
                confidence=0.85
            )

            self.log(f"Created capacity plan: {plan.id}")
            self.log(f"  Current capacity: {plan.current_capacity}")
            self.log(f"  Predicted capacity: {plan.predicted_capacity}")
            self.log(f"  Recommended action: {plan.recommended_action}")

            # 获取容量规划列表
            plans = self.manager.get_capacity_plans(self.tenant_id)
            assert len(plans) >= 1
            self.log(f"Listed {len(plans)} capacity plans")

            # 清理
            with self.manager._get_db() as conn:
                conn.execute("DELETE FROM capacity_plans WHERE tenant_id = ?", (self.tenant_id,))
                conn.execute("DELETE FROM resource_metrics WHERE tenant_id = ?", (self.tenant_id,))
                conn.commit()
            self.log("Cleaned up capacity planning test data")

        except Exception as e:
            self.log(f"Capacity planning test failed: {e}", success=False)

    def test_auto_scaling(self):
        """测试自动扩缩容"""
        print("\n⚖️ Testing Auto Scaling...")

        try:
            # 创建自动扩缩容策略
            policy = self.manager.create_auto_scaling_policy(
                tenant_id=self.tenant_id,
                name="API 服务自动扩缩容",
                resource_type=ResourceType.CPU,
                min_instances=2,
                max_instances=10,
                target_utilization=0.7,
                scale_up_threshold=0.8,
                scale_down_threshold=0.3,
                scale_up_step=2,
                scale_down_step=1,
                cooldown_period=300
            )

            self.log(f"Created auto scaling policy: {policy.name} (ID: {policy.id})")
            self.log(f"  Min instances: {policy.min_instances}")
            self.log(f"  Max instances: {policy.max_instances}")
            self.log(f"  Target utilization: {policy.target_utilization}")

            # 获取策略列表
            policies = self.manager.list_auto_scaling_policies(self.tenant_id)
            assert len(policies) >= 1
            self.log(f"Listed {len(policies)} auto scaling policies")

            # 模拟扩缩容评估
            event = self.manager.evaluate_scaling_policy(
                policy_id=policy.id,
                current_instances=3,
                current_utilization=0.85
            )

            if event:
                self.log(f"Scaling event triggered: {event.action.value}")
                self.log(f"  From {event.from_count} to {event.to_count} instances")
                self.log(f"  Reason: {event.reason}")
            else:
                self.log("No scaling action needed")

            # 获取扩缩容事件列表
            events = self.manager.list_scaling_events(self.tenant_id)
            self.log(f"Listed {len(events)} scaling events")

            # 清理
            with self.manager._get_db() as conn:
                conn.execute("DELETE FROM scaling_events WHERE tenant_id = ?", (self.tenant_id,))
                conn.execute("DELETE FROM auto_scaling_policies WHERE tenant_id = ?", (self.tenant_id,))
                conn.commit()
            self.log("Cleaned up auto scaling test data")

        except Exception as e:
            self.log(f"Auto scaling test failed: {e}", success=False)

    def test_health_checks(self):
        """测试健康检查"""
        print("\n💓 Testing Health Checks...")

        try:
            # 创建 HTTP 健康检查
            check1 = self.manager.create_health_check(
                tenant_id=self.tenant_id,
                name="API 服务健康检查",
                target_type="service",
                target_id="api-service",
                check_type="http",
                check_config={
                    "url": "https://api.insightflow.io/health",
                    "expected_status": 200
                },
                interval=60,
                timeout=10,
                retry_count=3
            )
            self.log(f"Created HTTP health check: {check1.name} (ID: {check1.id})")

            # 创建 TCP 健康检查
            check2 = self.manager.create_health_check(
                tenant_id=self.tenant_id,
                name="数据库健康检查",
                target_type="database",
                target_id="postgres-001",
                check_type="tcp",
                check_config={
                    "host": "db.insightflow.io",
                    "port": 5432
                },
                interval=30,
                timeout=5,
                retry_count=2
            )
            self.log(f"Created TCP health check: {check2.name} (ID: {check2.id})")

            # 获取健康检查列表
            checks = self.manager.list_health_checks(self.tenant_id)
            assert len(checks) >= 2
            self.log(f"Listed {len(checks)} health checks")

            # 执行健康检查（异步）
            async def run_health_check():
                result = await self.manager.execute_health_check(check1.id)
                return result

            # 由于健康检查需要网络，这里只验证方法存在
            self.log("Health check execution method verified")

            # 清理
            with self.manager._get_db() as conn:
                conn.execute("DELETE FROM health_checks WHERE tenant_id = ?", (self.tenant_id,))
                conn.commit()
            self.log("Cleaned up health check test data")

        except Exception as e:
            self.log(f"Health checks test failed: {e}", success=False)

    def test_failover(self):
        """测试故障转移"""
        print("\n🔄 Testing Failover...")

        try:
            # 创建故障转移配置
            config = self.manager.create_failover_config(
                tenant_id=self.tenant_id,
                name="主备数据中心故障转移",
                primary_region="cn-north-1",
                secondary_regions=["cn-south-1", "cn-east-1"],
                failover_trigger="health_check_failed",
                auto_failover=False,
                failover_timeout=300,
                health_check_id=None
            )

            self.log(f"Created failover config: {config.name} (ID: {config.id})")
            self.log(f"  Primary region: {config.primary_region}")
            self.log(f"  Secondary regions: {config.secondary_regions}")

            # 获取故障转移配置列表
            configs = self.manager.list_failover_configs(self.tenant_id)
            assert len(configs) >= 1
            self.log(f"Listed {len(configs)} failover configs")

            # 发起故障转移
            event = self.manager.initiate_failover(
                config_id=config.id,
                reason="Primary region health check failed"
            )

            if event:
                self.log(f"Initiated failover: {event.id}")
                self.log(f"  From: {event.from_region}")
                self.log(f"  To: {event.to_region}")

                # 更新故障转移状态
                self.manager.update_failover_status(event.id, "completed")
                updated_event = self.manager.get_failover_event(event.id)
                assert updated_event.status == "completed"
                self.log("Failover completed")

            # 获取故障转移事件列表
            events = self.manager.list_failover_events(self.tenant_id)
            self.log(f"Listed {len(events)} failover events")

            # 清理
            with self.manager._get_db() as conn:
                conn.execute("DELETE FROM failover_events WHERE tenant_id = ?", (self.tenant_id,))
                conn.execute("DELETE FROM failover_configs WHERE tenant_id = ?", (self.tenant_id,))
                conn.commit()
            self.log("Cleaned up failover test data")

        except Exception as e:
            self.log(f"Failover test failed: {e}", success=False)

    def test_backup(self):
        """测试备份与恢复"""
        print("\n💾 Testing Backup & Recovery...")

        try:
            # 创建备份任务
            job = self.manager.create_backup_job(
                tenant_id=self.tenant_id,
                name="每日数据库备份",
                backup_type="full",
                target_type="database",
                target_id="postgres-main",
                schedule="0 2 * * *",  # 每天凌晨2点
                retention_days=30,
                encryption_enabled=True,
                compression_enabled=True,
                storage_location="s3://insightflow-backups/"
            )

            self.log(f"Created backup job: {job.name} (ID: {job.id})")
            self.log(f"  Schedule: {job.schedule}")
            self.log(f"  Retention: {job.retention_days} days")

            # 获取备份任务列表
            jobs = self.manager.list_backup_jobs(self.tenant_id)
            assert len(jobs) >= 1
            self.log(f"Listed {len(jobs)} backup jobs")

            # 执行备份
            record = self.manager.execute_backup(job.id)

            if record:
                self.log(f"Executed backup: {record.id}")
                self.log(f"  Status: {record.status.value}")
                self.log(f"  Storage: {record.storage_path}")

                # 获取备份记录列表
                records = self.manager.list_backup_records(self.tenant_id)
                self.log(f"Listed {len(records)} backup records")

                # 测试恢复（模拟）
                restore_result = self.manager.restore_from_backup(record.id)
                self.log(f"Restore test result: {restore_result}")

            # 清理
            with self.manager._get_db() as conn:
                conn.execute("DELETE FROM backup_records WHERE tenant_id = ?", (self.tenant_id,))
                conn.execute("DELETE FROM backup_jobs WHERE tenant_id = ?", (self.tenant_id,))
                conn.commit()
            self.log("Cleaned up backup test data")

        except Exception as e:
            self.log(f"Backup test failed: {e}", success=False)

    def test_cost_optimization(self):
        """测试成本优化"""
        print("\n💰 Testing Cost Optimization...")

        try:
            # 记录资源利用率数据
            report_date = datetime.now().strftime("%Y-%m-%d")

            for i in range(5):
                self.manager.record_resource_utilization(
                    tenant_id=self.tenant_id,
                    resource_type=ResourceType.CPU,
                    resource_id=f"server-{i:03d}",
                    utilization_rate=0.05 + random.random() * 0.1,  # 低利用率
                    peak_utilization=0.15,
                    avg_utilization=0.08,
                    idle_time_percent=0.85,
                    report_date=report_date,
                    recommendations=["Consider downsizing this resource"]
                )

            self.log("Recorded 5 resource utilization records")

            # 生成成本报告
            now = datetime.now()
            report = self.manager.generate_cost_report(
                tenant_id=self.tenant_id,
                year=now.year,
                month=now.month
            )

            self.log(f"Generated cost report: {report.id}")
            self.log(f"  Period: {report.report_period}")
            self.log(f"  Total cost: {report.total_cost} {report.currency}")
            self.log(f"  Anomalies detected: {len(report.anomalies)}")

            # 检测闲置资源
            idle_resources = self.manager.detect_idle_resources(self.tenant_id)
            self.log(f"Detected {len(idle_resources)} idle resources")

            # 获取闲置资源列表
            idle_list = self.manager.get_idle_resources(self.tenant_id)
            for resource in idle_list:
                self.log(
                    f"  Idle resource: {
                        resource.resource_name} (est. cost: {
                        resource.estimated_monthly_cost}/month)")

            # 生成成本优化建议
            suggestions = self.manager.generate_cost_optimization_suggestions(self.tenant_id)
            self.log(f"Generated {len(suggestions)} cost optimization suggestions")

            for suggestion in suggestions:
                self.log(f"  Suggestion: {suggestion.title}")
                self.log(f"    Potential savings: {suggestion.potential_savings} {suggestion.currency}")
                self.log(f"    Confidence: {suggestion.confidence}")
                self.log(f"    Difficulty: {suggestion.difficulty}")

            # 获取优化建议列表
            all_suggestions = self.manager.get_cost_optimization_suggestions(self.tenant_id)
            self.log(f"Listed {len(all_suggestions)} optimization suggestions")

            # 应用优化建议
            if all_suggestions:
                applied = self.manager.apply_cost_optimization_suggestion(all_suggestions[0].id)
                if applied:
                    self.log(f"Applied optimization suggestion: {applied.title}")
                    assert applied.is_applied
                    assert applied.applied_at is not None

            # 清理
            with self.manager._get_db() as conn:
                conn.execute("DELETE FROM cost_optimization_suggestions WHERE tenant_id = ?", (self.tenant_id,))
                conn.execute("DELETE FROM idle_resources WHERE tenant_id = ?", (self.tenant_id,))
                conn.execute("DELETE FROM resource_utilizations WHERE tenant_id = ?", (self.tenant_id,))
                conn.execute("DELETE FROM cost_reports WHERE tenant_id = ?", (self.tenant_id,))
                conn.commit()
            self.log("Cleaned up cost optimization test data")

        except Exception as e:
            self.log(f"Cost optimization test failed: {e}", success=False)

    def print_summary(self):
        """打印测试总结"""
        print("\n" + "=" * 60)
        print("Test Summary")
        print("=" * 60)

        total = len(self.test_results)
        passed = sum(1 for _, success in self.test_results if success)
        failed = total - passed

        print(f"Total tests: {total}")
        print(f"Passed: {passed} ✅")
        print(f"Failed: {failed} ❌")

        if failed > 0:
            print("\nFailed tests:")
            for message, success in self.test_results:
                if not success:
                    print(f"  ❌ {message}")

        print("=" * 60)


def main():
    """主函数"""
    test = TestOpsManager()
    test.run_all_tests()


if __name__ == "__main__":
    main()