insightflow/code_reviewer.py

#!/usr/bin/env python3
"""
InsightFlow 代码审查与自动修复脚本
"""

import ast
import re
from pathlib import Path


class CodeIssue:
    def __init__(self, file_path: str, line_no: int, issue_type: str, message: str, severity: str = "info"):
        self.file_path = file_path
        self.line_no = line_no
        self.issue_type = issue_type
        self.message = message
        self.severity = severity  # info, warning, error
        self.fixed = False

    def __repr__(self):
        return f"{self.severity.upper()}: {self.file_path}:{self.line_no} - {self.issue_type}: {self.message}"


class CodeReviewer:
    def __init__(self, base_path: str):
        self.base_path = Path(base_path)
        self.issues: list[CodeIssue] = []
        self.fixed_issues: list[CodeIssue] = []
        self.manual_review_issues: list[CodeIssue] = []

    def scan_all(self) -> None:
        """扫描所有 Python 文件"""
        for py_file in self.base_path.rglob("*.py"):
            if "__pycache__" in str(py_file):
                continue
            self.scan_file(py_file)

    def scan_file(self, file_path: Path) -> None:
        """扫描单个文件"""
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                lines = content.split("\n")
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            return

        rel_path = str(file_path.relative_to(self.base_path))

        # 1. 检查裸异常捕获
        self._check_bare_exceptions(content, lines, rel_path)

        # 2. 检查重复导入
        self._check_duplicate_imports(content, lines, rel_path)

        # 3. 检查 PEP8 问题
        self._check_pep8_issues(content, lines, rel_path)

        # 4. 检查未使用的导入
        self._check_unused_imports(content, lines, rel_path)

        # 5. 检查混合字符串格式化
        self._check_string_formatting(content, lines, rel_path)

        # 6. 检查魔法数字
        self._check_magic_numbers(content, lines, rel_path)

        # 7. 检查 SQL 注入风险
        self._check_sql_injection(content, lines, rel_path)

        # 8. 检查 CORS 配置
        self._check_cors_config(content, lines, rel_path)

        # 9. 检查敏感信息
        self._check_sensitive_info(content, lines, rel_path)

    def _check_bare_exceptions(self, content: str, lines: list[str], file_path: str) -> None:
        """检查裸异常捕获"""
        for i, line in enumerate(lines, 1):
            if re.search(r"except\s*:\s*$", line.strip()) or re.search(r"except\s+Exception\s*:\s*$", line.strip()):
                # 跳过有注释说明的情况
                if "# noqa" in line or "# intentional" in line.lower():
                    continue
                issue = CodeIssue(file_path, i, "bare_exception", "裸异常捕获，应该使用具体异常类型", "warning")
                self.issues.append(issue)

    def _check_duplicate_imports(self, content: str, lines: list[str], file_path: str) -> None:
        """检查重复导入"""
        imports = {}
        for i, line in enumerate(lines, 1):
            match = re.match(r"^(?:from\s+(\S+)\s+)?import\s+(.+)$", line.strip())
            if match:
                module = match.group(1) or ""
                names = match.group(2).split(",")
                for name in names:
                    name = name.strip().split()[0]  # 处理 'as' 别名
                    key = f"{module}.{name}" if module else name
                    if key in imports:
                        issue = CodeIssue(file_path, i, "duplicate_import", f"重复导入: {key}", "warning")
                        self.issues.append(issue)
                    imports[key] = i

    def _check_pep8_issues(self, content: str, lines: list[str], file_path: str) -> None:
        """检查 PEP8 问题"""
        for i, line in enumerate(lines, 1):
            # 行长度超过 120
            if len(line) > 120:
                issue = CodeIssue(file_path, i, "line_too_long", f"行长度 {len(line)} 超过 120 字符", "info")
                self.issues.append(issue)

            # 行尾空格
            if line.rstrip() != line:
                issue = CodeIssue(file_path, i, "trailing_whitespace", "行尾有空格", "info")
                self.issues.append(issue)

            # 多余的空行
            if i > 1 and line.strip() == "" and lines[i - 2].strip() == "":
                if i < len(lines) and lines[i].strip() == "":
                    issue = CodeIssue(file_path, i, "extra_blank_line", "多余的空行", "info")
                    self.issues.append(issue)

    def _check_unused_imports(self, content: str, lines: list[str], file_path: str) -> None:
        """检查未使用的导入"""
        try:
            tree = ast.parse(content)
        except SyntaxError:
            return

        imported_names = {}
        used_names = set()

        for node in ast.walk(tree):
            if isinstance(node, ast.Import):
                for alias in node.names:
                    name = alias.asname if alias.asname else alias.name
                    imported_names[name] = node.lineno
            elif isinstance(node, ast.ImportFrom):
                for alias in node.names:
                    name = alias.asname if alias.asname else alias.name
                    if name != "*":
                        imported_names[name] = node.lineno
            elif isinstance(node, ast.Name):
                used_names.add(node.id)

        for name, lineno in imported_names.items():
            if name not in used_names and not name.startswith("_"):
                # 排除一些常见例外
                if name in ["annotations", "TYPE_CHECKING"]:
                    continue
                issue = CodeIssue(file_path, lineno, "unused_import", f"未使用的导入: {name}", "info")
                self.issues.append(issue)

    def _check_string_formatting(self, content: str, lines: list[str], file_path: str) -> None:
        """检查混合字符串格式化"""
        has_fstring = False
        has_percent = False
        has_format = False

        for i, line in enumerate(lines, 1):
            if re.search(r'f["\']', line):
                has_fstring = True
            if re.search(r"%[sdfr]", line) and not re.search(r"\d+%", line):
                has_percent = True
            if ".format(" in line:
                has_format = True

        if has_fstring and (has_percent or has_format):
            issue = CodeIssue(file_path, 0, "mixed_formatting", "文件混合使用多种字符串格式化方式，建议统一为 f-string", "info")
            self.issues.append(issue)

    def _check_magic_numbers(self, content: str, lines: list[str], file_path: str) -> None:
        """检查魔法数字"""
        # 常见的魔法数字模式
        magic_patterns = [
            (r"=\s*(\d{3,})\s*[^:]", "可能的魔法数字"),
            (r"timeout\s*=\s*(\d+)", "timeout 魔法数字"),
            (r"limit\s*=\s*(\d+)", "limit 魔法数字"),
            (r"port\s*=\s*(\d+)", "port 魔法数字"),
        ]

        for i, line in enumerate(lines, 1):
            # 跳过注释和字符串
            code_part = line.split("#")[0]
            if not code_part.strip():
                continue

            for pattern, msg in magic_patterns:
                if re.search(pattern, code_part, re.IGNORECASE):
                    # 排除常见的合理数字
                    match = re.search(r"(\d{3,})", code_part)
                    if match:
                        num = int(match.group(1))
                        if num in [200, 404, 500, 401, 403, 429, 1000, 1024, 2048, 4096, 8080, 3000, 8000]:
                            continue
                        issue = CodeIssue(file_path, i, "magic_number", f"{msg}: {num}", "info")
                        self.issues.append(issue)

    def _check_sql_injection(self, content: str, lines: list[str], file_path: str) -> None:
        """检查 SQL 注入风险"""
        for i, line in enumerate(lines, 1):
            # 检查字符串拼接的 SQL
            if re.search(r'execute\s*\(\s*["\'].*%s', line) or re.search(r'execute\s*\(\s*f["\']', line):
                if "?" not in line and "%s" in line:
                    issue = CodeIssue(file_path, i, "sql_injection_risk", "可能的 SQL 注入风险 - 需要人工确认", "error")
                    self.manual_review_issues.append(issue)

    def _check_cors_config(self, content: str, lines: list[str], file_path: str) -> None:
        """检查 CORS 配置"""
        for i, line in enumerate(lines, 1):
            if "allow_origins" in line and '["*"]' in line:
                issue = CodeIssue(file_path, i, "cors_wildcard", "CORS 允许所有来源 - 需要人工确认", "warning")
                self.manual_review_issues.append(issue)

    def _check_sensitive_info(self, content: str, lines: list[str], file_path: str) -> None:
        """检查敏感信息"""
        for i, line in enumerate(lines, 1):
            # 检查硬编码密钥
            if re.search(r'(password|secret|key|token)\s*=\s*["\'][^"\']+["\']', line, re.IGNORECASE):
                if "os.getenv" not in line and "environ" not in line and "getenv" not in line:
                    # 排除一些常见假阳性
                    if not re.search(r'["\']\*+["\']', line) and not re.search(r'["\']<[^"\']*>["\']', line):
                        issue = CodeIssue(file_path, i, "hardcoded_secret", "可能的硬编码敏感信息 - 需要人工确认", "error")
                        self.manual_review_issues.append(issue)

    def auto_fix(self) -> None:
        """自动修复问题"""
        # 按文件分组问题
        issues_by_file: dict[str, list[CodeIssue]] = {}
        for issue in self.issues:
            if issue.file_path not in issues_by_file:
                issues_by_file[issue.file_path] = []
            issues_by_file[issue.file_path].append(issue)

        for file_path, issues in issues_by_file.items():
            full_path = self.base_path / file_path
            if not full_path.exists():
                continue

            try:
                with open(full_path, "r", encoding="utf-8") as f:
                    content = f.read()
                    lines = content.split("\n")
            except Exception as e:
                print(f"Error reading {full_path}: {e}")
                continue

            original_lines = lines.copy()

            # 修复行尾空格
            for issue in issues:
                if issue.issue_type == "trailing_whitespace":
                    idx = issue.line_no - 1
                    if 0 <= idx < len(lines):
                        lines[idx] = lines[idx].rstrip()
                        issue.fixed = True

            # 修复裸异常
            for issue in issues:
                if issue.issue_type == "bare_exception":
                    idx = issue.line_no - 1
                    if 0 <= idx < len(lines):
                        line = lines[idx]
                        # 将 except: 改为 except Exception:
                        if re.search(r"except\s*:\s*$", line.strip()):
                            lines[idx] = line.replace("except:", "except Exception:")
                            issue.fixed = True
                        elif re.search(r"except\s+Exception\s*:\s*$", line.strip()):
                            # 已经是 Exception，但可能需要更具体
                            pass

            # 如果文件有修改，写回
            if lines != original_lines:
                with open(full_path, "w", encoding="utf-8") as f:
                    f.write("\n".join(lines))
                print(f"Fixed issues in {file_path}")

        # 移动到已修复列表
        self.fixed_issues = [i for i in self.issues if i.fixed]
        self.issues = [i for i in self.issues if not i.fixed]

    def generate_report(self) -> str:
        """生成审查报告"""
        report = []
        report.append("# InsightFlow 代码审查报告")
        report.append(f"\n扫描路径: {self.base_path}")
        report.append(f"扫描时间: {__import__('datetime').datetime.now().isoformat()}")
        report.append("\n## 已自动修复的问题\n")

        if self.fixed_issues:
            report.append(f"共修复 {len(self.fixed_issues)} 个问题:\n")
            for issue in self.fixed_issues:
                report.append(f"- ✅ {issue.file_path}:{issue.line_no} - {issue.issue_type}: {issue.message}")
        else:
            report.append("无")

        report.append("\n## 需要人工确认的问题\n")
        if self.manual_review_issues:
            report.append(f"共发现 {len(self.manual_review_issues)} 个问题:\n")
            for issue in self.manual_review_issues:
                report.append(f"- ⚠️ {issue.file_path}:{issue.line_no} - {issue.issue_type}: {issue.message}")
        else:
            report.append("无")

        report.append("\n## 建议手动修复的问题\n")
        if self.issues:
            report.append(f"共发现 {len(self.issues)} 个问题:\n")
            for issue in self.issues:
                report.append(f"- 📝 {issue.file_path}:{issue.line_no} - {issue.issue_type}: {issue.message}")
        else:
            report.append("无")

        return "\n".join(report)


def main():
    base_path = "/root/.openclaw/workspace/projects/insightflow/backend"
    reviewer = CodeReviewer(base_path)

    print("开始扫描代码...")
    reviewer.scan_all()

    print(f"发现 {len(reviewer.issues)} 个可自动修复问题")
    print(f"发现 {len(reviewer.manual_review_issues)} 个需要人工确认的问题")

    print("\n开始自动修复...")
    reviewer.auto_fix()

    print(f"\n已修复 {len(reviewer.fixed_issues)} 个问题")

    # 生成报告
    report = reviewer.generate_report()
    report_path = Path(base_path).parent / "CODE_REVIEW_REPORT.md"
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(report)
    print(f"\n报告已保存到: {report_path}")

    return reviewer


if __name__ == "__main__":
    main()