import shutil import subprocess import sys import json import click from pathlib import Path from rich.console import Console from rich.table import Table from datetime import datetime from dataclasses import dataclass, asdict @dataclass class EvaluationResult: success: set[Path] failure: set[Path] compile_error: set[Path] error: set[Path] @classmethod def from_dict(cls, data: dict[str, list[Path]]) -> 'EvaluationResult': return cls( success = set(data.get('success', [])), failure = set(data.get('failure', [])), compile_error = set(data.get('compile_error', [])), error = set(data.get('error', [])), ) @classmethod def import_json(cls, json_path: Path) -> 'EvaluationResult': with json_path.open("r") as f: return cls.from_dict(json.load(f)) def to_dict(self): return asdict(self) def export_json(self, json_path: Path): jsonable_dict = { 'success': sorted(self.success), 'failure': sorted(self.failure), 'compile_error': sorted(self.compile_error), 'error': sorted(self.error), } with json_path.open("w") as f: json.dump(jsonable_dict, f, indent=2) def __post_init__(self): assert len(set.intersection(self.success, self.failure, self.compile_error, self.error)) == 0, 'Malformed evaluation result. Paths appear in multiple categories.' # --- Constants and Configuration --- # Project root is the parent directory of this script's location PROJECT_ROOT = Path(__file__).parent.parent.resolve() HARNESS_DIR = PROJECT_ROOT / ".eval_harness" CACHE_DIR = HARNESS_DIR / "results_cache" LOCAL_WORKSPACE = HARNESS_DIR / "local" SUPPORTED_PYTHON_VERSIONS = ('3.6', '3.7', '3.8', '3.9', '3.10', '3.11', '3.12', '3.13') # Rich console for pretty printing console = Console() def _get_cache_path(commit_hash: str, eval_file_list_path: Path, python_version: str) -> Path: cache_path = CACHE_DIR / python_version / commit_hash / eval_file_list_path.with_suffix('.json').name cache_path.parent.mkdir(parents=True, exist_ok=True) return cache_path def run_command(command, cwd=None, capture_output=False, text=True): """A helper to run a shell command and handle errors.""" try: # If the command is a string, use shell=True for commands like 'git archive | tar' use_shell = isinstance(command, str) process = subprocess.run( command, cwd=cwd, check=True, capture_output=capture_output, text=text, shell=use_shell, ) return process except subprocess.CalledProcessError as e: console.print(f"[bold red]Error running command:[/bold red] {command}") console.print(f"[red]Return Code:[/red] {e.returncode}") console.print(f"[red]Output:[/red]\n{e.stdout or e.stderr}") sys.exit(1) except FileNotFoundError: # This error is more common on Windows if git isn't in the PATH cmd_name = command[0] if isinstance(command, list) else command.split()[0] console.print(f"[bold red]Error: Command not found.[/bold red] Is '{cmd_name}' in your system's PATH?") sys.exit(1) def get_head_commit_hash(): """Gets the short hash of the current HEAD commit.""" return run_command(["git", "rev-parse", "--short", "HEAD"], capture_output=True).stdout.strip() def setup_workspace(workspace_path: Path, version_name: str, commit_hash: str = ''): """Prepares a clean workspace for an evaluation run.""" console.print(f"\n[bold cyan]Setting up '{version_name}' workspace...[/bold cyan]") # Clean up previous workspace if it exists if workspace_path.exists(): shutil.rmtree(workspace_path) workspace_path.mkdir(parents=True) code_dir = workspace_path / "code" venv_dir = workspace_path / "venv" # Handle OS-specific executable paths pip_executable = venv_dir / "Scripts" / "pip.exe" if sys.platform == "win32" else venv_dir / "bin" / "pip" # 1. Get the source code if version_name == "local": console.print(" -> Copying current project state...") # Ignore git, the harness, and other noise shutil.copytree( PROJECT_ROOT, code_dir, ignore=shutil.ignore_patterns(".git", ".eval_harness", "__pycache__", "*.pyc", ".idea"), ) else: console.print(f" -> Exporting code from {version_name} ({commit_hash})...") code_dir.mkdir() # Using git archive is a clean way to export the repo content git_archive_command = f"git archive {commit_hash} | tar -x -C {code_dir}" run_command(git_archive_command, cwd=PROJECT_ROOT) # 2. Create virtual environment console.print(f" -> Creating virtual environment at [italic]{venv_dir}[/italic]...") run_command([sys.executable, "-m", "venv", str(venv_dir)]) # 3. Install dependencies console.print(" -> Installing project dependencies...") run_command([str(pip_executable), "install", "-e", "."], cwd=code_dir, capture_output=True) return code_dir, venv_dir def run_evaluation(workspace_path: Path, venv_dir: Path, input_file: Path, python_version: str) -> EvaluationResult: """Runs the cflow.py evaluation script within a given workspace.""" version_name = workspace_path.name console.print(f"\n[bold green]Running evaluation for '{version_name}' on Python {python_version}...[/bold green]") code_dir = workspace_path / "code" output_dir = workspace_path / "output" / python_version # Use a sub-dir for version-specific output output_dir.mkdir(parents=True, exist_ok=True) results_file = output_dir / python_version / f"{input_file.stem}_0" / "results.json" # Clean previous results for this version if they exist if results_file.exists(): results_file.unlink() cflow_script = code_dir / "dev_scripts" / "cflow.py" python_executable = venv_dir / "Scripts" / "python.exe" if sys.platform == "win32" else venv_dir / "bin" / "python" command = [ str(python_executable), str(cflow_script), input_file, "--version", python_version, "--prefix", str(output_dir), ] run_command(command) if not results_file.exists(): console.print(f"[bold red]Error:[/bold red] Evaluation for '{version_name}' finished but 'results.json' was not created.") sys.exit(1) return EvaluationResult.import_json(results_file) def compare_and_report(commit_results: EvaluationResult, local_results: EvaluationResult, report_path: Path, compare_to_commit: str, python_version: str): """Compares two sets of results and prints a detailed report to console and a file.""" with report_path.open("w", encoding="utf-8") as f: report_console = Console(file=f) title = f"[bold magenta]Evaluation Comparison Report (Python {python_version})[/bold magenta]" console.print(f"\n\n{title}") report_console.print(title) categories = ["success", "failure", "compile_error", "error"] commit_dict = commit_results.to_dict() local_dict = local_results.to_dict() # 1. Movement Matrix table = Table(title="Evaluation Movement Matrix") table.add_column(f"From ({compare_to_commit})", justify="right", style="cyan", no_wrap=True) for category in categories: table.add_column( f"To (Local)\n{category.replace('_', ' ').title()}", justify="center", ) commit_map = {path: cat for cat, paths in commit_dict.items() for path in paths} local_map = {path: cat for cat, paths in local_dict.items() for path in paths} all_paths = set(commit_map.keys()) | set(local_map.keys()) movement_matrix = {cat: {cat2: 0 for cat2 in categories} for cat in categories} for path in all_paths: from_cat = commit_map.get(path) to_cat = local_map.get(path) if from_cat and to_cat: movement_matrix[from_cat][to_cat] += 1 for from_cat in categories: row = [from_cat.replace("_", " ").title()] for to_cat in categories: count = movement_matrix[from_cat][to_cat] if count == 0: row.append("[bright_black]-[/bright_black]") continue if from_cat == to_cat: style = "blue" elif from_cat == "success": style = "bold red" # Regression from success elif to_cat == "success": style = "bold green" # Improvement to success else: style = "tan" # Side-move row.append(f"[{style}]{'+' if from_cat != to_cat else ''}{count}[/{style}]") table.add_row(*row) console.print(table) report_console.print(table) # 2. Detailed Deltas by Movement Category for from_cat in categories: for to_cat in categories: if from_cat == to_cat: continue moved_paths = sorted([ p for p in all_paths if commit_map.get(p) == from_cat and local_map.get(p) == to_cat ]) if not moved_paths: continue # Determine style and title if from_cat == "success": style = "bold red" # Regression elif to_cat == "success": style = "bold green" # Improvement else: style = "bold yellow" # Side-move title = f"[{style}]{from_cat.replace('_', ' ').title()} -> {to_cat.replace('_', ' ').title()}[/{style}]" console.print(f"\n{title}") report_console.print(f"\n{title}") for p in moved_paths: console.print(f"- {p}") report_console.print(f"- {p}") # 3. New and Removed Items new_items = sorted([p for p in all_paths if commit_map.get(p) is None]) removed_items = sorted([p for p in all_paths if local_map.get(p) is None]) def print_list_section(title, items, format_func): if items: console.print(f"\n{title}") report_console.print(f"\n{title}") for item in items: line = format_func(item) console.print(line) report_console.print(line) print_list_section( "\n[bold blue]New Items[/bold blue]", new_items, lambda p: f"- {p} (Added as [cyan]{local_map.get(p)}[/cyan])", ) print_list_section( "[bold gray50]Removed Items[/bold gray50]", removed_items, lambda p: f"- {p} (Removed from [cyan]{commit_map.get(p)}[/cyan])", ) console.print(f"\n-> Comparison report saved to [italic]{report_path}[/italic]") @click.command() @click.option('--input-file', required=True, type=click.Path(exists=True, dir_okay=False, resolve_path=True, path_type=Path), help='Path to the input file listing test cases.') @click.option('--python-version', 'python_versions', multiple=True, type=str, help='Python version to evaluate. Can be specified multiple times. Defaults to all supported versions.', default=SUPPORTED_PYTHON_VERSIONS) @click.option('--compare-to-commit', type=str, help='The git commit hash to compare to. Defaults to HEAD.', default='HEAD') @click.option('--no-cache', is_flag=True, default=False, help='Force re-evaluation of the comparison commit for all specified Python versions.') def main(input_file: Path, python_versions: list[str], compare_to_commit: str, no_cache: bool): """ An evaluation framework to compare the performance of the current project state against a previous git commit. """ HARNESS_DIR.mkdir(exist_ok=True) run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") commit_version = compare_to_commit if compare_to_commit.lower() == 'head': compare_to_commit = get_head_commit_hash() console.print(f"[bold green]Resolved HEAD to commit {compare_to_commit}.[/bold green]") else: compare_to_commit = compare_to_commit[:7].lower() # shorten and lowercase for consistency COMMIT_WORKSPACE = HARNESS_DIR / compare_to_commit # Always setup the local workspace _, local_venv_dir = setup_workspace(LOCAL_WORKSPACE, "local") # Only setup the commit workspace on demand commit_venv_dir = None for python_version in python_versions: console.print(f"\n[bold rule dark_orange]Processing Python Version: {python_version}[/bold rule dark_orange]") # --- Commit Evaluation --- cached_result_file = _get_cache_path(compare_to_commit, input_file, python_version) if not no_cache and cached_result_file.exists(): console.print(f"[bold green]Using cached result for ({compare_to_commit}) on Python {python_version}...[/bold green]") commit_results = EvaluationResult.import_json(cached_result_file) else: if commit_venv_dir is None: _, commit_venv_dir = setup_workspace(COMMIT_WORKSPACE, commit_version, compare_to_commit) assert commit_venv_dir is not None commit_results = run_evaluation(COMMIT_WORKSPACE, commit_venv_dir, input_file, python_version) commit_results.export_json(cached_result_file) console.print(f"-> Caching result to [italic]{cached_result_file}[/italic]") # --- Local Evaluation --- local_results = run_evaluation(LOCAL_WORKSPACE, local_venv_dir, input_file, python_version) # --- Save Local Results Artifact --- local_artifact_path = CACHE_DIR / python_version / f"local_results_{run_timestamp}.json" local_results.export_json(local_artifact_path) console.print(f"-> Local results saved to [italic]{local_artifact_path}[/italic]") # --- Comparison --- report_artifact_path = CACHE_DIR / python_version / f"comparison_report_{run_timestamp}.txt" compare_and_report(commit_results, local_results, report_artifact_path, compare_to_commit, python_version) # --- Final Cleanup --- console.print("\n[bold]Cleaning up workspaces...[/bold]") if commit_venv_dir is not None: shutil.rmtree(COMMIT_WORKSPACE) shutil.rmtree(LOCAL_WORKSPACE) console.print("Done.") if __name__ == "__main__": main()