From a142dc69f124c7ce58ffa2e284fcf765a7f6cb92 Mon Sep 17 00:00:00 2001
From: Josh Wiedemeier <jdw170000@utdallas.edu>
Date: Fri, 29 Aug 2025 14:12:55 -0500
Subject: [PATCH] cflow eval script improvements

---
 dev_scripts/evaluate_cflow.py | 249 ++++++++++++++++++++++++----------
 1 file changed, 174 insertions(+), 75 deletions(-)

diff --git a/dev_scripts/evaluate_cflow.py b/dev_scripts/evaluate_cflow.py
index 2a7d77f..72ab1ff 100644
--- a/dev_scripts/evaluate_cflow.py
+++ b/dev_scripts/evaluate_cflow.py
@@ -8,17 +8,62 @@ from rich.console import Console
 from rich.table import Table
 from datetime import datetime
 
+from dataclasses import dataclass, asdict
+
+@dataclass
+class EvaluationResult:
+    success: set[Path]
+    failure: set[Path]
+    compile_error: set[Path]
+    error: set[Path]
+
+    @classmethod
+    def from_dict(cls, data: dict[str, list[Path]]) -> 'EvaluationResult':
+        return cls(
+            success = set(data.get('success', [])),
+            failure = set(data.get('failure', [])),
+            compile_error = set(data.get('compile_error', [])),
+            error = set(data.get('error', [])),
+        )
+    
+    @classmethod
+    def import_json(cls, json_path: Path) -> 'EvaluationResult':
+        with json_path.open("r") as f:
+            return cls.from_dict(json.load(f))
+    
+    def to_dict(self):
+        return asdict(self)
+
+    def export_json(self, json_path: Path):
+        jsonable_dict = {
+            'success': sorted(self.success),
+            'failure': sorted(self.failure),
+            'compile_error': sorted(self.compile_error),
+            'error': sorted(self.error),
+        }
+        with json_path.open("w") as f:
+            json.dump(jsonable_dict, f, indent=2)
+
+    def __post_init__(self):
+        assert len(set.intersection(self.success, self.failure, self.compile_error, self.error)) == 0, 'Malformed evaluation result. Paths appear in multiple categories.'
+
+
 # --- Constants and Configuration ---
 # Project root is the parent directory of this script's location
 PROJECT_ROOT = Path(__file__).parent.parent.resolve()
 HARNESS_DIR = PROJECT_ROOT / ".eval_harness"
 CACHE_DIR = HARNESS_DIR / "results_cache"
-HEAD_WORKSPACE = HARNESS_DIR / "head"
 LOCAL_WORKSPACE = HARNESS_DIR / "local"
 
+SUPPORTED_PYTHON_VERSIONS = ('3.6', '3.7', '3.8', '3.9', '3.10', '3.11', '3.12', '3.13')
+
 # Rich console for pretty printing
 console = Console()
 
+def _get_cache_path(commit_hash: str, eval_file_list_path: Path, python_version: str) -> Path:
+    cache_path = CACHE_DIR / python_version / commit_hash / eval_file_list_path.with_suffix('.json').name
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    return cache_path
 
 def run_command(command, cwd=None, capture_output=False, text=True):
     """A helper to run a shell command and handle errors."""
@@ -51,7 +96,7 @@ def get_head_commit_hash():
     return run_command(["git", "rev-parse", "--short", "HEAD"], capture_output=True).stdout.strip()
 
 
-def setup_workspace(workspace_path: Path, version_name: str, head_commit_hash: str = None):
+def setup_workspace(workspace_path: Path, version_name: str, commit_hash: str = ''):
     """Prepares a clean workspace for an evaluation run."""
     console.print(f"\n[bold cyan]Setting up '{version_name}' workspace...[/bold cyan]")
 
@@ -66,13 +111,7 @@ def setup_workspace(workspace_path: Path, version_name: str, head_commit_hash: s
     pip_executable = venv_dir / "Scripts" / "pip.exe" if sys.platform == "win32" else venv_dir / "bin" / "pip"
 
     # 1. Get the source code
-    if version_name == "head":
-        console.print(f"  -> Exporting code from HEAD ({head_commit_hash})...")
-        code_dir.mkdir()
-        # Using git archive is a clean way to export the repo content
-        git_archive_command = f"git archive {head_commit_hash} | tar -x -C {code_dir}"
-        run_command(git_archive_command, cwd=PROJECT_ROOT)
-    else: # "local"
+    if version_name == "local":
         console.print("  -> Copying current project state...")
         # Ignore git, the harness, and other noise
         shutil.copytree(
@@ -80,6 +119,12 @@ def setup_workspace(workspace_path: Path, version_name: str, head_commit_hash: s
             code_dir,
             ignore=shutil.ignore_patterns(".git", ".eval_harness", "__pycache__", "*.pyc", ".idea"),
         )
+    else:
+        console.print(f"  -> Exporting code from {version_name} ({commit_hash})...")
+        code_dir.mkdir()
+        # Using git archive is a clean way to export the repo content
+        git_archive_command = f"git archive {commit_hash} | tar -x -C {code_dir}"
+        run_command(git_archive_command, cwd=PROJECT_ROOT)
 
     # 2. Create virtual environment
     console.print(f"  -> Creating virtual environment at [italic]{venv_dir}[/italic]...")
@@ -92,7 +137,7 @@ def setup_workspace(workspace_path: Path, version_name: str, head_commit_hash: s
     return code_dir, venv_dir
 
 
-def run_evaluation(workspace_path: Path, venv_dir: Path, input_file: Path, python_version: str):
+def run_evaluation(workspace_path: Path, venv_dir: Path, input_file: Path, python_version: str) -> EvaluationResult:
     """Runs the cflow.py evaluation script within a given workspace."""
     version_name = workspace_path.name
     console.print(f"\n[bold green]Running evaluation for '{version_name}' on Python {python_version}...[/bold green]")
@@ -125,126 +170,180 @@ def run_evaluation(workspace_path: Path, venv_dir: Path, input_file: Path, pytho
         console.print(f"[bold red]Error:[/bold red] Evaluation for '{version_name}' finished but 'results.json' was not created.")
         sys.exit(1)
 
-    with open(results_file) as f:
-        return json.load(f)
+    return EvaluationResult.import_json(results_file)
 
-
-def compare_and_report(head_results, local_results, report_path: Path):
+def compare_and_report(commit_results: EvaluationResult, local_results: EvaluationResult, report_path: Path, compare_to_commit: str):
     """Compares two sets of results and prints a detailed report to console and a file."""
-    # Setup a new console to capture the report text to a file
     with report_path.open("w", encoding="utf-8") as f:
         report_console = Console(file=f, width=120, record=True)
 
         title = "[bold magenta]Evaluation Comparison Report[/bold magenta]"
-        console.print("\n\n" + title)
+        console.print(f"\n\n{title}")
         report_console.print(title)
 
-        # 1. Summary Table
-        table = Table(title="Comparison Summary")
-        table.add_column("Category", justify="right", style="cyan", no_wrap=True)
-        table.add_column("HEAD", justify="center", style="green")
-        table.add_column("Local", justify="center", style="yellow")
-        table.add_column("Change", justify="center")
+        categories = ["success", "failure", "compile_error", "error"]
+        commit_dict = commit_results.to_dict()
+        local_dict = local_results.to_dict()
 
-        categories = sorted(list(set(list(head_results.keys()) + list(local_results.keys()))))
-        for cat in categories:
-            head_count = len(head_results.get(cat, []))
-            local_count = len(local_results.get(cat, []))
-            change = local_count - head_count
-            change_str = f"[red]+{change}[/red]" if change > 0 else f"[green]{change}[/green]" if change < 0 else "0"
-            table.add_row(cat.replace("_", " ").title(), str(head_count), str(local_count), change_str)
+        # 1. Movement Matrix
+        table = Table(title="Evaluation Movement Matrix")
+        table.add_column(f"From ({compare_to_commit})", justify="right", style="cyan", no_wrap=True)
+        for category in categories:
+            table.add_column(
+                f"To (Local)\n{category.replace('_', ' ').title()}",
+                justify="center",
+            )
+
+        commit_map = {path: cat for cat, paths in commit_dict.items() for path in paths}
+        local_map = {path: cat for cat, paths in local_dict.items() for path in paths}
+        all_paths = set(commit_map.keys()) | set(local_map.keys())
+        
+        movement_matrix = {cat: {cat2: 0 for cat2 in categories} for cat in categories}
+        for path in all_paths:
+            from_cat = commit_map.get(path)
+            to_cat = local_map.get(path)
+            if from_cat and to_cat:
+                movement_matrix[from_cat][to_cat] += 1
+
+        for from_cat in categories:
+            row = [from_cat.replace("_", " ").title()]
+            for to_cat in categories:
+                count = movement_matrix[from_cat][to_cat]
+                if count == 0:
+                    row.append("[bright_black]-[/bright_black]")
+                    continue
+
+                if from_cat == to_cat:
+                    style = "blue"
+                elif from_cat == "success":
+                    style = "bold red" # Regression from success
+                elif to_cat == "success":
+                    style = "bold green" # Improvement to success
+                else:
+                    style = "tan" # Side-move
+                row.append(f"[{style}]{'+' if from_cat != to_cat else ''}{count}[/{style}]")
+            table.add_row(*row)
 
         console.print(table)
         report_console.print(table)
 
-        # 2. Detailed Deltas
-        head_map = {path: cat for cat, paths in head_results.items() for path in paths}
-        local_map = {path: cat for cat, paths in local_results.items() for path in paths}
-        all_paths = set(head_map.keys()) | set(local_map.keys())
+        # 2. Detailed Deltas by Movement Category
+        for from_cat in categories:
+            for to_cat in categories:
+                if from_cat == to_cat:
+                    continue
 
-        regressions = sorted([(path, local_map.get(path)) for path in all_paths if head_map.get(path) == "success" and local_map.get(path) != "success"])
-        improvements = sorted([(path, head_map.get(path)) for path in all_paths if local_map.get(path) == "success" and head_map.get(path) != "success"])
-        side_moves = sorted([(path, head_map.get(path), local_map.get(path)) for path in all_paths if head_map.get(path) != local_map.get(path) and "success" not in [head_map.get(path), local_map.get(path)]])
+                moved_paths = sorted([
+                    p for p in all_paths
+                    if commit_map.get(p) == from_cat and local_map.get(p) == to_cat
+                ])
 
-        def print_section(title, items, format_func):
+                if not moved_paths:
+                    continue
+
+                # Determine style and title
+                if from_cat == "success":
+                    style = "bold red"  # Regression
+                elif to_cat == "success":
+                    style = "bold green"  # Improvement
+                else:
+                    style = "bold yellow"  # Side-move
+
+                title = f"[{style}]{from_cat.replace('_', ' ').title()} -> {to_cat.replace('_', ' ').title()}[/{style}]"
+                console.print(f"\n{title}")
+                report_console.print(f"\n{title}")
+                for p in moved_paths:
+                    console.print(f"- {p}")
+                    report_console.print(f"- {p}")
+        
+        # 3. New and Removed Items
+        new_items = sorted([p for p in all_paths if commit_map.get(p) is None])
+        removed_items = sorted([p for p in all_paths if local_map.get(p) is None])
+
+        def print_list_section(title, items, format_func):
             if items:
                 console.print(f"\n{title}")
                 report_console.print(f"\n{title}")
                 for item in items:
-                    line = format_func(*item)
+                    line = format_func(item)
                     console.print(line)
                     report_console.print(line)
+        
+        print_list_section(
+            "\n[bold blue]New Items[/bold blue]",
+            new_items,
+            lambda p: f"- {p} (Added as [cyan]{local_map.get(p)}[/cyan])",
+        )
+
+        print_list_section(
+            "[bold gray50]Removed Items[/bold gray50]",
+            removed_items,
+            lambda p: f"- {p} (Removed from [cyan]{commit_map.get(p)}[/cyan])",
+        )
 
-        print_section("[bold red]Regressions (Success -> Other)[/bold red]", regressions, lambda p, new: f"- {p}  ([green]success[/green] -> [yellow]{new}[/yellow])")
-        print_section("[bold green]Improvements (Other -> Success)[/bold green]", improvements, lambda p, old: f"- {p}  ([yellow]{old}[/yellow] -> [green]success[/green])")
-        print_section("[bold yellow]Side Moves (Error -> Error)[/bold yellow]", side_moves, lambda p, old, new: f"- {p}  ([cyan]{old}[/cyan] -> [cyan]{new}[/cyan])")
-    
     console.print(f"\n-> Comparison report saved to [italic]{report_path}[/italic]")
 
 @click.command()
 @click.option('--input-file', required=True, type=click.Path(exists=True, dir_okay=False, resolve_path=True, path_type=Path), help='Path to the input file listing test cases.')
-@click.option('--python-version', 'python_versions', required=True, multiple=True, type=str, help='Python version to evaluate. Can be specified multiple times.')
-@click.option('--no-cache', is_flag=True, default=False, help='Force re-evaluation of the HEAD commit for all specified Python versions.')
-def main(input_file, python_versions, no_cache):
+@click.option('--python-version', 'python_versions', multiple=True, type=str, help='Python version to evaluate. Can be specified multiple times. Defaults to all supported versions.', default=SUPPORTED_PYTHON_VERSIONS)
+@click.option('--compare-to-commit', type=str, help='The git commit hash to compare to. Defaults to HEAD.', default='HEAD')
+@click.option('--no-cache', is_flag=True, default=False, help='Force re-evaluation of the comparison commit for all specified Python versions.')
+def main(input_file: Path, python_versions: list[str], compare_to_commit: str, no_cache: bool):
     """
     An evaluation framework to compare the performance of the current project
-    state against the most recent git commit (HEAD).
+    state against a previous git commit.
     """
     HARNESS_DIR.mkdir(exist_ok=True)
-    CACHE_DIR.mkdir(exist_ok=True)
     run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 
-    head_commit = get_head_commit_hash()
-    input_filename = Path(input_file).name
+    commit_version = compare_to_commit
+    if compare_to_commit.lower() == 'head':
+        compare_to_commit = get_head_commit_hash()
+        console.print(f"[bold green]Resolved HEAD to commit {compare_to_commit}.[/bold green]")
+    else:
+        compare_to_commit = compare_to_commit[:7].lower() # shorten and lowercase for consistency 
 
-    # Determine if we need to setup the HEAD workspace
-    head_is_setup = False
-    head_venv_dir = None
-    if no_cache or not all((CACHE_DIR / f"{head_commit}_{input_filename}_{py_ver}.json").exists() for py_ver in python_versions):
-        _, head_venv_dir = setup_workspace(HEAD_WORKSPACE, "head", head_commit)
-        head_is_setup = True
+    COMMIT_WORKSPACE = HARNESS_DIR / compare_to_commit
 
     # Always setup the local workspace
     _, local_venv_dir = setup_workspace(LOCAL_WORKSPACE, "local")
+    # Only setup the commit workspace on demand
+    commit_venv_dir = None
 
     for python_version in python_versions:
         console.print(f"\n[bold rule dark_orange]Processing Python Version: {python_version}[/bold rule dark_orange]")
 
-        # --- HEAD Evaluation ---
-        cache_filename = f"{head_commit}_{input_filename}_{python_version}.json"
-        cached_result_file = CACHE_DIR / cache_filename
+        # --- Commit Evaluation ---
+        cached_result_file = _get_cache_path(compare_to_commit, input_file, python_version)
 
         if not no_cache and cached_result_file.exists():
-            console.print(f"[bold green]Using cached result for HEAD ({head_commit}) on Python {python_version}...[/bold green]")
-            with open(cached_result_file) as f:
-                head_results = json.load(f)
+            console.print(f"[bold green]Using cached result for ({compare_to_commit}) on Python {python_version}...[/bold green]")
+            commit_results = EvaluationResult.import_json(cached_result_file)
         else:
-            if not head_is_setup: # Should not happen if logic is correct, but as a safeguard
-                 _, head_venv_dir = setup_workspace(HEAD_WORKSPACE, "head", head_commit)
-                 head_is_setup = True
-            assert head_venv_dir is not None
-            head_results = run_evaluation(HEAD_WORKSPACE, head_venv_dir, input_file, python_version)
-            with open(cached_result_file, "w") as f:
-                json.dump(head_results, f, indent=2)
+            if commit_venv_dir is None:
+                _, commit_venv_dir = setup_workspace(COMMIT_WORKSPACE, commit_version, compare_to_commit)
+            
+            assert commit_venv_dir is not None
+            commit_results = run_evaluation(COMMIT_WORKSPACE, commit_venv_dir, input_file, python_version)
+            commit_results.export_json(cached_result_file)
             console.print(f"-> Caching result to [italic]{cached_result_file}[/italic]")
 
         # --- Local Evaluation ---
         local_results = run_evaluation(LOCAL_WORKSPACE, local_venv_dir, input_file, python_version)
         
         # --- Save Local Results Artifact ---
-        local_artifact_path = CACHE_DIR / f"local_results_{run_timestamp}_{python_version}.json"
-        with open(local_artifact_path, "w") as f:
-            json.dump(local_results, f, indent=2)
+        local_artifact_path = CACHE_DIR / python_version / f"local_results_{run_timestamp}.json"
+        local_results.export_json(local_artifact_path)
         console.print(f"-> Local results saved to [italic]{local_artifact_path}[/italic]")
 
         # --- Comparison ---
-        report_artifact_path = CACHE_DIR / f"comparison_report_{run_timestamp}_{python_version}.txt"
-        compare_and_report(head_results, local_results, report_artifact_path)
+        report_artifact_path = CACHE_DIR / python_version / f"comparison_report_{run_timestamp}.txt"
+        compare_and_report(commit_results, local_results, report_artifact_path, compare_to_commit)
 
     # --- Final Cleanup ---
     console.print("\n[bold]Cleaning up workspaces...[/bold]")
-    if head_is_setup:
-        shutil.rmtree(HEAD_WORKSPACE)
+    if commit_venv_dir is not None:
+        shutil.rmtree(COMMIT_WORKSPACE)
     shutil.rmtree(LOCAL_WORKSPACE)
     console.print("Done.")