rank functions

KRRT7 · KRRT7 · commit 08464c49e0a2 · 2025-06-27T17:36:04.000-07:00
diff --git a/codeflash/benchmarking/function_ranker.py b/codeflash/benchmarking/function_ranker.py
@@ -0,0 +1,137 @@
+from __future__ import annotations
+
+import sqlite3
+from typing import TYPE_CHECKING
+
+from codeflash.cli_cmds.console import logger
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from codeflash.discovery.functions_to_optimize import FunctionToOptimize
+
+
+class FunctionRanker:
+    """Ranks functions for optimization based on trace data using ttX scoring.
+
+    ttX = own_time + (time_spent_in_callees x call_count)
+
+    This prioritizes functions that:
+    1. Take significant time themselves (own_time)
+    2. Are called frequently and have expensive subcalls (time_spent_in_callees x call_count)
+    """
+
+    def __init__(self, trace_file_path: Path) -> None:
+        self.trace_file_path = trace_file_path
+        self._function_stats = None
+
+    def _load_function_stats(self) -> dict[str, dict]:
+        """Load function timing statistics from trace database."""
+        if self._function_stats is not None:
+            return self._function_stats
+
+        self._function_stats = {}
+
+        try:
+            with sqlite3.connect(self.trace_file_path) as conn:
+                cursor = conn.cursor()
+
+                cursor.execute("""
+                    SELECT
+                        filename,
+                        line_number,
+                        function,
+                        class_name,
+                        call_count_nonrecursive,
+                        total_time_ns,
+                        cumulative_time_ns
+                    FROM pstats
+                    WHERE call_count_nonrecursive > 0
+                """)
+
+                for row in cursor.fetchall():
+                    filename, line_number, function_name, class_name, call_count, total_time_ns, cumulative_time_ns = (
+                        row
+                    )
+
+                    if class_name and class_name.strip():
+                        qualified_name = f"{class_name}.{function_name}"
+                    else:
+                        qualified_name = function_name
+
+                    # Calculate own time (total time - time spent in subcalls)
+                    own_time_ns = total_time_ns
+                    time_in_callees_ns = cumulative_time_ns - total_time_ns
+
+                    # Calculate ttX score
+                    ttx_score = own_time_ns + (time_in_callees_ns * call_count)
+
+                    function_key = f"{filename}:{qualified_name}"
+                    self._function_stats[function_key] = {
+                        "filename": filename,
+                        "function_name": function_name,
+                        "qualified_name": qualified_name,
+                        "class_name": class_name,
+                        "line_number": line_number,
+                        "call_count": call_count,
+                        "own_time_ns": own_time_ns,
+                        "cumulative_time_ns": cumulative_time_ns,
+                        "time_in_callees_ns": time_in_callees_ns,
+                        "ttx_score": ttx_score,
+                    }
+
+                logger.debug(f"Loaded timing stats for {len(self._function_stats)} functions from trace")
+
+        except Exception as e:
+            logger.warning(f"Failed to load function stats from trace file {self.trace_file_path}: {e}")
+            self._function_stats = {}
+
+        return self._function_stats
+
+    def get_function_ttx_score(self, function_to_optimize: FunctionToOptimize) -> float:
+        stats = self._load_function_stats()
+
+        possible_keys = [
+            f"{function_to_optimize.file_path}:{function_to_optimize.qualified_name}",
+            f"{function_to_optimize.file_path}:{function_to_optimize.function_name}",
+        ]
+
+        for key in possible_keys:
+            if key in stats:
+                return stats[key]["ttx_score"]
+
+        # If not found in trace data, return 0 (will be ranked last)
+        return 0.0
+
+    def rank_functions(self, functions_to_optimize: list[FunctionToOptimize]) -> list[FunctionToOptimize]:
+        # Calculate ttX scores for all functions
+        function_scores = []
+        for func in functions_to_optimize:
+            ttx_score = self.get_function_ttx_score(func)
+            function_scores.append((func, ttx_score))
+
+        # Sort by ttX score descending (highest impact first)
+        function_scores.sort(key=lambda x: x[1], reverse=True)
+
+        logger.info("Function ranking by ttX score:")
+        for i, (func, score) in enumerate(function_scores[:10]):  # Top 10
+            logger.info(f"  {i + 1}. {func.qualified_name} (ttX: {score:.0f}ns)")
+
+        ranked_functions = [func for func, _ in function_scores]
+        logger.info(f"Ranked {len(ranked_functions)} functions by optimization priority")
+
+        return ranked_functions
+
+    def get_function_stats_summary(self, function_to_optimize: FunctionToOptimize) -> dict | None:
+        stats = self._load_function_stats()
+
+        possible_keys = [
+            f"{function_to_optimize.file_path}:{function_to_optimize.qualified_name}",
+            f"{function_to_optimize.file_path}:{function_to_optimize.function_name}",
+        ]
+
+        for key in possible_keys:
+            if key in stats:
+                return stats[key]
+
+        return None
diff --git a/codeflash/discovery/functions_to_optimize.py b/codeflash/discovery/functions_to_optimize.py
@@ -163,14 +163,15 @@ def get_functions_to_optimize(
         "Only one of optimize_all, replay_test, or file should be provided"
     )
     functions: dict[str, list[FunctionToOptimize]]
+    trace_file_path: Path | None = None
     with warnings.catch_warnings():
         warnings.simplefilter(action="ignore", category=SyntaxWarning)
         if optimize_all:
             logger.info("Finding all functions in the module '%s'…", optimize_all)
             console.rule()
             functions = get_all_files_and_functions(Path(optimize_all))
         elif replay_test:
-            functions = get_all_replay_test_functions(
+            functions, trace_file_path = get_all_replay_test_functions(
                 replay_test=replay_test, test_cfg=test_cfg, project_root_path=project_root
             )
         elif file is not None:
@@ -206,6 +207,28 @@ def get_functions_to_optimize(
         filtered_modified_functions, functions_count = filter_functions(
             functions, test_cfg.tests_root, ignore_paths, project_root, module_root, previous_checkpoint_functions
         )
+
+        if trace_file_path and trace_file_path.exists():
+            from codeflash.benchmarking.function_ranker import FunctionRanker
+
+            ranker = FunctionRanker(trace_file_path)
+
+            all_functions = []
+            for file_functions in filtered_modified_functions.values():
+                all_functions.extend(file_functions)
+
+            if all_functions:
+                ranked_functions = ranker.rank_functions(all_functions)
+
+                ranked_dict = {}
+                for func in ranked_functions:
+                    if func.file_path not in ranked_dict:
+                        ranked_dict[func.file_path] = []
+                    ranked_dict[func.file_path].append(func)
+
+                filtered_modified_functions = ranked_dict
+                logger.info(f"Ranked {len(all_functions)} functions by optimization priority using trace data")
+
         logger.info(f"Found {functions_count} function{'s' if functions_count > 1 else ''} to optimize")
         if optimize_all:
             three_min_in_ns = int(1.8e11)
@@ -272,7 +295,34 @@ def find_all_functions_in_file(file_path: Path) -> dict[Path, list[FunctionToOpt
 
 def get_all_replay_test_functions(
     replay_test: list[Path], test_cfg: TestConfig, project_root_path: Path
-) -> dict[Path, list[FunctionToOptimize]]:
+) -> tuple[dict[Path, list[FunctionToOptimize]], Path]:
+    trace_file_path: Path | None = None
+    for replay_test_file in replay_test:
+        try:
+            with replay_test_file.open("r", encoding="utf8") as f:
+                tree = ast.parse(f.read())
+                for node in ast.walk(tree):
+                    if isinstance(node, ast.Assign):
+                        for target in node.targets:
+                            if (
+                                isinstance(target, ast.Name)
+                                and target.id == "trace_file_path"
+                                and isinstance(node.value, ast.Constant)
+                                and isinstance(node.value.value, str)
+                            ):
+                                trace_file_path = Path(node.value.value)
+                                break
+                        if trace_file_path:
+                            break
+            if trace_file_path:
+                break
+        except Exception as e:
+            logger.warning(f"Error parsing replay test file {replay_test_file}: {e}")
+
+    if not trace_file_path:
+        logger.error("Could not find trace_file_path in replay test files.")
+        exit_with_message("Could not find trace_file_path in replay test files.")
+
     function_tests, _ = discover_unit_tests(test_cfg, discover_only_these_tests=replay_test)
     # Get the absolute file paths for each function, excluding class name if present
     filtered_valid_functions = defaultdict(list)
@@ -317,7 +367,7 @@ def get_all_replay_test_functions(
         if filtered_list:
             filtered_valid_functions[file_path] = filtered_list
 
-    return filtered_valid_functions
+    return filtered_valid_functions, trace_file_path
 
 
 def is_git_repo(file_path: str) -> bool: