From a5570e04610c7e0369de1fdd9c779921dffe9505 Mon Sep 17 00:00:00 2001 From: Max Novich Date: Tue, 1 Apr 2025 12:40:57 -0700 Subject: [PATCH 1/4] Example selector v1 --- src/ai_migrate/__init__.py | 12 +++ src/ai_migrate/example_selector.py | 133 +++++++++++++++++++++++++++++ src/ai_migrate/migrate.py | 20 ++++- 3 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 src/ai_migrate/example_selector.py diff --git a/src/ai_migrate/__init__.py b/src/ai_migrate/__init__.py index e69de29..e29c0cc 100644 --- a/src/ai_migrate/__init__.py +++ b/src/ai_migrate/__init__.py @@ -0,0 +1,12 @@ +from .migrate import run +from .manifest import Manifest, FileEntry, FileGroup +from .example_selector import ExampleSelectionResult, select_relevant_examples + +__all__ = [ + "run", + "Manifest", + "FileEntry", + "FileGroup", + "ExampleSelectionResult", + "select_relevant_examples", +] diff --git a/src/ai_migrate/example_selector.py b/src/ai_migrate/example_selector.py new file mode 100644 index 0000000..d1dc541 --- /dev/null +++ b/src/ai_migrate/example_selector.py @@ -0,0 +1,133 @@ +"""Example selection logic for optimizing migration context.""" + +from dataclasses import dataclass +from typing import List +from pathlib import Path +from .migrate import DefaultClient, MigrationExample + +EXAMPLE_SELECTION_PROMPT = """You are an expert at analyzing code migration patterns. Your task is to select the most relevant examples for migrating specific target files. + +Analyze the target files and all available example pairs. Then select only the examples that demonstrate patterns and transformations that will be most helpful for migrating the target. + +Consider: +1. Language features and syntax +2. Similar patterns or structures +3. Related functionality or domain +4. Migration complexity and scope + +IMPORTANT: You must provide your response in the following JSON format: + +{ + "analysis": "Brief analysis of what needs to be migrated in the target files", + "selected_examples": [ + { + "id": "Example number (integer)", + "reason": "Detailed justification for selecting this example" + } + ], + "excluded_examples": [ + { + "id": "Example number (integer)", + "reason": "Brief reason for excluding this example" + } + ] +} + +Notes: +- Example numbers should be integers (1, 2, 3, etc.) +- Provide clear, specific reasons for each selection and exclusion +- Focus on selecting examples that demonstrate patterns needed for this specific migration +- Ensure the response is valid JSON that can be parsed programmatically""" + + +@dataclass +class ExampleSelectionResult: + selected_examples: List[MigrationExample] + analysis: str + selection_reasons: dict[str, str] + exclusion_reasons: dict[str, str] + + +async def select_relevant_examples( + target_files: List[str], + available_examples: List[MigrationExample], + client: DefaultClient, +) -> ExampleSelectionResult: + target_content = [] + for target_file in target_files: + path = Path(target_file) + content = path.read_text() + target_content.append(f"### `{path.name}`\n```\n{content}\n```") + + examples_content = [] + for i, example in enumerate(available_examples, 1): + example_files = [] + for old_file in example.old_files: + example_files.append(f"### `{old_file.name}`\n```\n{old_file.content}\n```") + for new_file in example.new_files: + if new_file.name == old_file.name: + example_files.append( + f"### `{new_file.name}` (migrated)\n```\n{new_file.content}\n```" + ) + break + examples_content.append( + f"Example {i} ({example.name}):\n" + "\n".join(example_files) + ) + + prompt = ( + f"{EXAMPLE_SELECTION_PROMPT}\n\n" + f"Target Files:\n{chr(10).join(target_content)}\n\n" + f"Available Examples:\n\n{chr(10).join(examples_content)}" + ) + + messages = [ + {"role": "system", "content": EXAMPLE_SELECTION_PROMPT}, + {"role": "user", "content": prompt}, + ] + + response, _ = await client.generate_completion(messages=messages, temperature=0.1) + content = response["choices"][0]["message"]["content"] + + import json + try: + result = json.loads(content) + except json.JSONDecodeError as e: + raise ValueError(f"Failed to parse LLM response as JSON: {e}\nResponse: {content}") + + # Extract analysis + analysis = result.get("analysis", "") + + # Process selected examples + selected_indices = [] + selection_reasons = {} + for example in result.get("selected_examples", []): + try: + idx = int(example["id"]) - 1 # Convert from 1-based to 0-based indexing + if 0 <= idx < len(available_examples): + selected_indices.append(idx) + selection_reasons[available_examples[idx].name] = example["reason"] + except (ValueError, KeyError) as e: + print(f"Warning: Invalid selected example format: {example}, Error: {e}") + + # Process excluded examples + exclusion_reasons = {} + for example in result.get("excluded_examples", []): + try: + idx = int(example["id"]) - 1 # Convert from 1-based to 0-based indexing + if 0 <= idx < len(available_examples): + exclusion_reasons[available_examples[idx].name] = example["reason"] + except (ValueError, KeyError) as e: + print(f"Warning: Invalid excluded example format: {example}, Error: {e}") + + selected_examples = [ + available_examples[i] + for i in selected_indices + if 0 <= i < len(available_examples) + ] + + return ExampleSelectionResult( + selected_examples=selected_examples, + analysis=analysis, + selection_reasons=selection_reasons, + exclusion_reasons=exclusion_reasons, + ) \ No newline at end of file diff --git a/src/ai_migrate/migrate.py b/src/ai_migrate/migrate.py index 5b3c8d7..286bb5c 100644 --- a/src/ai_migrate/migrate.py +++ b/src/ai_migrate/migrate.py @@ -539,6 +539,25 @@ async def _run( if not examples: raise FileNotFoundError("No valid example pairs found in examples directory") + from .example_selector import select_relevant_examples + + log("[agent] Running example selection analysis...") + selection_result = await select_relevant_examples(target_files, examples, client) + + log("\nExample Selection Analysis:") + log(selection_result.analysis) + log("\nSelected Examples:") + for example in selection_result.selected_examples: + reason = selection_result.selection_reasons.get( + example.name, "No specific reason provided" + ) + log(f"- {example.name}: {reason}") + log("\nExcluded Examples:") + for name, reason in selection_result.exclusion_reasons.items(): + log(f"- {name}: {reason}") + + examples = selection_result.selected_examples + system_prompt = Path(system_prompt).read_text() # TODO: Have some kind of configuration driven controls for how basename is transformed @@ -547,7 +566,6 @@ async def _run( target_basename.replace("-", " ").replace("_", " ").title().replace(" ", "") ) - # Create target MigrationExample target_file_contents = [] for i, target_file in enumerate(target_files): full_path = Path(target_file).absolute() From f85387ec856f454fa7674c28c6dd438a9ac317fa Mon Sep 17 00:00:00 2001 From: Max Novich Date: Tue, 1 Apr 2025 13:21:22 -0700 Subject: [PATCH 2/4] fixed code block handling --- src/ai_migrate/example_selector.py | 18 ++++++++---- src/ai_migrate/migrate.py | 44 ++---------------------------- src/ai_migrate/test_migrate.py | 3 +- src/ai_migrate/utils.py | 43 +++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 49 deletions(-) diff --git a/src/ai_migrate/example_selector.py b/src/ai_migrate/example_selector.py index d1dc541..fa8682e 100644 --- a/src/ai_migrate/example_selector.py +++ b/src/ai_migrate/example_selector.py @@ -1,5 +1,6 @@ """Example selection logic for optimizing migration context.""" +import json from dataclasses import dataclass from typing import List from pathlib import Path @@ -87,12 +88,19 @@ async def select_relevant_examples( response, _ = await client.generate_completion(messages=messages, temperature=0.1) content = response["choices"][0]["message"]["content"] - - import json + + from .utils import extract_code_blocks + try: - result = json.loads(content) + parsed_result = extract_code_blocks(content) + if parsed_result.code_blocks: + result = json.loads(parsed_result.code_blocks[0].code) + else: + result = json.loads(content) except json.JSONDecodeError as e: - raise ValueError(f"Failed to parse LLM response as JSON: {e}\nResponse: {content}") + raise ValueError( + f"Failed to parse LLM response as JSON: {e}\nResponse: {content}" + ) # Extract analysis analysis = result.get("analysis", "") @@ -130,4 +138,4 @@ async def select_relevant_examples( analysis=analysis, selection_reasons=selection_reasons, exclusion_reasons=exclusion_reasons, - ) \ No newline at end of file + ) diff --git a/src/ai_migrate/migrate.py b/src/ai_migrate/migrate.py index 286bb5c..1ca3c80 100644 --- a/src/ai_migrate/migrate.py +++ b/src/ai_migrate/migrate.py @@ -10,6 +10,8 @@ import subprocess from typing import Any, Iterable, Optional +from .utils import extract_code_blocks + from pydantic_ai.messages import ToolCallPart from pydantic_ai.tools import Tool from pydantic_ai import RunContext @@ -455,48 +457,6 @@ async def run( ) -@dataclass -class CodeBlock: - filename: str | None - code: str - - -@dataclass -class CodeResponseResult: - code_blocks: list[CodeBlock] - other_text: str - - -def extract_code_blocks(markdown, replacement="") -> CodeResponseResult: - lines = markdown.splitlines() - filename = None - line_it = iter(lines) - result = CodeResponseResult([], "") - other_text = [] - - for line in line_it: - if line.lstrip().startswith("### ") and line.count("`") == 2: - start = line.find("`") - end = line.find("`", start + 1) - filename = line[start + 1 : end] - elif line.lstrip().startswith("```"): - code = [] - for line in line_it: - if line.lstrip().startswith("```"): - break - code.append(line) - result.code_blocks.append(CodeBlock(filename, "\n".join(code))) - filename = None - other_text.append(replacement) - else: - other_text.append(line) - - if other_text: - result.other_text = "\n".join(other_text) - - return result - - class FailedPreVerification(Exception): pass diff --git a/src/ai_migrate/test_migrate.py b/src/ai_migrate/test_migrate.py index 7f8ba9a..7169311 100644 --- a/src/ai_migrate/test_migrate.py +++ b/src/ai_migrate/test_migrate.py @@ -6,11 +6,10 @@ FileContent, migrate_prompt, extract_code_blocks, - CodeResponseResult, - CodeBlock, MigrationExample, read_file_pairs_from, ) +from ai_migrate.utils import CodeResponseResult, CodeBlock @pytest.fixture diff --git a/src/ai_migrate/utils.py b/src/ai_migrate/utils.py index 881b6dc..0fd913f 100644 --- a/src/ai_migrate/utils.py +++ b/src/ai_migrate/utils.py @@ -1,5 +1,6 @@ """Utilities for generating system prompts and other common tasks.""" +from dataclasses import dataclass from typing import Optional, List from pydantic import BaseModel @@ -12,6 +13,48 @@ class PRDetails(BaseModel): deletions: int +@dataclass +class CodeBlock: + filename: str | None + code: str + + +@dataclass +class CodeResponseResult: + code_blocks: list[CodeBlock] + other_text: str + + +def extract_code_blocks(markdown, replacement="") -> CodeResponseResult: + lines = markdown.splitlines() + filename = None + line_it = iter(lines) + result = CodeResponseResult([], "") + other_text = [] + + for line in line_it: + if line.lstrip().startswith("### ") and line.count("`") == 2: + start = line.find("`") + end = line.find("`", start + 1) + filename = line[start + 1 : end] + elif line.lstrip().startswith("```"): + code = [] + for line in line_it: + if line.lstrip().startswith("```"): + break + code.append(line) + result.code_blocks.append(CodeBlock(filename, "\n".join(code))) + filename = None + other_text.append(replacement) + else: + other_text.append(line) + + if other_text: + result.other_text = "\n".join(other_text) + + return result + + async def generate_system_prompt( description: str, pr_details: Optional[PRDetails] = None ) -> str: From f3ca5e116cc0a64f706bed9a3ce3969a5563f9d6 Mon Sep 17 00:00:00 2001 From: Max Novich Date: Tue, 1 Apr 2025 15:24:52 -0700 Subject: [PATCH 3/4] fix tests --- src/ai_migrate/example_selector.py | 30 +++++++++++++++++-- .../test/eval_test_data/response1.txt | 18 ++++++----- .../test/eval_test_data/response2.txt | 8 +++++ 3 files changed, 46 insertions(+), 10 deletions(-) create mode 100644 src/ai_migrate/test/eval_test_data/response2.txt diff --git a/src/ai_migrate/example_selector.py b/src/ai_migrate/example_selector.py index fa8682e..7653ae4 100644 --- a/src/ai_migrate/example_selector.py +++ b/src/ai_migrate/example_selector.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from typing import List from pathlib import Path -from .migrate import DefaultClient, MigrationExample +from .migrate import DefaultClient, MigrationExample, log EXAMPLE_SELECTION_PROMPT = """You are an expert at analyzing code migration patterns. Your task is to select the most relevant examples for migrating specific target files. @@ -87,8 +87,34 @@ async def select_relevant_examples( ] response, _ = await client.generate_completion(messages=messages, temperature=0.1) - content = response["choices"][0]["message"]["content"] + log("[agent] Raw LLM response:", response) + + try: + if "choices" in response and response["choices"]: + message = response["choices"][0].get("message", {}) + if isinstance(message, dict): + content = message.get("content", "") + else: + content = str(message) + else: + content = str(response) + + if not content.strip(): + log("[agent] Warning: Empty content received from LLM") + content = json.dumps( + { + "analysis": "No analysis provided - LLM returned empty response", + "selected_examples": [], + "excluded_examples": [], + } + ) + except Exception as e: + raise ValueError( + f"Failed to extract content from LLM response: {e}\nResponse: {response}" + ) + + log("[agent] Parsed content:", content) from .utils import extract_code_blocks try: diff --git a/src/ai_migrate/test/eval_test_data/response1.txt b/src/ai_migrate/test/eval_test_data/response1.txt index 59a4a58..9c3d45a 100644 --- a/src/ai_migrate/test/eval_test_data/response1.txt +++ b/src/ai_migrate/test/eval_test_data/response1.txt @@ -1,8 +1,10 @@ -Here's the migrated code: - -### `example.py` -```python -def new_function(): - print("This is the migrated version") - return "New implementation" -``` \ No newline at end of file +{ + "analysis": "The target file contains a simple function that needs to be migrated. The function prints a message and returns a string.", + "selected_examples": [ + { + "id": 1, + "reason": "This example demonstrates the pattern needed for migrating a simple function with print statement and return value." + } + ], + "excluded_examples": [] +} \ No newline at end of file diff --git a/src/ai_migrate/test/eval_test_data/response2.txt b/src/ai_migrate/test/eval_test_data/response2.txt new file mode 100644 index 0000000..59a4a58 --- /dev/null +++ b/src/ai_migrate/test/eval_test_data/response2.txt @@ -0,0 +1,8 @@ +Here's the migrated code: + +### `example.py` +```python +def new_function(): + print("This is the migrated version") + return "New implementation" +``` \ No newline at end of file From 54b6ad247af0d2f90cc464b76fd0ac04451428d1 Mon Sep 17 00:00:00 2001 From: Max Novich Date: Tue, 1 Apr 2025 16:04:52 -0700 Subject: [PATCH 4/4] format stuff --- src/ai_migrate/example_selector.py | 49 ++++++++++++------------------ 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/src/ai_migrate/example_selector.py b/src/ai_migrate/example_selector.py index 7653ae4..29e8c51 100644 --- a/src/ai_migrate/example_selector.py +++ b/src/ai_migrate/example_selector.py @@ -1,10 +1,8 @@ -"""Example selection logic for optimizing migration context.""" - import json from dataclasses import dataclass from typing import List from pathlib import Path -from .migrate import DefaultClient, MigrationExample, log +from .migrate import DefaultClient, MigrationExample EXAMPLE_SELECTION_PROMPT = """You are an expert at analyzing code migration patterns. Your task is to select the most relevant examples for migrating specific target files. @@ -88,8 +86,6 @@ async def select_relevant_examples( response, _ = await client.generate_completion(messages=messages, temperature=0.1) - log("[agent] Raw LLM response:", response) - try: if "choices" in response and response["choices"]: message = response["choices"][0].get("message", {}) @@ -101,7 +97,6 @@ async def select_relevant_examples( content = str(response) if not content.strip(): - log("[agent] Warning: Empty content received from LLM") content = json.dumps( { "analysis": "No analysis provided - LLM returned empty response", @@ -113,8 +108,6 @@ async def select_relevant_examples( raise ValueError( f"Failed to extract content from LLM response: {e}\nResponse: {response}" ) - - log("[agent] Parsed content:", content) from .utils import extract_code_blocks try: @@ -128,30 +121,28 @@ async def select_relevant_examples( f"Failed to parse LLM response as JSON: {e}\nResponse: {content}" ) - # Extract analysis analysis = result.get("analysis", "") - # Process selected examples - selected_indices = [] - selection_reasons = {} - for example in result.get("selected_examples", []): - try: - idx = int(example["id"]) - 1 # Convert from 1-based to 0-based indexing - if 0 <= idx < len(available_examples): - selected_indices.append(idx) - selection_reasons[available_examples[idx].name] = example["reason"] - except (ValueError, KeyError) as e: - print(f"Warning: Invalid selected example format: {example}, Error: {e}") - - # Process excluded examples - exclusion_reasons = {} - for example in result.get("excluded_examples", []): - try: - idx = int(example["id"]) - 1 # Convert from 1-based to 0-based indexing + def _process_example_list(examples, available_examples, is_selected=True): + indices = [] + reasons_dict = {} + + for example in examples: + idx = int(example["id"]) - 1 if 0 <= idx < len(available_examples): - exclusion_reasons[available_examples[idx].name] = example["reason"] - except (ValueError, KeyError) as e: - print(f"Warning: Invalid excluded example format: {example}, Error: {e}") + if is_selected: + indices.append(idx) + reasons_dict[available_examples[idx].name] = example["reason"] + + return indices, reasons_dict + + selected_indices, selection_reasons = _process_example_list( + result.get("selected_examples", []), available_examples, is_selected=True + ) + + _, exclusion_reasons = _process_example_list( + result.get("excluded_examples", []), available_examples, is_selected=False + ) selected_examples = [ available_examples[i]