Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/ai_migrate/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from .migrate import run
from .manifest import Manifest, FileEntry, FileGroup
from .example_selector import ExampleSelectionResult, select_relevant_examples

__all__ = [
"run",
"Manifest",
"FileEntry",
"FileGroup",
"ExampleSelectionResult",
"select_relevant_examples",
]
Comment thread
Kvadratni marked this conversation as resolved.
158 changes: 158 additions & 0 deletions src/ai_migrate/example_selector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import json
from dataclasses import dataclass
from typing import List
from pathlib import Path
from .migrate import DefaultClient, MigrationExample

EXAMPLE_SELECTION_PROMPT = """You are an expert at analyzing code migration patterns. Your task is to select the most relevant examples for migrating specific target files.

Analyze the target files and all available example pairs. Then select only the examples that demonstrate patterns and transformations that will be most helpful for migrating the target.

Consider:
1. Language features and syntax
2. Similar patterns or structures
3. Related functionality or domain
4. Migration complexity and scope

IMPORTANT: You must provide your response in the following JSON format:

{
"analysis": "Brief analysis of what needs to be migrated in the target files",
"selected_examples": [
{
"id": "Example number (integer)",
"reason": "Detailed justification for selecting this example"
}
],
"excluded_examples": [
{
"id": "Example number (integer)",
"reason": "Brief reason for excluding this example"
}
]
}

Notes:
- Example numbers should be integers (1, 2, 3, etc.)
- Provide clear, specific reasons for each selection and exclusion
- Focus on selecting examples that demonstrate patterns needed for this specific migration
- Ensure the response is valid JSON that can be parsed programmatically"""


@dataclass
class ExampleSelectionResult:
Comment thread
Kvadratni marked this conversation as resolved.
selected_examples: List[MigrationExample]
Comment thread
Kvadratni marked this conversation as resolved.
analysis: str
selection_reasons: dict[str, str]
exclusion_reasons: dict[str, str]


async def select_relevant_examples(
target_files: List[str],
available_examples: List[MigrationExample],
client: DefaultClient,
) -> ExampleSelectionResult:
target_content = []
for target_file in target_files:
path = Path(target_file)
content = path.read_text()
target_content.append(f"### `{path.name}`\n```\n{content}\n```")

examples_content = []
for i, example in enumerate(available_examples, 1):
example_files = []
for old_file in example.old_files:
example_files.append(f"### `{old_file.name}`\n```\n{old_file.content}\n```")
for new_file in example.new_files:
if new_file.name == old_file.name:
example_files.append(
f"### `{new_file.name}` (migrated)\n```\n{new_file.content}\n```"
)
break
examples_content.append(
f"Example {i} ({example.name}):\n" + "\n".join(example_files)
)

prompt = (
f"{EXAMPLE_SELECTION_PROMPT}\n\n"
f"Target Files:\n{chr(10).join(target_content)}\n\n"
Comment thread
Kvadratni marked this conversation as resolved.
f"Available Examples:\n\n{chr(10).join(examples_content)}"
)

messages = [
{"role": "system", "content": EXAMPLE_SELECTION_PROMPT},
{"role": "user", "content": prompt},
]

response, _ = await client.generate_completion(messages=messages, temperature=0.1)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

openai allows you to specify json as an output format, which is more reliable than asking for it. you can even specify the format.

Copy link
Copy Markdown
Contributor Author

@Kvadratni Kvadratni Apr 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, but we aren't using claude? or you mean openAi style client?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we are using databricks which provides an openai client independent of the model we choose I think. of course that's a leaky abstraction so it might not work, but we should give it a shot. it's a useful trick in general

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gotcha

try:
if "choices" in response and response["choices"]:
message = response["choices"][0].get("message", {})
if isinstance(message, dict):
content = message.get("content", "")
else:
content = str(message)
else:
content = str(response)
Comment on lines +90 to +97
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's happening here?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is different format handling, but now I realize it's redundant.


if not content.strip():
content = json.dumps(
{
"analysis": "No analysis provided - LLM returned empty response",
"selected_examples": [],
"excluded_examples": [],
}
)
except Exception as e:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

avoid catching Exception, the stack trace should have enough info here

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ack

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

come on, don't catch general Exceptions

raise ValueError(
f"Failed to extract content from LLM response: {e}\nResponse: {response}"
)
from .utils import extract_code_blocks
Comment thread
Kvadratni marked this conversation as resolved.

try:
parsed_result = extract_code_blocks(content)
Comment thread
Kvadratni marked this conversation as resolved.
if parsed_result.code_blocks:
result = json.loads(parsed_result.code_blocks[0].code)
else:
result = json.loads(content)
except json.JSONDecodeError as e:
raise ValueError(
f"Failed to parse LLM response as JSON: {e}\nResponse: {content}"
)

analysis = result.get("analysis", "")

def _process_example_list(examples, available_examples, is_selected=True):
indices = []
reasons_dict = {}

for example in examples:
idx = int(example["id"]) - 1
if 0 <= idx < len(available_examples):
if is_selected:
indices.append(idx)
reasons_dict[available_examples[idx].name] = example["reason"]

return indices, reasons_dict

selected_indices, selection_reasons = _process_example_list(
result.get("selected_examples", []), available_examples, is_selected=True
)

_, exclusion_reasons = _process_example_list(
result.get("excluded_examples", []), available_examples, is_selected=False
)

selected_examples = [
available_examples[i]
for i in selected_indices
if 0 <= i < len(available_examples)
]

return ExampleSelectionResult(
selected_examples=selected_examples,
analysis=analysis,
selection_reasons=selection_reasons,
exclusion_reasons=exclusion_reasons,
)
64 changes: 21 additions & 43 deletions src/ai_migrate/migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import subprocess
from typing import Any, Iterable, Optional

from .utils import extract_code_blocks

from pydantic_ai.messages import ToolCallPart
from pydantic_ai.tools import Tool
from pydantic_ai import RunContext
Expand Down Expand Up @@ -455,48 +457,6 @@ async def run(
)


@dataclass
class CodeBlock:
filename: str | None
code: str


@dataclass
class CodeResponseResult:
code_blocks: list[CodeBlock]
other_text: str


def extract_code_blocks(markdown, replacement="<code>") -> CodeResponseResult:
lines = markdown.splitlines()
filename = None
line_it = iter(lines)
result = CodeResponseResult([], "")
other_text = []

for line in line_it:
if line.lstrip().startswith("### ") and line.count("`") == 2:
start = line.find("`")
end = line.find("`", start + 1)
filename = line[start + 1 : end]
elif line.lstrip().startswith("```"):
code = []
for line in line_it:
if line.lstrip().startswith("```"):
break
code.append(line)
result.code_blocks.append(CodeBlock(filename, "\n".join(code)))
filename = None
other_text.append(replacement)
else:
other_text.append(line)

if other_text:
result.other_text = "\n".join(other_text)

return result


class FailedPreVerification(Exception):
pass

Expand Down Expand Up @@ -539,6 +499,25 @@ async def _run(
if not examples:
raise FileNotFoundError("No valid example pairs found in examples directory")

from .example_selector import select_relevant_examples
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same


log("[agent] Running example selection analysis...")
selection_result = await select_relevant_examples(target_files, examples, client)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we put it behind a flag?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm good point. which would be the default tho?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can throw exceptions, which would stop the whole thing. is that what we want?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if it is not under feature flag - probably yes as there will be no examples.
In case we put it under a flag - probably not


log("\nExample Selection Analysis:")
log(selection_result.analysis)
log("\nSelected Examples:")
for example in selection_result.selected_examples:
reason = selection_result.selection_reasons.get(
example.name, "No specific reason provided"
)
log(f"- {example.name}: {reason}")
log("\nExcluded Examples:")
for name, reason in selection_result.exclusion_reasons.items():
log(f"- {name}: {reason}")

examples = selection_result.selected_examples

system_prompt = Path(system_prompt).read_text()

# TODO: Have some kind of configuration driven controls for how basename is transformed
Expand All @@ -547,7 +526,6 @@ async def _run(
target_basename.replace("-", " ").replace("_", " ").title().replace(" ", "")
)

# Create target MigrationExample
target_file_contents = []
for i, target_file in enumerate(target_files):
full_path = Path(target_file).absolute()
Expand Down
18 changes: 10 additions & 8 deletions src/ai_migrate/test/eval_test_data/response1.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
Here's the migrated code:

### `example.py`
```python
def new_function():
print("This is the migrated version")
return "New implementation"
```
{
"analysis": "The target file contains a simple function that needs to be migrated. The function prints a message and returns a string.",
"selected_examples": [
{
"id": 1,
"reason": "This example demonstrates the pattern needed for migrating a simple function with print statement and return value."
}
],
"excluded_examples": []
}
8 changes: 8 additions & 0 deletions src/ai_migrate/test/eval_test_data/response2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Here's the migrated code:

### `example.py`
```python
def new_function():
print("This is the migrated version")
return "New implementation"
```
3 changes: 1 addition & 2 deletions src/ai_migrate/test_migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@
FileContent,
migrate_prompt,
extract_code_blocks,
CodeResponseResult,
CodeBlock,
MigrationExample,
read_file_pairs_from,
)
from ai_migrate.utils import CodeResponseResult, CodeBlock


@pytest.fixture
Expand Down
43 changes: 43 additions & 0 deletions src/ai_migrate/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Utilities for generating system prompts and other common tasks."""

from dataclasses import dataclass
from typing import Optional, List
from pydantic import BaseModel

Expand All @@ -12,6 +13,48 @@ class PRDetails(BaseModel):
deletions: int


@dataclass
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW I am not a fan of having files called utils. It can just mean anything. maybe we can just call this code_handling and move all code handling here? now it just a mixed bag of things max built.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

point taken.

class CodeBlock:
filename: str | None
code: str


@dataclass
class CodeResponseResult:
code_blocks: list[CodeBlock]
other_text: str


def extract_code_blocks(markdown, replacement="<code>") -> CodeResponseResult:
lines = markdown.splitlines()
filename = None
line_it = iter(lines)
result = CodeResponseResult([], "")
other_text = []

for line in line_it:
if line.lstrip().startswith("### ") and line.count("`") == 2:
start = line.find("`")
end = line.find("`", start + 1)
filename = line[start + 1 : end]
elif line.lstrip().startswith("```"):
code = []
for line in line_it:
if line.lstrip().startswith("```"):
break
code.append(line)
result.code_blocks.append(CodeBlock(filename, "\n".join(code)))
filename = None
other_text.append(replacement)
else:
other_text.append(line)

if other_text:
result.other_text = "\n".join(other_text)

return result


async def generate_system_prompt(
description: str, pr_details: Optional[PRDetails] = None
) -> str:
Expand Down