-
Notifications
You must be signed in to change notification settings - Fork 4
Smarter example selection #18
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| from .migrate import run | ||
| from .manifest import Manifest, FileEntry, FileGroup | ||
| from .example_selector import ExampleSelectionResult, select_relevant_examples | ||
|
|
||
| __all__ = [ | ||
| "run", | ||
| "Manifest", | ||
| "FileEntry", | ||
| "FileGroup", | ||
| "ExampleSelectionResult", | ||
| "select_relevant_examples", | ||
| ] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,158 @@ | ||
| import json | ||
| from dataclasses import dataclass | ||
| from typing import List | ||
| from pathlib import Path | ||
| from .migrate import DefaultClient, MigrationExample | ||
|
|
||
| EXAMPLE_SELECTION_PROMPT = """You are an expert at analyzing code migration patterns. Your task is to select the most relevant examples for migrating specific target files. | ||
|
|
||
| Analyze the target files and all available example pairs. Then select only the examples that demonstrate patterns and transformations that will be most helpful for migrating the target. | ||
|
|
||
| Consider: | ||
| 1. Language features and syntax | ||
| 2. Similar patterns or structures | ||
| 3. Related functionality or domain | ||
| 4. Migration complexity and scope | ||
|
|
||
| IMPORTANT: You must provide your response in the following JSON format: | ||
|
|
||
| { | ||
| "analysis": "Brief analysis of what needs to be migrated in the target files", | ||
| "selected_examples": [ | ||
| { | ||
| "id": "Example number (integer)", | ||
| "reason": "Detailed justification for selecting this example" | ||
| } | ||
| ], | ||
| "excluded_examples": [ | ||
| { | ||
| "id": "Example number (integer)", | ||
| "reason": "Brief reason for excluding this example" | ||
| } | ||
| ] | ||
| } | ||
|
|
||
| Notes: | ||
| - Example numbers should be integers (1, 2, 3, etc.) | ||
| - Provide clear, specific reasons for each selection and exclusion | ||
| - Focus on selecting examples that demonstrate patterns needed for this specific migration | ||
| - Ensure the response is valid JSON that can be parsed programmatically""" | ||
|
|
||
|
|
||
| @dataclass | ||
| class ExampleSelectionResult: | ||
|
Kvadratni marked this conversation as resolved.
|
||
| selected_examples: List[MigrationExample] | ||
|
Kvadratni marked this conversation as resolved.
|
||
| analysis: str | ||
| selection_reasons: dict[str, str] | ||
| exclusion_reasons: dict[str, str] | ||
|
|
||
|
|
||
| async def select_relevant_examples( | ||
| target_files: List[str], | ||
| available_examples: List[MigrationExample], | ||
| client: DefaultClient, | ||
| ) -> ExampleSelectionResult: | ||
| target_content = [] | ||
| for target_file in target_files: | ||
| path = Path(target_file) | ||
| content = path.read_text() | ||
| target_content.append(f"### `{path.name}`\n```\n{content}\n```") | ||
|
|
||
| examples_content = [] | ||
| for i, example in enumerate(available_examples, 1): | ||
| example_files = [] | ||
| for old_file in example.old_files: | ||
| example_files.append(f"### `{old_file.name}`\n```\n{old_file.content}\n```") | ||
| for new_file in example.new_files: | ||
| if new_file.name == old_file.name: | ||
| example_files.append( | ||
| f"### `{new_file.name}` (migrated)\n```\n{new_file.content}\n```" | ||
| ) | ||
| break | ||
| examples_content.append( | ||
| f"Example {i} ({example.name}):\n" + "\n".join(example_files) | ||
| ) | ||
|
|
||
| prompt = ( | ||
| f"{EXAMPLE_SELECTION_PROMPT}\n\n" | ||
| f"Target Files:\n{chr(10).join(target_content)}\n\n" | ||
|
Kvadratni marked this conversation as resolved.
|
||
| f"Available Examples:\n\n{chr(10).join(examples_content)}" | ||
| ) | ||
|
|
||
| messages = [ | ||
| {"role": "system", "content": EXAMPLE_SELECTION_PROMPT}, | ||
| {"role": "user", "content": prompt}, | ||
| ] | ||
|
|
||
| response, _ = await client.generate_completion(messages=messages, temperature=0.1) | ||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. openai allows you to specify json as an output format, which is more reliable than asking for it. you can even specify the format.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, but we aren't using claude? or you mean openAi style client?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we are using databricks which provides an openai client independent of the model we choose I think. of course that's a leaky abstraction so it might not work, but we should give it a shot. it's a useful trick in general
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. gotcha |
||
| try: | ||
| if "choices" in response and response["choices"]: | ||
| message = response["choices"][0].get("message", {}) | ||
| if isinstance(message, dict): | ||
| content = message.get("content", "") | ||
| else: | ||
| content = str(message) | ||
| else: | ||
| content = str(response) | ||
|
Comment on lines
+90
to
+97
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's happening here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is different format handling, but now I realize it's redundant. |
||
|
|
||
| if not content.strip(): | ||
| content = json.dumps( | ||
| { | ||
| "analysis": "No analysis provided - LLM returned empty response", | ||
| "selected_examples": [], | ||
| "excluded_examples": [], | ||
| } | ||
| ) | ||
| except Exception as e: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. avoid catching Exception, the stack trace should have enough info here
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ack
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. come on, don't catch general Exceptions |
||
| raise ValueError( | ||
| f"Failed to extract content from LLM response: {e}\nResponse: {response}" | ||
| ) | ||
| from .utils import extract_code_blocks | ||
|
Kvadratni marked this conversation as resolved.
|
||
|
|
||
| try: | ||
| parsed_result = extract_code_blocks(content) | ||
|
Kvadratni marked this conversation as resolved.
|
||
| if parsed_result.code_blocks: | ||
| result = json.loads(parsed_result.code_blocks[0].code) | ||
| else: | ||
| result = json.loads(content) | ||
| except json.JSONDecodeError as e: | ||
| raise ValueError( | ||
| f"Failed to parse LLM response as JSON: {e}\nResponse: {content}" | ||
| ) | ||
|
|
||
| analysis = result.get("analysis", "") | ||
|
|
||
| def _process_example_list(examples, available_examples, is_selected=True): | ||
| indices = [] | ||
| reasons_dict = {} | ||
|
|
||
| for example in examples: | ||
| idx = int(example["id"]) - 1 | ||
| if 0 <= idx < len(available_examples): | ||
| if is_selected: | ||
| indices.append(idx) | ||
| reasons_dict[available_examples[idx].name] = example["reason"] | ||
|
|
||
| return indices, reasons_dict | ||
|
|
||
| selected_indices, selection_reasons = _process_example_list( | ||
| result.get("selected_examples", []), available_examples, is_selected=True | ||
| ) | ||
|
|
||
| _, exclusion_reasons = _process_example_list( | ||
| result.get("excluded_examples", []), available_examples, is_selected=False | ||
| ) | ||
|
|
||
| selected_examples = [ | ||
| available_examples[i] | ||
| for i in selected_indices | ||
| if 0 <= i < len(available_examples) | ||
| ] | ||
|
|
||
| return ExampleSelectionResult( | ||
| selected_examples=selected_examples, | ||
| analysis=analysis, | ||
| selection_reasons=selection_reasons, | ||
| exclusion_reasons=exclusion_reasons, | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,8 @@ | |
| import subprocess | ||
| from typing import Any, Iterable, Optional | ||
|
|
||
| from .utils import extract_code_blocks | ||
|
|
||
| from pydantic_ai.messages import ToolCallPart | ||
| from pydantic_ai.tools import Tool | ||
| from pydantic_ai import RunContext | ||
|
|
@@ -455,48 +457,6 @@ async def run( | |
| ) | ||
|
|
||
|
|
||
| @dataclass | ||
| class CodeBlock: | ||
| filename: str | None | ||
| code: str | ||
|
|
||
|
|
||
| @dataclass | ||
| class CodeResponseResult: | ||
| code_blocks: list[CodeBlock] | ||
| other_text: str | ||
|
|
||
|
|
||
| def extract_code_blocks(markdown, replacement="<code>") -> CodeResponseResult: | ||
| lines = markdown.splitlines() | ||
| filename = None | ||
| line_it = iter(lines) | ||
| result = CodeResponseResult([], "") | ||
| other_text = [] | ||
|
|
||
| for line in line_it: | ||
| if line.lstrip().startswith("### ") and line.count("`") == 2: | ||
| start = line.find("`") | ||
| end = line.find("`", start + 1) | ||
| filename = line[start + 1 : end] | ||
| elif line.lstrip().startswith("```"): | ||
| code = [] | ||
| for line in line_it: | ||
| if line.lstrip().startswith("```"): | ||
| break | ||
| code.append(line) | ||
| result.code_blocks.append(CodeBlock(filename, "\n".join(code))) | ||
| filename = None | ||
| other_text.append(replacement) | ||
| else: | ||
| other_text.append(line) | ||
|
|
||
| if other_text: | ||
| result.other_text = "\n".join(other_text) | ||
|
|
||
| return result | ||
|
|
||
|
|
||
| class FailedPreVerification(Exception): | ||
| pass | ||
|
|
||
|
|
@@ -539,6 +499,25 @@ async def _run( | |
| if not examples: | ||
| raise FileNotFoundError("No valid example pairs found in examples directory") | ||
|
|
||
| from .example_selector import select_relevant_examples | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same |
||
|
|
||
| log("[agent] Running example selection analysis...") | ||
| selection_result = await select_relevant_examples(target_files, examples, client) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we put it behind a flag?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hm good point. which would be the default tho?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this can throw exceptions, which would stop the whole thing. is that what we want?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if it is not under feature flag - probably yes as there will be no examples. |
||
|
|
||
| log("\nExample Selection Analysis:") | ||
| log(selection_result.analysis) | ||
| log("\nSelected Examples:") | ||
| for example in selection_result.selected_examples: | ||
| reason = selection_result.selection_reasons.get( | ||
| example.name, "No specific reason provided" | ||
| ) | ||
| log(f"- {example.name}: {reason}") | ||
| log("\nExcluded Examples:") | ||
| for name, reason in selection_result.exclusion_reasons.items(): | ||
| log(f"- {name}: {reason}") | ||
|
|
||
| examples = selection_result.selected_examples | ||
|
|
||
| system_prompt = Path(system_prompt).read_text() | ||
|
|
||
| # TODO: Have some kind of configuration driven controls for how basename is transformed | ||
|
|
@@ -547,7 +526,6 @@ async def _run( | |
| target_basename.replace("-", " ").replace("_", " ").title().replace(" ", "") | ||
| ) | ||
|
|
||
| # Create target MigrationExample | ||
| target_file_contents = [] | ||
| for i, target_file in enumerate(target_files): | ||
| full_path = Path(target_file).absolute() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,10 @@ | ||
| Here's the migrated code: | ||
|
|
||
| ### `example.py` | ||
| ```python | ||
| def new_function(): | ||
| print("This is the migrated version") | ||
| return "New implementation" | ||
| ``` | ||
| { | ||
| "analysis": "The target file contains a simple function that needs to be migrated. The function prints a message and returns a string.", | ||
| "selected_examples": [ | ||
| { | ||
| "id": 1, | ||
| "reason": "This example demonstrates the pattern needed for migrating a simple function with print statement and return value." | ||
| } | ||
| ], | ||
| "excluded_examples": [] | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| Here's the migrated code: | ||
|
|
||
| ### `example.py` | ||
| ```python | ||
| def new_function(): | ||
| print("This is the migrated version") | ||
| return "New implementation" | ||
| ``` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| """Utilities for generating system prompts and other common tasks.""" | ||
|
|
||
| from dataclasses import dataclass | ||
| from typing import Optional, List | ||
| from pydantic import BaseModel | ||
|
|
||
|
|
@@ -12,6 +13,48 @@ class PRDetails(BaseModel): | |
| deletions: int | ||
|
|
||
|
|
||
| @dataclass | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW I am not a fan of having files called utils. It can just mean anything. maybe we can just call this code_handling and move all code handling here? now it just a mixed bag of things max built.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. point taken. |
||
| class CodeBlock: | ||
| filename: str | None | ||
| code: str | ||
|
|
||
|
|
||
| @dataclass | ||
| class CodeResponseResult: | ||
| code_blocks: list[CodeBlock] | ||
| other_text: str | ||
|
|
||
|
|
||
| def extract_code_blocks(markdown, replacement="<code>") -> CodeResponseResult: | ||
| lines = markdown.splitlines() | ||
| filename = None | ||
| line_it = iter(lines) | ||
| result = CodeResponseResult([], "") | ||
| other_text = [] | ||
|
|
||
| for line in line_it: | ||
| if line.lstrip().startswith("### ") and line.count("`") == 2: | ||
| start = line.find("`") | ||
| end = line.find("`", start + 1) | ||
| filename = line[start + 1 : end] | ||
| elif line.lstrip().startswith("```"): | ||
| code = [] | ||
| for line in line_it: | ||
| if line.lstrip().startswith("```"): | ||
| break | ||
| code.append(line) | ||
| result.code_blocks.append(CodeBlock(filename, "\n".join(code))) | ||
| filename = None | ||
| other_text.append(replacement) | ||
| else: | ||
| other_text.append(line) | ||
|
|
||
| if other_text: | ||
| result.other_text = "\n".join(other_text) | ||
|
|
||
| return result | ||
|
|
||
|
|
||
| async def generate_system_prompt( | ||
| description: str, pr_details: Optional[PRDetails] = None | ||
| ) -> str: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.