diff --git a/apps/docs/faq.mdx b/apps/docs/faq.mdx index 4f03b215f..e17b45506 100644 --- a/apps/docs/faq.mdx +++ b/apps/docs/faq.mdx @@ -6,7 +6,7 @@ description: Frequently Asked Questions The cost per run varies greatly based on the complexity of the task, the size of the repository, and the number of files that need to be changed. - For most tasks, you can expect to pay between `$0.50` -> `$3.00` when using Claude Sonnet 4. + For most tasks, you can expect to pay between `$0.50` -> `$3.00` when using Claude Opus 4.5. For the same tasks running on Claude Opus 4/4.1, you can expect to pay between `$1.50` -> `$9.00`. Always remember to monitor your runs if you're cost conscious. The most expensive run I've seen Open SWE complete was ~50M Opus 4 tokens, costing `$25.00`. @@ -53,3 +53,4 @@ description: Frequently Asked Questions Yes! We're always looking for contributors to help us improve Open SWE. Feel free to pick up an [open issue](https://github.com/langchain-ai/open-swe/issues) or submit a pull request with a new feature or bug fix. + diff --git a/apps/docs/usage/best-practices.mdx b/apps/docs/usage/best-practices.mdx index 5fbe35033..c29a83407 100644 --- a/apps/docs/usage/best-practices.mdx +++ b/apps/docs/usage/best-practices.mdx @@ -39,7 +39,7 @@ Submit separate requests for different features or fixes. This allows Open SWE t ## Model Selection -- **Claude Sonnet 4 (Default)**: The default model for planning, writing code, and reviewing changes. This model offers the best balance of performance, speed and cost. +- **Claude Opus 4.5 (Default)**: The default model for planning, writing code, and reviewing changes. This model offers the best balance of performance, speed and cost. - **Claude Opus 4.1**: A larger, more powerful model for difficult, or open-ended tasks. Opus 4.1 is more expensive and slower, but will provide better results for complex tasks. ### Avoid Other Models @@ -50,7 +50,7 @@ Although Open SWE allows you to select any model from Anthropic, OpenAI and Goog ### `open-swe` vs `open-swe-max` -**`open-swe`**: Uses Claude Sonnet 4 +**`open-swe`**: Uses Claude Opus 4.5 - Suitable for most development tasks - Faster execution @@ -81,3 +81,5 @@ If you're running Open SWE against an open-ended or very complex task, you may w In development environments, append `-dev` to all labels (e.g., `open-swe-dev`, `open-swe-auto-dev`). + + diff --git a/apps/open-swe/src/graphs/manager/nodes/classify-message/prompts.ts b/apps/open-swe/src/graphs/manager/nodes/classify-message/prompts.ts index 0d2101650..12c96aeba 100644 --- a/apps/open-swe/src/graphs/manager/nodes/classify-message/prompts.ts +++ b/apps/open-swe/src/graphs/manager/nodes/classify-message/prompts.ts @@ -71,10 +71,10 @@ Your source code is available in the GitHub repository: https://github.com/langc The website you're accessible through is: https://swe.langchain.com Your documentation is available at: https://docs.langchain.com/labs/swe You can be invoked by both the web app, or by adding a label to a GitHub issue. These label options are: -- \`open-swe\` - trigger a standard Open SWE task. It will interrupt after generating a plan, and the user must approve it before it can continue. Uses Claude Sonnet 4 for all LLM requests. -- \`open-swe-auto\` - trigger an 'auto' Open SWE task. It will not interrupt after generating a plan, and instead it will auto-approve the plan, and continue to the programming step without user approval. Uses Claude Sonnet 4 for all LLM requests. -- \`open-swe-max\` - this label acts the same as \`open-swe\`, except it uses a larger, more powerful model for the planning and programming steps: Claude Opus 4.1. It still uses Claude Sonnet 4 for the reviewer step. -- \`open-swe-max-auto\` - this label acts the same as \`open-swe-auto\`, except it uses a larger, more powerful model for the planning and programming steps: Claude Opus 4.1. It still uses Claude Sonnet 4 for the reviewer step. +- \`open-swe\` - trigger a standard Open SWE task. It will interrupt after generating a plan, and the user must approve it before it can continue. Uses Claude Opus 4.5 for all LLM requests. +- \`open-swe-auto\` - trigger an 'auto' Open SWE task. It will not interrupt after generating a plan, and instead it will auto-approve the plan, and continue to the programming step without user approval. Uses Claude Opus 4.5 for all LLM requests. +- \`open-swe-max\` - this label acts the same as \`open-swe\`, except it uses a larger, more powerful model for the planning and programming steps: Claude Opus 4.1. It still uses Claude Opus 4.5 for the reviewer step. +- \`open-swe-max-auto\` - this label acts the same as \`open-swe-auto\`, except it uses a larger, more powerful model for the planning and programming steps: Claude Opus 4.1. It still uses Claude Opus 4.5 for the reviewer step. Only provide this information if requested by the user. For example, if the user asks what you can do, you should provide the above information in your response. diff --git a/apps/open-swe/src/utils/llms/model-manager.ts b/apps/open-swe/src/utils/llms/model-manager.ts index 27d59cf71..c3e24e9c8 100644 --- a/apps/open-swe/src/utils/llms/model-manager.ts +++ b/apps/open-swe/src/utils/llms/model-manager.ts @@ -379,23 +379,23 @@ export class ModelManager { ): ModelLoadConfig | null { const defaultModels: Record> = { anthropic: { - [LLMTask.PLANNER]: "claude-sonnet-4-0", - [LLMTask.PROGRAMMER]: "claude-sonnet-4-0", - [LLMTask.REVIEWER]: "claude-sonnet-4-0", - [LLMTask.ROUTER]: "claude-3-5-haiku-latest", - [LLMTask.SUMMARIZER]: "claude-sonnet-4-0", + [LLMTask.PLANNER]: "claude-opus-4-5", + [LLMTask.PROGRAMMER]: "claude-opus-4-5", + [LLMTask.REVIEWER]: "claude-opus-4-5", + [LLMTask.ROUTER]: "claude-haiku-4-5-latest", + [LLMTask.SUMMARIZER]: "claude-opus-4-5", }, "google-genai": { - [LLMTask.PLANNER]: "gemini-2.5-flash", - [LLMTask.PROGRAMMER]: "gemini-2.5-pro", - [LLMTask.REVIEWER]: "gemini-2.5-flash", - [LLMTask.ROUTER]: "gemini-2.5-flash", - [LLMTask.SUMMARIZER]: "gemini-2.5-pro", + [LLMTask.PLANNER]: "gemini-3-pro-preview", + [LLMTask.PROGRAMMER]: "gemini-3-pro-preview", + [LLMTask.REVIEWER]: "gemini-flash-latest", + [LLMTask.ROUTER]: "gemini-flash-latest", + [LLMTask.SUMMARIZER]: "gemini-3-pro-preview", }, openai: { - [LLMTask.PLANNER]: "gpt-5", - [LLMTask.PROGRAMMER]: "gpt-5", - [LLMTask.REVIEWER]: "gpt-5", + [LLMTask.PLANNER]: "gpt-5-codex", + [LLMTask.PROGRAMMER]: "gpt-5-codex", + [LLMTask.REVIEWER]: "gpt-5-codex", [LLMTask.ROUTER]: "gpt-5-nano", [LLMTask.SUMMARIZER]: "gpt-5-mini", }, diff --git a/apps/web/src/components/v2/token-usage.tsx b/apps/web/src/components/v2/token-usage.tsx index 5f72384ca..024486981 100644 --- a/apps/web/src/components/v2/token-usage.tsx +++ b/apps/web/src/components/v2/token-usage.tsx @@ -85,6 +85,16 @@ function getModelPricingPlaceholder(model: string): { outputPrice: 15.0, cachePrice: 0.3, }, + "anthropic:claude-sonnet-4-5": { + inputPrice: 3.0, + outputPrice: 15.0, + cachePrice: 0.3, + }, + "anthropic:claude-opus-4-5": { + inputPrice: 5.0, + outputPrice: 25.0, + cachePrice: 0.3, + }, // Claude 3.7 models "anthropic:claude-3-7-sonnet": { diff --git a/packages/shared/src/__tests__/caching.test.ts b/packages/shared/src/__tests__/caching.test.ts index fd15e4f9a..fcb8df564 100644 --- a/packages/shared/src/__tests__/caching.test.ts +++ b/packages/shared/src/__tests__/caching.test.ts @@ -5,7 +5,7 @@ describe("tokenDataReducer", () => { it("should merge objects with the same model string", () => { const state: ModelTokenData[] = [ { - model: "anthropic:claude-sonnet-4-0", + model: "anthropic:claude-sonnet-4-5", cacheCreationInputTokens: 100, cacheReadInputTokens: 50, inputTokens: 200, @@ -22,7 +22,7 @@ describe("tokenDataReducer", () => { const update: ModelTokenData[] = [ { - model: "anthropic:claude-sonnet-4-0", + model: "anthropic:claude-sonnet-4-5", cacheCreationInputTokens: 25, cacheReadInputTokens: 15, inputTokens: 75, @@ -44,10 +44,10 @@ describe("tokenDataReducer", () => { // Find the merged anthropic model const mergedAnthropic = result.find( - (data) => data.model === "anthropic:claude-sonnet-4-0", + (data) => data.model === "anthropic:claude-sonnet-4-5", ); expect(mergedAnthropic).toEqual({ - model: "anthropic:claude-sonnet-4-0", + model: "anthropic:claude-sonnet-4-5", cacheCreationInputTokens: 125, // 100 + 25 cacheReadInputTokens: 65, // 50 + 15 inputTokens: 275, // 200 + 75 @@ -82,7 +82,7 @@ describe("tokenDataReducer", () => { it("should return update array when state is undefined", () => { const update: ModelTokenData[] = [ { - model: "anthropic:claude-sonnet-4-0", + model: "anthropic:claude-sonnet-4-5", cacheCreationInputTokens: 100, cacheReadInputTokens: 50, inputTokens: 200, @@ -98,7 +98,7 @@ describe("tokenDataReducer", () => { it("should handle empty update array", () => { const state: ModelTokenData[] = [ { - model: "anthropic:claude-sonnet-4-0", + model: "anthropic:claude-sonnet-4-5", cacheCreationInputTokens: 100, cacheReadInputTokens: 50, inputTokens: 200, @@ -114,7 +114,7 @@ describe("tokenDataReducer", () => { it("should handle multiple updates for the same model", () => { const state: ModelTokenData[] = [ { - model: "anthropic:claude-sonnet-4-0", + model: "anthropic:claude-sonnet-4-5", cacheCreationInputTokens: 100, cacheReadInputTokens: 50, inputTokens: 200, @@ -124,14 +124,14 @@ describe("tokenDataReducer", () => { const update: ModelTokenData[] = [ { - model: "anthropic:claude-sonnet-4-0", + model: "anthropic:claude-sonnet-4-5", cacheCreationInputTokens: 25, cacheReadInputTokens: 15, inputTokens: 75, outputTokens: 60, }, { - model: "anthropic:claude-sonnet-4-0", + model: "anthropic:claude-sonnet-4-5", cacheCreationInputTokens: 10, cacheReadInputTokens: 5, inputTokens: 30, @@ -143,7 +143,7 @@ describe("tokenDataReducer", () => { expect(result).toHaveLength(1); expect(result[0]).toEqual({ - model: "anthropic:claude-sonnet-4-0", + model: "anthropic:claude-sonnet-4-5", cacheCreationInputTokens: 135, // 100 + 25 + 10 cacheReadInputTokens: 70, // 50 + 15 + 5 inputTokens: 305, // 200 + 75 + 30 diff --git a/packages/shared/src/open-swe/llm-task.ts b/packages/shared/src/open-swe/llm-task.ts index e287e7a8d..f1864ae07 100644 --- a/packages/shared/src/open-swe/llm-task.ts +++ b/packages/shared/src/open-swe/llm-task.ts @@ -29,23 +29,23 @@ export enum LLMTask { export const TASK_TO_CONFIG_DEFAULTS_MAP = { [LLMTask.PLANNER]: { - modelName: "anthropic:claude-sonnet-4-0", + modelName: "anthropic:claude-opus-4-5", temperature: 0, }, [LLMTask.PROGRAMMER]: { - modelName: "anthropic:claude-sonnet-4-0", + modelName: "anthropic:claude-opus-4-5", temperature: 0, }, [LLMTask.REVIEWER]: { - modelName: "anthropic:claude-sonnet-4-0", + modelName: "anthropic:claude-opus-4-5", temperature: 0, }, [LLMTask.ROUTER]: { - modelName: "anthropic:claude-3-5-haiku-latest", + modelName: "anthropic:claude-haiku-4-5", temperature: 0, }, [LLMTask.SUMMARIZER]: { - modelName: "anthropic:claude-3-5-haiku-latest", + modelName: "anthropic:claude-haiku-4-5", temperature: 0, }, }; diff --git a/packages/shared/src/open-swe/models.ts b/packages/shared/src/open-swe/models.ts index d0ec8043a..7751dc5b9 100644 --- a/packages/shared/src/open-swe/models.ts +++ b/packages/shared/src/open-swe/models.ts @@ -8,10 +8,18 @@ export const MODEL_OPTIONS = [ // label: "Claude Opus 4 (Extended Thinking)", // value: "anthropic:extended-thinking:claude-opus-4-0", // }, + { + label: "Claude Sonnet 4.5", + value: "anthropic:claude-sonnet-4-5", + }, { label: "Claude Sonnet 4", value: "anthropic:claude-sonnet-4-0", }, + { + label: "Claude Opus 4.5", + value: "anthropic:claude-opus-4-5", + }, { label: "Claude Opus 4.1", value: "anthropic:claude-opus-4-1", diff --git a/packages/shared/src/open-swe/types.ts b/packages/shared/src/open-swe/types.ts index 4bf9338f9..2f3d00d31 100644 --- a/packages/shared/src/open-swe/types.ts +++ b/packages/shared/src/open-swe/types.ts @@ -323,7 +323,7 @@ export const GraphConfigurationMetadata: { plannerModelName: { x_open_swe_ui_config: { type: "select", - default: "anthropic:claude-sonnet-4-0", + default: "anthropic:claude-opus-4-5", description: "The model to use for planning tasks. This model should be very good at generating code, and have strong context understanding and reasoning capabilities. It will be used for the most complex tasks throughout the agent.", options: MODEL_OPTIONS_NO_THINKING, @@ -342,7 +342,7 @@ export const GraphConfigurationMetadata: { programmerModelName: { x_open_swe_ui_config: { type: "select", - default: "anthropic:claude-sonnet-4-0", + default: "anthropic:claude-opus-4-5", description: "The model to use for programming/other advanced technical tasks. This model should be very good at generating code, and have strong context understanding and reasoning capabilities. It will be used for the most complex tasks throughout the agent.", options: MODEL_OPTIONS_NO_THINKING, @@ -361,7 +361,7 @@ export const GraphConfigurationMetadata: { reviewerModelName: { x_open_swe_ui_config: { type: "select", - default: "anthropic:claude-sonnet-4-0", + default: "anthropic:claude-opus-4-5", description: "The model to use for reviewer tasks. This model should be very good at generating code, and have strong context understanding and reasoning capabilities. It will be used for the most complex tasks throughout the agent.", options: MODEL_OPTIONS_NO_THINKING, @@ -380,7 +380,7 @@ export const GraphConfigurationMetadata: { routerModelName: { x_open_swe_ui_config: { type: "select", - default: "anthropic:claude-3-5-haiku-latest", + default: "anthropic:claude-haiku-4-5", description: "The model to use for routing tasks, and other simple generations. This model should be good at tool calling/structured output.", options: MODEL_OPTIONS, @@ -399,7 +399,7 @@ export const GraphConfigurationMetadata: { summarizerModelName: { x_open_swe_ui_config: { type: "select", - default: "anthropic:claude-sonnet-4-0", + default: "anthropic:claude-opus-4-5", description: "The model to use for summarizing the conversation history, or extracting key context from large inputs. This model should have strong context retention/understanding capabilities, and should be good at tool calling/structured output.", options: MODEL_OPTIONS_NO_THINKING, @@ -537,7 +537,7 @@ export const GraphConfiguration = z.object({ /** * The model ID to use for programming/other advanced technical tasks. - * @default "anthropic:claude-sonnet-4-0" + * @default "anthropic:claude-opus-4-5" */ plannerModelName: withLangGraph(z.string().optional(), { metadata: GraphConfigurationMetadata.plannerModelName, @@ -552,7 +552,7 @@ export const GraphConfiguration = z.object({ /** * The model ID to use for programming/other advanced technical tasks. - * @default "anthropic:claude-sonnet-4-0" + * @default "anthropic:claude-opus-4-5" */ programmerModelName: withLangGraph(z.string().optional(), { metadata: GraphConfigurationMetadata.programmerModelName, @@ -567,7 +567,7 @@ export const GraphConfiguration = z.object({ /** * The model ID to use for programming/other advanced technical tasks. - * @default "anthropic:claude-sonnet-4-0" + * @default "anthropic:claude-opus-4-5" */ reviewerModelName: withLangGraph(z.string().optional(), { metadata: GraphConfigurationMetadata.reviewerModelName, @@ -582,7 +582,7 @@ export const GraphConfiguration = z.object({ /** * The model ID to use for routing tasks. - * @default "anthropic:claude-3-5-haiku-latest" + * @default "anthropic:claude-haiku-4-5" */ routerModelName: withLangGraph(z.string().optional(), { metadata: GraphConfigurationMetadata.routerModelName, @@ -597,7 +597,7 @@ export const GraphConfiguration = z.object({ /** * The model ID to use for summarizing the conversation history, or extracting key context from large inputs. - * @default "anthropic:claude-sonnet-4-0" + * @default "anthropic:claude-opus-4-5" */ summarizerModelName: withLangGraph(z.string().optional(), { metadata: GraphConfigurationMetadata.summarizerModelName,