diff --git a/README.md b/README.md index e72e9d1..386d54b 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,49 @@ result = await agent.ainvoke( ) ``` +### File System Support + +You can now attach files to the sandbox environment and perform data analysis: + +```python +import asyncio + +from langchain_sandbox import PyodideSandboxTool +from langgraph.prebuilt import create_react_agent + +# Define the sandbox tool with filesystem support +sandbox_tool = PyodideSandboxTool( + enable_filesystem=True, + allow_net=True, +) + +sales_data = """...csv_data""" + +sandbox_tool.attach_file("sales.csv", sales_data) + +# Create an agent with the sandbox tool +agent = create_react_agent( + "anthropic:claude-3-7-sonnet-latest", [sandbox_tool] +) + +query = """Please analyze the sales data and tell me: +1. What is the total revenue by category? +2. Which region has the highest sales? +3. What are the top 3 best-selling products by revenue? + +Use pandas to read the CSV file and perform the analysis.""" + +async def run_agent(query: str): + # Stream agent outputs + async for chunk in agent.astream({"messages": query}): + print(chunk) + print("\n") + +if __name__ == "__main__": + # Run the agent + asyncio.run(run_agent(query)) +``` + #### Stateful Tool > [!important] @@ -192,16 +235,15 @@ second_result = await agent.ainvoke( ) ``` - - See full examples here: * [ReAct agent](examples/react_agent.py) * [CodeAct agent](examples/codeact_agent.py) +* [ReAct agent with csv](examples/react_agent_with_csv.py) ## 🧩 Components The sandbox consists of two main components: - **`pyodide-sandbox-js`**: JavaScript/TypeScript module using Deno to provide the core sandboxing functionality. -- **`sandbox-py`**: Contains `PyodideSandbox` which just wraps the JavaScript/TypeScript module and executes it as a subprocess. +- **`sandbox-py`**: Contains `PyodideSandbox` which just wraps the JavaScript/TypeScript module and executes it as a subprocess. \ No newline at end of file diff --git a/examples/react_agent_with_csv.py b/examples/react_agent_with_csv.py new file mode 100644 index 0000000..923418c --- /dev/null +++ b/examples/react_agent_with_csv.py @@ -0,0 +1,47 @@ +# pip install langgraph-codeact "langchain[anthropic]" +import asyncio + +from langchain_sandbox import PyodideSandboxTool +from langgraph.prebuilt import create_react_agent + +sales_data = """date,product,category,quantity,price,region +2024-01-15,Laptop,Electronics,2,1299.99,North +2024-01-16,Chair,Furniture,1,249.50,South +2024-01-16,T-shirt,Clothing,5,29.99,East +2024-01-17,Laptop,Electronics,1,1299.99,West +2024-01-18,Phone,Electronics,3,799.99,North +2024-01-19,Desk,Furniture,2,399.99,South +2024-01-20,Jeans,Clothing,4,79.99,East +2024-01-21,Tablet,Electronics,2,499.99,West +2024-01-22,Sofa,Furniture,1,899.99,North +2024-01-23,Shoes,Clothing,3,129.99,South""" + +# Define the sandbox tool with filesystem support +sandbox_tool = PyodideSandboxTool( + allow_net=True, + files={ + "sales.csv": sales_data + } +) + +# Create an agent with the sandbox tool +agent = create_react_agent( + "anthropic:claude-3-7-sonnet-latest", [sandbox_tool] +) + +query = """Please analyze the sales data and tell me: +1. What is the total revenue by category? +2. Which region has the highest sales? +3. What are the top 3 best-selling products by revenue? + +Use pandas to read the CSV file and perform the analysis.""" + +async def run_agent(query: str): + # Stream agent outputs + async for chunk in agent.astream({"messages": query}): + print(chunk) + print("\n") + +if __name__ == "__main__": + # Run the agent + asyncio.run(run_agent(query)) diff --git a/libs/pyodide-sandbox-js/main.ts b/libs/pyodide-sandbox-js/main.ts index b9b7b88..31105bc 100644 --- a/libs/pyodide-sandbox-js/main.ts +++ b/libs/pyodide-sandbox-js/main.ts @@ -13,6 +13,9 @@ import datetime import importlib import json import sys +import os +import base64 +from pathlib import Path from typing import Union, TypedDict, List, Any, Callable, Literal try: @@ -24,11 +27,91 @@ import pyodide_js # noqa sys.setrecursionlimit(400) - class InstallEntry(TypedDict): module: str package: str +def perform_fs_operation(op) -> dict: + """Filesystem operation function for file operations. + + Supports only essential operations needed for the binary streaming protocol: + - read: Read file contents (text or binary) + - write: Write file contents (text or binary) + - list: List directory contents + - mkdir: Create directories + """ + try: + if hasattr(op, 'to_py'): + op = op.to_py() + + operation = op.get("operation") + path = op.get("path") + content = op.get("content") + encoding = op.get("encoding", "utf-8") + + if operation == "read": + if os.path.exists(path): + if encoding == "binary": + with open(path, "rb") as f: + content = base64.b64encode(f.read()).decode('ascii') + return {"success": True, "content": content, "is_binary": True} + else: + with open(path, "r", encoding=encoding) as f: + content = f.read() + return {"success": True, "content": content, "is_binary": False} + else: + return {"success": False, "error": f"File not found: {path}"} + + elif operation == "write": + parent_dir = os.path.dirname(path) + if parent_dir and not os.path.exists(parent_dir): + os.makedirs(parent_dir, exist_ok=True) + + if encoding == "binary": + content = base64.b64decode(content) + with open(path, "wb") as f: + f.write(content) + else: + with open(path, "w", encoding=encoding) as f: + f.write(content) + + if os.path.exists(path): + return {"success": True} + else: + return {"success": False, "error": f"Failed to create file at {path}"} + + elif operation == "list": + if os.path.exists(path): + items = [] + for item in os.listdir(path): + item_path = os.path.join(path, item) + stat_info = os.stat(item_path) + items.append({ + "name": item, + "is_dir": os.path.isdir(item_path), + "is_file": os.path.isfile(item_path), + "size": stat_info.st_size, + "modified": stat_info.st_mtime + }) + return {"success": True, "items": items} + else: + return {"success": False, "error": f"Directory not found: {path}"} + + elif operation == "mkdir": + try: + os.makedirs(path, exist_ok=True) + if os.path.exists(path): + return {"success": True} + else: + return {"success": False, "error": "Failed to create directory"} + except Exception as e: + return {"success": False, "error": f"Error creating directory: {str(e)}"} + + else: + return {"success": False, "error": f"Unknown operation: {operation}"} + + except Exception as e: + return {"success": False, "error": str(e)} def find_imports_to_install(imports: list[str]) -> list[InstallEntry]: """ @@ -57,7 +140,6 @@ def find_imports_to_install(imports: list[str]) -> list[InstallEntry]: ) return to_install - async def install_imports( source_code_or_imports: Union[str, list[str]], additional_packages: list[str] = [], @@ -75,14 +157,14 @@ async def install_imports( try: imports: list[str] = find_imports(source_code_or_imports) except SyntaxError: - return + return [] else: imports: list[str] = source_code_or_imports to_install = find_imports_to_install(imports) # Merge with additional packages for package in additional_packages: - if package not in to_install: + if package not in [entry["package"] for entry in to_install]: to_install.append(dict(module=package, package=package)) if to_install: @@ -100,7 +182,6 @@ async def install_imports( break # Fail fast return to_install - def load_session_bytes(session_bytes: bytes) -> list[str]: """Load the session module.""" import dill @@ -109,7 +190,6 @@ def load_session_bytes(session_bytes: bytes) -> list[str]: buffer = io.BytesIO(session_bytes.to_py()) dill.session.load_session(filename=buffer) - def dump_session_bytes() -> bytes: """Dump the session module.""" import dill @@ -119,7 +199,6 @@ def dump_session_bytes() -> bytes: dill.session.dump_session(filename=buffer) return buffer.getvalue() - def robust_serialize(obj): """Recursively converts an arbitrary Python object into a JSON-serializable structure. @@ -157,7 +236,6 @@ def robust_serialize(obj): # return a dictionary with type indicator and repr. return {"type": "not_serializable", "repr": repr(obj)} - def dumps(result: Any) -> str: """Get the result of the session.""" result = robust_serialize(result) @@ -170,25 +248,231 @@ interface SessionMetadata { packages: string[]; } +interface FileSystemOptions { + enableFileSystem?: boolean; +} + interface PyodideResult { success: boolean; - result?: any; + result?: unknown; stdout?: string[]; stderr?: string[]; error?: string; jsonResult?: string; sessionBytes?: Uint8Array; sessionMetadata?: SessionMetadata; + fileSystemOperations?: Record[]; + fileSystemInfo?: { + type: "memfs"; + mountPoint: string; + workingDirectory: string; + mounted: boolean; + }; +} + +interface FileSystemOperation { + operation: "read" | "write" | "list" | "mkdir"; // Removed: "exists" | "remove" | "copy" + path: string; + content?: string | Uint8Array; + encoding?: string; + destination?: string; +} + +/** + * Resolves a relative path within the sandbox environment. + */ +function resolvePathInSandbox( + inputPath: string, + mountPoint: string = "/sandbox" +): string { + // If already absolute, return as is + if (inputPath.startsWith("/")) { + return inputPath; + } + + // Resolve directly in mount point + if (inputPath.startsWith("./")) { + const cleanPath = inputPath.substring(2); + return `${mountPoint}/${cleanPath}`.replace(/\/+/g, "/"); + } else if (inputPath.startsWith("../")) { + return `${mountPoint}/${inputPath}`.replace(/\/+/g, "/"); + } else { + return `${mountPoint}/${inputPath}`.replace(/\/+/g, "/"); + } +} + +/** + * Setup memory filesystem environment in Python. + */ +function setupFileSystem(pyodide: unknown): void { + const mountPoint = "/sandbox"; + + (pyodide as { runPython: (code: string) => void }).runPython(` +import os +import sys + +# Setup memory filesystem environment +MOUNT_POINT = "${mountPoint}" + +# Ensure directory exists +os.makedirs(MOUNT_POINT, exist_ok=True) + +# Change to mount point +os.chdir(MOUNT_POINT) + +# Make variables available globally +sys.modules['__main__'].MOUNT_POINT = MOUNT_POINT + +# Add helper function for path resolution +def resolve_path(path): + """Resolve a path relative to the sandbox""" + if isinstance(path, str) and path.startswith("/"): + return path + return os.path.join(MOUNT_POINT, path) + +sys.modules['__main__'].resolve_path = resolve_path + `); } -async function initPyodide(pyodide: any): Promise { - const sys = pyodide.pyimport("sys"); - const pathlib = pyodide.pyimport("pathlib"); +function initPyodide(pyodide: unknown): void { + const sys = (pyodide as { pyimport: (name: string) => unknown }).pyimport("sys"); + const pathlib = (pyodide as { pyimport: (name: string) => unknown }).pyimport("pathlib"); const dirPath = "/tmp/pyodide_worker_runner/"; - sys.path.append(dirPath); - pathlib.Path(dirPath).mkdir(); - pathlib.Path(dirPath + "prepare_env.py").write_text(prepareEnvCode); + (sys as { path: { append: (path: string) => void } }).path.append(dirPath); + (pathlib as { Path: (path: string) => { mkdir: () => void; write_text: (text: string) => void } }).Path(dirPath).mkdir(); + (pathlib as { Path: (path: string) => { mkdir: () => void; write_text: (text: string) => void } }).Path(dirPath + "prepare_env.py").write_text(prepareEnvCode); + + // Ensure sandbox mount point exists + try { + (pyodide as { FS: { mkdirTree: (path: string) => void } }).FS.mkdirTree("/sandbox"); + } catch (_e) { + // Directory might already exist, which is fine + } + + setupFileSystem(pyodide); +} + +/** + * Process stdin using ReadableStream for large files + */ +async function processStreamedFiles(pyodide: unknown): Promise[]> { + const results: Record[] = []; + + // Read binary protocol header + const headerBuffer = new Uint8Array(8); + const headerRead = await Deno.stdin.read(headerBuffer); + + if (!headerRead || headerRead < 8) { + // No stdin data or insufficient data + return results; + } + + // Check magic header + const magic = new TextDecoder().decode(headerBuffer.slice(0, 3)); + const version = headerBuffer[3]; + if (magic !== "PSB" || version !== 1) { + throw new Error(`Invalid PSB header: ${magic} v${version}`); + } + + // Get metadata length + const metadataLength = new DataView(headerBuffer.buffer).getUint32(4, false); + + // Read metadata + const metadataBuffer = new Uint8Array(metadataLength); + const metadataRead = await Deno.stdin.read(metadataBuffer); + + if (!metadataRead || metadataRead < metadataLength) { + throw new Error("Failed to read metadata"); + } + + // Parse metadata + const metadata = JSON.parse(new TextDecoder().decode(metadataBuffer)) as { + directories?: string[]; + files?: Array<{ path: string; size: number; binary: boolean }>; + }; + + // Process directories first + if (metadata.directories) { + for (const dir of metadata.directories) { + const resolvedPath = resolvePathInSandbox(dir, "/sandbox"); + try { + (pyodide as { FS: { mkdirTree: (path: string) => void } }).FS.mkdirTree(resolvedPath); + results.push({ + success: true, + operation: "mkdir", + path: dir + }); + } catch (error) { + const errorMsg = error instanceof Error ? error.message : String(error); + results.push({ + success: false, + error: errorMsg, + operation: "mkdir", + path: dir + }); + } + } + } + + // Process files + if (metadata.files && metadata.files.length > 0) { + for (const fileInfo of metadata.files) { + const resolvedPath = resolvePathInSandbox(fileInfo.path, "/sandbox"); + + // Create parent directories if needed + const parentDir = resolvedPath.substring(0, resolvedPath.lastIndexOf("/")); + if (parentDir) { + try { + (pyodide as { FS: { mkdirTree: (path: string) => void } }).FS.mkdirTree(parentDir); + } catch (_e) { + // Directory might already exist + } + } + + try { + // Read file data + const fileBuffer = new Uint8Array(fileInfo.size); + let bytesRead = 0; + + // Read in chunks to handle large files efficiently + while (bytesRead < fileInfo.size) { + const chunkSize = Math.min(65536, fileInfo.size - bytesRead); + const chunkBuffer = new Uint8Array(chunkSize); + const readResult = await Deno.stdin.read(chunkBuffer); + + if (readResult === null) { + throw new Error(`Unexpected end of stream at ${bytesRead}/${fileInfo.size} bytes`); + } + + // Copy to the main buffer + fileBuffer.set(chunkBuffer.subarray(0, readResult), bytesRead); + bytesRead += readResult; + } + + // Write to PyFS + (pyodide as { FS: { writeFile: (path: string, data: Uint8Array) => void } }).FS.writeFile(resolvedPath, fileBuffer); + + results.push({ + success: true, + operation: "write", + path: fileInfo.path, + size: bytesRead, + binary: fileInfo.binary + }); + } catch (error) { + const errorMsg = error instanceof Error ? error.message : String(error); + results.push({ + success: false, + error: errorMsg, + operation: "write", + path: fileInfo.path + }); + } + } + } + + return results; } async function runPython( @@ -197,27 +481,29 @@ async function runPython( stateful?: boolean; sessionBytes?: string; sessionMetadata?: string; - } + } = {} ): Promise { const output: string[] = []; const err_output: string[] = []; const originalLog = console.log; - console.log = (...args: any[]) => {} + console.log = (..._args: unknown[]) => {} try { const pyodide = await loadPyodide({ stdout: (msg) => output.push(msg), stderr: (msg) => err_output.push(msg), - }) + }); + await pyodide.loadPackage(["micropip"], { messageCallback: () => {}, errorCallback: (msg: string) => { output.push(`install error: ${msg}`) }, }); - await initPyodide(pyodide); - // Determine session directory + initPyodide(pyodide); + + // Determine session metadata let sessionMetadata: SessionMetadata; if (options.sessionMetadata) { sessionMetadata = JSON.parse(options.sessionMetadata); @@ -227,25 +513,38 @@ async function runPython( lastModified: new Date().toISOString(), packages: [], }; - }; + } let sessionData: Uint8Array | null = null; if (options.sessionBytes && !options.sessionMetadata) { console.error("sessionMetadata is required when providing sessionBytes"); return { success: false, error: "sessionMetadata is required when providing sessionBytes" }; } - - // Import our prepared environment module - const prepare_env = pyodide.pyimport("prepare_env"); - // Prepare additional packages to install (include dill) + + // Import prepared environment module + const prepare_env = (pyodide as { pyimport: (name: string) => unknown }).pyimport("prepare_env"); + + let fileSystemResults: Record[] = []; + + if (!Deno.stdin.isTerminal()) { + fileSystemResults = await processStreamedFiles(pyodide); + } + + // Prepare packages to install (include dill) const defaultPackages = options.stateful ? ["dill"] : []; const additionalPackagesToInstall = options.sessionBytes ? [...new Set([...defaultPackages, ...sessionMetadata.packages])] : defaultPackages; - let installErrors: string[] = [] + const installErrors: string[] = [] - const installedPackages = await prepare_env.install_imports( + const installedPackages = await (prepare_env as { + install_imports: ( + code: string, + packages: string[], + callback: (event: string, data: string) => void + ) => Promise; + }).install_imports( pythonCode, additionalPackagesToInstall, (event_type: string, data: string) => { @@ -271,17 +570,22 @@ async function runPython( if (options.sessionBytes) { sessionData = Uint8Array.from(JSON.parse(options.sessionBytes)); // Run session preamble - await prepare_env.load_session_bytes(sessionData); + await (prepare_env as { load_session_bytes: (data: Uint8Array) => Promise }) + .load_session_bytes(sessionData); } - const packages = installedPackages.map((pkg: any) => pkg.get("package")); + const packages = installedPackages.map((pkg: unknown) => + (pkg as { get?: (key: string) => string }).get?.("package") as string + ); // Restore the original console.log function console.log = originalLog; + // Run the Python code - const rawValue = await pyodide.runPythonAsync(pythonCode); + const rawValue = await (pyodide as { runPythonAsync: (code: string) => Promise }).runPythonAsync(pythonCode); // Dump result to string - const jsonValue = await prepare_env.dumps(rawValue); + const jsonValue = await (prepare_env as { dumps: (value: unknown) => Promise }) + .dumps(rawValue); // Update session metadata with installed packages sessionMetadata.packages = [ @@ -291,25 +595,41 @@ async function runPython( if (options.stateful) { // Save session state to sessionBytes - sessionData = await prepare_env.dump_session_bytes() as Uint8Array; - }; + sessionData = await (prepare_env as { dump_session_bytes: () => Promise }) + .dump_session_bytes(); + } + + // Process stdout - join array to string for consistent handling + const stdoutString = output.join('\n'); + // Return the result with stdout and stderr output const result: PyodideResult = { success: true, result: rawValue, jsonResult: jsonValue, - stdout: output, + stdout: stdoutString ? [stdoutString] : [], stderr: err_output, sessionMetadata: sessionMetadata, }; + if (options.stateful && sessionData) { result["sessionBytes"] = sessionData; } + + // Add filesystem info + result["fileSystemOperations"] = fileSystemResults; + result["fileSystemInfo"] = { + type: "memfs", + mountPoint: "/sandbox", + workingDirectory: "", + mounted: true + }; + return result; - } catch (error: any) { + } catch (error: unknown) { return { success: false, - error: error.message, + error: error instanceof Error ? error.message : String(error), stdout: output, stderr: err_output }; @@ -329,7 +649,11 @@ async function main(): Promise { m: "session-metadata", }, boolean: ["help", "version", "stateful"], - default: { help: false, version: false, stateful: false }, + default: { + help: false, + version: false, + stateful: false + }, }); if (flags.help) { @@ -338,14 +662,14 @@ pyodide-sandbox ${pkgVersion} Run Python code in a sandboxed environment using Pyodide OPTIONS: - -c, --code Python code to execute - -f, --file Path to Python file to execute - -s, --stateful Use a stateful session - -b, --session-bytes Session bytes - -m, --session-metadata Session metadata - -h, --help Display help - -V, --version Display version -`); + -c, --code Python code to execute + -f, --file Path to Python file to execute + -s, --stateful Use a stateful session + -b, --session-bytes Session bytes + -m, --session-metadata Session metadata + -h, --help Display help + -V, --version Display version +`); return; } @@ -354,57 +678,58 @@ OPTIONS: return } - const options = { - code: flags.code, - file: flags.file, - stateful: flags.stateful, - sessionBytes: flags["session-bytes"], - sessionMetadata: flags["session-metadata"], - }; - - if (!options.code && !options.file) { - console.error( - "Error: You must provide Python code using either -c/--code or -f/--file option.\nUse --help for usage information." - ); - Deno.exit(1); - } - // Get Python code from file or command line argument let pythonCode = ""; - if (options.file) { + if (flags.file) { try { // Resolve relative or absolute file path - const filePath = options.file.startsWith("/") - ? options.file - : join(Deno.cwd(), options.file); + const filePath = flags.file.startsWith("/") + ? flags.file + : join(Deno.cwd(), flags.file); pythonCode = await Deno.readTextFile(filePath); - } catch (error: any) { - console.error(`Error reading file ${options.file}:`, error.message); + } catch (error: unknown) { + const errorMessage = error instanceof Error ? error.message : String(error); + console.error(`Error reading file ${flags.file}:`, errorMessage); Deno.exit(1); } } else { // Process code from command line (replacing escaped newlines) - pythonCode = options.code?.replace(/\\n/g, "\n") ?? ""; + pythonCode = flags.code?.replace(/\\n/g, "\n") ?? ""; } + if (!pythonCode) { + console.error( + "Error: You must provide Python code using either -c/--code or -f/--file option.\nUse --help for usage information." + ); + Deno.exit(1); + } + + // Run the code const result = await runPython(pythonCode, { - stateful: options.stateful, - sessionBytes: options.sessionBytes, - sessionMetadata: options.sessionMetadata, + stateful: flags.stateful, + sessionBytes: flags["session-bytes"], + sessionMetadata: flags["session-metadata"], }); - // Exit with error code if Python execution failed // Create output JSON with stdout, stderr, and result - const outputJson = { - stdout: result.stdout?.join('') || null, - stderr: result.success ? (result.stderr?.join('') || null) : result.error || null, + const outputJson: Record = { + stdout: result.stdout?.join('\n') || "", + stderr: result.success ? (result.stderr?.join('\n') || null) : result.error || null, result: result.success ? JSON.parse(result.jsonResult || 'null') : null, success: result.success, sessionBytes: result.sessionBytes, sessionMetadata: result.sessionMetadata, }; + // Include filesystem info if used + if (result.fileSystemInfo) { + outputJson.fileSystemInfo = result.fileSystemInfo; + } + if (result.fileSystemOperations) { + outputJson.fileSystemOperations = result.fileSystemOperations; + } + // Output as JSON to stdout console.log(JSON.stringify(outputJson)); @@ -416,12 +741,10 @@ OPTIONS: // If this module is run directly if (import.meta.main) { - // Override the global environment variables that Deno's permission prompts look for - // to suppress color-related permission prompts main().catch((err) => { console.error("Unhandled error:", err); Deno.exit(1); }); } -export { runPython }; +export { runPython, resolvePathInSandbox, type FileSystemOperation, type FileSystemOptions }; \ No newline at end of file diff --git a/libs/pyodide-sandbox-js/main_test.ts b/libs/pyodide-sandbox-js/main_test.ts index 5aafc05..3b335d5 100644 --- a/libs/pyodide-sandbox-js/main_test.ts +++ b/libs/pyodide-sandbox-js/main_test.ts @@ -1,5 +1,5 @@ -import { assertEquals, assertNotEquals, assertExists } from "@std/assert"; -import { runPython } from "./main.ts"; +import { assertEquals, assertExists, assertNotEquals } from "@std/assert"; +import { runPython, resolvePathInSandbox } from "./main.ts"; Deno.test("runPython simple test", async () => { const result = await runPython("x = 2 + 3; x", {}); @@ -10,11 +10,19 @@ Deno.test("runPython simple test", async () => { Deno.test("runPython with stdout", async () => { const result = await runPython("x = 5; print(x); x", {}); assertEquals(result.success, true); - assertEquals(result.stdout?.join(''), "5"); + assertEquals(result.stdout?.join('').trim(), "5"); assertEquals(JSON.parse(result.jsonResult || "null"), 5); assertEquals(result.stderr?.length, 0); }); +Deno.test("runPython with error - name error", async () => { + const result = await runPython("undefined_variable", {}); + assertEquals(result.success, false); + assertExists(result.error); + // Check that error contains NameError + assertEquals(result.error?.includes("NameError"), true); +}); + Deno.test("runPython with error - division by zero", async () => { const result = await runPython("x = 1/0", {}); assertEquals(result.success, false); @@ -26,14 +34,273 @@ Deno.test("runPython with error - syntax error", async () => { const result = await runPython("x = 5; y = x +", {}); assertEquals(result.success, false); assertNotEquals(result.error?.length, 0); - // Check that error contains SyntaxError assertEquals(result.error?.includes("SyntaxError"), true); }); -Deno.test("runPython with error - name error", async () => { - const result = await runPython("undefined_variable", {}); - assertEquals(result.success, false); - assertExists(result.error); - // Check that error contains NameError - assertEquals(result.error?.includes("NameError"), true); +Deno.test("resolvePathInSandbox - basic resolution", () => { + assertEquals(resolvePathInSandbox("config.json"), "/sandbox/config.json"); + assertEquals(resolvePathInSandbox("./logs/app.log"), "/sandbox/logs/app.log"); + assertEquals(resolvePathInSandbox("../shared/data.txt"), "/sandbox/../shared/data.txt"); + assertEquals(resolvePathInSandbox("/tmp/absolute.txt"), "/tmp/absolute.txt"); }); + +// Helper function to create stdin data for filesystem operations +function createFilesystemStdin( + files: Array<{ path: string; content: string | Uint8Array; binary?: boolean }>, + directories: string[] = [] +): Uint8Array { + // Convert files to the expected format + const fileInfos = files.map(f => { + const contentBytes = typeof f.content === 'string' + ? new TextEncoder().encode(f.content) + : f.content; + + return { + path: f.path, + size: contentBytes.length, + binary: f.binary || false, + content: contentBytes + }; + }); + + // Create metadata + const metadata = { + files: fileInfos.map(f => ({ + path: f.path, + size: f.size, + binary: f.binary + })), + directories: directories + }; + + const metadataJson = new TextEncoder().encode(JSON.stringify(metadata)); + + // Create header: "PSB" + version + metadata size (4 bytes) + const header = new Uint8Array(8); + header.set(new TextEncoder().encode("PSB"), 0); + header[3] = 1; // version + + // Set metadata length (big endian) + const dataView = new DataView(header.buffer); + dataView.setUint32(4, metadataJson.length, false); + + // Combine header + metadata + file contents + const totalSize = header.length + metadataJson.length + + fileInfos.reduce((sum, f) => sum + f.content.length, 0); + + const result = new Uint8Array(totalSize); + let offset = 0; + + result.set(header, offset); + offset += header.length; + + result.set(metadataJson, offset); + offset += metadataJson.length; + + for (const fileInfo of fileInfos) { + result.set(fileInfo.content, offset); + offset += fileInfo.content.length; + } + + return result; +} + +// Mock Deno.stdin for filesystem tests +async function runPythonWithFiles( + code: string, + files: Array<{ path: string; content: string | Uint8Array; binary?: boolean }> = [], + directories: string[] = [], + options: Record = {} +) { + if (files.length === 0 && directories.length === 0) { + return await runPython(code, options); + } + + // Create the stdin data + const stdinData = createFilesystemStdin(files, directories); + + // Mock stdin for this test + const originalIsTerminal = Deno.stdin.isTerminal; + const originalRead = Deno.stdin.read; + let dataOffset = 0; + + // Mock isTerminal to return false (indicating we have stdin data) + Deno.stdin.isTerminal = () => false; + + // Mock stdin.read to return our data + Deno.stdin.read = (buffer: Uint8Array): Promise => { + if (dataOffset >= stdinData.length) { + return Promise.resolve(null); + } + + const remaining = stdinData.length - dataOffset; + const toRead = Math.min(buffer.length, remaining); + + buffer.set(stdinData.subarray(dataOffset, dataOffset + toRead)); + dataOffset += toRead; + + return Promise.resolve(toRead); + }; + + try { + return await runPython(code, options); + } finally { + // Restore original functions + Deno.stdin.isTerminal = originalIsTerminal; + Deno.stdin.read = originalRead; + } +} + +Deno.test("FileSystem - operations", async () => { + const files = [ + { + path: "config.json", + content: '{"app": "test", "version": "1.0"}' + }, + { + path: "data/output.txt", + content: "Hello World\nLine 2" + } + ]; + + const directories = ["data"]; + + const result = await runPythonWithFiles(` +import os +import json + +# Read config file +with open("config.json", "r") as f: + config = json.load(f) + +# Read data file +with open("data/output.txt", "r") as f: + content = f.read() + +# List files +root_files = os.listdir(".") +data_files = os.listdir("data") + +result = { + "config": config, + "content": content.strip(), + "root_files": sorted(root_files), + "data_files": sorted(data_files), + "working_dir": os.getcwd() +} + +result + `, files, directories); + + assertEquals(result.success, true); + const resultObj = JSON.parse(result.jsonResult || "null"); + + assertEquals(resultObj.config.app, "test"); + assertEquals(resultObj.content, "Hello World\nLine 2"); + assertEquals(resultObj.root_files, ["config.json", "data"]); + assertEquals(resultObj.data_files, ["output.txt"]); + assertEquals(resultObj.working_dir, "/sandbox"); +}); + +Deno.test("FileSystem - binary operations", async () => { + // Create binary content - "Binary data" encoded as bytes + const binaryContent = new TextEncoder().encode("Binary data"); + + const files = [ + { + path: "test.bin", + content: binaryContent, + binary: true + } + ]; + + const result = await runPythonWithFiles(` +import os + +# Read binary file +with open("test.bin", "rb") as f: + binary_content = f.read() + +# Decode content +decoded = binary_content.decode('utf-8') + +result = { + "file_exists": os.path.exists("test.bin"), + "decoded_content": decoded, + "is_binary_match": decoded == "Binary data", + "working_dir": os.getcwd() +} + +result + `, files); + + assertEquals(result.success, true); + const resultObj = JSON.parse(result.jsonResult || "null"); + assertEquals(resultObj.file_exists, true); + assertEquals(resultObj.decoded_content, "Binary data"); + assertEquals(resultObj.is_binary_match, true); + assertEquals(resultObj.working_dir, "/sandbox"); +}); + +Deno.test("FileSystem - memfs directory structure", async () => { + const files = [ + { + path: "project/src/main.py", + content: "print('Hello from memfs!')" + }, + { + path: "project/README.md", + content: "# My Project\nRunning in memfs" + } + ]; + + const directories = ["project", "project/src"]; + + const result = await runPythonWithFiles(` +import os + +# Navigate and check structure +project_exists = os.path.exists("project") +src_exists = os.path.exists("project/src") +main_py_exists = os.path.exists("project/src/main.py") +readme_exists = os.path.exists("project/README.md") + +# Read files +with open("project/src/main.py", "r") as f: + main_content = f.read() + +with open("project/README.md", "r") as f: + readme_content = f.read() + +# List structure +project_files = sorted(os.listdir("project")) +src_files = sorted(os.listdir("project/src")) + +result = { + "project_exists": project_exists, + "src_exists": src_exists, + "main_py_exists": main_py_exists, + "readme_exists": readme_exists, + "main_content": main_content.strip(), + "readme_content": readme_content.strip(), + "project_files": project_files, + "src_files": src_files, + "working_dir": os.getcwd() +} + +result + `, files, directories); + + assertEquals(result.success, true); + const resultObj = JSON.parse(result.jsonResult || "null"); + + assertEquals(resultObj.project_exists, true); + assertEquals(resultObj.src_exists, true); + assertEquals(resultObj.main_py_exists, true); + assertEquals(resultObj.readme_exists, true); + assertEquals(resultObj.main_content, "print('Hello from memfs!')"); + assertEquals(resultObj.readme_content, "# My Project\nRunning in memfs"); + assertEquals(resultObj.project_files, ["README.md", "src"]); + assertEquals(resultObj.src_files, ["main.py"]); + assertEquals(resultObj.working_dir, "/sandbox"); +}); \ No newline at end of file diff --git a/libs/sandbox-py/README.md b/libs/sandbox-py/README.md index e37ba58..386d54b 100644 --- a/libs/sandbox-py/README.md +++ b/libs/sandbox-py/README.md @@ -34,6 +34,11 @@ LangChain Sandbox provides a secure environment for executing untrusted Python c ## 💡 Example Usage + +> [!warning] +> Use `alllow_net` to limit the network requests that can be made by the sandboxed code to avoid SSRF attacks +> https://docs.deno.com/runtime/fundamentals/security/#network-access + ```python from langchain_sandbox import PyodideSandbox @@ -136,6 +141,49 @@ result = await agent.ainvoke( ) ``` +### File System Support + +You can now attach files to the sandbox environment and perform data analysis: + +```python +import asyncio + +from langchain_sandbox import PyodideSandboxTool +from langgraph.prebuilt import create_react_agent + +# Define the sandbox tool with filesystem support +sandbox_tool = PyodideSandboxTool( + enable_filesystem=True, + allow_net=True, +) + +sales_data = """...csv_data""" + +sandbox_tool.attach_file("sales.csv", sales_data) + +# Create an agent with the sandbox tool +agent = create_react_agent( + "anthropic:claude-3-7-sonnet-latest", [sandbox_tool] +) + +query = """Please analyze the sales data and tell me: +1. What is the total revenue by category? +2. Which region has the highest sales? +3. What are the top 3 best-selling products by revenue? + +Use pandas to read the CSV file and perform the analysis.""" + +async def run_agent(query: str): + # Stream agent outputs + async for chunk in agent.astream({"messages": query}): + print(chunk) + print("\n") + +if __name__ == "__main__": + # Run the agent + asyncio.run(run_agent(query)) +``` + #### Stateful Tool > [!important] @@ -187,16 +235,15 @@ second_result = await agent.ainvoke( ) ``` - - See full examples here: * [ReAct agent](examples/react_agent.py) * [CodeAct agent](examples/codeact_agent.py) +* [ReAct agent with csv](examples/react_agent_with_csv.py) ## 🧩 Components The sandbox consists of two main components: - **`pyodide-sandbox-js`**: JavaScript/TypeScript module using Deno to provide the core sandboxing functionality. -- **`sandbox-py`**: Contains `PyodideSandbox` which just wraps the JavaScript/TypeScript module and executes it as a subprocess. +- **`sandbox-py`**: Contains `PyodideSandbox` which just wraps the JavaScript/TypeScript module and executes it as a subprocess. \ No newline at end of file diff --git a/libs/sandbox-py/langchain_sandbox/__init__.py b/libs/sandbox-py/langchain_sandbox/__init__.py index ab8c43a..526a70f 100644 --- a/libs/sandbox-py/langchain_sandbox/__init__.py +++ b/libs/sandbox-py/langchain_sandbox/__init__.py @@ -6,8 +6,4 @@ SyncPyodideSandbox, ) -__all__ = [ - "PyodideSandbox", - "PyodideSandboxTool", - "SyncPyodideSandbox", -] +__all__ = ["PyodideSandbox", "PyodideSandboxTool", "SyncPyodideSandbox"] diff --git a/libs/sandbox-py/langchain_sandbox/pyodide.py b/libs/sandbox-py/langchain_sandbox/pyodide.py index cc1f579..19e3377 100644 --- a/libs/sandbox-py/langchain_sandbox/pyodide.py +++ b/libs/sandbox-py/langchain_sandbox/pyodide.py @@ -15,7 +15,7 @@ from langchain_core.messages import ToolMessage from langchain_core.runnables import RunnableConfig from langchain_core.tools import BaseTool, InjectedToolCallId -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, PrivateAttr logger = logging.getLogger(__name__) @@ -34,10 +34,13 @@ class CodeExecutionResult: execution_time: float session_metadata: dict | None = None session_bytes: bytes | None = None + filesystem_info: dict | None = None + filesystem_operations: list[dict] | None = None # Published package name PKG_NAME = "jsr:@langchain/pyodide-sandbox@0.0.4" +# PKG_NAME = "../pyodide-sandbox-js/main.ts" def build_permission_flag( @@ -80,6 +83,7 @@ class BasePyodideSandbox: - Support for execution timeouts to prevent infinite loops - Memory usage monitoring - Process isolation via Deno's security sandbox + - In-memory filesystem with file attachment capabilities The sandbox supports fine-grained permission control through its initializer: - Restrict network access to specific domains @@ -100,6 +104,8 @@ def __init__( allow_ffi: list[str] | bool = False, node_modules_dir: str = "auto", skip_deno_check: bool = False, + files: dict[str, str | bytes] | None = None, + directories: list[str] | None = None, ) -> None: """Initialize the sandbox with specific Deno permissions. @@ -126,7 +132,7 @@ def __init__( - List[str]: Read access restricted to specific paths, e.g. ["/tmp/sandbox", "./data"] - By default allows read from node_modules + By default allows read from node_modules and other required paths allow_write: File system write access configuration: - False: No file system write access (default, most secure) @@ -134,7 +140,7 @@ def __init__( - List[str]: Write access restricted to specific paths, e.g. ["/tmp/sandbox/output"] - By default allows write to node_modules + By default allows write to node_modules and other required paths allow_net: Network access configuration: - False: No network access (default, most secure) @@ -157,10 +163,15 @@ def __init__( node_modules_dir: Directory for Node.js modules. Set to "auto" to use the default directory for Deno modules. skip_deno_check: If True, skip the check for Deno installation. + files: Dictionary of files to attach to the sandbox filesystem. + Keys are file paths, values are file contents (str or bytes). + directories: List of directory paths to create in the sandbox filesystem. """ self.stateful = stateful - # Configure permissions - self.permissions = [] + # List to store file information for binary streaming + self._sandbox_files = [] + # List to store directory paths + self._sandbox_dirs = list(directories) if directories else [] if not skip_deno_check: # Check if Deno is installed @@ -197,6 +208,105 @@ def __init__( self.permissions.append(f"--node-modules-dir={node_modules_dir}") + # Process files if provided during initialization + if files: + for path, content in files.items(): + self._process_file(path, content) + + def _process_file( + self, + path: str, + content: str | bytes, + ) -> None: + """Process a file for attachment during initialization only. + + Args: + path: Path where the file should be available in the sandbox + content: File content as string (for text files) or bytes (for binary files) + + Raises: + TypeError: If content is neither string nor bytes + """ + if isinstance(content, str): + # Text file - convert to bytes + content_bytes = content.encode("utf-8") + self._sandbox_files.append( + { + "path": path, + "content": content_bytes, + "size": len(content_bytes), + "binary": False, + } + ) + logger.debug( + "Processed text file: %s (%d bytes)", + path, + len(content_bytes), + ) + elif isinstance(content, bytes): + # Binary file + self._sandbox_files.append( + {"path": path, "content": content, "size": len(content), "binary": True} + ) + logger.debug( + "Processed binary file: %s (%d bytes)", + path, + len(content), + ) + else: + msg = f"Content must be either a string or bytes, got {type(content)}" + raise TypeError(msg) + + @property + def _attached_files(self) -> list[str]: + """Get list of attached file paths. + + Returns: + List of file paths currently attached to the sandbox + """ + return [f["path"] for f in self._sandbox_files] + + def _prepare_stdin_data(self) -> bytes | None: + """Prepare data to be sent via stdin using binary streaming protocol. + + Creates a binary stream containing filesystem data when files or directories + are attached. Uses the PSB (Pyodide Sandbox Binary) protocol format: + - Header: "PSB" + version(1 byte) + metadata_length(4 bytes) + - Metadata: JSON describing files and directories + - Content: Raw binary content of all files in sequence + + Returns: + Binary data to send via stdin, or None if no filesystem operations + """ + # Use binary protocol if we have files or directories + if not self._sandbox_files and not self._sandbox_dirs: + # No files, return None to avoid sending stdin + return None + + # Format: "PSB" + version + length(4 bytes) + metadata JSON + file data + metadata = { + "files": [ + {"path": f["path"], "size": f["size"], "binary": f["binary"]} + for f in self._sandbox_files + ], + "directories": self._sandbox_dirs, + } + + metadata_json = json.dumps(metadata).encode("utf-8") + + # Create header: "PSB" + version + metadata size (4 bytes) + header = b"PSB\x01" + len(metadata_json).to_bytes(4, byteorder="big") + + # Concatenate header + metadata + result = bytearray(header) + result.extend(metadata_json) + + # Add file contents directly as binary data + for file_info in self._sandbox_files: + result.extend(file_info["content"]) + + return bytes(result) + def _build_command( self, code: str, @@ -248,11 +358,58 @@ def _build_command( return cmd +def _process_execution_output( + stdout_text: str, + stderr_bytes: bytes, +) -> tuple[ + str, str, Any, str, dict | None, dict | None, list[dict] | None, bytes | None +]: + """Process execution output and return parsed results. + + Returns: + Tuple of (stdout, stderr, result, status, session_metadata, + filesystem_info, filesystem_operations, session_bytes) + """ + if stdout_text: + try: + full_result = json.loads(stdout_text) + stdout = full_result.get("stdout", "") + stderr = full_result.get("stderr", "") + result = full_result.get("result", None) + status = "success" if full_result.get("success", False) else "error" + session_metadata = full_result.get("sessionMetadata", None) + filesystem_info = full_result.get("fileSystemInfo", None) + filesystem_operations = full_result.get("fileSystemOperations", None) + + # Convert array of bytes to Python bytes + session_bytes_array = full_result.get("sessionBytes", None) + session_bytes = bytes(session_bytes_array) if session_bytes_array else None + + return ( + stdout, + stderr, + result, + status, + session_metadata, + filesystem_info, + filesystem_operations, + session_bytes, + ) + except json.JSONDecodeError as e: + status = "error" + stderr = f"Failed to parse output as JSON: {e}\nRaw output: {stdout_text}" + return ("", stderr, None, status, None, None, None, None) + + stderr = stderr_bytes.decode("utf-8", errors="replace") + return ("", stderr, None, "error", None, None, None, None) + + class PyodideSandbox(BasePyodideSandbox): """Asynchronous implementation of PyodideSandbox. This class provides an asynchronous interface for executing Python code in a - sandboxed Deno environment using Pyodide. + sandboxed Deno environment using Pyodide. It supports file attachment and + in-memory filesystem operations via binary streaming. """ async def execute( @@ -282,11 +439,8 @@ async def execute( CodeExecutionResult containing execution results and metadata """ start_time = time.time() - stdout = "" - stderr = "" - result = None - status: Literal["success", "error"] = "success" + # Build the command with all necessary arguments cmd = self._build_command( code, session_bytes=session_bytes, @@ -294,56 +448,76 @@ async def execute( memory_limit_mb=memory_limit_mb, ) - # Create and run the subprocess - process = await asyncio.create_subprocess_exec( - *cmd, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) + # Prepare stdin data with filesystem operations (always binary streaming) + stdin_data = self._prepare_stdin_data() try: - # Wait for process with a timeout + # Configure process + process = await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE if stdin_data else None, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + # Send stdin data if we have filesystem operations + communicate_args = {} + if stdin_data: + communicate_args["input"] = stdin_data + + # Wait for the process with timeout stdout_bytes, stderr_bytes = await asyncio.wait_for( - process.communicate(), + process.communicate(**communicate_args), timeout=timeout_seconds, ) - stdout = stdout_bytes.decode("utf-8", errors="replace") - - if stdout: - # stdout encodes the full result from the sandbox. - # including stdout, stderr, and the json result. - full_result = json.loads(stdout) - stdout = full_result.get("stdout", None) - stderr = full_result.get("stderr", None) - result = full_result.get("result", None) - status = "success" if full_result.get("success", False) else "error" - session_metadata = full_result.get("sessionMetadata", None) - # Convert the Uint8Array to Python bytes - session_bytes_array = full_result.get("sessionBytes", None) - session_bytes = ( - bytes(session_bytes_array) if session_bytes_array else None - ) - else: - stderr = stderr_bytes.decode("utf-8", errors="replace") - status = "error" + + # Process the output + stdout_text = stdout_bytes.decode("utf-8", errors="replace") + ( + stdout, + stderr, + result, + status, + session_metadata, + filesystem_info, + filesystem_operations, + session_bytes, + ) = _process_execution_output(stdout_text, stderr_bytes) + except asyncio.TimeoutError: - process.kill() - await process.wait() + if process: + process.kill() + await process.wait() status = "error" stderr = f"Execution timed out after {timeout_seconds} seconds" - except asyncio.CancelledError: - # Optionally: log cancellation if needed - pass + stdout = "" + result = None + session_metadata = None + filesystem_info = None + filesystem_operations = None + session_bytes = None + except (OSError, subprocess.SubprocessError) as e: + status = "error" + stderr = f"Error during execution: {e!s}" + stdout = "" + result = None + session_metadata = None + filesystem_info = None + filesystem_operations = None + session_bytes = None + end_time = time.time() return CodeExecutionResult( status=status, execution_time=end_time - start_time, - stdout=stdout or None, + stdout=stdout, stderr=stderr or None, result=result, session_metadata=session_metadata, session_bytes=session_bytes, + filesystem_info=filesystem_info, + filesystem_operations=filesystem_operations, ) @@ -351,6 +525,7 @@ class SyncPyodideSandbox(BasePyodideSandbox): """Synchronous version of PyodideSandbox. This class provides a synchronous interface to the PyodideSandbox functionality. + It supports the same features as the asynchronous version but in a blocking manner. """ def execute( @@ -378,11 +553,8 @@ def execute( CodeExecutionResult containing execution results and metadata """ start_time = time.time() - stdout = "" - result = None - stderr: str - status: Literal["success", "error"] + # Build command cmd = self._build_command( code, session_bytes=session_bytes, @@ -390,61 +562,70 @@ def execute( memory_limit_mb=memory_limit_mb, ) + # Prepare stdin data with filesystem operations (always binary streaming) + stdin_data = self._prepare_stdin_data() + try: - # Run the subprocess with timeout + # Execute the subprocess with stdin data # Ignoring S603 for subprocess.run as the cmd is built safely. # Untrusted input comes from `code` parameter, which should be # escaped properly as we are **not** using shell=True. process = subprocess.run( # noqa: S603 cmd, + input=stdin_data, capture_output=True, text=False, # Keep as bytes for proper decoding timeout=timeout_seconds, check=False, # Don't raise on non-zero exit ) - stdout_bytes = process.stdout - stderr_bytes = process.stderr - - stdout = stdout_bytes.decode("utf-8", errors="replace") - - if stdout: - # stdout encodes the full result from the sandbox - # including stdout, stderr, and the json result - full_result = json.loads(stdout) - stdout = full_result.get("stdout", None) - stderr = full_result.get("stderr", None) - result = full_result.get("result", None) - status = "success" if full_result.get("success", False) else "error" - session_metadata = full_result.get("sessionMetadata", None) - # Convert the Uint8Array to Python bytes - session_bytes_array = full_result.get("sessionBytes", None) - session_bytes = ( - bytes(session_bytes_array) if session_bytes_array else None - ) - else: - stderr = stderr_bytes.decode("utf-8", errors="replace") - status = "error" + # Process the output + stdout_text = process.stdout.decode("utf-8", errors="replace") + ( + stdout, + stderr, + result, + status, + session_metadata, + filesystem_info, + filesystem_operations, + session_bytes, + ) = _process_execution_output(stdout_text, process.stderr) except subprocess.TimeoutExpired: status = "error" stderr = f"Execution timed out after {timeout_seconds} seconds" + stdout = "" + result = None + filesystem_info = None + filesystem_operations = None + session_bytes = None + except (OSError, subprocess.SubprocessError) as e: + status = "error" + stderr = f"Error during execution: {e!s}" + stdout = "" + result = None + filesystem_info = None + filesystem_operations = None + session_bytes = None end_time = time.time() return CodeExecutionResult( status=status, execution_time=end_time - start_time, - stdout=stdout or None, + stdout=stdout, stderr=stderr or None, result=result, session_metadata=session_metadata, session_bytes=session_bytes, + filesystem_info=filesystem_info, + filesystem_operations=filesystem_operations, ) class PyodideSandboxTool(BaseTool): - """Tool for running python code in a PyodideSandbox. + r"""Tool for running python code in a PyodideSandbox. If you use a stateful sandbox (PyodideSandboxTool(stateful=True)), the state between code executions (to variables, imports, @@ -455,19 +636,25 @@ class PyodideSandboxTool(BaseTool): inside a LangGraph graph with a checkpointer, and has to be used with the prebuilt `create_react_agent` or `ToolNode`. - Example: stateless sandbox usage + Example: stateless sandbox usage with file attachment ```python from langgraph.prebuilt import create_react_agent from langchain_sandbox import PyodideSandboxTool - tool = PyodideSandboxTool(allow_net=True) + # Attach CSV data to the sandbox + csv_data = "name,age\nJohn,30\nJane,25" + tool = PyodideSandboxTool( + allow_net=True, + files={"data.csv": csv_data} + ) + agent = create_react_agent( "anthropic:claude-3-7-sonnet-latest", tools=[tool], ) result = await agent.ainvoke( - {"messages": [{"role": "user", "content": "what's 5 + 7?"}]}, + {"messages": [{"role": "user", "content": "analyze the data in data.csv"}]}, ) ``` @@ -477,7 +664,7 @@ class PyodideSandboxTool(BaseTool): from langgraph.prebuilt import create_react_agent from langgraph.prebuilt.chat_agent_executor import AgentState from langgraph.checkpoint.memory import InMemorySaver - from langchain_sandbox import PyodideSandboxTool, PyodideSandbox + from langchain_sandbox import PyodideSandboxTool class State(AgentState): session_bytes: bytes @@ -508,28 +695,43 @@ class State(AgentState): """ name: str = "python_code_sandbox" - description: str = ( - "A secure Python code sandbox. Use this to execute python commands.\n" - "- Input should be a valid python command.\n" - "- To return output, you should print it out with `print(...)`.\n" - "- Don't use f-strings when printing outputs.\n" - "- If you need to make web requests, use `httpx.AsyncClient`." + + # Field description with default value + description: str = Field( + default="A secure Python code sandbox with filesystem support." ) # Mirror the PyodideSandbox constructor arguments - stateful: bool = False - allow_env: list[str] | bool = False - allow_read: list[str] | bool = False - allow_write: list[str] | bool = False - allow_net: list[str] | bool = False - allow_run: list[str] | bool = False - allow_ffi: list[str] | bool = False - timeout_seconds: float | None - """Timeout for code execution in seconds. By default set to 60 seconds.""" - node_modules_dir: str = "auto" - - _sandbox: PyodideSandbox - _sync_sandbox: SyncPyodideSandbox + stateful: bool = Field(default=False) + allow_env: list[str] | bool = Field(default=False) + allow_read: list[str] | bool = Field(default=False) + allow_write: list[str] | bool = Field(default=False) + allow_net: list[str] | bool = Field(default=False) + allow_run: list[str] | bool = Field(default=False) + allow_ffi: list[str] | bool = Field(default=False) + timeout_seconds: float | None = Field( + default=60.0, + description="Timeout for code execution in seconds. " + "By default set to 60 seconds.", + ) + node_modules_dir: str = Field(default="auto") + + # Private attributes using PrivateAttr + _description_template: str = PrivateAttr( + default=( + "A secure Python code sandbox with filesystem support. " + "Use this to execute python commands.\n" + "- Input should be a valid python command.\n" + "- To return output, you should print it out with `print(...)`.\n" + "- Don't use f-strings when printing outputs.\n" + "- If you need to make web requests, use `httpx.AsyncClient`.\n" + "- Files can be read/written using standard Python file operations.\n" + "{available_files}" + ) + ) + _sandbox: PyodideSandbox | None = PrivateAttr(default=None) + _sync_sandbox: SyncPyodideSandbox | None = PrivateAttr(default=None) + _custom_description: bool = PrivateAttr(default=False) def __init__( self, @@ -537,6 +739,9 @@ def __init__( stateful: bool = False, timeout_seconds: float | None = 60, allow_net: list[str] | bool = False, + files: dict[str, str | bytes] | None = None, + directories: list[str] | None = None, + description: str | None = None, **kwargs: dict[str, Any], ) -> None: """Initialize the tool. @@ -553,14 +758,27 @@ def __init__( Depending on your use case, you can restrict the network access to only the URLs you need (e.g., required to set up micropip / pyodide). Please refer to pyodide documentation for more details. + files: Dictionary of files to attach to the sandbox filesystem. + Keys are file paths, values are file contents (str or bytes). + directories: List of directory paths to create in the sandbox filesystem. + description: Custom description template for the tool. **kwargs: Other attributes will be passed to the PyodideSandbox """ - super().__init__( - stateful=stateful, - timeout_seconds=timeout_seconds, - allow_net=allow_net, - **kwargs, - ) + # Prepare arguments for super().__init__ + init_kwargs = { + "stateful": stateful, + "timeout_seconds": timeout_seconds, + "allow_net": allow_net, + "allow_env": kwargs.get("allow_env", False), + "allow_read": kwargs.get("allow_read", False), + "allow_write": kwargs.get("allow_write", False), + "allow_run": kwargs.get("allow_run", False), + "allow_ffi": kwargs.get("allow_ffi", False), + "node_modules_dir": kwargs.get("node_modules_dir", "auto"), + } + + # Call super().__init__() first + super().__init__(**init_kwargs) if self.stateful: try: @@ -588,7 +806,15 @@ class PyodideSandboxToolInput(BaseModel): code: str = Field(description="Code to execute.") - self.args_schema: type[BaseModel] = PyodideSandboxToolInput + self.args_schema = PyodideSandboxToolInput + + # Set up custom description if provided + if description is not None: + self._custom_description = True + self._description_template = description + self.description = description + + # Create sandbox instances after initialization self._sandbox = PyodideSandbox( stateful=self.stateful, allow_env=self.allow_env, @@ -598,7 +824,10 @@ class PyodideSandboxToolInput(BaseModel): allow_run=self.allow_run, allow_ffi=self.allow_ffi, node_modules_dir=self.node_modules_dir, + files=files, + directories=directories, ) + # Initialize sync sandbox with deno check skipped since async sandbox already # checked self._sync_sandbox = SyncPyodideSandbox( @@ -611,8 +840,41 @@ class PyodideSandboxToolInput(BaseModel): allow_ffi=self.allow_ffi, node_modules_dir=self.node_modules_dir, skip_deno_check=True, # Skip deno check since async sandbox already checked + files=files, + directories=directories, ) + if not self._custom_description or ( + "{available_files}" in self._description_template + ): + self.description = self._build_description() + + def _build_description(self) -> str: + """Build the complete description string with attached files information. + + Returns: + Complete description string including file information + """ + if ( + self._custom_description + and "{available_files}" not in self._description_template + ): + return self._description_template + + # Use the property from the base class to get attached files + files = self._sandbox._attached_files + if files: + available_files = ( + "\n\nATTACHED FILES AVAILABLE:\n" + + "\n".join(f" • {p}" for p in files) + + "\nThese files are already loaded and ready to use " + "with pandas, open(), etc." + ) + else: + available_files = "" + + return self._description_template.format(available_files=available_files) + def _run( self, code: str, @@ -621,7 +883,22 @@ def _run( config: RunnableConfig | None = None, run_manager: CallbackManagerForToolRun | None = None, ) -> Any: # noqa: ANN401 - """Use the tool synchronously.""" + """Use the tool synchronously. + + Args: + code: The code to execute in the sandbox + state: State object containing session information + (required for stateful mode) + tool_call_id: ID of the tool call for message creation + config: Configuration for the tool execution + run_manager: Callback manager for the tool run + + Returns: + Tool execution result or LangGraph Command in stateful mode + + Raises: + ValueError: If required state keys are missing in stateful mode + """ if self.stateful: required_keys = {"session_bytes", "session_metadata", "messages"} actual_keys = set(state) if isinstance(state, dict) else set(state.__dict__) @@ -683,7 +960,21 @@ async def _arun( config: RunnableConfig | None = None, run_manager: AsyncCallbackManagerForToolRun | None = None, ) -> Any: # noqa: ANN401 - """Use the tool synchronously.""" + """Use the tool asynchronously. + + Args: + code: The code to execute in the sandbox + state: State object containing session information (required for stateful mode) + tool_call_id: ID of the tool call for message creation + config: Configuration for the tool execution + run_manager: Callback manager for the tool run + + Returns: + Tool execution result or LangGraph Command in stateful mode + + Raises: + ValueError: If required state keys are missing in stateful mode + """ if self.stateful: required_keys = {"session_bytes", "session_metadata", "messages"} actual_keys = set(state) if isinstance(state, dict) else set(state.__dict__) diff --git a/libs/sandbox-py/tests/unit_tests/test_pyodide_sandbox.py b/libs/sandbox-py/tests/unit_tests/test_pyodide_sandbox.py index 89e6635..fe06949 100644 --- a/libs/sandbox-py/tests/unit_tests/test_pyodide_sandbox.py +++ b/libs/sandbox-py/tests/unit_tests/test_pyodide_sandbox.py @@ -20,6 +20,17 @@ def pyodide_package(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr("langchain_sandbox.pyodide.PKG_NAME", local_script) +@pytest.fixture +def mock_csv_data() -> str: + """Sample sales data for testing.""" + return """date,product_id,category,quantity,price,customer_id,region +2024-01-15,P001,Electronics,2,499.99,C123,North +2024-01-16,P002,Furniture,1,899.50,C124,South +2024-01-16,P003,Clothing,5,59.99,C125,East +2024-01-17,P001,Electronics,1,499.99,C126,West +2024-01-18,P004,Electronics,3,299.99,C127,North""" + + def get_default_sandbox(stateful: bool = False) -> PyodideSandbox: """Get default PyodideSandbox instance for testing.""" return PyodideSandbox( @@ -49,7 +60,7 @@ def get_default_sync_sandbox(stateful: bool = False) -> SyncPyodideSandbox: async def test_stdout_sessionless(pyodide_package: None) -> None: """Test without a session ID.""" sandbox = get_default_sandbox() - # Execute a simple piece of code synchronously + # Execute a simple piece of code asynchronously result = await sandbox.execute("x = 5; print(x); x") assert result.status == "success" assert result.stdout == "5" @@ -78,7 +89,7 @@ async def test_session_state_persistence_basic(pyodide_package: None) -> None: assert result1.result is None assert result2.status == "success", f"Encountered error: {result2.stderr}" assert result2.stdout == "10" - assert result1.result is None + assert result2.result is None async def test_pyodide_sandbox_error_handling(pyodide_package: None) -> None: @@ -136,7 +147,7 @@ def test_sync_session_state_persistence_basic(pyodide_package: None) -> None: assert result1.result is None assert result2.status == "success", f"Encountered error: {result2.stderr}" assert result2.stdout == "10" - assert result1.result is None + assert result2.result is None def test_sync_pyodide_sandbox_error_handling(pyodide_package: None) -> None: @@ -165,35 +176,316 @@ def test_sync_pyodide_sandbox_timeout(pyodide_package: None) -> None: assert "timed out" in result.stderr.lower() -def test_pyodide_sandbox_tool() -> None: +def test_pyodide_sandbox_tool(pyodide_package: None) -> None: """Test synchronous invocation of PyodideSandboxTool.""" - tool = PyodideSandboxTool(stateful=False, allow_net=True) - result = tool.invoke("x = 5; print(x)") + # allow_read=True is required for Deno to access Pyodide WASM files + tool = PyodideSandboxTool( + stateful=False, + allow_net=True, + allow_read=True, + ) + result = tool.invoke({"code": "x = 5; print(x)"}) assert result == "5" - result = tool.invoke("x = 5; print(1); print(2)") - assert result == "12" + result = tool.invoke({"code": "x = 5; print(1); print(2)"}) + assert result == "1\n2" def test_pyodide_timeout() -> None: """Test synchronous invocation of PyodideSandboxTool with timeout.""" - tool = PyodideSandboxTool(stateful=False, timeout_seconds=0.1, allow_net=True) - result = tool.invoke("while True: pass") - assert result == "Error during execution: Execution timed out after 0.1 seconds" + tool = PyodideSandboxTool( + stateful=False, + allow_net=True, + timeout_seconds=0.1, + ) + result = tool.invoke({"code": "while True: pass"}) + assert "timed out after 0.1 seconds" in result -async def test_async_pyodide_sandbox_tool() -> None: - """Test synchronous invocation of PyodideSandboxTool.""" - tool = PyodideSandboxTool(stateful=False, allow_net=True) - result = await tool.ainvoke("x = 5; print(x)") +async def test_async_pyodide_sandbox_tool(pyodide_package: None) -> None: + """Test asynchronous invocation of PyodideSandboxTool.""" + # allow_read=True is required for Deno to access Pyodide WASM files + tool = PyodideSandboxTool( + stateful=False, + allow_net=True, + allow_read=True, + ) + result = await tool.ainvoke({"code": "x = 5; print(x)"}) assert result == "5" - result = await tool.ainvoke("x = 5; print(1); print(2)") - # TODO: Need to preserve newlines in the output # noqa: FIX002, TD002 - # https://github.com/langchain-ai/langchain-sandbox/issues/26 - assert result == "12" + result = await tool.ainvoke({"code": "x = 5; print(1); print(2)"}) + assert result == "1\n2" async def test_async_pyodide_timeout() -> None: - """Test synchronous invocation of PyodideSandboxTool with timeout.""" - tool = PyodideSandboxTool(stateful=False, timeout_seconds=0.1, allow_net=True) - result = await tool.ainvoke("while True: pass") - assert result == "Error during execution: Execution timed out after 0.1 seconds" + """Test asynchronous invocation of PyodideSandboxTool with timeout.""" + tool = PyodideSandboxTool( + stateful=False, + allow_net=True, + timeout_seconds=0.1, + ) + result = await tool.ainvoke({"code": "while True: pass"}) + assert "timed out after 0.1 seconds" in result + + +async def test_filesystem_basic_operations(pyodide_package: None) -> None: + """Test basic filesystem operations.""" + # allow_read=True is required for Deno to access Pyodide WASM files + sandbox = PyodideSandbox( + allow_net=True, + allow_read=True, + files={"test.txt": "Hello, World!", "data.json": '{"key": "value"}'}, + directories=["output"], + ) + + code = """ +import os +import json + +# Read files +with open("test.txt", "r") as f: + txt_content = f.read() + +with open("data.json", "r") as f: + json_data = json.load(f) + +# Create new file in pre-created directory +with open("output/result.txt", "w") as f: + f.write("Processing complete!") + +# List files +root_files = sorted(os.listdir(".")) +output_files = sorted(os.listdir("output")) + +print(f"Text: {txt_content}") +print(f"JSON key: {json_data['key']}") +print(f"Root files: {root_files}") +print(f"Output files: {output_files}") + +# Read the created file to verify it was written +with open("output/result.txt", "r") as f: + created_content = f.read() +print(f"Created file content: {created_content}") +""" + + result = await sandbox.execute(code) + assert result.status == "success", f"Execution failed: {result.stderr}" + assert "Hello, World!" in result.stdout + assert "value" in result.stdout + assert "Processing complete!" in result.stdout + + +def test_filesystem_tool_usage(pyodide_package: None) -> None: + """Test filesystem with PyodideSandboxTool.""" + # Attach CSV data using files parameter in constructor + csv_data = "name,age\nAlice,30\nBob,25" + tool = PyodideSandboxTool( + allow_net=True, allow_read=True, files={"users.csv": csv_data} + ) + + code = """ +import csv + +users = [] +with open("users.csv", "r") as f: + reader = csv.DictReader(f) + for row in reader: + users.append(row) + +for user in users: + print(f"{user['name']} is {user['age']} years old") +""" + + result = tool.invoke({"code": code}) + assert "Alice is 30 years old" in result + assert "Bob is 25 years old" in result + + +async def test_binary_file_operations(pyodide_package: None) -> None: + """Test binary file operations.""" + # Create some binary data + binary_data = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01" + + # allow_read=True is required for Deno to access Pyodide WASM files + sandbox = PyodideSandbox( + allow_net=True, allow_read=True, files={"image.png": binary_data} + ) + + code = """ +import base64 + +# Read binary file +with open("image.png", "rb") as f: + data = f.read() + +# Check if it's the PNG header +is_png = data.startswith(b'\\x89PNG') +size = len(data) + +print(f"Is PNG: {is_png}") +print(f"Size: {size} bytes") +""" + + result = await sandbox.execute(code) + assert result.status == "success", f"Execution failed: {result.stderr}" + assert "Is PNG: True" in result.stdout + # Verify the size matches the binary data size + assert f"Size: {len(binary_data)} bytes" in result.stdout + + +async def test_large_file_attachment(pyodide_package: None) -> None: + """Test attaching a large file to the sandbox.""" + # Generate a test file with a simple pattern + size_mb = 5 # 5MB is sufficient to test streaming + size_bytes = size_mb * 1024 * 1024 + + # Generate test content + large_data = bytes([i % 256 for i in range(size_bytes)]) + + # allow_read=True is required for Deno to access Pyodide WASM files + sandbox = PyodideSandbox( + allow_net=True, allow_read=True, files={"large_file.bin": large_data} + ) + + # Verify that the file was attached correctly + code = """ +import os + +file_path = "large_file.bin" +exists = os.path.exists(file_path) +size = os.path.getsize(file_path) if exists else 0 + +print(f"File exists: {exists}") +print(f"File size: {size} bytes") +print("Verification completed successfully!") +""" + + # Execute the code that verifies the file + result = await sandbox.execute(code) + + assert result.status == "success", f"Failed to verify file: {result.stderr}" + assert "File exists: True" in result.stdout + assert f"File size: {size_bytes} bytes" in result.stdout + assert "Verification completed successfully!" in result.stdout + + +def test_description_custom_without_files(pyodide_package: None) -> None: + """Test custom description without files.""" + custom_description = "Use Python to analyze data. No fancy stuff." + + tool = PyodideSandboxTool(allow_net=True, description=custom_description) + + # Verify the custom description is used and doesn't have file info + assert tool.description == custom_description + assert "ATTACHED FILES AVAILABLE" not in tool.description + + +def test_description_custom_with_files(pyodide_package: None) -> None: + """Test custom description with files.""" + custom_description = "Custom Python sandbox with {available_files}" + + # Create tool with files in constructor + tool = PyodideSandboxTool( + allow_net=True, + description=custom_description, + files={"data.csv": "a,b\n1,2", "config.json": '{"setting": true}'}, + ) + + # Verify description contains both custom text and file info + assert "Custom Python sandbox with" in tool.description + assert "ATTACHED FILES AVAILABLE" in tool.description + assert "data.csv" in tool.description + assert "config.json" in tool.description + + +def test_description_default(pyodide_package: None) -> None: + """Test default description behavior.""" + tool = PyodideSandboxTool(allow_net=True) + + # Check default description + assert "A secure Python code sandbox with filesystem support" in tool.description + assert "ATTACHED FILES AVAILABLE" not in tool.description + + # Create a new tool with files to test description update + tool_with_files = PyodideSandboxTool( + allow_net=True, files={"test.txt": "Hello world"} + ) + + # Verify description was updated with file info + assert ( + "A secure Python code sandbox with filesystem support" + in tool_with_files.description + ) + assert "ATTACHED FILES AVAILABLE" in tool_with_files.description + assert "test.txt" in tool_with_files.description + + +def test_directories_creation(pyodide_package: None) -> None: + """Test directory creation via constructor.""" + tool = PyodideSandboxTool( + allow_net=True, allow_read=True, directories=["data", "output", "logs/app"] + ) + + code = """ +import os + +# Check if directories exist +data_exists = os.path.exists("data") and os.path.isdir("data") +output_exists = os.path.exists("output") and os.path.isdir("output") +logs_exists = os.path.exists("logs") and os.path.isdir("logs") +logs_app_exists = os.path.exists("logs/app") and os.path.isdir("logs/app") + +# List root directory +root_items = sorted(os.listdir(".")) + +print(f"Data directory exists: {data_exists}") +print(f"Output directory exists: {output_exists}") +print(f"Logs directory exists: {logs_exists}") +print(f"Logs/app directory exists: {logs_app_exists}") +print(f"Root items: {root_items}") +""" + + result = tool.invoke({"code": code}) + assert "Data directory exists: True" in result + assert "Output directory exists: True" in result + assert "Logs directory exists: True" in result + assert "Logs/app directory exists: True" in result + + +def test_combined_files_and_directories(pyodide_package: None) -> None: + """Test using both files and directories together.""" + tool = PyodideSandboxTool( + allow_net=True, + allow_read=True, + files={"config.json": '{"app": "test"}', "data/input.txt": "Hello World"}, + directories=["output", "logs"], + ) + + code = """ +import os +import json + +# Read config file +with open("config.json", "r") as f: + config = json.load(f) + +# Read input file +with open("data/input.txt", "r") as f: + content = f.read() + +# Write to output directory +with open("output/result.txt", "w") as f: + f.write(f"App: {config['app']}, Content: {content}") + +# Check what was created +output_files = os.listdir("output") +root_items = sorted([item for item in os.listdir(".") if not item.startswith(".")]) + +print(f"Config app: {config['app']}") +print(f"Input content: {content}") +print(f"Output files: {output_files}") +print(f"Root items: {root_items}") +""" + + result = tool.invoke({"code": code}) + assert "Config app: test" in result + assert "Input content: Hello World" in result + assert "result.txt" in result diff --git a/libs/sandbox-py/uv.lock b/libs/sandbox-py/uv.lock index 6441963..dcdb6d8 100644 --- a/libs/sandbox-py/uv.lock +++ b/libs/sandbox-py/uv.lock @@ -439,7 +439,7 @@ wheels = [ [[package]] name = "langchain-sandbox" -version = "0.0.5" +version = "0.0.6" source = { editable = "." } dependencies = [ { name = "langchain-core" },