Minor prompting changes

ludomitch · ludomitch · commit 700232110d27 · 2025-05-10T23:24:50.000-07:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -38,6 +38,7 @@ repos:
       - id: codespell
         additional_dependencies: [".[toml]"]
         exclude_types: [jupyter]
+        exclude: ".*\\.csv$"
   - repo: https://github.com/pappasam/toml-sort
     rev: v0.24.2
     hooks:
diff --git a/src/fhda/notebook_env.py b/src/fhda/notebook_env.py
@@ -112,7 +112,7 @@ async def close(self):
 
 class NBEnvironment(Environment[NBEnvironmentState]):
     NOTEBOOK_NAME: ClassVar[str] = "notebook.ipynb"
-    EXEC_TIMEOUT: ClassVar[float | None] = 300.0
+    EXEC_TIMEOUT: ClassVar[float | None] = 600.0
 
     state: NBEnvironmentState
 
diff --git a/src/fhda/prompts.py b/src/fhda/prompts.py
@@ -25,22 +25,36 @@
 """
 
 # Guidelines for R code output optimization
-R_OUTPUT_RECOMMENDATION_PROMPT = """
-R-Specific Guidelines:
+R_SPECIFIC_GUIDELINES = """Guidelines for using the R programming language:
 1. Load packages using this format to minimize verbose output:
    ```r
    if (!requireNamespace("package_name", quietly = TRUE)) {{
      install.packages("package_name")
    }}
    suppressPackageStartupMessages(library(package_name))
    ```
+2. You must use the tidyverse wherever possible: dplyr, tidyr, ggplot2, readr, stringr, forcats, purrr, tibble, and lubridate.
 
-2. For data operations, suppress messages about column name repairs:
+3. All plots must be made using ggplot2. Here is an example of how to make a plot:
+
+   # Create a density scatter plot of FSC-A vs SSC-A
+plot_data <- as.data.frame(dmso_data[, c("FSC-A", "SSC-A")])
+scatter_plot <- ggplot2::ggplot(plot_data, ggplot2::aes(x = `FSC-A`, y = `SSC-A`)) +
+  ggplot2::geom_hex(bins = 100) +
+  ggplot2::scale_fill_viridis_c(trans = "log10") +
+  ggplot2::labs(
+    title = "FSC-A vs SSC-A Density Plot (DMSO Control)",
+    x = "FSC-A",
+    y = "SSC-A"
+  ) +
+  ggplot2::theme_minimal()
+
+3. Use explicit namespace qualification for functions. For example, use dplyr::select() instead of select().
+
+4. For data operations, suppress messages about column name repairs:
    ```r
    variable_name <- read_excel("<fpath>.csv", col_names = FALSE, .name_repair = "minimal")
    ```
-
-3. Very important: always use the tidyverse package where possible.
 """
 
 
@@ -101,7 +115,7 @@
 
 1. Load Data and Perform Descriptive Statistics:
 <analysis_planning>
-- Identify which data files are most relevant to resolving the task. List these files.
+- Identify which data files are most relevant to resolving the task.
 - Plan how to load these files efficiently in {language}.
 - List the specific descriptive statistics you plan to use (e.g., summary(), str(), head()).
 - Consider potential issues like missing data or unexpected formats. How will you handle each?
@@ -197,7 +211,7 @@
 {CHAIN_OF_THOUGHT_AGNOSTIC}
 {SUBMIT_ANSWER_HYPOTHESIS}
 {GENERAL_NOTEBOOK_GUIDELINES}
-{R_OUTPUT_RECOMMENDATION_PROMPT}
+{R_SPECIFIC_GUIDELINES}
 """
 # MCQ
 MCQ_PROMPT_TEMPLATE = f"""
@@ -209,7 +223,7 @@
 {CHAIN_OF_THOUGHT_AGNOSTIC}
 {SUBMIT_ANSWER_MCQ}
 {GENERAL_NOTEBOOK_GUIDELINES}
-{R_OUTPUT_RECOMMENDATION_PROMPT}
+{R_SPECIFIC_GUIDELINES}
 """
 # Open answer
 OPEN_PROMPT_TEMPLATE = f"""
@@ -222,5 +236,5 @@
 {CHAIN_OF_THOUGHT_AGNOSTIC}
 {SUBMIT_ANSWER_OPEN}
 {GENERAL_NOTEBOOK_GUIDELINES}
-{R_OUTPUT_RECOMMENDATION_PROMPT}
+{R_SPECIFIC_GUIDELINES}
 """
diff --git a/tutorial/datasets/brain_size_data.csv b/tutorial/datasets/brain_size_data.csv
diff --git a/tutorial/example.ipynb b/tutorial/example.ipynb
@@ -64,7 +64,7 @@
     "    {prompts.GENERAL_NOTEBOOK_GUIDELINES.format(language=language.name)}\"\"\"\n",
     "\n",
     "    if language == NBLanguage.R:\n",
-    "        augmented_task += f\"\\n{prompts.R_OUTPUT_RECOMMENDATION_PROMPT}\"\n",
+    "        augmented_task += f\"\\n{prompts.R_SPECIFIC_GUIDELINES}\"\n",
     "\n",
     "    dae = DataAnalysisEnv(\n",
     "        problem_id=f\"data-analysis-task-{task_hash}\",\n",
diff --git a/tutorial/platform_api.ipynb b/tutorial/platform_api.ipynb
@@ -55,7 +55,9 @@
    "source": [
     "# Load your dataset – note you only have to do this once\n",
     "# File path can be an absolute path or a relative path to either a directory or a file containing the dataset\n",
-    "client.upload_file(JOB_NAME, file_path=\"dataset\", upload_id=UPLOAD_ID)"
+    "client.upload_file(\n",
+    "    JOB_NAME, file_path=\"datasets/brain_size_data.csv\", upload_id=UPLOAD_ID\n",
+    ")"
    ]
   },
   {
@@ -92,7 +94,7 @@
     "\n",
     "# This is extra R prompting to avoid long R output blocks – also feel free to discard this\n",
     "if LANGUAGE == \"R\":\n",
-    "    task += f\"\\n{prompts.R_OUTPUT_RECOMMENDATION_PROMPT}\""
+    "    task += f\"\\n{prompts.R_SPECIFIC_GUIDELINES}\""
    ]
   },
   {

Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,9 @@`
`55`	`55`	`"source": [`
`56`	`56`	`"# Load your dataset – note you only have to do this once\n",`
`57`	`57`	`"# File path can be an absolute path or a relative path to either a directory or a file containing the dataset\n",`
`58`		`- "client.upload_file(JOB_NAME, file_path=\"dataset\", upload_id=UPLOAD_ID)"`
	`58`	`+ "client.upload_file(\n",`
	`59`	`+ " JOB_NAME, file_path=\"datasets/brain_size_data.csv\", upload_id=UPLOAD_ID\n",`
	`60`	`+ ")"`
`59`	`61`	`]`
`60`	`62`	`},`
`61`	`63`	`{`
`@@ -92,7 +94,7 @@`
`92`	`94`	`"\n",`
`93`	`95`	`"# This is extra R prompting to avoid long R output blocks – also feel free to discard this\n",`
`94`	`96`	`"if LANGUAGE == \"R\":\n",`
`95`		`- " task += f\"\\n{prompts.R_OUTPUT_RECOMMENDATION_PROMPT}\""`
	`97`	`+ " task += f\"\\n{prompts.R_SPECIFIC_GUIDELINES}\""`
`96`	`98`	`]`
`97`	`99`	`},`
`98`	`100`	`{`