Skip to content

Commit 095df6e

Browse files
authored
Tutorial notebooks (#23)
1 parent 1d85d8f commit 095df6e

File tree

12 files changed

+1244
-1019
lines changed

12 files changed

+1244
-1019
lines changed

pyproject.toml

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,16 @@ authors = [
88
]
99
dependencies = [
1010
"aiodocker==0.24.0",
11-
"fhaviary[server]==0.18.1",
12-
"fh-llm-client==0.0.11",
13-
"ldp==0.23.0",
11+
"fhaviary[server]==0.19.0",
12+
"ldp==0.26.0",
1413
"pandas==2.2.3",
1514
"numpy==2.2.3",
1615
"matplotlib==3.10.0",
17-
"scipy==1.15.2",
18-
"seaborn==0.13.2",
19-
"scikit-learn==1.6.1",
20-
"statsmodels==0.14.4",
2116
"aiofiles==24.1.0",
2217
"google-auth==2.38.0",
2318
"google-cloud-storage==3.0.0",
2419
"google-cloud-secret-manager==2.23.0",
25-
"crow-client==0.3.6",
20+
"futurehouse-client==0.3.18",
2621
"jupyter==1.1.1",
2722
"nbconvert==7.16.6",
2823
"notebook==7.3.2",
@@ -52,4 +47,4 @@ run_expt = 'scripts.configurable:_run_expt'
5247
package-dir = {"" = "src"}
5348

5449
[tool.setuptools.packages.find]
55-
where = ["src"]
50+
where = ["src"]

src/fhda/Dockerfile.pinned

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,14 +75,19 @@ RUN mamba install -c conda-forge -c bioconda -y \
7575
bioconductor-apeglm=1.24.0
7676

7777

78-
COPY requirements.txt .
79-
RUN mamba install -c conda-forge --file requirements.txt -y
78+
COPY kernel_requirements.txt .
79+
80+
# Install conda packages first
81+
RUN mamba install -c conda-forge --file kernel_requirements.txt -y
82+
83+
# Install pip packages
84+
RUN pip install aiodocker ldp==0.26.0 fhaviary[server]==0.19.0 futurehouse-client==0.3.14
8085

8186
# Certain tools are not easily installable via conda. A common practice for
8287
# bioinformaticians is to use udocker to run certain heavy duty omics processing
8388
# tools in an isolated environment
84-
RUN udocker --allow-root install && \
85-
udocker --allow-root pull ezlabgva/busco:v5.8.0_cv1
89+
# RUN udocker --allow-root install && \
90+
# udocker --allow-root pull ezlabgva/busco:v5.8.0_cv1
8691

8792
WORKDIR /workspace
8893

src/fhda/kernel_requirements.txt

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
1-
aiodocker
21
anndata==0.11.1
32
biopython==1.84
4-
datasets
53
ete3==3.1.3
4+
fcsparser==0.2.8
5+
cython==3.0.12
66
gseapy==1.1.4
7-
fhaviary[server] >= 0.18.0
87
keras==3.7.0
9-
ldp
108
jupyter==1.0.0
119
matplotlib==3.10.0
1210
matplotlib-venn==1.1.1
1311
mygene==3.2.2
1412
nbconvert==7.16.4
15-
numpy==2.0.2
13+
numpy==1.26.4 # Pinned lower for fcsparser <2
1614
optuna==4.1.0
1715
openpyxl==3.1.5
1816
pandas==2.2.3
@@ -24,13 +22,3 @@ seaborn==0.13.2
2422
scikit-learn==1.6.0
2523
statsmodels==0.14.4
2624
umap-learn==0.5.7
27-
aiofiles
28-
google-auth
29-
google-cloud-storage
30-
google-cloud-secret-manager
31-
google-crc32c
32-
httpx
33-
pydantic
34-
requests
35-
tqdm
36-
crow-client

src/fhda/prompts.py

Lines changed: 71 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -18,30 +18,42 @@
1818
"""
1919

2020
CAPSULE_SYSTEM_PROMPT_QUERY = """
21-
You are an expert data scientist.
22-
Your task is to create a comprehensive Jupyter notebook named 'notebook.ipynb' that thoroughly analyzes data to answer a user query
23-
The notebook should contain all necessary artifacts (plots, tables, print outputs, code commentary) to fully answer the query.
21+
You are an expert bioinformatician and seasoned biological data scientist.
22+
Your task is to create a comprehensive Jupyter notebook named 'notebook.ipynb' that analyzes data to answer a user query.
23+
The notebook should contain all necessary artifacts (plots, tables, print outputs) to fully answer these questions.
24+
Take your time to think through the question and the data before writing any code, explore the data rigorously and defend your conclusions rigorously.
2425
"""
2526

2627
# Guidelines for R code output optimization
27-
R_OUTPUT_RECOMMENDATION_PROMPT = """
28-
R-Specific Guidelines:
28+
R_SPECIFIC_GUIDELINES = """Guidelines for using the R programming language:
2929
1. Load packages using this format to minimize verbose output:
3030
```r
3131
if (!requireNamespace("package_name", quietly = TRUE)) {{
3232
install.packages("package_name")
3333
}}
3434
suppressPackageStartupMessages(library(package_name))
3535
```
36+
2. You must use the tidyverse wherever possible: dplyr, tidyr, ggplot2, readr, stringr, forcats, purrr, tibble, and lubridate.
3637
37-
2. For data operations, suppress messages about column name repairs:
38-
```r
39-
variable_name <- read_excel("<fpath>.csv", col_names = FALSE, .name_repair = "minimal")
40-
```
38+
3. All plots must be made using ggplot2. Here is an example of how to make a plot:
39+
40+
# Create a density scatter plot of FSC-A vs SSC-A
41+
plot_data <- as.data.frame(dmso_data[, c("FSC-A", "SSC-A")])
42+
scatter_plot <- ggplot2::ggplot(plot_data, ggplot2::aes(x = `FSC-A`, y = `SSC-A`)) +
43+
ggplot2::geom_hex(bins = 100) +
44+
ggplot2::scale_fill_viridis_c(trans = "log10") +
45+
ggplot2::labs(
46+
title = "FSC-A vs SSC-A Density Plot (DMSO Control)",
47+
x = "FSC-A",
48+
y = "SSC-A"
49+
) +
50+
ggplot2::theme_minimal()
51+
52+
3. Use explicit namespace qualification for functions. For example, use dplyr::select() instead of select().
4153
42-
3. When printing dataframes, always wrap them in print() statements:
54+
4. For data operations, suppress messages about column name repairs:
4355
```r
44-
print(head(dataframe))
56+
variable_name <- read_excel("<fpath>.csv", col_names = FALSE, .name_repair = "minimal")
4557
```
4658
"""
4759

@@ -54,13 +66,13 @@
5466
- Check dataframe shapes before printing. Use head() for large dataframes.
5567
- Ensure each cell executes successfully before moving to the next.
5668
- Assume you already have the packages you need installed and only install new ones if you receive errors.
57-
- If you need to install packages, use mamba or conda.
58-
IMPORTANT: R vs Python vs bash
59-
- You can use either Python, R or bash cells to complete the analysis.
60-
- All cells are by default Python cells. However, you can use both bash and R cells by adding %%bash or %%R to the first line of the cell.
61-
- The first cell has already been loaded with %load_ext rpy2.ipython so you can use %%R cells from the second cell onwards
69+
- If you need to install packages, use pip or mamba.
70+
- All cells are by default {language} cells. Use {language} or bash tools for all analysis.
71+
- You can use bash cells by adding %%bash to the first line of the cell or running a subprocess.
72+
- You can only create code cells, no markdown cells.
6273
"""
6374

75+
6476
AVOID_IMAGES = """
6577
AVOID USING PLOTS/IMAGES. USE TABLES AND PRINT OUTPUTS INSTEAD AS MUCH AS POSSIBLE.
6678
"""
@@ -101,19 +113,10 @@
101113
CHAIN_OF_THOUGHT_AGNOSTIC = """
102114
Follow these steps to create your notebook, using chain-of-thought reasoning at each stage:
103115
104-
1. List Directory Contents:
105-
<analysis_planning>
106-
- Consider how to use the list_workdir tool to recursively list the directory contents.
107-
- Think about how to organize and present this information clearly in the notebook.
108-
- List potential challenges in interpreting the directory structure.
109-
- Consider how the directory structure might inform your approach to the analysis.
110-
</analysis_planning>
111-
Place the output of the list_workdir tool inside <directory_contents> tags.
112-
113-
2. Load Data and Perform Descriptive Statistics:
116+
1. Load Data and Perform Descriptive Statistics:
114117
<analysis_planning>
115-
- Identify which data files are most relevant to resolving the task. List these files.
116-
- Plan how to load these files efficiently in R or Python.
118+
- Identify which data files are most relevant to resolving the task.
119+
- Plan how to load these files efficiently in {language}.
117120
- List the specific descriptive statistics you plan to use (e.g., summary(), str(), head()).
118121
- Consider potential issues like missing data or unexpected formats. How will you handle each?
119122
- Plan how to present this information clearly in the notebook.
@@ -122,7 +125,7 @@
122125
</analysis_planning>
123126
Execute your plan to load data and perform descriptive statistics.
124127
125-
3. Develop Analysis Plan:
128+
2. Develop Analysis Plan:
126129
<analysis_planning>
127130
- Break down each task into testable components. List these components.
128131
- For each component, list appropriate statistical tests or visualizations.
@@ -135,9 +138,9 @@
135138
</analysis_planning>
136139
Write out your analysis plan as comments in the notebook.
137140
138-
4. Execute Analysis Plan:
141+
3. Execute Analysis Plan:
139142
<analysis_planning>
140-
- For each step in your analysis plan, list the R, Python or bash functions and libraries you'll use.
143+
- For each step in your analysis plan, list the {language} or bash functions and libraries you'll use.
141144
- Think about how to structure your code for readability and efficiency.
142145
- Plan how to document your code with clear comments.
143146
- Consider how to present results clearly, using tables or visualizations where appropriate.
@@ -147,7 +150,7 @@
147150
</analysis_planning>
148151
Execute your analysis plan, creating new cells as needed.
149152
150-
5. Conclude and Submit Answer:
153+
4. Conclude and Submit Answer:
151154
<thought_process>
152155
- Reflect on how your results relate to the original task.
153156
- Consider any limitations or uncertainties in your analysis.
@@ -163,6 +166,14 @@
163166
[Use the submit_answer tool to submit your final answer as a single string either "True" or "False"]
164167
Remember, the final notebook should contain all necessary artifacts (plots, tables, print outputs) to solve the task provided.
165168
"""
169+
SUBMIT_ANSWER_SINGLE = """
170+
[Use the submit_answer tool to submit your final answer as a single string]
171+
Example output:
172+
```
173+
submit_answer("CD94") or submit_answer("-1.23")
174+
```
175+
Remember, the final notebook should contain all necessary artifacts (plots, tables, print outputs) to solve the task provided.
176+
"""
166177
SUBMIT_ANSWER_OPEN = """
167178
[Use the submit_answer tool to submit your final answer as a jsondictionary with keys as the question number and values as a short answer]
168179
Example output:
@@ -200,7 +211,7 @@
200211
{CHAIN_OF_THOUGHT_AGNOSTIC}
201212
{SUBMIT_ANSWER_HYPOTHESIS}
202213
{GENERAL_NOTEBOOK_GUIDELINES}
203-
{R_OUTPUT_RECOMMENDATION_PROMPT}
214+
{R_SPECIFIC_GUIDELINES}
204215
"""
205216
# MCQ
206217
MCQ_PROMPT_TEMPLATE = f"""
@@ -212,7 +223,7 @@
212223
{CHAIN_OF_THOUGHT_AGNOSTIC}
213224
{SUBMIT_ANSWER_MCQ}
214225
{GENERAL_NOTEBOOK_GUIDELINES}
215-
{R_OUTPUT_RECOMMENDATION_PROMPT}
226+
{R_SPECIFIC_GUIDELINES}
216227
"""
217228
# Open answer
218229
OPEN_PROMPT_TEMPLATE = f"""
@@ -225,5 +236,30 @@
225236
{CHAIN_OF_THOUGHT_AGNOSTIC}
226237
{SUBMIT_ANSWER_OPEN}
227238
{GENERAL_NOTEBOOK_GUIDELINES}
228-
{R_OUTPUT_RECOMMENDATION_PROMPT}
239+
{R_SPECIFIC_GUIDELINES}
240+
"""
241+
242+
CONTINUATION_PROMPT_TEMPLATE = f"""
243+
{GENERAL_NOTEBOOK_GUIDELINES}
244+
245+
You have been provided with a notebook previously generated by an agent based on a user's research question.
246+
247+
This was the user's research question:
248+
<previous_research_question>
249+
{{previous_research_question}}
250+
</previous_research_question>
251+
252+
This was the final answer generated by the previous agent:
253+
<previous_final_answer>
254+
{{previous_final_answer}}
255+
</previous_final_answer>
256+
257+
The user has now tasked you with addressing a new query:
258+
<query>
259+
{{query}}
260+
</query>
261+
262+
Please make any edits required to the notebook and the answer to address the new query. Be extremely diligent and ensure that the notebook is fully updated to address the new query.
263+
Note you may have to run all cells one by one again if the user query involved updating one of the intermediate cells and subsequent cells depend on it.
264+
Once you have updated the notebook, use the submit_answer tool to submit your final answer once the user's query is addressed.
229265
"""

0 commit comments

Comments
 (0)