1+ """
2+ STORM Wiki pipeline powered by local model hosted by Ollama server and You.com or Bing search engine.
3+ You need to set up the following environment variables to run this script:
4+ - YDC_API_KEY: You.com API key; or, BING_SEARCH_API_KEY: Bing Search API key
5+ You also need to have a Ollama server running with the llama3 model or other. Specify `--url`, `--port` and `--model` accordingly.
6+
7+ Output will be structured as below
8+ args.output_dir/
9+ topic_name/ # topic_name will follow convention of underscore-connected topic name w/o space and slash
10+ conversation_log.json # Log of information-seeking conversation
11+ raw_search_results.json # Raw search results from search engine
12+ direct_gen_outline.txt # Outline directly generated with LLM's parametric knowledge
13+ storm_gen_outline.txt # Outline refined with collected information
14+ url_to_info.json # Sources that are used in the final article
15+ storm_gen_article.txt # Final article generated
16+ storm_gen_article_polished.txt # Polished final article (if args.do_polish_article is True)
17+ """
18+ import os
19+ import sys
20+ from argparse import ArgumentParser
21+
22+ from dspy import Example
23+
24+ sys .path .append ('./src' )
25+ from lm import OllamaClient
26+ from rm import YouRM , BingSearch
27+ from storm_wiki .engine import STORMWikiRunnerArguments , STORMWikiRunner , STORMWikiLMConfigs
28+ from utils import load_api_key
29+
30+
31+ def main (args ):
32+ load_api_key (toml_file_path = 'secrets.toml' )
33+ lm_configs = STORMWikiLMConfigs ()
34+
35+ ollama_kwargs = {
36+ "model" : args .model ,
37+ "port" : args .port ,
38+ "url" : args .url ,
39+ "stop" : ('\n \n ---' ,) # dspy uses "\n\n---" to separate examples. Open models sometimes generate this.
40+ }
41+
42+ conv_simulator_lm = OllamaClient (max_tokens = 500 , ** ollama_kwargs )
43+ question_asker_lm = OllamaClient (max_tokens = 500 , ** ollama_kwargs )
44+ outline_gen_lm = OllamaClient (max_tokens = 400 , ** ollama_kwargs )
45+ article_gen_lm = OllamaClient (max_tokens = 700 , ** ollama_kwargs )
46+ article_polish_lm = OllamaClient (max_tokens = 4000 , ** ollama_kwargs )
47+
48+ lm_configs .set_conv_simulator_lm (conv_simulator_lm )
49+ lm_configs .set_question_asker_lm (question_asker_lm )
50+ lm_configs .set_outline_gen_lm (outline_gen_lm )
51+ lm_configs .set_article_gen_lm (article_gen_lm )
52+ lm_configs .set_article_polish_lm (article_polish_lm )
53+
54+ engine_args = STORMWikiRunnerArguments (
55+ output_dir = args .output_dir ,
56+ max_conv_turn = args .max_conv_turn ,
57+ max_perspective = args .max_perspective ,
58+ search_top_k = args .search_top_k ,
59+ max_thread_num = args .max_thread_num ,
60+ )
61+
62+ # STORM is a knowledge curation system which consumes information from the retrieval module.
63+ # Currently, the information source is the Internet and we use search engine API as the retrieval module.
64+ if args .retriever == 'bing' :
65+ rm = BingSearch (bing_search_api = os .getenv ('BING_SEARCH_API_KEY' ), k = engine_args .search_top_k )
66+ elif args .retriever == 'you' :
67+ rm = YouRM (ydc_api_key = os .getenv ('YDC_API_KEY' ), k = engine_args .search_top_k )
68+
69+ runner = STORMWikiRunner (engine_args , lm_configs , rm )
70+
71+ # Open LMs are generally weaker in following output format.
72+ # One way for mitigation is to add one-shot example to the prompt to exemplify the desired output format.
73+ # For example, we can add the following examples to the two prompts used in StormPersonaGenerator.
74+ # Note that the example should be an object of dspy.Example with fields matching the InputField
75+ # and OutputField in the prompt (i.e., dspy.Signature).
76+ find_related_topic_example = Example (
77+ topic = "Knowledge Curation" ,
78+ related_topics = "https://en.wikipedia.org/wiki/Knowledge_management\n "
79+ "https://en.wikipedia.org/wiki/Information_science\n "
80+ "https://en.wikipedia.org/wiki/Library_science\n "
81+ )
82+ gen_persona_example = Example (
83+ topic = "Knowledge Curation" ,
84+ examples = "Title: Knowledge management\n "
85+ "Table of Contents: History\n Research\n Dimensions\n Strategies\n Motivations\n KM technologies"
86+ "\n Knowledge barriers\n Knowledge retention\n Knowledge audit\n Knowledge protection\n "
87+ " Knowledge protection methods\n Formal methods\n Informal methods\n "
88+ " Balancing knowledge protection and knowledge sharing\n Knowledge protection risks" ,
89+ personas = "1. Historian of Knowledge Systems: This editor will focus on the history and evolution of knowledge curation. They will provide context on how knowledge curation has changed over time and its impact on modern practices.\n "
90+ "2. Information Science Professional: With insights from 'Information science', this editor will explore the foundational theories, definitions, and philosophy that underpin knowledge curation\n "
91+ "3. Digital Librarian: This editor will delve into the specifics of how digital libraries operate, including software, metadata, digital preservation.\n "
92+ "4. Technical expert: This editor will focus on the technical aspects of knowledge curation, such as common features of content management systems.\n "
93+ "5. Museum Curator: The museum curator will contribute expertise on the curation of physical items and the transition of these practices into the digital realm."
94+ )
95+ runner .storm_knowledge_curation_module .persona_generator .create_writer_with_persona .find_related_topic .demos = [
96+ find_related_topic_example ]
97+ runner .storm_knowledge_curation_module .persona_generator .create_writer_with_persona .gen_persona .demos = [
98+ gen_persona_example ]
99+
100+ # A trade-off of adding one-shot example is that it will increase the input length of the prompt. Also, some
101+ # examples may be very long (e.g., an example for writing a section based on the given information), which may
102+ # confuse the model. For these cases, you can create a pseudo-example that is short and easy to understand to steer
103+ # the model's output format.
104+ # For example, we can add the following pseudo-examples to the prompt used in WritePageOutlineFromConv and
105+ # ConvToSection.
106+ write_page_outline_example = Example (
107+ topic = "Example Topic" ,
108+ conv = "Wikipedia Writer: ...\n Expert: ...\n Wikipedia Writer: ...\n Expert: ..." ,
109+ old_outline = "# Section 1\n ## Subsection 1\n ## Subsection 2\n "
110+ "# Section 2\n ## Subsection 1\n ## Subsection 2\n "
111+ "# Section 3" ,
112+ outline = "# New Section 1\n ## New Subsection 1\n ## New Subsection 2\n "
113+ "# New Section 2\n "
114+ "# New Section 3\n ## New Subsection 1\n ## New Subsection 2\n ## New Subsection 3"
115+ )
116+ runner .storm_outline_generation_module .write_outline .write_page_outline .demos = [write_page_outline_example ]
117+ write_section_example = Example (
118+ info = "[1]\n Information in document 1\n [2]\n Information in document 2\n [3]\n Information in document 3" ,
119+ topic = "Example Topic" ,
120+ section = "Example Section" ,
121+ output = "# Example Topic\n ## Subsection 1\n "
122+ "This is an example sentence [1]. This is another example sentence [2][3].\n "
123+ "## Subsection 2\n This is one more example sentence [1]."
124+ )
125+ runner .storm_article_generation .section_gen .write_section .demos = [write_section_example ]
126+
127+ topic = input ('Topic: ' )
128+ runner .run (
129+ topic = topic ,
130+ do_research = args .do_research ,
131+ do_generate_outline = args .do_generate_outline ,
132+ do_generate_article = args .do_generate_article ,
133+ do_polish_article = args .do_polish_article ,
134+ )
135+ runner .post_run ()
136+ runner .summary ()
137+
138+
139+ if __name__ == '__main__' :
140+ parser = ArgumentParser ()
141+ # global arguments
142+ parser .add_argument ('--url' , type = str , default = 'http://localhost' ,
143+ help = 'URL of the Ollama server.' )
144+ parser .add_argument ('--port' , type = int , default = 11434 ,
145+ help = 'Port of the Ollama server.' )
146+ parser .add_argument ('--model' , type = str , default = 'llama3:latest' ,
147+ help = 'Model of the Ollama server.' )
148+ parser .add_argument ('--output-dir' , type = str , default = './results/ollama' ,
149+ help = 'Directory to store the outputs.' )
150+ parser .add_argument ('--max-thread-num' , type = int , default = 3 ,
151+ help = 'Maximum number of threads to use. The information seeking part and the article generation'
152+ 'part can speed up by using multiple threads. Consider reducing it if keep getting '
153+ '"Exceed rate limit" error when calling LM API.' )
154+ parser .add_argument ('--retriever' , type = str , choices = ['bing' , 'you' ],
155+ help = 'The search engine API to use for retrieving information.' )
156+ # stage of the pipeline
157+ parser .add_argument ('--do-research' , action = 'store_true' ,
158+ help = 'If True, simulate conversation to research the topic; otherwise, load the results.' )
159+ parser .add_argument ('--do-generate-outline' , action = 'store_true' ,
160+ help = 'If True, generate an outline for the topic; otherwise, load the results.' )
161+ parser .add_argument ('--do-generate-article' , action = 'store_true' ,
162+ help = 'If True, generate an article for the topic; otherwise, load the results.' )
163+ parser .add_argument ('--do-polish-article' , action = 'store_true' ,
164+ help = 'If True, polish the article by adding a summarization section and (optionally) removing '
165+ 'duplicate content.' )
166+ # hyperparameters for the pre-writing stage
167+ parser .add_argument ('--max-conv-turn' , type = int , default = 3 ,
168+ help = 'Maximum number of questions in conversational question asking.' )
169+ parser .add_argument ('--max-perspective' , type = int , default = 3 ,
170+ help = 'Maximum number of perspectives to consider in perspective-guided question asking.' )
171+ parser .add_argument ('--search-top-k' , type = int , default = 3 ,
172+ help = 'Top k search results to consider for each search query.' )
173+ # hyperparameters for the writing stage
174+ parser .add_argument ('--retrieve-top-k' , type = int , default = 3 ,
175+ help = 'Top k collected references for each section title.' )
176+ parser .add_argument ('--remove-duplicate' , action = 'store_true' ,
177+ help = 'If True, remove duplicate content from the article.' )
178+
179+ main (parser .parse_args ())
0 commit comments