66from extract_thinker .llm_engine import LLMEngine
77from extract_thinker .utils import add_classification_structure , extract_thinking_json
88
9- # Add these constants at the top of the file, after the imports
10- DYNAMIC_PROMPT_TEMPLATE = """Please provide your thinking process within <think> tags, followed by your JSON output.
9+ # Helper to build the dynamic prompt used when `is_dynamic=True`.
10+ # We expose it as a standalone function so that callers (or subclasses)
11+ # can supply their own variants if needed.
1112
12- JSON structure:
13- {prompt}
1413
15- OUTPUT example:
16- <think>
17- Your step-by-step reasoning and analysis goes here...
18- </think>
14+ def build_dynamic_prompt (structure : str , * , think_tag : str = "think" ) -> str :
15+ """Return the dynamic prompt used for classification style requests.
1916
20- ##JSON OUTPUT
21- {{
22- ...
23- }}
24- """
17+ Args:
18+ structure: The JSON structure/fields to be returned by the model.
19+ think_tag: The XML-style tag that should wrap the model's chain-of-thought.
20+
21+ This helper allows downstream users to customise the surrounding text
22+ (for example, changing the tag name or adding extra instructions) rather
23+ than editing a hard-coded string inside *llm.py*.
24+ """
25+
26+ return (
27+ f"Please provide your thinking process within <{ think_tag } > tags, "
28+ "followed by your JSON output.\n \n "
29+ "JSON structure:\n "
30+ f"{ structure } \n \n "
31+ "OUTPUT example:\n "
32+ f"<{ think_tag } >\n "
33+ "Your step-by-step reasoning and analysis goes here...\n "
34+ f"</{ think_tag } >\n \n "
35+ "##JSON OUTPUT\n "
36+ "{\n ...\n }" # placeholder keeps JSON fence out of model context
37+ )
2538
2639class LLM :
2740 TIMEOUT = 3000 # Timeout in milliseconds
@@ -34,6 +47,11 @@ class LLM:
3447 MIN_THINKING_BUDGET = 1200 # Minimum thinking budget
3548 DEFAULT_OUTPUT_TOKENS = 32000
3649
50+ # A single default completion-token limit that is accepted by the vast
51+ # majority of models. If a model supports more (or you need fewer), pass
52+ # `token_limit=` when instantiating `LLM` to override this value.
53+ DEFAULT_MAX_COMPLETION_TOKENS = 8000
54+
3755 def __init__ (
3856 self ,
3957 model : str ,
@@ -194,7 +212,7 @@ def request(
194212 working_messages = messages .copy ()
195213 if self .is_dynamic and response_model :
196214 structure = add_classification_structure (response_model )
197- prompt = DYNAMIC_PROMPT_TEMPLATE . format ( prompt = structure )
215+ prompt = build_dynamic_prompt ( structure )
198216 working_messages .append ({
199217 "role" : "system" ,
200218 "content" : prompt
@@ -219,11 +237,11 @@ def request(
219237
220238 def _request_with_router (self , messages : List [Dict [str , str ]], response_model : Optional [str ]) -> Any :
221239 """Handle request using router with or without thinking parameter"""
222- max_tokens = self .DEFAULT_OUTPUT_TOKENS
240+ max_tokens = self ._get_model_max_tokens ()
223241 if self .token_limit is not None :
224- max_tokens = self .token_limit
242+ max_tokens = min ( self .token_limit , max_tokens )
225243 elif self .is_thinking :
226- max_tokens = self .thinking_token_limit
244+ max_tokens = min ( self .thinking_token_limit , max_tokens ) if self . thinking_token_limit else max_tokens
227245
228246 params = {
229247 "model" : self .model ,
@@ -248,11 +266,11 @@ def _request_with_router(self, messages: List[Dict[str, str]], response_model: O
248266
249267 def _request_direct (self , messages : List [Dict [str , str ]], response_model : Optional [str ]) -> Any :
250268 """Handle direct request with or without thinking parameter"""
251- max_tokens = self .DEFAULT_OUTPUT_TOKENS
269+ max_tokens = self ._get_model_max_tokens ()
252270 if self .token_limit is not None :
253- max_tokens = self .token_limit
271+ max_tokens = min ( self .token_limit , max_tokens )
254272 elif self .is_thinking :
255- max_tokens = self .thinking_token_limit
273+ max_tokens = min ( self .thinking_token_limit , max_tokens ) if self . thinking_token_limit else max_tokens
256274
257275 base_params = {
258276 "model" : self .model ,
@@ -293,11 +311,11 @@ def raw_completion(self, messages: List[Dict[str, str]]) -> str:
293311 except Exception as e :
294312 raise ValueError (f"Failed to extract from source: { str (e )} " )
295313
296- max_tokens = self .DEFAULT_OUTPUT_TOKENS
314+ max_tokens = self ._get_model_max_tokens ()
297315 if self .token_limit is not None :
298- max_tokens = self .token_limit
316+ max_tokens = min ( self .token_limit , max_tokens )
299317 elif self .is_thinking :
300- max_tokens = self .thinking_token_limit
318+ max_tokens = min ( self .thinking_token_limit , max_tokens ) if self . thinking_token_limit else max_tokens
301319
302320 params = {
303321 "model" : self .model ,
@@ -325,4 +343,14 @@ def raw_completion(self, messages: List[Dict[str, str]]) -> str:
325343
326344 def set_timeout (self , timeout_ms : int ) -> None :
327345 """Set the timeout value for LLM requests in milliseconds."""
328- self .TIMEOUT = timeout_ms
346+ self .TIMEOUT = timeout_ms
347+
348+ def _get_model_max_tokens (self ) -> int :
349+ """Return the default maximum completion-token limit.
350+
351+ This constant (DEFAULT_MAX_COMPLETION_TOKENS) is meant to work for ~99 %
352+ of models. If you need a different value, supply `token_limit=` when
353+ creating the `LLM` instance.
354+ """
355+
356+ return self .DEFAULT_MAX_COMPLETION_TOKENS
0 commit comments