Skip to content

Commit 98a5eaa

Browse files
authored
Merge pull request #350 from meghana1090/main
updating token limits
2 parents cf5abdb + ce49e4e commit 98a5eaa

File tree

2 files changed

+57
-29
lines changed

2 files changed

+57
-29
lines changed

extract_thinker/llm.py

Lines changed: 52 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,35 @@
66
from extract_thinker.llm_engine import LLMEngine
77
from extract_thinker.utils import add_classification_structure, extract_thinking_json
88

9-
# Add these constants at the top of the file, after the imports
10-
DYNAMIC_PROMPT_TEMPLATE = """Please provide your thinking process within <think> tags, followed by your JSON output.
9+
# Helper to build the dynamic prompt used when `is_dynamic=True`.
10+
# We expose it as a standalone function so that callers (or subclasses)
11+
# can supply their own variants if needed.
1112

12-
JSON structure:
13-
{prompt}
1413

15-
OUTPUT example:
16-
<think>
17-
Your step-by-step reasoning and analysis goes here...
18-
</think>
14+
def build_dynamic_prompt(structure: str, *, think_tag: str = "think") -> str:
15+
"""Return the dynamic prompt used for classification style requests.
1916
20-
##JSON OUTPUT
21-
{{
22-
...
23-
}}
24-
"""
17+
Args:
18+
structure: The JSON structure/fields to be returned by the model.
19+
think_tag: The XML-style tag that should wrap the model's chain-of-thought.
20+
21+
This helper allows downstream users to customise the surrounding text
22+
(for example, changing the tag name or adding extra instructions) rather
23+
than editing a hard-coded string inside *llm.py*.
24+
"""
25+
26+
return (
27+
f"Please provide your thinking process within <{think_tag}> tags, "
28+
"followed by your JSON output.\n\n"
29+
"JSON structure:\n"
30+
f"{structure}\n\n"
31+
"OUTPUT example:\n"
32+
f"<{think_tag}>\n"
33+
"Your step-by-step reasoning and analysis goes here...\n"
34+
f"</{think_tag}>\n\n"
35+
"##JSON OUTPUT\n"
36+
"{\n ...\n}" # placeholder keeps JSON fence out of model context
37+
)
2538

2639
class LLM:
2740
TIMEOUT = 3000 # Timeout in milliseconds
@@ -34,6 +47,11 @@ class LLM:
3447
MIN_THINKING_BUDGET = 1200 # Minimum thinking budget
3548
DEFAULT_OUTPUT_TOKENS = 32000
3649

50+
# A single default completion-token limit that is accepted by the vast
51+
# majority of models. If a model supports more (or you need fewer), pass
52+
# `token_limit=` when instantiating `LLM` to override this value.
53+
DEFAULT_MAX_COMPLETION_TOKENS = 8000
54+
3755
def __init__(
3856
self,
3957
model: str,
@@ -194,7 +212,7 @@ def request(
194212
working_messages = messages.copy()
195213
if self.is_dynamic and response_model:
196214
structure = add_classification_structure(response_model)
197-
prompt = DYNAMIC_PROMPT_TEMPLATE.format(prompt=structure)
215+
prompt = build_dynamic_prompt(structure)
198216
working_messages.append({
199217
"role": "system",
200218
"content": prompt
@@ -219,11 +237,11 @@ def request(
219237

220238
def _request_with_router(self, messages: List[Dict[str, str]], response_model: Optional[str]) -> Any:
221239
"""Handle request using router with or without thinking parameter"""
222-
max_tokens = self.DEFAULT_OUTPUT_TOKENS
240+
max_tokens = self._get_model_max_tokens()
223241
if self.token_limit is not None:
224-
max_tokens = self.token_limit
242+
max_tokens = min(self.token_limit, max_tokens)
225243
elif self.is_thinking:
226-
max_tokens = self.thinking_token_limit
244+
max_tokens = min(self.thinking_token_limit, max_tokens) if self.thinking_token_limit else max_tokens
227245

228246
params = {
229247
"model": self.model,
@@ -248,11 +266,11 @@ def _request_with_router(self, messages: List[Dict[str, str]], response_model: O
248266

249267
def _request_direct(self, messages: List[Dict[str, str]], response_model: Optional[str]) -> Any:
250268
"""Handle direct request with or without thinking parameter"""
251-
max_tokens = self.DEFAULT_OUTPUT_TOKENS
269+
max_tokens = self._get_model_max_tokens()
252270
if self.token_limit is not None:
253-
max_tokens = self.token_limit
271+
max_tokens = min(self.token_limit, max_tokens)
254272
elif self.is_thinking:
255-
max_tokens = self.thinking_token_limit
273+
max_tokens = min(self.thinking_token_limit, max_tokens) if self.thinking_token_limit else max_tokens
256274

257275
base_params = {
258276
"model": self.model,
@@ -293,11 +311,11 @@ def raw_completion(self, messages: List[Dict[str, str]]) -> str:
293311
except Exception as e:
294312
raise ValueError(f"Failed to extract from source: {str(e)}")
295313

296-
max_tokens = self.DEFAULT_OUTPUT_TOKENS
314+
max_tokens = self._get_model_max_tokens()
297315
if self.token_limit is not None:
298-
max_tokens = self.token_limit
316+
max_tokens = min(self.token_limit, max_tokens)
299317
elif self.is_thinking:
300-
max_tokens = self.thinking_token_limit
318+
max_tokens = min(self.thinking_token_limit, max_tokens) if self.thinking_token_limit else max_tokens
301319

302320
params = {
303321
"model": self.model,
@@ -325,4 +343,14 @@ def raw_completion(self, messages: List[Dict[str, str]]) -> str:
325343

326344
def set_timeout(self, timeout_ms: int) -> None:
327345
"""Set the timeout value for LLM requests in milliseconds."""
328-
self.TIMEOUT = timeout_ms
346+
self.TIMEOUT = timeout_ms
347+
348+
def _get_model_max_tokens(self) -> int:
349+
"""Return the default maximum completion-token limit.
350+
351+
This constant (DEFAULT_MAX_COMPLETION_TOKENS) is meant to work for ~99 %
352+
of models. If you need a different value, supply `token_limit=` when
353+
creating the `LLM` instance.
354+
"""
355+
356+
return self.DEFAULT_MAX_COMPLETION_TOKENS

tests/test_llm_backends.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
def test_litellm_backend():
55
"""Test default LiteLLM backend"""
6-
llm = LLM("gpt-4", backend=llm_engine.LITELLM)
6+
llm = LLM("gpt-4o", backend=llm_engine.LITELLM)
77
assert llm.backend == llm_engine.LITELLM
88
assert llm.client is not None
99
assert llm.agent is None
@@ -12,7 +12,7 @@ def test_pydanticai_backend():
1212
"""Test PydanticAI backend if available"""
1313
try:
1414
import pydantic_ai
15-
llm = LLM("gpt-4", backend=llm_engine.PYDANTIC_AI)
15+
llm = LLM("gpt-4o", backend=llm_engine.PYDANTIC_AI)
1616
assert llm.backend == llm_engine.PYDANTIC_AI
1717
assert llm.client is None
1818
assert llm.agent is not None
@@ -22,13 +22,13 @@ def test_pydanticai_backend():
2222
def test_invalid_backend():
2323
"""Test invalid backend type raises error"""
2424
with pytest.raises(TypeError):
25-
LLM("gpt-4", backend="invalid") # Should be LLMBackend enum
25+
LLM("gpt-4o", backend="invalid") # Should be LLMBackend enum
2626

2727
def test_router_with_pydanticai():
2828
"""Test router not supported with PydanticAI"""
2929
from litellm import Router
30-
router = Router(model_list=[{"model_name": "gpt-4"}])
30+
router = Router(model_list=[{"model_name": "gpt-4o"}])
3131

32-
llm = LLM("gpt-4", backend=llm_engine.PYDANTIC_AI)
32+
llm = LLM("gpt-4o", backend=llm_engine.PYDANTIC_AI)
3333
with pytest.raises(ValueError, match="Router is only supported with LITELLM backend"):
3434
llm.load_router(router)

0 commit comments

Comments
 (0)