diff --git a/humanpages/.gitignore b/humanpages/.gitignore new file mode 100644 index 00000000..c18dd8d8 --- /dev/null +++ b/humanpages/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/humanpages/LICENSE b/humanpages/LICENSE new file mode 100644 index 00000000..48e8eb46 --- /dev/null +++ b/humanpages/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 TinyFish, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/humanpages/Makefile b/humanpages/Makefile new file mode 100644 index 00000000..730e1b2c --- /dev/null +++ b/humanpages/Makefile @@ -0,0 +1,65 @@ +.PHONY: all format lint test tests integration_tests docker_tests help extended_tests + +# Default target executed when no arguments are given to make. +all: help + +# Define a variable for the test file path. +TEST_FILE ?= tests/unit_tests/ +integration_test integration_tests: TEST_FILE = tests/integration_tests/ + + +# unit tests are run with the --disable-socket flag to prevent network calls +test tests: + poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE) + +test_watch: + poetry run ptw --now . -- --snapshot-update -vv $(TEST_FILE) + +# integration tests are run without the --disable-socket flag to allow network calls +integration_test integration_tests: + poetry run pytest $(TEST_FILE) + +###################### +# LINTING AND FORMATTING +###################### + +# Define a variable for Python and notebook files. +PYTHON_FILES=. +MYPY_CACHE=.mypy_cache +lint format: PYTHON_FILES=. +BASE_REF ?= main +lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative --name-only --diff-filter=d $(BASE_REF)...HEAD | grep -E '\.py$$|\.ipynb$$') +lint_package: PYTHON_FILES=agentql_humanpages +lint_tests: PYTHON_FILES=tests +lint_tests: MYPY_CACHE=.mypy_cache_test + +lint lint_diff lint_package lint_tests: + [ "$(PYTHON_FILES)" = "" ] || poetry run ruff check $(PYTHON_FILES) + [ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff + [ "$(PYTHON_FILES)" = "" ] || { mkdir -p $(MYPY_CACHE) && poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE); } + +format format_diff: + [ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) + [ "$(PYTHON_FILES)" = "" ] || poetry run ruff check --select I --fix $(PYTHON_FILES) + +spell_check: + poetry run codespell --toml pyproject.toml + +spell_fix: + poetry run codespell --toml pyproject.toml -w + +check_imports: $(shell find agentql_humanpages -name '*.py') + poetry run python ./scripts/check_imports.py $^ + +###################### +# HELP +###################### + +help: + @echo '----' + @echo 'check_imports - check imports' + @echo 'format - run code formatters' + @echo 'lint - run linters' + @echo 'test - run unit tests' + @echo 'tests - run unit tests' + @echo 'test TEST_FILE= - run all tests in file' diff --git a/humanpages/README.md b/humanpages/README.md new file mode 100644 index 00000000..158512db --- /dev/null +++ b/humanpages/README.md @@ -0,0 +1,117 @@ +# agentql-humanpages + +An integration package connecting [AgentQL](https://www.agentql.com/) and [Human Pages](https://humanpages.ai) for human-in-the-loop web data extraction. + +When AgentQL's automated extraction fails -- due to anti-bot protections, CAPTCHAs, empty results, or any other blocker -- the task is automatically delegated to a human worker via the Human Pages platform. + +## Installation + +```bash +pip install -U agentql-humanpages +``` + +You need to configure both API keys: + +- `AGENTQL_API_KEY` -- get one from the [AgentQL Dev Portal](https://dev.agentql.com) +- `HUMANPAGES_API_KEY` -- get one from [Human Pages](https://humanpages.ai) + +## Quick Start + +```python +from agentql_humanpages import HumanFallbackAgent + +agent = HumanFallbackAgent( + agentql_api_key="your-agentql-key", + humanpages_api_key="your-humanpages-key", +) + +result = agent.extract( + url="https://example.com/products", + query="{ products[] { name price } }", +) + +if result["source"] == "agentql": + print("Extracted via AgentQL:", result["data"]) +else: + print("Extracted via human:", result["messages"]) +``` + +## HumanFallbackAgent + +The main entry point. Attempts AgentQL extraction first, then falls back to Human Pages. + +```python +agent = HumanFallbackAgent( + agentql_api_key="...", # or set AGENTQL_API_KEY env var + humanpages_api_key="...", # or set HUMANPAGES_API_KEY env var + price_usdc=5.0, # default price for human jobs + deadline_hours=24, # default deadline for human jobs +) +``` + +### extract() + +```python +result = agent.extract( + url="https://example.com", + query="{ products[] { name price } }", # AgentQL query + # OR + prompt="Get all product names and prices", # Natural language + fallback_description="Custom instructions for the human worker", + price_usdc=10.0, # override default price + deadline_hours=12, # override default deadline +) +``` + +Returns a dict with: +- `source`: `"agentql"` or `"humanpages"` +- `data`: extracted data (when source is agentql) +- `job_id`, `status`, `messages`: job details (when source is humanpages) + +### aextract() + +Async version of `extract()` with the same interface. + +## HumanPagesClient + +Lower-level client for the Human Pages REST API: + +```python +from agentql_humanpages import HumanPagesClient + +client = HumanPagesClient(api_key="your-key") + +# Search for available humans +humans = client.search_humans(skill="web task", available=True) + +# Create a job +job = client.create_job( + human_id=humans[0]["id"], + title="Extract product data", + description="Visit example.com and extract all product names and prices.", + price_usdc=5.0, + deadline_hours=24, +) + +# Check job status +status = client.get_job_status(job["id"]) + +# Get messages +messages = client.get_job_messages(job["id"]) +``` + +All methods have async counterparts (`asearch_humans`, `acreate_job`, `aget_job_status`, `aget_job_messages`). + +## Run Tests + +Unit tests (no network calls): + +```bash +make test +``` + +Integration tests (requires valid API keys): + +```bash +make integration_tests +``` diff --git a/humanpages/agentql_humanpages/__init__.py b/humanpages/agentql_humanpages/__init__.py new file mode 100644 index 00000000..b55ac9fa --- /dev/null +++ b/humanpages/agentql_humanpages/__init__.py @@ -0,0 +1,28 @@ +from importlib import metadata + +from agentql_humanpages.agent import HumanFallbackAgent +from agentql_humanpages.client import HumanPagesClient +from agentql_humanpages.const import ( + DEFAULT_DEADLINE_HOURS, + DEFAULT_POLL_INTERVAL_SECONDS, + DEFAULT_PRICE_USDC, + DEFAULT_TIMEOUT_SECONDS, + HUMANPAGES_BASE_URL, +) + +try: + __version__ = metadata.version(__package__) +except metadata.PackageNotFoundError: + # Case where package metadata is not available. + __version__ = "0.1.0" + +__all__ = [ + "DEFAULT_DEADLINE_HOURS", + "DEFAULT_POLL_INTERVAL_SECONDS", + "DEFAULT_PRICE_USDC", + "DEFAULT_TIMEOUT_SECONDS", + "HUMANPAGES_BASE_URL", + "HumanFallbackAgent", + "HumanPagesClient", + "__version__", +] diff --git a/humanpages/agentql_humanpages/agent.py b/humanpages/agentql_humanpages/agent.py new file mode 100644 index 00000000..84dd1bc2 --- /dev/null +++ b/humanpages/agentql_humanpages/agent.py @@ -0,0 +1,365 @@ +"""Human Fallback Agent -- uses AgentQL for web extraction with Human Pages fallback.""" + +import asyncio +import logging +import os +import time +from typing import Any, Optional + +import httpx + +from agentql_humanpages.client import HumanPagesClient +from agentql_humanpages.const import ( + AGENTQL_DEFAULT_MODE, + AGENTQL_DEFAULT_TIMEOUT_SECONDS, + AGENTQL_EXTRACT_DATA_ENDPOINT, + DEFAULT_DEADLINE_HOURS, + DEFAULT_MAX_POLL_ATTEMPTS, + DEFAULT_POLL_INTERVAL_SECONDS, + DEFAULT_PRICE_USDC, + REQUEST_ORIGIN, +) +from agentql_humanpages.messages import ( + AGENTQL_EXTRACTION_FAILED, + JOB_CREATION_FAILED_ERROR, + JOB_TIMEOUT_ERROR, + NO_HUMANS_AVAILABLE_ERROR, + UNSET_AGENTQL_API_KEY_ERROR, + UNSET_HUMANPAGES_API_KEY_ERROR, +) + +logger = logging.getLogger(__name__) + + +class HumanFallbackAgent: + """Agent that uses AgentQL for web data extraction with Human Pages as a fallback. + + When AgentQL extraction fails (network errors, anti-bot blocks, CAPTCHAs, + or empty results), this agent automatically delegates the task to a human + worker via the Human Pages platform. + + Setup: + Set ``AGENTQL_API_KEY`` and ``HUMANPAGES_API_KEY`` environment variables, + or pass them directly. + + .. code-block:: bash + + export AGENTQL_API_KEY="your-agentql-key" + export HUMANPAGES_API_KEY="your-humanpages-key" + + Instantiation: + .. code-block:: python + + from agentql_humanpages import HumanFallbackAgent + + agent = HumanFallbackAgent( + agentql_api_key="your-agentql-key", + humanpages_api_key="your-humanpages-key", + ) + + Usage: + .. code-block:: python + + result = agent.extract( + url="https://example.com/products", + query="{ products[] { name price } }", + ) + """ + + def __init__( + self, + agentql_api_key: Optional[str] = None, + humanpages_api_key: Optional[str] = None, + agentql_timeout: int = AGENTQL_DEFAULT_TIMEOUT_SECONDS, + agentql_mode: str = AGENTQL_DEFAULT_MODE, + humanpages_base_url: Optional[str] = None, + price_usdc: float = DEFAULT_PRICE_USDC, + deadline_hours: int = DEFAULT_DEADLINE_HOURS, + poll_interval: int = DEFAULT_POLL_INTERVAL_SECONDS, + max_poll_attempts: int = DEFAULT_MAX_POLL_ATTEMPTS, + ) -> None: + """Initialize the Human Fallback Agent. + + Args: + agentql_api_key: AgentQL API key. Falls back to AGENTQL_API_KEY env var. + humanpages_api_key: Human Pages API key. Falls back to HUMANPAGES_API_KEY env var. + agentql_timeout: Timeout in seconds for AgentQL requests. + agentql_mode: AgentQL response mode ('fast' or 'standard'). + humanpages_base_url: Override the Human Pages base URL. + price_usdc: Default price in USDC for human fallback jobs. + deadline_hours: Default deadline in hours for human fallback jobs. + poll_interval: Seconds between polling for job completion. + max_poll_attempts: Maximum number of poll attempts before timing out. + """ + self._agentql_api_key = agentql_api_key or os.getenv("AGENTQL_API_KEY") + if not self._agentql_api_key: + raise ValueError(UNSET_AGENTQL_API_KEY_ERROR) + + hp_api_key = humanpages_api_key or os.getenv("HUMANPAGES_API_KEY") + if not hp_api_key: + raise ValueError(UNSET_HUMANPAGES_API_KEY_ERROR) + + hp_kwargs: dict[str, Any] = {"api_key": hp_api_key} + if humanpages_base_url: + hp_kwargs["base_url"] = humanpages_base_url + self._hp_client = HumanPagesClient(**hp_kwargs) + + self._agentql_timeout = agentql_timeout + self._agentql_mode = agentql_mode + self._price_usdc = price_usdc + self._deadline_hours = deadline_hours + self._poll_interval = poll_interval + self._max_poll_attempts = max_poll_attempts + + def _agentql_extract( + self, + url: str, + query: Optional[str] = None, + prompt: Optional[str] = None, + ) -> dict[str, Any]: + """Attempt data extraction via the AgentQL REST API.""" + payload: dict[str, Any] = { + "url": url, + "query": query, + "prompt": prompt, + "params": {"mode": self._agentql_mode}, + "metadata": {}, + } + headers = { + "X-API-Key": self._agentql_api_key, + "Content-Type": "application/json", + "X-TF-Request-Origin": REQUEST_ORIGIN, + } + response = httpx.post( + AGENTQL_EXTRACT_DATA_ENDPOINT, + headers=headers, + json=payload, + timeout=self._agentql_timeout, + ) + response.raise_for_status() + return response.json() + + async def _agentql_extract_async( + self, + url: str, + query: Optional[str] = None, + prompt: Optional[str] = None, + ) -> dict[str, Any]: + """Async version of AgentQL extraction.""" + payload: dict[str, Any] = { + "url": url, + "query": query, + "prompt": prompt, + "params": {"mode": self._agentql_mode}, + "metadata": {}, + } + headers = { + "X-API-Key": self._agentql_api_key, + "Content-Type": "application/json", + "X-TF-Request-Origin": REQUEST_ORIGIN, + } + async with httpx.AsyncClient() as client: + response = await client.post( + AGENTQL_EXTRACT_DATA_ENDPOINT, + headers=headers, + json=payload, + timeout=self._agentql_timeout, + ) + response.raise_for_status() + return response.json() + + def _delegate_to_human( + self, + url: str, + description: str, + price_usdc: Optional[float] = None, + deadline_hours: Optional[int] = None, + ) -> dict[str, Any]: + """Create a Human Pages job and poll until completion or timeout.""" + humans = self._hp_client.search_humans(skill="web task", available=True) + if not humans: + raise RuntimeError(NO_HUMANS_AVAILABLE_ERROR) + + human_id = humans[0]["id"] + price = price_usdc if price_usdc is not None else self._price_usdc + deadline = deadline_hours if deadline_hours is not None else self._deadline_hours + + try: + job = self._hp_client.create_job( + human_id=human_id, + title=f"Extract data from {url}", + description=description, + price_usdc=price, + deadline_hours=deadline, + ) + except (ValueError, httpx.HTTPError) as e: + raise RuntimeError(JOB_CREATION_FAILED_ERROR.format(detail=str(e))) from e + + job_id = job["id"] + for _ in range(self._max_poll_attempts): + status = self._hp_client.get_job_status(job_id) + if status.get("status") == "completed": + messages = self._hp_client.get_job_messages(job_id) + return { + "source": "humanpages", + "job_id": job_id, + "status": status, + "messages": messages, + } + if status.get("status") in ("cancelled", "expired", "failed"): + return { + "source": "humanpages", + "job_id": job_id, + "status": status, + "messages": [], + } + time.sleep(self._poll_interval) + + raise TimeoutError(JOB_TIMEOUT_ERROR.format(job_id=job_id)) + + async def _adelegate_to_human( + self, + url: str, + description: str, + price_usdc: Optional[float] = None, + deadline_hours: Optional[int] = None, + ) -> dict[str, Any]: + """Async version: create a Human Pages job and poll until completion or timeout.""" + humans = await self._hp_client.asearch_humans(skill="web task", available=True) + if not humans: + raise RuntimeError(NO_HUMANS_AVAILABLE_ERROR) + + human_id = humans[0]["id"] + price = price_usdc if price_usdc is not None else self._price_usdc + deadline = deadline_hours if deadline_hours is not None else self._deadline_hours + + try: + job = await self._hp_client.acreate_job( + human_id=human_id, + title=f"Extract data from {url}", + description=description, + price_usdc=price, + deadline_hours=deadline, + ) + except (ValueError, httpx.HTTPError) as e: + raise RuntimeError(JOB_CREATION_FAILED_ERROR.format(detail=str(e))) from e + + job_id = job["id"] + for _ in range(self._max_poll_attempts): + status = await self._hp_client.aget_job_status(job_id) + if status.get("status") == "completed": + messages = await self._hp_client.aget_job_messages(job_id) + return { + "source": "humanpages", + "job_id": job_id, + "status": status, + "messages": messages, + } + if status.get("status") in ("cancelled", "expired", "failed"): + return { + "source": "humanpages", + "job_id": job_id, + "status": status, + "messages": [], + } + await asyncio.sleep(self._poll_interval) + + raise TimeoutError(JOB_TIMEOUT_ERROR.format(job_id=job_id)) + + def extract( + self, + url: str, + query: Optional[str] = None, + prompt: Optional[str] = None, + fallback_description: Optional[str] = None, + price_usdc: Optional[float] = None, + deadline_hours: Optional[int] = None, + ) -> dict[str, Any]: + """Extract data from a URL using AgentQL, falling back to Human Pages on failure. + + First attempts extraction via the AgentQL REST API. If that fails for any + reason (network error, anti-bot block, empty result), the task is delegated + to a human worker on Human Pages. + + Args: + url: The URL to extract data from. + query: An AgentQL query string (mutually exclusive with prompt). + prompt: A natural language description of the data to extract. + fallback_description: Custom description for the human fallback job. + If not provided, one is generated from the query/prompt. + price_usdc: Override the default price for the human fallback job. + deadline_hours: Override the default deadline for the human fallback job. + + Returns: + A dict with keys: + - ``source``: Either ``"agentql"`` or ``"humanpages"``. + - ``data``: The extracted data (when source is agentql). + - ``job_id``, ``status``, ``messages``: Job details (when source is humanpages). + """ + if bool(query) == bool(prompt): + raise ValueError("Exactly one of 'query' or 'prompt' must be provided.") + + # Attempt AgentQL extraction + try: + result = self._agentql_extract(url=url, query=query, prompt=prompt) + data = result.get("data") + if data: + return {"source": "agentql", "data": data} + logger.info("AgentQL returned empty data for %s, falling back to human.", url) + except (httpx.HTTPError, ValueError) as e: + logger.info( + AGENTQL_EXTRACTION_FAILED.format(url=url, detail=str(e)) + ) + + # Build fallback description + if not fallback_description: + task_detail = query if query else prompt + fallback_description = ( + f"Please visit {url} and extract the following data:\n\n{task_detail}\n\n" + f"Return the results as structured JSON." + ) + + return self._delegate_to_human( + url=url, + description=fallback_description, + price_usdc=price_usdc, + deadline_hours=deadline_hours, + ) + + async def aextract( + self, + url: str, + query: Optional[str] = None, + prompt: Optional[str] = None, + fallback_description: Optional[str] = None, + price_usdc: Optional[float] = None, + deadline_hours: Optional[int] = None, + ) -> dict[str, Any]: + """Async version of extract. See extract() for full documentation.""" + if bool(query) == bool(prompt): + raise ValueError("Exactly one of 'query' or 'prompt' must be provided.") + + try: + result = await self._agentql_extract_async(url=url, query=query, prompt=prompt) + data = result.get("data") + if data: + return {"source": "agentql", "data": data} + logger.info("AgentQL returned empty data for %s, falling back to human.", url) + except (httpx.HTTPError, ValueError) as e: + logger.info( + AGENTQL_EXTRACTION_FAILED.format(url=url, detail=str(e)) + ) + + if not fallback_description: + task_detail = query if query else prompt + fallback_description = ( + f"Please visit {url} and extract the following data:\n\n{task_detail}\n\n" + f"Return the results as structured JSON." + ) + + return await self._adelegate_to_human( + url=url, + description=fallback_description, + price_usdc=price_usdc, + deadline_hours=deadline_hours, + ) diff --git a/humanpages/agentql_humanpages/client.py b/humanpages/agentql_humanpages/client.py new file mode 100644 index 00000000..eaf9faea --- /dev/null +++ b/humanpages/agentql_humanpages/client.py @@ -0,0 +1,263 @@ +"""Human Pages REST API client.""" + +import os +from typing import Any, Optional +from urllib.parse import quote + +import httpx + +from agentql_humanpages.const import ( + CREATE_JOB_ENDPOINT, + DEFAULT_DEADLINE_HOURS, + DEFAULT_PRICE_USDC, + DEFAULT_TIMEOUT_SECONDS, + HUMANPAGES_BASE_URL, + JOB_MESSAGES_ENDPOINT, + JOB_STATUS_ENDPOINT, + SEARCH_HUMANS_ENDPOINT, +) +from agentql_humanpages.messages import ( + HUMANPAGES_UNAUTHORIZED_ERROR, + UNSET_HUMANPAGES_API_KEY_ERROR, +) + + +class HumanPagesClient: + """Client for the Human Pages REST API. + + Provides methods to search for available humans, create jobs, + check job status, and retrieve job messages. + + Setup: + Set the ``HUMANPAGES_API_KEY`` environment variable or pass the key directly. + + .. code-block:: bash + + export HUMANPAGES_API_KEY="your-api-key" + + Instantiation: + .. code-block:: python + + client = HumanPagesClient() + # or + client = HumanPagesClient(api_key="your-api-key") + """ + + def __init__( + self, + api_key: Optional[str] = None, + base_url: str = HUMANPAGES_BASE_URL, + timeout: int = DEFAULT_TIMEOUT_SECONDS, + ) -> None: + self._api_key = api_key or os.getenv("HUMANPAGES_API_KEY") + if not self._api_key: + raise ValueError(UNSET_HUMANPAGES_API_KEY_ERROR) + self._base_url = base_url.rstrip("/") + self._timeout = timeout + + def _headers(self) -> dict[str, str]: + return { + "Content-Type": "application/json", + "X-Agent-Key": self._api_key, + } + + def _handle_error(self, e: httpx.HTTPStatusError) -> None: + if e.response.status_code == httpx.codes.UNAUTHORIZED: + raise ValueError(HUMANPAGES_UNAUTHORIZED_ERROR) from e + msg = e.response.text + try: + error_json = e.response.json() + if isinstance(error_json, dict): + msg = error_json.get("error", error_json.get("message", str(error_json))) + else: + msg = str(error_json) + except (ValueError, TypeError): + msg = f"HTTP {e}" + raise ValueError(msg) from e + + def search_humans( + self, + skill: str = "web task", + available: bool = True, + ) -> list[dict[str, Any]]: + """Search for available humans with a given skill. + + Args: + skill: The skill to search for (default: "web task"). + available: Whether to filter for available humans only. + + Returns: + A list of human profiles matching the search criteria. + """ + params = {"skill": skill, "available": str(available).lower()} + try: + response = httpx.get( + f"{self._base_url}{SEARCH_HUMANS_ENDPOINT}", + params=params, + headers=self._headers(), + timeout=self._timeout, + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + self._handle_error(e) + return response.json() + + def create_job( + self, + human_id: str, + title: str, + description: str, + price_usdc: float = DEFAULT_PRICE_USDC, + deadline_hours: int = DEFAULT_DEADLINE_HOURS, + ) -> dict[str, Any]: + """Create a new job for a human to complete. + + Args: + human_id: The ID of the human to assign the job to. + title: A short title for the job. + description: A detailed description of what needs to be done. + price_usdc: The price in USDC to pay for the job. + deadline_hours: The number of hours to complete the job. + + Returns: + The created job object including its ID and status. + """ + payload = { + "humanId": human_id, + "title": title, + "description": description, + "priceUsdc": price_usdc, + "deadlineHours": deadline_hours, + } + try: + response = httpx.post( + f"{self._base_url}{CREATE_JOB_ENDPOINT}", + json=payload, + headers=self._headers(), + timeout=self._timeout, + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + self._handle_error(e) + return response.json() + + def get_job_status(self, job_id: str) -> dict[str, Any]: + """Check the status of a job. + + Args: + job_id: The ID of the job to check. + + Returns: + The job object with its current status. + """ + url = f"{self._base_url}{JOB_STATUS_ENDPOINT.format(job_id=quote(job_id, safe=''))}" + try: + response = httpx.get( + url, + headers=self._headers(), + timeout=self._timeout, + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + self._handle_error(e) + return response.json() + + def get_job_messages(self, job_id: str) -> list[dict[str, Any]]: + """Retrieve messages exchanged on a job. + + Args: + job_id: The ID of the job. + + Returns: + A list of message objects for the job. + """ + url = f"{self._base_url}{JOB_MESSAGES_ENDPOINT.format(job_id=quote(job_id, safe=''))}" + try: + response = httpx.get( + url, + headers=self._headers(), + timeout=self._timeout, + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + self._handle_error(e) + return response.json() + + async def asearch_humans( + self, + skill: str = "web task", + available: bool = True, + ) -> list[dict[str, Any]]: + """Async version of search_humans.""" + params = {"skill": skill, "available": str(available).lower()} + async with httpx.AsyncClient() as client: + try: + response = await client.get( + f"{self._base_url}{SEARCH_HUMANS_ENDPOINT}", + params=params, + headers=self._headers(), + timeout=self._timeout, + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + self._handle_error(e) + return response.json() + + async def acreate_job( + self, + human_id: str, + title: str, + description: str, + price_usdc: float = DEFAULT_PRICE_USDC, + deadline_hours: int = DEFAULT_DEADLINE_HOURS, + ) -> dict[str, Any]: + """Async version of create_job.""" + payload = { + "humanId": human_id, + "title": title, + "description": description, + "priceUsdc": price_usdc, + "deadlineHours": deadline_hours, + } + async with httpx.AsyncClient() as client: + try: + response = await client.post( + f"{self._base_url}{CREATE_JOB_ENDPOINT}", + json=payload, + headers=self._headers(), + timeout=self._timeout, + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + self._handle_error(e) + return response.json() + + async def aget_job_status(self, job_id: str) -> dict[str, Any]: + """Async version of get_job_status.""" + url = f"{self._base_url}{JOB_STATUS_ENDPOINT.format(job_id=quote(job_id, safe=''))}" + async with httpx.AsyncClient() as client: + try: + response = await client.get( + url, + headers=self._headers(), + timeout=self._timeout, + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + self._handle_error(e) + return response.json() + + async def aget_job_messages(self, job_id: str) -> list[dict[str, Any]]: + """Async version of get_job_messages.""" + url = f"{self._base_url}{JOB_MESSAGES_ENDPOINT.format(job_id=quote(job_id, safe=''))}" + async with httpx.AsyncClient() as client: + try: + response = await client.get( + url, + headers=self._headers(), + timeout=self._timeout, + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + self._handle_error(e) + return response.json() diff --git a/humanpages/agentql_humanpages/const.py b/humanpages/agentql_humanpages/const.py new file mode 100644 index 00000000..11ed4197 --- /dev/null +++ b/humanpages/agentql_humanpages/const.py @@ -0,0 +1,18 @@ +HUMANPAGES_BASE_URL = "https://humanpages.ai" + +SEARCH_HUMANS_ENDPOINT = "/api/humans/search" +CREATE_JOB_ENDPOINT = "/api/jobs" +JOB_STATUS_ENDPOINT = "/api/jobs/{job_id}" +JOB_MESSAGES_ENDPOINT = "/api/jobs/{job_id}/messages" + +DEFAULT_TIMEOUT_SECONDS = 120 +DEFAULT_PRICE_USDC = 5.0 +DEFAULT_DEADLINE_HOURS = 24 +DEFAULT_POLL_INTERVAL_SECONDS = 30 +DEFAULT_MAX_POLL_ATTEMPTS = 120 + +AGENTQL_EXTRACT_DATA_ENDPOINT = "https://api.agentql.com/v1/query-data" +AGENTQL_DEFAULT_MODE = "fast" +AGENTQL_DEFAULT_TIMEOUT_SECONDS = 900 + +REQUEST_ORIGIN = "humanpages" diff --git a/humanpages/agentql_humanpages/messages.py b/humanpages/agentql_humanpages/messages.py new file mode 100644 index 00000000..fb7cf1cb --- /dev/null +++ b/humanpages/agentql_humanpages/messages.py @@ -0,0 +1,33 @@ +UNSET_AGENTQL_API_KEY_ERROR = ( + "No AgentQL API key provided. Set the `agentql_api_key` argument or the " + "`AGENTQL_API_KEY` environment variable. " + "Create an API key at https://dev.agentql.com." +) + +UNSET_HUMANPAGES_API_KEY_ERROR = ( + "No Human Pages API key provided. Set the `humanpages_api_key` argument or the " + "`HUMANPAGES_API_KEY` environment variable. " + "Get an API key at https://humanpages.ai." +) + +NO_HUMANS_AVAILABLE_ERROR = ( + "No humans are currently available for this task on Human Pages. " + "Try again later or adjust the task description." +) + +JOB_CREATION_FAILED_ERROR = "Failed to create a job on Human Pages: {detail}" + +JOB_TIMEOUT_ERROR = ( + "The Human Pages job did not complete within the allowed time. " + "Job ID: {job_id}" +) + +AGENTQL_EXTRACTION_FAILED = ( + "AgentQL extraction failed for URL {url}: {detail}. " + "Falling back to Human Pages." +) + +HUMANPAGES_UNAUTHORIZED_ERROR = ( + "Invalid Human Pages API key. " + "Please provide a valid key from https://humanpages.ai." +) diff --git a/humanpages/agentql_humanpages/py.typed b/humanpages/agentql_humanpages/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/humanpages/docs/human_fallback.ipynb b/humanpages/docs/human_fallback.ipynb new file mode 100644 index 00000000..849377e9 --- /dev/null +++ b/humanpages/docs/human_fallback.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# AgentQL + Human Pages: Human-in-the-Loop Data Extraction\n", + "\n", + "This notebook demonstrates how to use the `agentql-humanpages` integration to extract structured data from web pages with an automatic human fallback.\n", + "\n", + "## Setup\n", + "\n", + "Install the package and set your API keys:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU agentql-humanpages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"AGENTQL_API_KEY\"] = \"your-agentql-key\"\n", + "os.environ[\"HUMANPAGES_API_KEY\"] = \"your-humanpages-key\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using HumanFallbackAgent\n", + "\n", + "The `HumanFallbackAgent` tries AgentQL first. If extraction fails, it automatically creates a job on Human Pages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from agentql_humanpages import HumanFallbackAgent\n", + "\n", + "agent = HumanFallbackAgent()\n", + "\n", + "# Extract with an AgentQL query\n", + "result = agent.extract(\n", + " url=\"https://www.ycombinator.com/companies\",\n", + " query=\"{ companies[] { name description batch } }\",\n", + ")\n", + "\n", + "print(f\"Source: {result['source']}\")\n", + "if result[\"source\"] == \"agentql\":\n", + " print(f\"Data: {result['data']}\")\n", + "else:\n", + " print(f\"Job ID: {result['job_id']}\")\n", + " print(f\"Status: {result['status']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Natural Language Prompts\n", + "\n", + "Instead of an AgentQL query, you can use a natural language prompt:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = agent.extract(\n", + " url=\"https://example.com/products\",\n", + " prompt=\"Extract all product names, prices, and ratings\",\n", + ")\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using the HumanPagesClient Directly\n", + "\n", + "For more control, use the `HumanPagesClient` to interact with the Human Pages API directly:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from agentql_humanpages import HumanPagesClient\n", + "\n", + "client = HumanPagesClient()\n", + "\n", + "# Search for available humans\n", + "humans = client.search_humans(skill=\"web task\")\n", + "print(f\"Found {len(humans)} available humans\")\n", + "\n", + "if humans:\n", + " # Create a job\n", + " job = client.create_job(\n", + " human_id=humans[0][\"id\"],\n", + " title=\"Extract product catalog\",\n", + " description=\"Visit example.com/products and extract all product names and prices as JSON.\",\n", + " price_usdc=5.0,\n", + " deadline_hours=24,\n", + " )\n", + " print(f\"Created job: {job['id']}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/humanpages/examples/human_fallback_scraper/README.md b/humanpages/examples/human_fallback_scraper/README.md new file mode 100644 index 00000000..351ccf59 --- /dev/null +++ b/humanpages/examples/human_fallback_scraper/README.md @@ -0,0 +1,43 @@ +# Human Fallback Scraper + +This example extracts company data from Y Combinator's company directory. If AgentQL's automated extraction fails (e.g., due to anti-bot measures), the task is automatically delegated to a human worker on [Human Pages](https://humanpages.ai). + +## Run the script + +- Install the AgentQL Human Pages integration: + +```bash +pip install agentql-humanpages +``` + +- Configure the `AGENTQL_API_KEY` environment variable. You can get your AgentQL API key [here](https://dev.agentql.com/api-keys) + +```bash +export AGENTQL_API_KEY= +``` + +- Configure the `HUMANPAGES_API_KEY` environment variable. You can get your Human Pages API key at [humanpages.ai](https://humanpages.ai) + +```bash +export HUMANPAGES_API_KEY= +``` + +- Run the following command from the project's folder: + +```bash +python3 human_fallback_scraper.py +``` + +## How it works + +1. The `HumanFallbackAgent` first attempts to extract data using AgentQL's REST API. +2. If extraction fails (HTTP error, timeout, empty results), the agent automatically: + - Searches for an available human worker on Human Pages + - Creates a job with the extraction task description + - Polls for completion and returns the results +3. The returned result includes a `source` field indicating whether data came from `"agentql"` or `"humanpages"`. + +## Learn more + +- [AgentQL Documentation](https://docs.agentql.com/) +- [Human Pages API](https://humanpages.ai) diff --git a/humanpages/examples/human_fallback_scraper/human_fallback_scraper.py b/humanpages/examples/human_fallback_scraper/human_fallback_scraper.py new file mode 100644 index 00000000..f656fa2a --- /dev/null +++ b/humanpages/examples/human_fallback_scraper/human_fallback_scraper.py @@ -0,0 +1,47 @@ +"""Example: Extract product data with AgentQL, falling back to Human Pages. + +This script demonstrates using the HumanFallbackAgent to extract structured +product data from a web page. If AgentQL cannot extract the data (due to +anti-bot protections, CAPTCHAs, or other issues), the task is automatically +delegated to a human worker on Human Pages. + +Prerequisites: + pip install agentql-humanpages + +Environment variables: + AGENTQL_API_KEY - Your AgentQL API key (https://dev.agentql.com) + HUMANPAGES_API_KEY - Your Human Pages API key (https://humanpages.ai) +""" + +import json + +from agentql_humanpages import HumanFallbackAgent + +# Initialize the agent with both API keys (reads from env vars by default) +agent = HumanFallbackAgent() + +# Define the target URL and the data to extract +url = "https://www.ycombinator.com/companies" +query = """ +{ + companies[] { + name + description + batch + location + } +} +""" + +# Extract data -- AgentQL tries first, Human Pages is the fallback +result = agent.extract(url=url, query=query) + +# Check which source provided the data +if result["source"] == "agentql": + print("Data extracted via AgentQL:") + print(json.dumps(result["data"], indent=2)) +else: + print(f"Data extracted via Human Pages (job ID: {result['job_id']}):") + print(f"Job status: {result['status']['status']}") + for message in result["messages"]: + print(f" Message: {message.get('content', '')}") diff --git a/humanpages/pyproject.toml b/humanpages/pyproject.toml new file mode 100644 index 00000000..3b6d7311 --- /dev/null +++ b/humanpages/pyproject.toml @@ -0,0 +1,72 @@ +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "agentql-humanpages" +version = "0.1.0" +description = "An integration package connecting AgentQL and Human Pages for human-in-the-loop fallback" +authors = [] +readme = "README.md" +repository = "https://github.com/tinyfish-io/agentql-integrations" +license = "MIT" + +[tool.mypy] +disallow_untyped_defs = true + +[tool.poetry.urls] +"Source Code" = "https://github.com/tinyfish-io/agentql-integrations/tree/main/humanpages" + +[tool.poetry.dependencies] +python = ">=3.10,<4.0" +agentql = "^1.8.1" +httpx = "^0.28.1" +pydantic = "^2.4.0" + +[tool.ruff.lint] +select = ["E", "F", "I", "T201"] + +[tool.coverage.run] +omit = ["tests/*"] + +[tool.pytest.ini_options] +addopts = "--strict-markers --strict-config --durations=5" +markers = [ + "compile: mark placeholder test used to compile integration tests without running them", +] +asyncio_mode = "auto" + +[tool.poetry.group.test] +optional = true + +[tool.poetry.group.codespell] +optional = true + +[tool.poetry.group.test_integration] +optional = true + +[tool.poetry.group.lint] +optional = true + +[tool.poetry.group.dev] +optional = true + +[tool.poetry.group.dev.dependencies] + +[tool.poetry.group.test.dependencies] +pytest = "^7.4.3" +pytest-asyncio = "^0.23.2" +pytest-socket = "^0.7.0" +pytest-watcher = "^0.3.4" +respx = "^0.21.1" + +[tool.poetry.group.codespell.dependencies] +codespell = "^2.2.6" + +[tool.poetry.group.test_integration.dependencies] + +[tool.poetry.group.lint.dependencies] +ruff = "^0.5" + +[tool.poetry.group.typing.dependencies] +mypy = "^1.10" diff --git a/humanpages/tests/__init__.py b/humanpages/tests/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/humanpages/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/humanpages/tests/integration_tests/__init__.py b/humanpages/tests/integration_tests/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/humanpages/tests/integration_tests/__init__.py @@ -0,0 +1 @@ + diff --git a/humanpages/tests/unit_tests/__init__.py b/humanpages/tests/unit_tests/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/humanpages/tests/unit_tests/__init__.py @@ -0,0 +1 @@ + diff --git a/humanpages/tests/unit_tests/test_agent.py b/humanpages/tests/unit_tests/test_agent.py new file mode 100644 index 00000000..45822fe0 --- /dev/null +++ b/humanpages/tests/unit_tests/test_agent.py @@ -0,0 +1,191 @@ +"""Unit tests for the HumanFallbackAgent.""" + +from unittest.mock import patch + +import httpx +import pytest + +from agentql_humanpages.agent import HumanFallbackAgent + + +def _mock_response(status_code: int, json_data: object, method: str = "POST", url: str = "https://api.agentql.com/v1/query-data") -> httpx.Response: + """Create a mock httpx.Response with a request attached.""" + request = httpx.Request(method, url) + return httpx.Response(status_code, json=json_data, request=request) + + +@pytest.fixture() +def agent() -> HumanFallbackAgent: + """Create an agent with test keys.""" + return HumanFallbackAgent( + agentql_api_key="test-agentql-key", + humanpages_api_key="test-hp-key", + poll_interval=0, # No waiting in tests + max_poll_attempts=2, + ) + + +class TestHumanFallbackAgentInit: + def test_init_with_keys(self) -> None: + agent = HumanFallbackAgent( + agentql_api_key="aql-key", + humanpages_api_key="hp-key", + ) + assert agent._agentql_api_key == "aql-key" + + def test_init_from_env(self) -> None: + with patch.dict("os.environ", { + "AGENTQL_API_KEY": "env-aql", + "HUMANPAGES_API_KEY": "env-hp", + }): + agent = HumanFallbackAgent() + assert agent._agentql_api_key == "env-aql" + + def test_init_raises_without_agentql_key(self) -> None: + with patch.dict("os.environ", {}, clear=True): + with pytest.raises(ValueError, match="No AgentQL API key"): + HumanFallbackAgent(humanpages_api_key="hp-key") + + def test_init_raises_without_humanpages_key(self) -> None: + with patch.dict("os.environ", {}, clear=True): + with pytest.raises(ValueError, match="No Human Pages API key"): + HumanFallbackAgent(agentql_api_key="aql-key") + + +class TestExtractWithAgentQL: + def test_extract_success_via_agentql(self, agent: HumanFallbackAgent) -> None: + agentql_response = { + "data": {"products": [{"name": "Widget", "price": 9.99}]}, + "metadata": {"request_id": "abc123"}, + } + resp = _mock_response(200, agentql_response) + + with patch.object(httpx, "post", return_value=resp): + result = agent.extract( + url="https://example.com/products", + query="{ products[] { name price } }", + ) + + assert result["source"] == "agentql" + assert result["data"]["products"][0]["name"] == "Widget" + + def test_extract_requires_query_or_prompt(self, agent: HumanFallbackAgent) -> None: + with pytest.raises(ValueError, match="Either 'query' or 'prompt'"): + agent.extract(url="https://example.com") + + +class TestFallbackToHuman: + def test_fallback_on_agentql_http_error(self, agent: HumanFallbackAgent) -> None: + """When AgentQL returns an error, the agent falls back to Human Pages.""" + agentql_error = _mock_response(500, {"error": "Internal Server Error"}) + + mock_humans = [{"id": "h1", "name": "Alice"}] + mock_job = {"id": "job-1", "status": "pending"} + mock_status = {"id": "job-1", "status": "completed"} + mock_messages = [{"content": '{"products": [{"name": "Widget"}]}'}] + + hp_client = agent._hp_client + with ( + patch.object( + httpx, "post", + side_effect=httpx.HTTPStatusError( + "500", request=agentql_error.request, response=agentql_error + ), + ), + patch.object(hp_client, "search_humans", return_value=mock_humans), + patch.object(hp_client, "create_job", return_value=mock_job), + patch.object(hp_client, "get_job_status", return_value=mock_status), + patch.object(hp_client, "get_job_messages", return_value=mock_messages), + ): + result = agent.extract( + url="https://example.com", + query="{ products[] { name } }", + ) + + assert result["source"] == "humanpages" + assert result["job_id"] == "job-1" + assert len(result["messages"]) == 1 + + def test_fallback_on_agentql_empty_data(self, agent: HumanFallbackAgent) -> None: + """When AgentQL returns empty data, the agent falls back to Human Pages.""" + resp = _mock_response(200, {"data": None, "metadata": {}}) + + mock_humans = [{"id": "h1", "name": "Bob"}] + mock_job = {"id": "job-2", "status": "pending"} + mock_status = {"id": "job-2", "status": "completed"} + mock_messages = [{"content": "result data"}] + + hp_client = agent._hp_client + with ( + patch.object(httpx, "post", return_value=resp), + patch.object(hp_client, "search_humans", return_value=mock_humans), + patch.object(hp_client, "create_job", return_value=mock_job), + patch.object(hp_client, "get_job_status", return_value=mock_status), + patch.object(hp_client, "get_job_messages", return_value=mock_messages), + ): + result = agent.extract( + url="https://example.com", + prompt="Get all product names", + ) + + assert result["source"] == "humanpages" + + def test_fallback_no_humans_available(self, agent: HumanFallbackAgent) -> None: + """When no humans are available, a RuntimeError is raised.""" + resp = _mock_response(200, {"data": None, "metadata": {}}) + + hp_client = agent._hp_client + with ( + patch.object(httpx, "post", return_value=resp), + patch.object(hp_client, "search_humans", return_value=[]), + ): + with pytest.raises(RuntimeError, match="No humans are currently available"): + agent.extract( + url="https://example.com", + query="{ data }", + ) + + def test_fallback_job_timeout(self, agent: HumanFallbackAgent) -> None: + """When the job does not complete in time, a TimeoutError is raised.""" + resp = _mock_response(200, {"data": None, "metadata": {}}) + + mock_humans = [{"id": "h1", "name": "Carol"}] + mock_job = {"id": "job-3", "status": "pending"} + mock_status = {"id": "job-3", "status": "in_progress"} + + hp_client = agent._hp_client + with ( + patch.object(httpx, "post", return_value=resp), + patch.object(hp_client, "search_humans", return_value=mock_humans), + patch.object(hp_client, "create_job", return_value=mock_job), + patch.object(hp_client, "get_job_status", return_value=mock_status), + ): + with pytest.raises(TimeoutError, match="did not complete"): + agent.extract( + url="https://example.com", + query="{ data }", + ) + + def test_fallback_cancelled_job(self, agent: HumanFallbackAgent) -> None: + """When a job is cancelled, results are returned with empty messages.""" + resp = _mock_response(200, {"data": None, "metadata": {}}) + + mock_humans = [{"id": "h1", "name": "Dave"}] + mock_job = {"id": "job-4", "status": "pending"} + mock_status = {"id": "job-4", "status": "cancelled"} + + hp_client = agent._hp_client + with ( + patch.object(httpx, "post", return_value=resp), + patch.object(hp_client, "search_humans", return_value=mock_humans), + patch.object(hp_client, "create_job", return_value=mock_job), + patch.object(hp_client, "get_job_status", return_value=mock_status), + ): + result = agent.extract( + url="https://example.com", + query="{ data }", + ) + + assert result["source"] == "humanpages" + assert result["status"]["status"] == "cancelled" + assert result["messages"] == [] diff --git a/humanpages/tests/unit_tests/test_client.py b/humanpages/tests/unit_tests/test_client.py new file mode 100644 index 00000000..2dc4e3ac --- /dev/null +++ b/humanpages/tests/unit_tests/test_client.py @@ -0,0 +1,120 @@ +"""Unit tests for the HumanPagesClient.""" + +from unittest.mock import patch + +import httpx +import pytest + +from agentql_humanpages.client import HumanPagesClient +from agentql_humanpages.messages import ( + HUMANPAGES_UNAUTHORIZED_ERROR, + UNSET_HUMANPAGES_API_KEY_ERROR, +) + + +def _mock_response(status_code: int, json_data: object, method: str = "GET", url: str = "https://humanpages.ai") -> httpx.Response: + """Create a mock httpx.Response with a request attached.""" + request = httpx.Request(method, url) + return httpx.Response(status_code, json=json_data, request=request) + + +class TestHumanPagesClientInit: + def test_init_with_api_key(self) -> None: + client = HumanPagesClient(api_key="test-key-123") + assert client._api_key == "test-key-123" + + def test_init_from_env(self) -> None: + with patch.dict("os.environ", {"HUMANPAGES_API_KEY": "env-key-456"}): + client = HumanPagesClient() + assert client._api_key == "env-key-456" + + def test_init_raises_without_key(self) -> None: + with patch.dict("os.environ", {}, clear=True): + with pytest.raises(ValueError, match="No Human Pages API key"): + HumanPagesClient() + + def test_init_custom_base_url(self) -> None: + client = HumanPagesClient(api_key="k", base_url="https://custom.example.com/") + assert client._base_url == "https://custom.example.com" + + +class TestHumanPagesClientSearchHumans: + def test_search_humans_success(self) -> None: + mock_humans = [{"id": "h1", "name": "Alice", "skills": ["web task"]}] + resp = _mock_response(200, mock_humans) + + client = HumanPagesClient(api_key="test-key") + with patch.object(httpx, "get", return_value=resp): + result = client.search_humans(skill="web task") + + assert result == mock_humans + + def test_search_humans_unauthorized(self) -> None: + resp = _mock_response(401, {"error": "unauthorized"}) + + client = HumanPagesClient(api_key="bad-key") + with patch.object(httpx, "get", side_effect=httpx.HTTPStatusError( + "401", request=resp.request, response=resp + )): + with pytest.raises(ValueError, match="Invalid Human Pages API key"): + client.search_humans() + + +class TestHumanPagesClientCreateJob: + def test_create_job_success(self) -> None: + mock_job = {"id": "job-1", "status": "pending"} + resp = _mock_response(200, mock_job, method="POST") + + client = HumanPagesClient(api_key="test-key") + with patch.object(httpx, "post", return_value=resp): + result = client.create_job( + human_id="h1", + title="Extract data", + description="Get product info from example.com", + ) + + assert result == mock_job + + def test_create_job_with_custom_params(self) -> None: + mock_job = {"id": "job-2", "status": "pending"} + resp = _mock_response(200, mock_job, method="POST") + + client = HumanPagesClient(api_key="test-key") + with patch.object(httpx, "post", return_value=resp) as mock_post: + client.create_job( + human_id="h1", + title="Extract data", + description="Get data", + price_usdc=10.0, + deadline_hours=48, + ) + + call_kwargs = mock_post.call_args + payload = call_kwargs.kwargs["json"] + assert payload["priceUsdc"] == 10.0 + assert payload["deadlineHours"] == 48 + + +class TestHumanPagesClientJobStatus: + def test_get_job_status(self) -> None: + mock_status = {"id": "job-1", "status": "completed"} + resp = _mock_response(200, mock_status) + + client = HumanPagesClient(api_key="test-key") + with patch.object(httpx, "get", return_value=resp): + result = client.get_job_status("job-1") + + assert result["status"] == "completed" + + +class TestHumanPagesClientJobMessages: + def test_get_job_messages(self) -> None: + mock_messages = [{"id": "m1", "content": "Done!", "sender": "human"}] + resp = _mock_response(200, mock_messages) + + client = HumanPagesClient(api_key="test-key") + with patch.object(httpx, "get", return_value=resp): + result = client.get_job_messages("job-1") + + assert len(result) == 1 + assert result[0]["content"] == "Done!"