Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 34 additions & 86 deletions scrapegraphai/telemetry/telemetry.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,3 @@
"""
This module contains code that relates to sending ScrapeGraphAI usage telemetry.

To disable sending telemetry there are three ways:

1. Set it to false programmatically in your driver:
>>> from scrapegraphai import telemetry
>>> telemetry.disable_telemetry()
2. Set it to `false` in ~/.scrapegraphai.conf under `DEFAULT`
[DEFAULT]
telemetry_enabled = False
3. Set SCRAPEGRAPHAI_TELEMETRY_ENABLED=false as an environment variable:
SCRAPEGRAPHAI_TELEMETRY_ENABLED=false python run.py
or:
export SCRAPEGRAPHAI_TELEMETRY_ENABLED=false
"""

import configparser
import functools
import importlib.metadata
Expand All @@ -27,17 +10,19 @@
from typing import Callable, Dict
from urllib import request

# Load version
VERSION = importlib.metadata.version("scrapegraphai")
STR_VERSION = ".".join([str(i) for i in VERSION])
HOST = "https://eu.i.posthog.com"
TRACK_URL = f"{HOST}/capture/" # https://posthog.com/docs/api/post-only-endpoints
API_KEY = "phc_orsfU4aHhtpTSLVcUE2hdUkQDLM4OEQZndKGFBKMEtn"

# 🚀 Your proxy service endpoint (instead of PostHog)
PROXY_URL = "https://scrapegraph-proxy.onrender.com/capture/"

TIMEOUT = 2
DEFAULT_CONFIG_LOCATION = os.path.expanduser("~/.scrapegraphai.conf")

logger = logging.getLogger(__name__)


# Everything below remains mostly same
def _load_config(config_location: str) -> configparser.ConfigParser:
config = configparser.ConfigParser()
try:
Expand All @@ -59,28 +44,22 @@ def _load_config(config_location: str) -> configparser.ConfigParser:
return config


def _check_config_and_environ_for_telemetry_flag(
telemetry_default: bool, config_obj: configparser.ConfigParser
) -> bool:
telemetry_enabled = telemetry_default
def _check_config_and_environ_for_telemetry_flag(default_value: bool, config_obj):
telemetry_enabled = default_value
if "telemetry_enabled" in config_obj["DEFAULT"]:
try:
telemetry_enabled = config_obj.getboolean("DEFAULT", "telemetry_enabled")
except ValueError as e:
logger.debug(
f"""Unable to parse value for
`telemetry_enabled` from config. Encountered {e}"""
)
except Exception:
pass

if os.environ.get("SCRAPEGRAPHAI_TELEMETRY_ENABLED") is not None:
env_value = os.environ.get("SCRAPEGRAPHAI_TELEMETRY_ENABLED")
config_obj["DEFAULT"]["telemetry_enabled"] = env_value
try:
telemetry_enabled = config_obj.getboolean("DEFAULT", "telemetry_enabled")
except ValueError as e:
logger.debug(
f"""Unable to parse value for `SCRAPEGRAPHAI_TELEMETRY_ENABLED`
from environment. Encountered {e}"""
telemetry_enabled = config_obj.getboolean(
"DEFAULT", "telemetry_enabled"
)
except Exception:
pass
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Environment variable telemetry override is completely broken

The environment variable SCRAPEGRAPHAI_TELEMETRY_ENABLED check is broken. The code checks if the variable is set (os.environ.get(...) is not None) but then reads from config_obj instead of using the actual environment variable value. The original code first assigned the env var value into config_obj["DEFAULT"]["telemetry_enabled"] = env_value before reading it back, but this assignment was removed. Now setting SCRAPEGRAPHAI_TELEMETRY_ENABLED=false won't actually disable telemetry since the env var value is never used.

Fix in Cursor Fix in Web


return telemetry_enabled


Expand All @@ -90,87 +69,70 @@ def _check_config_and_environ_for_telemetry_flag(
CALL_COUNTER = 0
MAX_COUNT_SESSION = 1000


BASE_PROPERTIES = {
"os_type": os.name,
"os_version": platform.platform(),
"python_version": f"{platform.python_version()}/{platform.python_implementation()}",
"distinct_id": g_anonymous_id,
"scrapegraphai_version": VERSION,
"telemetry_version": "0.0.3",
"telemetry_version": "0.0.4-proxy",
}


def disable_telemetry():
"""
function for disabling the telemetries
"""
global g_telemetry_enabled
g_telemetry_enabled = False


def is_telemetry_enabled() -> bool:
"""
function for checking if a telemetry is enables
"""
if g_telemetry_enabled:
global CALL_COUNTER
if CALL_COUNTER == 0:
logger.debug(
"Note: ScrapeGraphAI collects anonymous usage data to improve the library. "
"You can disable telemetry by setting SCRAPEGRAPHAI_TELEMETRY_ENABLED=false or "
"by editing ~/.scrapegraphai.conf."
)
CALL_COUNTER += 1
if CALL_COUNTER > MAX_COUNT_SESSION:
return False
return True
else:
return False
return False


# ⭐ UPDATED FOR PROXY — send without API key
def _send_event_json(event_json: dict):
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {API_KEY}",
"User-Agent": f"scrapegraphai/{STR_VERSION}",
}
try:
data = json.dumps(event_json).encode()
req = request.Request(TRACK_URL, data=data, headers=headers)
req = request.Request(PROXY_URL, data=data, headers=headers)

with request.urlopen(req, timeout=TIMEOUT) as f:
res = f.read()
response_body = f.read()
if f.code != 200:
raise RuntimeError(res)
raise RuntimeError(response_body)
except Exception as e:
logger.debug(f"Failed to send telemetry data: {e}")
logger.debug(f"Failed to send telemetry data to proxy: {e}")
else:
logger.debug(f"Telemetry data sent: {data}")
logger.debug(f"Telemetry payload forwarded to proxy: {data}")


def send_event_json(event_json: dict):
"""
fucntion for sending event json
"""
if not g_telemetry_enabled:
raise RuntimeError("Telemetry tracking is disabled!")
try:
th = threading.Thread(target=_send_event_json, args=(event_json,))
th.start()
except Exception as e:
logger.debug(f"Failed to send telemetry data in a thread: {e}")
logger.debug(f"Telemetry dispatch thread failed: {e}")


def log_event(event: str, properties: Dict[str, any]):
"""
function for logging the events
"""
if is_telemetry_enabled():
event_json = {
"api_key": API_KEY,
payload = {
"event": event,
"distinct_id": g_anonymous_id,
"properties": {**BASE_PROPERTIES, **properties},
}
send_event_json(event_json)
send_event_json(payload)


def log_graph_execution(
Expand All @@ -188,10 +150,7 @@ def log_graph_execution(
exception: str = None,
total_tokens: int = None,
):
"""
function for logging the graph execution
"""
properties = {
props = {
"graph_name": graph_name,
"source": source,
"prompt": prompt,
Expand All @@ -207,26 +166,15 @@ def log_graph_execution(
"total_tokens": total_tokens,
"type": "community-library",
}
log_event("graph_execution", properties)
log_event("graph_execution", props)


def capture_function_usage(call_fn: Callable) -> Callable:
"""
function that captures the usage
"""

@functools.wraps(call_fn)
def wrapped_fn(*args, **kwargs):
try:
return call_fn(*args, **kwargs)
finally:
if is_telemetry_enabled():
try:
function_name = call_fn.__name__
log_event("function_usage", {"function_name": function_name})
except Exception as e:
logger.debug(
f"Failed to send telemetry for function usage. Encountered: {e}"
)

return wrapped_fn
log_event("function_usage", {"function_name": call_fn.__name__})
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Removed error handling exposes race condition in decorator

The capture_function_usage decorator removed the try/except block that previously wrapped the log_event call. Since send_event_json raises RuntimeError when telemetry is disabled, and there's a race condition where g_telemetry_enabled could be set to False (via disable_telemetry()) between the is_telemetry_enabled() check and the send_event_json call, any exception will now propagate up from the finally block. This could unexpectedly affect the decorated function's behavior when the original code silently caught these errors.

Fix in Cursor Fix in Web

return wrapped_fn
Loading