-
-
Notifications
You must be signed in to change notification settings - Fork 213
[ENH] experimental design for model indexing framework #1583
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
20e37a2
d79fbe5
937f03e
dc15c66
91c138c
778b87e
abc0f49
cd4495e
5a6161f
f6f050a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| """Global get dispatch utility.""" | ||
|
|
||
| # currently just a forward to models | ||
| # to discuss and possibly | ||
| # todo: add global get utility here | ||
| # in general, e.g., datasets will not have same name as models etc | ||
| from __future__ import annotations | ||
|
|
||
| from openml.models import get | ||
|
|
||
| __all__ = ["get"] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| """Module of base classes.""" | ||
|
|
||
| from openml.base._base import OpenMLBase | ||
| from openml.base._base_pkg import _BasePkg | ||
|
|
||
| __all__ = ["_BasePkg", "OpenMLBase"] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,118 @@ | ||
| """Base Packager class.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import inspect | ||
| import sys | ||
| import textwrap | ||
| from pathlib import Path | ||
|
|
||
| from skbase.base import BaseObject | ||
| from skbase.utils.dependencies import _check_estimator_deps | ||
|
|
||
|
|
||
| class _BasePkg(BaseObject): | ||
| _tags = { | ||
| "python_dependencies": None, | ||
| "python_version": None, | ||
| # package register and manifest | ||
| "pkg_id": None, # object id contained, "__multiple" if multiple | ||
| "pkg_obj": "reference", # or "code" | ||
| "pkg_obj_type": None, # openml API type | ||
| "pkg_compression": "zlib", # compression | ||
| "pkg_pypi_name": None, # PyPI package name of objects | ||
| } | ||
|
|
||
| def __init__(self): | ||
| super().__init__() | ||
|
|
||
| def materialize(self): | ||
| try: | ||
| _check_estimator_deps(obj=self) | ||
| except ModuleNotFoundError as e: | ||
| # prettier message, so the reference is to the pkg_id | ||
| # currently, we cannot simply pass the object name to skbase | ||
| # in the error message, so this is a hack | ||
| # todo: fix this in scikit-base | ||
| msg = str(e) | ||
| if len(msg) > 11: | ||
| msg = msg[11:] | ||
| raise ModuleNotFoundError(msg) from e | ||
|
|
||
| return self._materialize() | ||
|
|
||
| def _materialize(self): | ||
| raise RuntimeError("abstract method") | ||
|
|
||
| def serialize(self): | ||
| cls_str = class_to_source(type(self)) | ||
| compress_method = self.get_tag("pkg_compression") | ||
| if compress_method in [None, "None"]: | ||
| return cls_str | ||
|
|
||
| cls_str = cls_str.encode("utf-8") | ||
| exec(f"import {compress_method}") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess this seems save, because the tag is not user-supplied code. But it still makes me nervous to see this. We could just add a simple mapping layer to restrict what can be executed in here. Something like this: |
||
| return eval(f"{compress_method}.compress(cls_str)") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Couldn't we just use importlib for this? Something like this: module = importlib.import_module(compress_method)
cls_bytes = cls_str.encode("utf-8")
return module.compress(cls_bytes) |
||
|
|
||
|
|
||
| def _has_source(obj) -> bool: | ||
| """Return True if inspect.getsource(obj) should succeed.""" | ||
| module_name = getattr(obj, "__module__", None) | ||
| if not module_name or module_name not in sys.modules: | ||
| return False | ||
|
|
||
| module = sys.modules[module_name] | ||
| file = getattr(module, "__file__", None) | ||
| if not file: | ||
| return False | ||
|
|
||
| return Path(file).suffix == ".py" | ||
|
|
||
|
|
||
| def class_to_source(cls) -> str: | ||
| """Return full source definition of python class as string. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| cls : class to serialize | ||
|
|
||
| Returns | ||
| ------- | ||
| str : complete definition of cls, as str. | ||
| Imports are not contained or serialized. | ||
| """ "" | ||
|
|
||
| # Fast path: class has retrievable source | ||
| if _has_source(cls): | ||
| source = inspect.getsource(cls) | ||
| return textwrap.dedent(source) | ||
|
|
||
| # Fallback for dynamically created classes | ||
| lines = [] | ||
|
|
||
| bases = [base.__name__ for base in cls.__bases__ if base is not object] | ||
| base_str = f"({', '.join(bases)})" if bases else "" | ||
| lines.append(f"class {cls.__name__}{base_str}:") | ||
|
|
||
| body_added = False | ||
|
|
||
| for name, value in cls.__dict__.items(): | ||
| if name.startswith("__") and name.endswith("__"): | ||
| continue | ||
|
|
||
| if inspect.isfunction(value): | ||
| if _has_source(value): | ||
| method_src = inspect.getsource(value) | ||
| method_src = textwrap.indent(textwrap.dedent(method_src), " ") | ||
| lines.append(method_src) | ||
| else: | ||
| lines.append(f" def {name}(self): ...") | ||
| body_added = True | ||
| else: | ||
| lines.append(f" {name} = {value!r}") | ||
| body_added = True | ||
|
|
||
| if not body_added: | ||
| lines.append(" pass") | ||
|
|
||
| return "\n".join(lines) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| """Module with packaging adapters.""" | ||
|
|
||
| from openml.models._get import get | ||
|
|
||
| __all__ = ["get"] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| """Model retrieval utility.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from functools import lru_cache | ||
|
|
||
|
|
||
| def get(id: str): | ||
| """Retrieve model object with unique identifier. | ||
|
|
||
| Parameter | ||
| --------- | ||
| id : str | ||
| unique identifier of object to retrieve | ||
|
|
||
| Returns | ||
| ------- | ||
| class | ||
| retrieved object | ||
|
|
||
| Raises | ||
| ------ | ||
| ModuleNotFoundError | ||
| if dependencies of object to retrieve are not satisfied | ||
| """ | ||
| id_lookup = _id_lookup() | ||
| obj = id_lookup.get(id) | ||
| if obj is None: | ||
| raise ValueError(f"Error in openml.get, object with package id {id} " "does not exist.") | ||
| return obj(id).materialize() | ||
|
|
||
|
|
||
| # todo: need to generalize this later to more types | ||
| # currently intentionally retrieves only classifiers | ||
| # todo: replace this, optionally, by database backend | ||
| def _id_lookup(obj_type=None): | ||
| return _id_lookup_cached(obj_type=obj_type).copy() | ||
|
|
||
|
|
||
| @lru_cache | ||
| def _id_lookup_cached(obj_type=None): | ||
| all_objs = _all_objects(obj_type=obj_type) | ||
|
|
||
| lookup_dict = {} | ||
| for obj in all_objs: | ||
| obj_index = obj.get_class_tag("pkg_id") | ||
| if obj_index != "__multiple": | ||
| lookup_dict[obj_index] = obj | ||
| else: | ||
| obj_all_ids = obj.contained_ids() | ||
| lookup_dict.update({obj_id: obj for obj_id in obj_all_ids}) | ||
|
|
||
| return lookup_dict | ||
|
|
||
|
|
||
| @lru_cache | ||
| def _all_objects(obj_type=None): | ||
| from skbase.lookup import all_objects | ||
|
|
||
| from openml.models.apis._classifier import _ModelPkgClassifier | ||
|
|
||
| return all_objects(object_types=_ModelPkgClassifier, package_name="openml", return_names=False) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| """Module with packaging adapters.""" | ||
|
|
||
| from openml.models.apis._classifier import _ModelPkgClassifier | ||
|
|
||
| __all__ = ["_ModelPkgClassifier"] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| """Base package for sklearn classifiers.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from openml.models.base import _OpenmlModelPkg | ||
|
|
||
|
|
||
| class _ModelPkgClassifier(_OpenmlModelPkg): | ||
| _tags = { | ||
| # tags specific to API type | ||
| "pkg_obj_type": "classifier", | ||
| } | ||
|
|
||
| def get_obj_tags(self): | ||
| """Return tags of the object as a dictionary.""" | ||
| return {} # this needs to be implemented | ||
|
|
||
| def get_obj_param_names(self): | ||
| """Return parameter names of the object as a list. | ||
|
|
||
| Returns | ||
| ------- | ||
| list: names of object parameters | ||
| """ | ||
| return list(self.materialize()().get_params().keys()) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| """Module with packaging adapters.""" | ||
|
|
||
| from openml.models.base._base import _OpenmlModelPkg | ||
|
|
||
| __all__ = ["_OpenmlModelPkg"] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| """Base model package class.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from openml.base import _BasePkg | ||
|
|
||
|
|
||
| class _OpenmlModelPkg(_BasePkg): | ||
| _obj = None | ||
| _obj_dict = {} | ||
|
|
||
| def __init__(self, id=None): | ||
| super().__init__() | ||
|
|
||
| pkg_id = self.get_tag("pkg_id") | ||
| if pkg_id == "__multiple": | ||
| self._obj = self._obj_dict.get(id, None) | ||
|
|
||
| @classmethod | ||
| def contained_ids(cls): | ||
| """Return list of ids of objects contained in this package. | ||
|
|
||
| Returns | ||
| ------- | ||
| ids : list of str | ||
| list of unique identifiers of objects contained in this package | ||
| """ | ||
| pkg_id = cls.get_class_tag("pkg_id") | ||
| if pkg_id != "__multiple": | ||
| return [cls.get_class_tag("pkg_id")] | ||
| return list(cls._obj_dict.keys()) | ||
|
|
||
| def _materialize(self): | ||
| pkg_obj = self.get_tag("pkg_obj") | ||
|
|
||
| _obj = self._obj | ||
|
|
||
| if _obj is None: | ||
| raise ValueError( | ||
| "Error in materialize." | ||
| "Either _materialize must be implemented, or" | ||
| "the _obj attribute must be not None." | ||
| ) | ||
|
|
||
| if pkg_obj == "reference": | ||
| from skbase.utils.dependencies import _safe_import | ||
|
|
||
| obj_loc = self._obj | ||
| pkg_name = self.get_tag("pkg_pypi_name") | ||
|
|
||
| return _safe_import(obj_loc, pkg_name=pkg_name) | ||
|
|
||
| if pkg_obj == "code": | ||
| exec(self._obj) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this line for importing a module again? Can't we use something different then 'exec'? |
||
|
|
||
| return obj | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is 'obj' where is it defined? I do not see it in this scope. |
||
|
|
||
| # elif pkg_obj == "craft": | ||
| # identify and call appropriate craft method | ||
|
|
||
| raise ValueError( | ||
| 'Error in package tag "pkg_obj", ' | ||
| 'must be one of "reference", "code", "craft", ' | ||
| f"but found value {pkg_obj}, of type {type(pkg_obj)}" | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| """Sklearn classification models.""" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| """Auto-sklearn classifier.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from openml.models.apis import _ModelPkgClassifier | ||
|
|
||
|
|
||
| class OpenmlPkg__AutoSklearnClassifier(_ModelPkgClassifier): | ||
| _tags = { | ||
| "pkg_id": "AutoSklearnClassifier", | ||
| "python_dependencies": "auto-sklearn", | ||
| "pkg_pypi_name": "auto-sklearn", | ||
| } | ||
|
|
||
| _obj = "autosklearn.classification.AutoSklearnClassifier" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This just selects the compression algorithm, right?