diff --git a/hed/__init__.py b/hed/__init__.py index 107ce4ad..430800b7 100644 --- a/hed/__init__.py +++ b/hed/__init__.py @@ -1,7 +1,7 @@ from hed.models.hed_string import HedString from hed.models.hed_tag import HedTag from hed.errors.error_reporter import get_printable_issue_string -from hed.errors.exceptions import HedFileError, HedExceptions +from hed.errors.exceptions import HedFileError, HedExceptions, HedQueryError from hed.models.base_input import BaseInput from hed.models.spreadsheet_input import SpreadsheetInput diff --git a/hed/errors/exceptions.py b/hed/errors/exceptions.py index d63c78b6..e77486e4 100644 --- a/hed/errors/exceptions.py +++ b/hed/errors/exceptions.py @@ -66,3 +66,12 @@ def __init__(self, code, message, filename, issues=None): self.issues = issues if self.issues is None: self.issues = [] + + +class HedQueryError(ValueError): + """Exception raised when a HED query string cannot be parsed. + + Inherits from :class:`ValueError` so that existing ``except ValueError`` handlers + continue to work, while allowing callers that need finer-grained control to + catch only query parse errors with ``except HedQueryError``. + """ diff --git a/hed/models/__init__.py b/hed/models/__init__.py index c176e8b6..f2dea62d 100644 --- a/hed/models/__init__.py +++ b/hed/models/__init__.py @@ -1,4 +1,51 @@ -"""Data structures for HED tag handling.""" +"""HED data models: strings, tags, groups, inputs, queries, and definitions. + +This module provides the core data structures used to represent, validate, and +transform HED-annotated data. A loaded :class:`~hed.schema.HedSchema` (from +``hed.schema``) is typically passed in when constructing these objects. + +Typical usage +------------- +Parse and validate a raw HED string:: + + from hed.schema import load_schema_version + from hed.models import HedString + + schema = load_schema_version("8.3.0") + hs = HedString("Sensory-event, (Action, Move/Flexion)", schema) + issues = hs.validate(schema) + +Load a BIDS events file with a sidecar:: + + from hed.models import TabularInput, Sidecar + + sidecar = Sidecar("task-rest_events.json", name="MySidecar") + events = TabularInput("sub-01_task-rest_events.tsv", sidecar=sidecar) + issues = events.validate(schema) + +Search HED annotations with a query:: + + from hed.models import QueryHandler + + query = QueryHandler("Sensory-event && Action") + matches = query.search(hs) + +Key exports +----------- +- :class:`HedString` — a parsed HED annotation string (root of the parse tree). +- :class:`HedTag` — a single HED tag with schema linkage and canonical form. +- :class:`HedGroup` — a parenthesised group of tags and nested groups. +- :class:`TabularInput` — a BIDS-style TSV events file with optional sidecar. +- :class:`Sidecar` — a BIDS JSON sidecar mapping column values to HED strings. +- :class:`SpreadsheetInput` — an Excel / TSV spreadsheet with HED columns. +- :class:`TimeseriesInput` — a continuous time-series file with HED annotations. +- :class:`DefinitionDict` — a collection of resolved HED Def/Def-expand definitions. +- :class:`QueryHandler` — compile and execute queries against HED strings. +- :func:`get_query_handlers` / :func:`search_hed_objs` — convenience helpers for + batch querying. +- :func:`convert_to_form`, :func:`shrink_defs`, :func:`expand_defs`, + :func:`process_def_expands` — DataFrame-level HED transformation utilities. +""" from .base_input import BaseInput from .column_mapper import ColumnMapper diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py index ddeb5663..57bfc378 100644 --- a/hed/models/column_mapper.py +++ b/hed/models/column_mapper.py @@ -9,6 +9,7 @@ import copy from collections import Counter +from functools import partial PANDAS_COLUMN_PREFIX_TO_IGNORE = "Unnamed: " NO_WARN_COLUMNS = ["onset", "duration"] @@ -110,20 +111,14 @@ def get_transformers(self): if isinstance(assign_to_column, int): if self._column_map: assign_to_column = self._column_map[assign_to_column] - else: - assign_to_column = assign_to_column if column.column_type == ColumnType.Ignore: continue elif column.column_type == ColumnType.Value: value_str = column.hed_dict - from functools import partial - final_transformers[assign_to_column] = partial(self._value_handler, value_str) elif column.column_type == ColumnType.Categorical: need_categorical.append(column.column_name) category_values = column.hed_dict - from functools import partial - final_transformers[assign_to_column] = partial(self._category_handler, category_values) else: final_transformers[assign_to_column] = lambda x: x diff --git a/hed/models/def_expand_gather.py b/hed/models/def_expand_gather.py index 883cd141..adca41ee 100644 --- a/hed/models/def_expand_gather.py +++ b/hed/models/def_expand_gather.py @@ -32,7 +32,10 @@ def add_def(self, def_tag, def_expand_group): def_extension = def_tag.extension.split("/") existing_contents = self.actual_contents.get(def_extension[1], None) if existing_contents and existing_contents != orig_group: - raise ValueError("Invalid Definition") + raise ValueError( + f"Definition '{def_extension[0]}' has conflicting contents for value '{def_extension[1]}': " + f"existing={existing_contents} vs new={orig_group}" + ) elif existing_contents: return self.actual_contents[def_extension[1]] = orig_group.copy() @@ -42,14 +45,20 @@ def add_def(self, def_tag, def_expand_group): tag for tag in orig_group.get_all_tags() if tag.extension == def_extension[1] and tag.is_takes_value_tag() ] if len(matching_tags) == 0: - raise ValueError("Invalid Definition") + raise ValueError( + f"Definition '{def_extension[0]}': no takes-value tag with extension '{def_extension[1]}' " + f"found in group {orig_group}" + ) matching_names = {tag.short_base_tag for tag in matching_tags} if self.matching_names is not None: self.matching_names = self.matching_names & matching_names else: self.matching_names = matching_names if len(self.matching_names) == 0: - raise ValueError("Invalid Definition") + raise ValueError( + f"Definition '{def_extension[0]}': no tag name is consistently the takes-value tag across " + f"all observed values — candidate names were {matching_names}" + ) def resolve_definition(self): """Try to resolve the definition based on the information available. @@ -84,7 +93,10 @@ def resolve_definition(self): self.resolved_definition = candidate_contents return True if len(candidate_tags) == 0 or (1 < len(candidate_tags) < len(tuple_list)): - raise ValueError("Invalid Definition") + raise ValueError( + f"Definition '{self.def_tag_name}': could not resolve a unique takes-value tag — " + f"found {len(candidate_tags)} candidate(s) across {len(tuple_list)} value(s)" + ) return False def get_definition_string(self): diff --git a/hed/models/hed_group.py b/hed/models/hed_group.py index 87de6948..bf80dd9a 100644 --- a/hed/models/hed_group.py +++ b/hed/models/hed_group.py @@ -133,7 +133,10 @@ def remove(self, items_to_remove: Iterable[Union[HedTag, "HedGroup"]]): item._parent = None def __copy__(self): - raise ValueError("Cannot make shallow copies of HedGroups") + raise copy.Error( + "Shallow copy of HedGroup is not supported: _parent pointers would alias the original. " + "Use .copy() for a deep copy." + ) def copy(self) -> "HedGroup": """Return a deep copy of this group. diff --git a/hed/models/hed_string.py b/hed/models/hed_string.py index 678e9bd7..4cdd4577 100644 --- a/hed/models/hed_string.py +++ b/hed/models/hed_string.py @@ -34,6 +34,17 @@ def __init__(self, hed_string, hed_schema, def_dict=None, _contents=None): try: contents = self.split_into_groups(hed_string, hed_schema, def_dict) except ValueError: + # ValueError is raised by split_into_groups for structurally malformed + # strings (mismatched or misordered parentheses). Rather than raising + # here, we fall back to an empty parse tree so that the object can be + # passed to the validator, which will independently detect and report + # the structural error through check_count_tag_group_parentheses / + # check_delimiter_issues_in_hed_string on the raw string. + # + # Callers that construct HedString without running it through + # HedValidator will receive an empty children list with no error + # indication. Always validate after construction if correctness is + # required. contents = [] super().__init__(hed_string, contents=contents, startpos=0, endpos=len(hed_string)) self._schema = hed_schema diff --git a/hed/models/query_handler.py b/hed/models/query_handler.py index c6e3ebed..e4473b2f 100644 --- a/hed/models/query_handler.py +++ b/hed/models/query_handler.py @@ -12,6 +12,7 @@ ExpressionExactMatch, ) from hed.models.query_util import Token +from hed.errors.exceptions import HedQueryError class QueryHandler: @@ -76,7 +77,7 @@ def _get_next_token(self): """Returns the current token and advances the counter""" self.at_token += 1 if self.at_token >= len(self.tokens): - raise ValueError("Parse error in get next token") + raise HedQueryError("Parse error in get next token") return self.tokens[self.at_token] def _next_token_is(self, kinds): @@ -94,7 +95,7 @@ def _parse(self, expression_string): expr = self._handle_or_op() if self.at_token + 1 != len(self.tokens): - raise ValueError("Parse error in search string") + raise HedQueryError("Parse error in search string") return expr @@ -137,7 +138,7 @@ def _handle_negation(self): if next_token == Token.LogicalNegation: interior = self._handle_grouping_op() if "?" in str(interior): - raise ValueError( + raise HedQueryError( "Cannot negate wildcards, or expressions that contain wildcards." "Use {required_expression : optional_expression}." ) @@ -152,13 +153,13 @@ def _handle_grouping_op(self): expr = self._handle_or_op() next_token = self._next_token_is([Token.LogicalGroupEnd]) if next_token != Token.LogicalGroupEnd: - raise ValueError("Parse error: Missing closing paren") + raise HedQueryError("Parse error: Missing closing paren") elif next_token == Token.DescendantGroup: interior = self._handle_or_op() expr = ExpressionDescendantGroup(next_token, right=interior) next_token = self._next_token_is([Token.DescendantGroupEnd]) if next_token != Token.DescendantGroupEnd: - raise ValueError("Parse error: Missing closing square bracket") + raise HedQueryError("Parse error: Missing closing square bracket") elif next_token == Token.ExactMatch: interior = self._handle_or_op() expr = ExpressionExactMatch(next_token, right=interior) @@ -172,14 +173,14 @@ def _handle_grouping_op(self): expr.left = optional_portion next_token = self._next_token_is([Token.ExactMatchEnd]) if "~" in str(expr): - raise ValueError( + raise HedQueryError( "Cannot use negation in exact matching groups," " as it's not clear what is being matched.\n" "{thing and ~(expression)} is allowed." ) if next_token is None: - raise ValueError("Parse error: Missing closing curly bracket") + raise HedQueryError("Parse error: Missing closing curly bracket") else: next_token = self._get_next_token() if next_token and next_token.kind == Token.Wildcard: diff --git a/hed/schema/__init__.py b/hed/schema/__init__.py index 4ef671e3..f381bd6b 100644 --- a/hed/schema/__init__.py +++ b/hed/schema/__init__.py @@ -1,4 +1,44 @@ -"""Data structures for handling the HED schema.""" +"""HED schema loading, caching, and introspection. + +This module exposes everything needed to load and inspect HED schemas — the +vocabularies that define valid HED tags. + +Typical usage +------------- +Load a released schema by version number (auto-downloaded and cached):: + + from hed.schema import load_schema_version + schema = load_schema_version("8.3.0") + +Load a schema from a local file or URL:: + + from hed.schema import load_schema + schema = load_schema("/path/to/HED8.3.0.xml") + +Load a library schema alongside a standard schema:: + + schema = load_schema_version(["8.3.0", "sc:score_1.0.0"]) + +Key exports +----------- +- :class:`HedSchema` — a single loaded schema; use it to validate tags. +- :class:`HedSchemaGroup` — two or more schemas used together (base + libraries). +- :func:`load_schema` — load from a file path or URL. +- :func:`load_schema_version` — load by version string(s), with caching. +- :func:`from_string` — parse a schema from an in-memory string. +- :func:`from_dataframes` — reconstruct a schema from TSV DataFrames. +- :data:`HedKey` / :data:`HedSectionKey` — enumerations of schema attribute and + section names used when querying schema entries. +- :func:`get_hed_versions` — list versions available in the local cache. +- :func:`get_hed_xml_version` — read the HED version string from an XML schema file on disk. +- :func:`cache_xml_versions` — pre-populate the local cache from the HED GitHub + releases. + +See also +-------- +``hed.models`` for data structures that *use* a loaded schema (HedString, HedTag, +TabularInput, etc.). +""" from .hed_schema import HedSchema from .hed_schema_entry import HedSchemaEntry, UnitClassEntry, UnitEntry, HedTagEntry diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py index 491d22f9..044c415d 100644 --- a/hed/schema/hed_schema.py +++ b/hed/schema/hed_schema.py @@ -446,39 +446,16 @@ def __eq__(self, other): if other is None: return False if self.get_save_header_attributes() != other.get_save_header_attributes(): - # print(f"Header attributes not equal: '{self.get_save_header_attributes()}' vs '{other.get_save_header_attributes()}'") return False if self.has_duplicates() != other.has_duplicates(): - # print(f"Duplicates: '{self.has_duplicates()}' vs '{other.has_duplicates()}'") return False if self.prologue.strip() != other.prologue.strip(): - # print(f"PROLOGUE NOT EQUAL: '{self.prologue.strip()}' vs '{other.prologue.strip()}'") return False if self.epilogue.strip() != other.epilogue.strip(): - # print(f"EPILOGUE NOT EQUAL: '{self.epilogue.strip()}' vs '{other.epilogue.strip()}'") return False if self._sections != other._sections: - # This block is useful for debugging when modifying the schema class itself. - # for section1, section2 in zip(self._sections.values(), other._sections.values()): - # if section1 != section2: - # dict1 = section1.all_names - # dict2 = section2.all_names - # if dict1 != dict2: - # print(f"DICT {section1._section_key} NOT EQUAL") - # key_union = set(list(dict1.keys()) + list(dict2.keys())) - # for key in key_union: - # if key not in dict1: - # print(f"{key} not in dict1") - # continue - # if key not in dict2: - # print(f"{key} not in dict2") - # continue - # if dict1[key] != dict2[key]: - # s = f"{key} unmatched: '{str(dict1[key].name)}' vs '{str(dict2[key].name)}'" - # print(s) return False if self._namespace != other._namespace: - # print(f"NAMESPACE NOT EQUAL: '{self._namespace}' vs '{other._namespace}'") return False return True diff --git a/hed/schema/hed_schema_entry.py b/hed/schema/hed_schema_entry.py index ea6806c3..196dfee4 100644 --- a/hed/schema/hed_schema_entry.py +++ b/hed/schema/hed_schema_entry.py @@ -104,7 +104,6 @@ def _set_attribute_value(self, attribute, attribute_value): # todo: remove this patch and redo the code # This check doesn't need to be done if the schema is valid. if attribute not in self._section.valid_attributes: - # print(f"Unknown attribute {attribute}") if self._unknown_attributes is None: self._unknown_attributes = {} self._unknown_attributes[attribute] = attribute_value @@ -406,8 +405,10 @@ def _finalize_inherited_attributes(self): # Replace the list with a copy we can modify. self.inherited_attributes = self.attributes.copy() for attribute in self._section.inheritable_attributes: - if self._check_inherited_attribute(attribute): - self.inherited_attributes[attribute] = self._check_inherited_attribute(attribute, True) + value = self._check_inherited_attribute(attribute, return_value=True) + # None means "not found in the hierarchy"; attribute values themselves are never None. + if value is not None: + self.inherited_attributes[attribute] = value def finalize_entry(self, schema): """Called once after schema loading to set state. diff --git a/hed/schema/schema_io/schema2base.py b/hed/schema/schema_io/schema2base.py index ca0a57e0..719ea3ba 100644 --- a/hed/schema/schema_io/schema2base.py +++ b/hed/schema/schema_io/schema2base.py @@ -5,6 +5,38 @@ class Schema2Base: + """Template base class for all HED schema serializers (mediawiki, XML, JSON, TSV). + + Subclasses implement the format-specific ``_output_*`` hooks; the shared + traversal logic lives entirely in :meth:`process_schema`. + + **Extension points** — override these abstract methods in each subclass: + + * ``_initialize_output`` — reset/create the output container + * ``_output_header`` — write the HED header line / element + * ``_output_prologue`` — write the prologue block + * ``_output_tags`` — write the main tag hierarchy + * ``_output_units`` — write unit-class definitions + * ``_output_section`` — write a generic named section (unit modifiers, value classes, attributes, properties) + * ``_output_annotations`` — write annotation-class definitions + * ``_output_extras`` — optional hook for format-specific extra sections (default: no-op) + * ``_output_epilogue`` — write the epilogue block + * ``_output_footer`` — write any closing structure + + .. note:: **Adding a new schema section type** + + :meth:`process_schema` is the *single source of truth* for section traversal + order. When a new :class:`~hed.schema.hed_schema_constants.HedSectionKey` + is added to the schema, **every** step below must be updated: + + 1. Add the new ``_output_*`` call to :meth:`process_schema` (this file). + 2. Implement the hook in **all four** serializer subclasses: + ``schema2wiki.py``, ``schema2xml.py``, ``schema2json.py``, + ``schema2df.py``. + 3. Add a matching reader branch in the corresponding ``*2schema.py`` + loader(s) so round-trips stay symmetric. + """ + def __init__(self): # Placeholder output variable self.output = None @@ -15,15 +47,29 @@ def __init__(self): self._schema = None def process_schema(self, hed_schema, save_merged=False): - """Takes a HedSchema object and returns it in the inherited form(MEDIAWIKI, XML, etc) + """Convert a HedSchema object to the subclass's output format (mediawiki, XML, JSON, or TSV). + + This method owns the **canonical section-traversal order** for all serializers. + Each ``_output_*`` call delegates to the format-specific subclass hook. + + .. warning:: + If a new :class:`~hed.schema.hed_schema_constants.HedSectionKey` is added + to the schema, a new ``_output_*`` call must be inserted here *and* the + matching hook must be implemented in each of the four serializer subclasses + (``schema2wiki``, ``schema2xml``, ``schema2json``, ``schema2df``). Parameters: - hed_schema (HedSchema): The schema to be processed. - save_merged (bool): If True, save as merged schema if has "withStandard". + hed_schema (HedSchema): The schema to be serialized. + save_merged (bool): If True, serialize as a merged (fully expanded) schema + when the schema has a ``withStandard`` attribute; ignored for standard + schemas (which are always saved fully). Returns: - Any: Varies based on inherited class + Any: Format-dependent output object (string, ElementTree, dict, or DataFrame + dict depending on the subclass). + Raises: + HedFileError: If the schema cannot be saved (e.g., merged multi-library schema). """ if not hed_schema.can_save(): raise HedFileError( @@ -78,7 +124,12 @@ def _output_annotations(self, hed_schema): raise NotImplementedError("This needs to be defined in the subclass") def _output_extras(self, hed_schema): - raise NotImplementedError("This needs to be defined in the subclass") + """Optional hook for format-specific sections not covered by the standard traversal. + + The base implementation is a deliberate no-op. Subclasses that need to + emit additional content (e.g. the header-attributes sheet in TSV) override + this method; subclasses that have nothing extra can safely omit it. + """ def _output_epilogue(self, epilogue): raise NotImplementedError("This needs to be defined in the subclass") diff --git a/hed/validator/reserved_checker.py b/hed/validator/reserved_checker.py index da445f7c..b18dcb07 100644 --- a/hed/validator/reserved_checker.py +++ b/hed/validator/reserved_checker.py @@ -177,9 +177,3 @@ def get_incompatible(self, tag, reserved_tags) -> list: return incompatible # Additional methods for other checks should be implemented here following similar patterns. - - -if __name__ == "__main__": - checker = ReservedChecker.get_instance() - print("ReservedChecker initialized successfully.") - print(checker.special_names) diff --git a/hed/validator/util/char_util.py b/hed/validator/util/char_util.py index 38ba1520..4fc915ce 100644 --- a/hed/validator/util/char_util.py +++ b/hed/validator/util/char_util.py @@ -221,15 +221,3 @@ def _get_rex_dict(): json_path = os.path.realpath(os.path.join(current_dir, CLASS_REX_FILENAME)) with open(json_path, "r", encoding="utf-8") as f: return json.load(f) - - -if __name__ == "__main__": - # Example input string - input_string = "Hello World123" - - # Class name (e.g., "nameClass" or "testClass") - class_name = "nameClass" - - # Call the function and print the result - # problem_indices = get_problem_chars(input_string, class_name, json_data) - # print(problem_indices) diff --git a/tests/models/test_hed_group.py b/tests/models/test_hed_group.py index f7b8e403..df597ed0 100644 --- a/tests/models/test_hed_group.py +++ b/tests/models/test_hed_group.py @@ -113,6 +113,31 @@ def test_sorted_structure(self): self.assertEqual(str(original_hed_string), str(hed_string)) self.assertIsNot(sorted_hed_string, hed_string) + def test_shallow_copy_raises_copy_error(self): + """copy.copy() on a HedGroup must raise copy.Error, not ValueError or any other type. + + Shallow copy is blocked because _parent pointers would alias the original, producing + an internally inconsistent object. The correct deep-copy path is .copy(). + """ + hed_string = HedString("(Tag1, Tag2)", self.hed_schema) + group = hed_string.get_first_group() + with self.assertRaises(copy.Error): + copy.copy(group) + + def test_deep_copy_succeeds(self): + """copy.deepcopy() and .copy() must work correctly despite _parent back-references.""" + hed_string = HedString("(Tag1, Tag2)", self.hed_schema) + group = hed_string.get_first_group() + + deep = copy.deepcopy(group) + self.assertIsNot(deep, group) + self.assertEqual(str(deep), str(group)) + + # .copy() is the documented public API for deep-copying a HedGroup + via_copy = group.copy() + self.assertIsNot(via_copy, group) + self.assertEqual(str(via_copy), str(group)) + if __name__ == "__main__": unittest.main() diff --git a/tests/models/test_query_handler.py b/tests/models/test_query_handler.py index 1f8d3139..9ac7287e 100644 --- a/tests/models/test_query_handler.py +++ b/tests/models/test_query_handler.py @@ -1,6 +1,7 @@ import unittest from hed.models.hed_string import HedString from hed.models.query_handler import QueryHandler +from hed.errors.exceptions import HedQueryError import os from hed import schema from hed import HedTag @@ -44,6 +45,24 @@ def test_broken_search_strings(self): QueryHandler(string) self.assertTrue(context.exception.args[0]) + def test_broken_search_strings_raise_hed_query_error(self): + """Parse errors must raise HedQueryError specifically, not a plain ValueError. + + HedQueryError subclasses ValueError for backward compatibility, so both + assertRaises(HedQueryError) and assertRaises(ValueError) must hold. + """ + broken_strings = ["A &&", "(A && B", "&& B", "A, ", ", A", "A)"] + for string in broken_strings: + with self.subTest(query=string): + # Must be the specific subclass, not a plain ValueError + with self.assertRaises(HedQueryError): + QueryHandler(string) + + def test_hed_query_error_is_value_error(self): + """HedQueryError must be catchable as ValueError for backward compatibility.""" + with self.assertRaises(ValueError): + QueryHandler("(unclosed") + def test_finding_tags(self): test_strings = { "Item, (Clear-throat)": True,