Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
760b9e1
CU-869ckx6dr: Add extra test to trainer to make sure it tests on mult…
github-actions[bot] Mar 24, 2026
4104863
CU-869ckx6dr: Add new method for reuse of entities when getting based…
github-actions[bot] Mar 24, 2026
e3c6f77
CU-869ckx6dr: Add simple test for entity persitance in document
github-actions[bot] Mar 24, 2026
214c9b1
CU-869ckx6dr: Small addition to test
github-actions[bot] Mar 24, 2026
3832ecb
CU-869ckx6dr: Prepare document with appropriate entities at training …
github-actions[bot] Mar 24, 2026
523d6b9
CU-869ckx6dr: Update tests to work with new setup
github-actions[bot] Mar 24, 2026
97a532c
CU-869ckx6dr: Add a new test for entities in add_and_train_concept.
github-actions[bot] Mar 24, 2026
5ff0464
CU-869ckx6dr: Add deprecation arning to old / unused entity_from_toke…
github-actions[bot] Mar 24, 2026
5cd5862
CU-869ckx6dr: Add deprecation warning to old / unused entity_from_tok…
github-actions[bot] Mar 24, 2026
c3c3822
CU-869ckx6dr: Deprecate unused method on a protocol level as well
github-actions[bot] Mar 24, 2026
b03adc3
CU-869ckx6dr: Fix linting issue
github-actions[bot] Mar 24, 2026
242c602
CU-869ckx6dr: Fix minor issues in test-time supervised triaining data
github-actions[bot] Mar 24, 2026
1ac9df9
CU-869ckx6dr: Add enw test for order of training examples
github-actions[bot] Mar 24, 2026
ec32f5d
CU-869ckx6dr: Minor changes to trainer tests
github-actions[bot] Mar 24, 2026
46c9b88
CU-869ckx6dr: Allow a little longer for the relcat tutorial to run
github-actions[bot] Mar 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/medcat-v2-tutorials_main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,4 @@ jobs:
- name: Smoke test tutorial
run: |
pytest --capture=no --collect-only --nbmake ${{ matrix.part }}
pytest --capture=no --nbmake -n=auto --nbmake-kernel=smoketests --nbmake-timeout=1800 ${{ matrix.part }}
pytest --capture=no --nbmake -n=auto --nbmake-kernel=smoketests --nbmake-timeout=2400 ${{ matrix.part }}
29 changes: 29 additions & 0 deletions medcat-v2/medcat/pipeline/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Optional, Iterable, Union
import logging
import os
import warnings

from medcat.utils.defaults import COMPONENTS_FOLDER
from medcat.tokenizing.tokenizers import BaseTokenizer, create_tokenizer
Expand Down Expand Up @@ -43,8 +44,19 @@ def create_entity(self, doc: MutableDocument,
doc, token_start_index, token_end_index, label)

def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
warnings.warn(
"The `medcat.pipeline.pipeline.entity_from_tokens` method is"
"depreacated and subject to removal in a future release. Please "
"use `medcat.pipeline.pipeline.entity_from_tokens_in_doc` instead.",
DeprecationWarning,
stacklevel=2
)
return self.tokenizer.entity_from_tokens(tokens)

def entity_from_tokens_in_doc(
self, tokens: list[MutableToken], doc: MutableDocument) -> MutableEntity:
return self.tokenizer.entity_from_tokens_in_doc(tokens, doc)

def __call__(self, text: str) -> MutableDocument:
doc = self.tokenizer(text)
for comp in self.components:
Expand Down Expand Up @@ -342,6 +354,23 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
"""
return self._tokenizer.entity_from_tokens(tokens)

def entity_from_tokens_in_doc(self, tokens: list[MutableToken],
doc: MutableDocument) -> MutableEntity:
"""Get the entity from the list of tokens in a document.

This effectively turns a list of (consecutive) documents
into an entity. But it is also designed to reuse existing
instances on the document instead of creating new ones.

Args:
tokens (list[MutableToken]): The tokens to use.
doc (MutableDocument): The document for these tokens.

Returns:
MutableEntity: The resulting entity.
"""
return self._tokenizer.entity_from_tokens_in_doc(tokens, doc)

def get_component(self, ctype: CoreComponentType) -> CoreComponent:
"""Get the core component by the component type.

Expand Down
26 changes: 26 additions & 0 deletions medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from typing import cast, Optional, Iterator, overload, Union, Any, Type
from collections import defaultdict
import warnings

from medcat.tokenizing.tokens import (
BaseToken, BaseEntity, BaseDocument,
Expand Down Expand Up @@ -340,13 +341,38 @@ def create_entity(self, doc: MutableDocument,
# return Entity(span)

def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
warnings.warn(
"The `medcat.tokenizing.tokenizers.Tokenizer.entity_from_tokens` method is"
"depreacated and subject to removal in a future release. Please use "
"`medcat.tokenizing.tokenizers.Tokenizer.entity_from_tokens_in_doc` "
"instead.",
DeprecationWarning,
stacklevel=2
)
if not tokens:
raise ValueError("Need at least one token for an entity")
doc = cast(Token, tokens[0])._doc
start_index = doc._tokens.index(tokens[0])
end_index = doc._tokens.index(tokens[-1])
return _entity_from_tokens(doc, tokens, start_index, end_index)

def _get_existing_entity(self, tokens: list[MutableToken],
doc: MutableDocument) -> Optional[MutableEntity]:
if not tokens:
return None
for ent in doc.ner_ents + doc.linked_ents:
if (ent.base.start_index == tokens[0].base.index and
ent.base.end_index == tokens[-1].base.index):
return ent
return None

def entity_from_tokens_in_doc(self, tokens: list[MutableToken],
doc: MutableDocument) -> MutableEntity:
existing_ent = self._get_existing_entity(tokens, doc)
if existing_ent:
return existing_ent
return self.entity_from_tokens(tokens)

def _get_tokens_matches(self, text: str) -> list[re.Match[str]]:
tokens = self.REGEX.finditer(text)
return list(tokens)
Expand Down
26 changes: 26 additions & 0 deletions medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import shutil
import logging
import warnings

import spacy
from spacy.tokens import Span
Expand Down Expand Up @@ -77,13 +78,38 @@ def create_entity(self, doc: MutableDocument,
return Entity(span)

def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
warnings.warn(
"The `medcat.tokenizing.tokenizers.Tokenizer.entity_from_tokens` method is"
"depreacated and subject to removal in a future release. Please use "
"`medcat.tokenizing.tokenizers.Tokenizer.entity_from_tokens_in_doc` "
"instead.",
DeprecationWarning,
stacklevel=2
)
if not tokens:
raise ValueError("Need at least one token for an entity")
spacy_tokens = cast(list[Token], tokens)
span = Span(spacy_tokens[0]._delegate.doc, spacy_tokens[0].index,
spacy_tokens[-1].index + 1)
return Entity(span)

def _get_existing_entity(self, tokens: list[MutableToken],
doc: MutableDocument) -> Optional[MutableEntity]:
if not tokens:
return None
for ent in doc.ner_ents + doc.linked_ents:
if (ent.base.start_index == tokens[0].base.index and
ent.base.end_index == tokens[-1].base.index):
return ent
return None

def entity_from_tokens_in_doc(self, tokens: list[MutableToken],
doc: MutableDocument) -> MutableEntity:
existing_ent = self._get_existing_entity(tokens, doc)
if existing_ent:
return existing_ent
return self.entity_from_tokens(tokens)

def __call__(self, text: str) -> MutableDocument:
if self._avoid_pipe:
doc = Document(self._nlp.make_doc(text))
Expand Down
11 changes: 9 additions & 2 deletions medcat-v2/medcat/tokenizing/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,22 @@ def create_entity(self, doc: MutableDocument,
pass

def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
"""Get an entity from the list of tokens.
"""Deprecated: use entity_from_tokens_in_doc instead."""
pass

def entity_from_tokens_in_doc(self, tokens: list[MutableToken],
doc: MutableDocument) -> MutableEntity:
"""Get an entity from the list of tokens in the specified document.

This method is designed to reuse entities where possible.

Args:
tokens (list[MutableToken]): List of tokens.
doc (MutableDocument): The document for these tokens.

Returns:
MutableEntity: The resulting entity.
"""
pass

def __call__(self, text: str) -> MutableDocument:
pass
Expand Down
20 changes: 17 additions & 3 deletions medcat-v2/medcat/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from medcat.utils.data_utils import make_mc_train_test, get_false_positives
from medcat.utils.filters import project_filters
from medcat.data.mctexport import (
MedCATTrainerExport, MedCATTrainerExportProject,
MedCATTrainerExport, MedCATTrainerExportAnnotation, MedCATTrainerExportProject,
MedCATTrainerExportDocument, count_all_annotations, iter_anns)
from medcat.preprocessors.cleaners import prepare_name, NameDescriptor
from medcat.components.types import CoreComponentType, TrainableComponent
Expand Down Expand Up @@ -397,6 +397,20 @@ def _train_supervised_for_project(self,
docs, current_document, train_from_false_positives,
devalue_others)

def _prepare_doc_with_anns(
self, doc: MutableDocument,
anns: list[MedCATTrainerExportAnnotation]) -> None:
ents = []
for ann in anns:
tkns = doc.get_tokens(ann['start'], ann['end'])
ents.append(self._pipeline.entity_from_tokens_in_doc(tkns, doc))
# set NER ents
doc.ner_ents.clear()
doc.ner_ents.extend(ents)
# duplicate for linked as well, but in a a separate list
doc.linked_ents.clear()
doc.linked_ents.extend(ents)

def _train_supervised_for_project2(self,
docs: list[MedCATTrainerExportDocument],
current_document: int,
Expand All @@ -412,17 +426,17 @@ def _train_supervised_for_project2(self,
with temp_changed_config(self.config.components.linking,
'train', False):
mut_doc = self.caller(doc['text'])
self._prepare_doc_with_anns(mut_doc, doc['annotations'])

# Compatibility with old output where annotations are a list
for ann in doc['annotations']:
for ann, mut_entity in zip(doc['annotations'], mut_doc.linked_ents):
if ann.get('killed', False):
continue
logger.info(" Annotation %s (%s) [%d:%d]",
ann['value'], ann['cui'], ann['start'], ann['end'])
cui = ann['cui']
start = ann['start']
end = ann['end']
mut_entity = mut_doc.get_tokens(start, end)
if not mut_entity:
logger.warning(
"When looking for CUI '%s' (value '%s') [%d...%d] "
Expand Down
4 changes: 2 additions & 2 deletions medcat-v2/tests/resources/supervised_mct_export.json
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
{
"cui": "C04",
"start": 81,
"end": 87,
"end": 88,
"value": "fittest"
}
],
Expand Down Expand Up @@ -125,7 +125,7 @@
"id": "ID-3",
"last_modified": "2024-08-21",
"name": "Doc#4",
"text": "The RHS male is healthy as considered by all available tests. There are no indications that the patient is not fittest."
"text": "The RHS male is healthy as considered by all available tests. There are no indications that the patient is not fittest"
}
],
"id": "Project#0",
Expand Down
18 changes: 17 additions & 1 deletion medcat-v2/tests/test_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from contextlib import contextmanager

from medcat import cat
from medcat.data.mctexport import count_all_annotations, iter_anns
from medcat.data.model_card import ModelCard
from medcat.vocab import Vocab
from medcat.config import Config
Expand Down Expand Up @@ -576,7 +577,7 @@ class CATSupTrainingTests(CATUnsupTrainingTests):
os.path.dirname(__file__), 'resources', 'supervised_mct_export.json'
)
# NOTE: should remain consistent unless we change the model or data
EXPECTED_HASH = "7bfe01e8e36eb07d"
EXPECTED_HASH = "9c299628c9e6c220"

@classmethod
def _get_cui_counts(cls) -> dict[str, int]:
Expand Down Expand Up @@ -620,6 +621,21 @@ def test_clearing_training_works(self):
self.assertEqual(self.cat.config.meta.unsup_trained, [])
self.assertEqual(self.cat.config.meta.sup_trained, [])

def test_training_happens_in_correct_order(self):
with captured_state_cdb(self.cat.cdb):
with unittest.mock.patch.object(
self.cat.trainer, "add_and_train_concept") as mock_add_and_train_concept:
self._perform_training()
mct_export = self._get_data()
called_ents = [
args.kwargs['mut_entity'] for args in mock_add_and_train_concept.call_args_list
]
self.assertEqual(len(called_ents), count_all_annotations(mct_export))
for (_, _, ann), ent in zip(iter_anns(mct_export), called_ents):
with self.subTest(f"Ann: {ann} vs Ent: {ent}"):
self.assertEqual(ann['start'], ent.base.start_char_index)
self.assertEqual(ann['end'], ent.base.end_char_index)


class CATWithDictNERSupTrainingTests(CATSupTrainingTests):
from medcat.components.types import CoreComponentType
Expand Down
Loading
Loading