Skip to content

medspacy.util

This module will contain helper functions and classes for common clinical processing tasks which will be used in many medspaCy components.

_build_pipe_names(enable, disable=None)

Implement logic based on the pipenames defined in 'enable' and 'disable'. If enable and disable are both None, then it will load the default pipenames. Otherwise, will allow custom selection of components.

Parameters:

Name Type Description Default
enable Union[str, Iterable[str]]

"all" loads components from ALL_PIPE_NAMES. "default" loads components from DEFAULT_PIPE_NAMES. Otherwise, loads he list of components as components.

required
disable Optional[Iterable[str]]

The optional list of components to disable. Set difference of enable.

None

Returns:

Type Description
Tuple[Set[str], Set[str]]

A complete list of enabled and disabled components, with all components listed and empty intersection.

Source code in medspacy/util.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def _build_pipe_names(
    enable: Union[str, Iterable[str]], disable: Optional[Iterable[str]] = None
) -> Tuple[Set[str], Set[str]]:
    """
    Implement logic based on the pipenames defined in 'enable' and 'disable'. If enable and disable are both None,
    then it will load the default pipenames. Otherwise, will allow custom selection of components.

    Args:
        enable: "all" loads components from ALL_PIPE_NAMES. "default" loads components from DEFAULT_PIPE_NAMES.
            Otherwise, loads he list of components as components.
        disable: The optional list of components to disable. Set difference of enable.

    Returns:
        A complete list of enabled and disabled components, with all components listed and empty intersection.
    """
    if not enable:
        raise ValueError(
            "Enable cannot be none, please specify 'all', 'default' or a list of components."
        )

    # cannot allow lists of enabled and disabled components, what happens if "context" is both enabled and disabled?
    if (not isinstance(enable, str) and isinstance(enable, Iterable)) and isinstance(
        disable, Iterable
    ):
        raise ValueError("Both enable and disable cannot be collections of components.")

    # set which components are enabled first
    if enable == "all":
        enable = ALL_PIPE_NAMES
    elif enable == "default":
        enable = DEFAULT_PIPE_NAMES
    else:
        enable = set(enable)

    # then find the difference with deactivated components
    if disable is not None:
        enable = enable.difference(set(disable))
    else:
        disable = set()  # otherwise disable is empty

    return enable, disable

load(model='default', medspacy_enable='default', medspacy_disable=None, language_code='en', load_rules=True, quickumls_path=None, **model_kwargs)

Load a spaCy language object with medSpaCy pipeline components. By default, the base model will be a blank 'en' model with the following components: - "medspacy_tokenizer": A customized, more aggressive tokenizer than the default spaCy tokenizer. This is set to nlp.tokenizer and is not loaded as a pipeline component. - "medspacy_pyrush": PyRuSH Sentencizer for sentence splitting - "medspacy_target_matcher": TargetMatcher for extended pattern matching - "medspacy_context": ConText for attribute assertion - "medspacy_quickumls": QuickUMLS for UMLS concept mapping Args: model: The base spaCy model to load. If 'default', will instantiate from a blank 'en' model. If it is a spaCy language model, then it will simply add medspaCy components to the existing pipeline. If it is a string other than 'default', passes the string to spacy.load(model, **model_kwargs). medspacy_enable: Specifies which components to enable in the medspacy pipeline. If "default", will load all components found in DEFAULT_PIPE_NAMES. These represent the simplest components used in a clinical NLP pipeline: tokenization, sentence detection, concept identification, and ConText. If "all", all components in medspaCy will be loaded. If a collection of strings, the components specified will be loaded. medspacy_disable: A collection of component names to exclude. Requires "all" is the value for enable. language_code: Language code to use (ISO code) as a default for loading additional resources. See documentation and also the /resources directory to see which resources might be available in each language. Default is "en" for English. load_rules: Whether to include default rules for available components. If True, sectionizer and context will both be loaded with default rules. Default is True. quickumls_path: Path to QuickUMLS dictionaries if it is included in the pipeline. model_kwargs: Optional model keyword arguments to pass to spacy.load().

Returns:

Type Description

A spaCy Language object containing the specified medspacy components.

Source code in medspacy/util.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def load(
    model: Union[Literal["default"], str, Language] = "default",
    medspacy_enable: Union[Literal["all", "default"], Iterable[str]] = "default",
    medspacy_disable: Optional[Iterable[str]] = None,
    language_code: str = "en",
    load_rules: bool = True,
    quickumls_path: Optional[str] = None,
    **model_kwargs,
):
    """Load a spaCy language object with medSpaCy pipeline components.
    By default, the base model will be a blank 'en' model with the
    following components:
        - "medspacy_tokenizer": A customized, more aggressive tokenizer than the default spaCy tokenizer. This is set to
            `nlp.tokenizer` and is not loaded as a pipeline component.
        - "medspacy_pyrush": PyRuSH Sentencizer for sentence splitting
        - "medspacy_target_matcher": TargetMatcher for extended pattern matching
        - "medspacy_context": ConText for attribute assertion
        - "medspacy_quickumls": QuickUMLS for UMLS concept mapping
    Args:
        model: The base spaCy model to load. If 'default', will instantiate from a blank 'en' model. If it is a spaCy
            language model, then it will simply add medspaCy components to the existing pipeline. If it is a string
            other than 'default', passes the string to spacy.load(model, **model_kwargs).
        medspacy_enable: Specifies which components to enable in the medspacy pipeline. If "default", will load all components
            found in `DEFAULT_PIPE_NAMES`. These represent the simplest components used in a clinical NLP pipeline:
            tokenization, sentence detection, concept identification, and ConText. If "all", all components in medspaCy
            will be loaded. If a collection of strings, the components specified will be loaded.
        medspacy_disable: A collection of component names to exclude. Requires "all" is the value for `enable`.
        language_code: Language code to use (ISO code) as a default for loading additional resources.  See documentation
            and also the /resources directory to see which resources might be available in each language.
            Default is "en" for English.
        load_rules: Whether to include default rules for available components. If True, sectionizer and context will
            both be loaded with default rules. Default is True.
        quickumls_path: Path to QuickUMLS dictionaries if it is included in the pipeline.
        model_kwargs: Optional model keyword arguments to pass to spacy.load().

    Returns:
        A spaCy Language object containing the specified medspacy components.
    """

    medspacy_enable, medspacy_disable = _build_pipe_names(
        medspacy_enable, medspacy_disable
    )

    if model == "default":
        nlp = spacy.blank("en")
    elif isinstance(model, Language):
        nlp = model
    elif isinstance(model, str):
        nlp = spacy.load(model, **model_kwargs)
    else:
        raise ValueError(
            "model must be either 'default' or an actual spaCy Language object, not ",
            type(model),
        )

    if "medspacy_tokenizer" in medspacy_enable:
        from .custom_tokenizer import create_medspacy_tokenizer

        medspacy_tokenizer = create_medspacy_tokenizer(nlp)
        nlp.tokenizer = medspacy_tokenizer

    if "medspacy_preprocessor" in medspacy_enable:
        from .preprocess import Preprocessor

        preprocessor = Preprocessor(nlp.tokenizer)
        nlp.tokenizer = preprocessor

    if "medspacy_pyrush" in medspacy_enable:
        pyrush_path = path.join(
            Path(__file__).resolve().parents[1], "resources", language_code.lower(), "rush_rules.tsv"
        )
        nlp.add_pipe("medspacy_pyrush", config={"rules_path": pyrush_path})

    if "medspacy_target_matcher" in medspacy_enable:
        nlp.add_pipe("medspacy_target_matcher")

    if "medspacy_quickumls" in medspacy_enable:
        if quickumls_path is None:
            quickumls_path = get_quickumls_demo_dir(language_code)

            print(
                "Loading QuickUMLS resources from a Medspacy-distributed SAMPLE of UMLS data from here: {}".format(
                    quickumls_path
                )
            )

        nlp.add_pipe("medspacy_quickumls", config={"quickumls_fp": quickumls_path})

    if "medspacy_context" in medspacy_enable:
        if load_rules is True:
            config = {'language_code': language_code}
        else:
            config = {"rules": None,
                      'language_code': language_code}
        nlp.add_pipe("medspacy_context", config=config)

    if "medspacy_sectionizer" in medspacy_enable:
        if load_rules is True:
            config = {'language_code': language_code}
        else:
            config = {"rules": None,
                      'language_code': language_code}
        nlp.add_pipe("medspacy_sectionizer", config=config)

    if "medspacy_postprocessor" in medspacy_enable:
        nlp.add_pipe("medspacy_postprocessor")

    if "medspacy_doc_consumer" in medspacy_enable:
        nlp.add_pipe("medspacy_doc_consumer")

    return nlp

tuple_overlaps(a, b)

Calculates whether two tuples overlap. Assumes tuples are sorted to be like spans (start, end)

Parameters:

Name Type Description Default
a Tuple[int, int]

A tuple representing a span (start, end).

required
b Tuple[int, int]

A tuple representing a span (start, end).

required

Returns:

Type Description

Whether the tuples overlap.

Source code in medspacy/util.py
192
193
194
195
196
197
198
199
200
201
202
203
def tuple_overlaps(a: Tuple[int, int], b: Tuple[int, int]):
    """
    Calculates whether two tuples overlap. Assumes tuples are sorted to be like spans (start, end)

    Args:
        a: A tuple representing a span (start, end).
        b: A tuple representing a span (start, end).

    Returns:
        Whether the tuples overlap.
    """
    return a[0] <= b[0] < a[1] or a[0] < b[1] <= a[1]