Skip to content

Single Page

All docs as a single page.

MedSpaCy

_extensions

This module will set extension attributes and methods for medspaCy. Examples include custom methods like span._.window()

any_context_attribute(span)

Return True if any of the ConText assertion attributes (is_negated, is_historical, etc.) is True.

Source code in medspacy/_extensions.py
130
131
132
def any_context_attribute(span):
    "Return True if any of the ConText assertion attributes (is_negated, is_historical, etc.) is True."
    return any(span._.context_attributes.values())

data_to_rows(data)

Unzip column-wise data from doc._.data into rows

Source code in medspacy/_extensions.py
243
244
245
246
247
def data_to_rows(data):
    """Unzip column-wise data from doc._.data into rows"""
    col_data = [data[key] for key in data.keys()]
    row_data = list(zip(*col_data))
    return row_data

get_context_attributes(span)

Return a dict of all ConText assertion attributes (is_negated, is_historical, etc.) and their values.

Source code in medspacy/_extensions.py
110
111
112
113
114
115
116
117
def get_context_attributes(span):
    """Return a dict of all ConText assertion attributes (is_negated, is_historical, etc.)
    and their values.
    """
    attr_dict = dict()
    for attr in _context_attributes:
        attr_dict[attr] = span._.get(attr)
    return attr_dict

get_extensions()

Get a list of medspaCy extensions for Token, Span, and Doc classes.

Source code in medspacy/_extensions.py
43
44
45
46
47
48
49
def get_extensions():
    """Get a list of medspaCy extensions for Token, Span, and Doc classes."""
    return {
        "Token": get_token_extensions(),
        "Span": get_span_extensions(),
        "Doc": get_doc_extensions(),
    }

get_span_literal(span)

Get the literal value from an entity's TargetRule, which is set when an entity is extracted by TargetMatcher. If the span does not have a TargetRule, it returns the lower-cased text.

Source code in medspacy/_extensions.py
120
121
122
123
124
125
126
127
def get_span_literal(span):
    """Get the literal value from an entity's TargetRule, which is set when an entity is extracted by TargetMatcher.
    If the span does not have a TargetRule, it returns the lower-cased text.
    """
    target_rule = span._.target_rule
    if target_rule is None:
        return span.text.lower()
    return target_rule.literal

get_window_span(span, n=1, left=True, right=True)

Get a Span of a window of text containing a span. Args: n (int): Number of tokens on each side of a span to return. Default 1. left (bool): Whether to include the span precedinga span. Default True. right (bool): Whether to include the span following a span. Default True. Returns: a spaCy Span

Source code in medspacy/_extensions.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def get_window_span(span, n=1, left=True, right=True):
    """Get a Span of a window of text containing a span.
    Args:
        n (int): Number of tokens on each side of a span to return.
            Default 1.
        left (bool): Whether to include the span precedinga span.
            Default True.
        right (bool): Whether to include the span following a span.
            Default True.
    Returns:
        a spaCy Span
    """
    if left:
        start = max((span.start - n, 0))
    else:
        start = span.start
    if right:
        end = min((span.end + n, len(span.doc)))
    else:
        end = span.end
    return span.doc[start:end]

get_window_token(token, n=1, left=True, right=True)

Get a Span of a window of text containing a token. Args: n (int): Number of tokens on each side of token to return. Default 1. left (bool): Whether to include the span preceding token. Default True. right (bool): Whether to include the span following token. Default True. Returns: a spaCy Span

Source code in medspacy/_extensions.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def get_window_token(token, n=1, left=True, right=True):
    """Get a Span of a window of text containing a token.
    Args:
        n (int): Number of tokens on each side of token to return.
            Default 1.
        left (bool): Whether to include the span preceding token.
            Default True.
        right (bool): Whether to include the span following token.
            Default True.
    Returns:
        a spaCy Span
    """
    if left:
        start = max((token.i - n, 0))
    else:
        start = token.i
    if right:
        end = min((token.i + n + 1, len(token.doc)))
    else:
        end = token.i + 1
    return token.doc[start:end]

set_extensions()

Set custom medspaCy extensions for Token, Span, and Doc classes.

Source code in medspacy/_extensions.py
11
12
13
14
15
def set_extensions():
    """Set custom medspaCy extensions for Token, Span, and Doc classes."""
    set_token_extensions()
    set_span_extensions()
    set_doc_extensions()

common

base_rule

BaseRule

BaseRule is the basic class for the rules contained in the MedspacyMatcher class. It contains the basic structure for a rule to be used by the spaCy matchers or by the RegexMatcher class in order to produce match tuples for processing by a component such as the Sectionizer, ContextComponent or TargetMatcher

Source code in medspacy/common/base_rule.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
class BaseRule:
    """
    BaseRule is the basic class for the rules contained in the MedspacyMatcher class. It contains the basic structure
    for a rule to be used by the spaCy matchers or by the RegexMatcher class in order to produce match tuples for
    processing by a component such as the Sectionizer, ContextComponent or TargetMatcher
    """

    def __init__(
        self,
        literal: str,
        category: str,
        pattern: Optional[Union[str, List[Dict[str, str]]]] = None,
        on_match: Optional[
            Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
        ] = None,
        metadata: Optional[Dict[Any, Any]] = None,
    ):
        """
        Base class for medspaCy rules such as TargetRule and ConTextRule.

        Args:
            literal: The plaintext form of the pattern. Can be a human-readable form of a more complex pattern or, if
                `pattern` is None, the literal is used in a spaCy PhraseMatcher by the MedspacyMatcher.
            category: The category for the match. Corresponds to ent.label_ for entities.
            pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
                token-based pattern matching to match using token attributes. If a string, will use medspaCy's
                RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
                https://spacy.io/usage/rule-based-matching.
            on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
                matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
            metadata: Optional dictionary of any extra metadata.
        """
        self.literal = literal
        self.category = category
        self.pattern = pattern
        self.on_match = on_match
        self.metadata = metadata
__init__(literal, category, pattern=None, on_match=None, metadata=None)

Base class for medspaCy rules such as TargetRule and ConTextRule.

Parameters:

Name Type Description Default
literal str

The plaintext form of the pattern. Can be a human-readable form of a more complex pattern or, if pattern is None, the literal is used in a spaCy PhraseMatcher by the MedspacyMatcher.

required
category str

The category for the match. Corresponds to ent.label_ for entities.

required
pattern Optional[Union[str, List[Dict[str, str]]]]

A list or string to use as a spaCy pattern rather than literal. If a list, will use spaCy token-based pattern matching to match using token attributes. If a string, will use medspaCy's RegexMatcher. If None, will use literal as the pattern for phrase matching. For more information, see https://spacy.io/usage/rule-based-matching.

None
on_match Optional[Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]]

An optional callback function or other callable which takes 4 arguments: (matcher, doc, i, matches). For more information, see https://spacy.io/usage/rule-based-matching#on_match

None
metadata Optional[Dict[Any, Any]]

Optional dictionary of any extra metadata.

None
Source code in medspacy/common/base_rule.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def __init__(
    self,
    literal: str,
    category: str,
    pattern: Optional[Union[str, List[Dict[str, str]]]] = None,
    on_match: Optional[
        Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
    ] = None,
    metadata: Optional[Dict[Any, Any]] = None,
):
    """
    Base class for medspaCy rules such as TargetRule and ConTextRule.

    Args:
        literal: The plaintext form of the pattern. Can be a human-readable form of a more complex pattern or, if
            `pattern` is None, the literal is used in a spaCy PhraseMatcher by the MedspacyMatcher.
        category: The category for the match. Corresponds to ent.label_ for entities.
        pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
            token-based pattern matching to match using token attributes. If a string, will use medspaCy's
            RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
            https://spacy.io/usage/rule-based-matching.
        on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
            matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
        metadata: Optional dictionary of any extra metadata.
    """
    self.literal = literal
    self.category = category
    self.pattern = pattern
    self.on_match = on_match
    self.metadata = metadata

medspacy_matcher

MedspacyMatcher

MedspacyMatcher is a class which combines spaCy's Matcher and PhraseMatcher classes along with medspaCy's RegexMatcher and acts as one single matcher using 3 different types of rules: - Exact phrases - List of dictionaries for matching on token attributes (see https://spacy.io/usage/rule-based-matching#matcher) - Regular expression matches. Note that regular-expression matching is not natively supported by spaCy and could result in unexpected matched spans if match boundaries do not align with token boundaries. Rules can be defined by any class which inherits from medspacy.common.BaseRule, such as: medspacy.target_matcher.TargetRule medspacy.context.ConTextRule

Source code in medspacy/common/medspacy_matcher.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
class MedspacyMatcher:
    """
    MedspacyMatcher is a class which combines spaCy's Matcher and PhraseMatcher classes along with medspaCy's
    RegexMatcher and acts as one single matcher using 3 different types of rules:
        - Exact phrases
        - List of dictionaries for matching on token attributes (see https://spacy.io/usage/rule-based-matching#matcher)
        - Regular expression matches. Note that regular-expression matching is not natively supported by spaCy and could
                result in unexpected matched spans if match boundaries do not align with token boundaries.
    Rules can be defined by any class which inherits from medspacy.common.BaseRule, such as:
        medspacy.target_matcher.TargetRule
        medspacy.context.ConTextRule
    """

    name = "medspacy_matcher"

    def __init__(
        self, nlp: Language, name: str = "medspacy_matcher", phrase_matcher_attr: str = "LOWER", prune: bool = True
    ):
        """
        Creates a MedspacyMatcher.

        Args:
            nlp: A spaCy Language model.
            name: The name of the component.
            phrase_matcher_attr: The attribute to use for spaCy's PhraseMatcher. Default is 'LOWER'.
            prune: Whether to prune matches that overlap or are substrings of another match. For example, if "no history
                of" and "history of" are both matches, setting prune to True would drop "history of". Default is True.
        """
        self.nlp = nlp.tokenizer # preserve only the tokenizer for creating phrasematcher rules
        self._rule_ids = set()
        self._labels = set()
        self._rule_map = dict()
        self._prune = prune
        self.__matcher = Matcher(nlp.vocab)
        self.__phrase_matcher = PhraseMatcher(nlp.vocab, attr=phrase_matcher_attr)
        self.__regex_matcher = RegexMatcher(nlp.vocab)

        self.__rule_count = 0
        self.__phrase_matcher_attr = phrase_matcher_attr

    @property
    def rules(self) -> List[BaseRule]:
        """
        The list of rules used by the MedspacyMatcher.

        Returns:
            A list of rules, all of which inherit from BaseRule.
        """
        return list(self._rule_map.values())

    @property
    def rule_map(self) -> Dict[str, BaseRule]:
        """
        The dictionary mapping a rule's id to the rule object.

        Returns:
            A dictionary mapping the rule's id to the rule.
        """
        return self._rule_map

    @property
    def labels(self) -> Set[str]:
        """
        The set of labels available to the matcher.

        Returns:
            A set of labels containing the labels for all the rules added to the matcher.
        """
        return self._labels

    def add(self, rules: Iterable[BaseRule]):
        """
        Adds a collection of rules to the matcher. Rules must inherit from `medspacy.common.BaseRule`.

        Args:
            rules: A collection of rules. Each rule must inherit from `medspacy.common.BaseRule`.
        """
        for rule in rules:
            if not isinstance(rule, BaseRule):
                raise TypeError("Rules must inherit from medspacy.common.BaseRule.")
            self._labels.add(rule.category)
            rule_id = f"{rule.category}_{self.__rule_count}"
            rule._rule_id = rule_id
            self._rule_map[rule_id] = rule
            if rule.pattern is not None:
                # If it's a string, add a RegEx
                if isinstance(rule.pattern, str):
                    self.__regex_matcher.add(rule_id, [rule.pattern], rule.on_match)
                # If it's a list, add a pattern dictionary
                elif isinstance(rule.pattern, list):
                    self.__matcher.add(rule_id, [rule.pattern], on_match=rule.on_match)
                else:
                    raise ValueError(
                        f"The pattern argument must be either a string or a list, not {type(rule.pattern)}"
                    )
            else:
                if self.__phrase_matcher_attr.lower() == "lower":
                    # only lowercase when the phrase matcher is looking for lowercase matches.
                    text = rule.literal.lower()
                else:
                    # otherwise, expect users to handle phrases as aligned with their non-default phrase matching scheme
                    # this prevents .lower() from blocking matches on attrs like ORTH or UPPER
                    text = rule.literal
                doc = self.nlp(text)
                self.__phrase_matcher.add(
                    rule_id,
                    [doc],
                    on_match=rule.on_match,
                )
            self.__rule_count += 1

    def __call__(self, doc: Doc) -> List[Tuple[int, int, int]]:
        """
        Call MedspacyMatcher on a doc and return a single list of matches. If self.prune is True,
        in the case of overlapping matches the longest will be returned.

        Args:
            doc: The spaCy Doc to process.

        Returns:
            A list of tuples, each containing 3 ints representing the individual match (match_id, start, end).
        """
        matches = self.__matcher(doc)
        matches += self.__phrase_matcher(doc)
        matches += self.__regex_matcher(doc)
        if self._prune:
            matches = prune_overlapping_matches(matches)
        return matches
labels property

The set of labels available to the matcher.

Returns:

Type Description
Set[str]

A set of labels containing the labels for all the rules added to the matcher.

rule_map property

The dictionary mapping a rule's id to the rule object.

Returns:

Type Description
Dict[str, BaseRule]

A dictionary mapping the rule's id to the rule.

rules property

The list of rules used by the MedspacyMatcher.

Returns:

Type Description
List[BaseRule]

A list of rules, all of which inherit from BaseRule.

__call__(doc)

Call MedspacyMatcher on a doc and return a single list of matches. If self.prune is True, in the case of overlapping matches the longest will be returned.

Parameters:

Name Type Description Default
doc Doc

The spaCy Doc to process.

required

Returns:

Type Description
List[Tuple[int, int, int]]

A list of tuples, each containing 3 ints representing the individual match (match_id, start, end).

Source code in medspacy/common/medspacy_matcher.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def __call__(self, doc: Doc) -> List[Tuple[int, int, int]]:
    """
    Call MedspacyMatcher on a doc and return a single list of matches. If self.prune is True,
    in the case of overlapping matches the longest will be returned.

    Args:
        doc: The spaCy Doc to process.

    Returns:
        A list of tuples, each containing 3 ints representing the individual match (match_id, start, end).
    """
    matches = self.__matcher(doc)
    matches += self.__phrase_matcher(doc)
    matches += self.__regex_matcher(doc)
    if self._prune:
        matches = prune_overlapping_matches(matches)
    return matches
__init__(nlp, name='medspacy_matcher', phrase_matcher_attr='LOWER', prune=True)

Creates a MedspacyMatcher.

Parameters:

Name Type Description Default
nlp Language

A spaCy Language model.

required
name str

The name of the component.

'medspacy_matcher'
phrase_matcher_attr str

The attribute to use for spaCy's PhraseMatcher. Default is 'LOWER'.

'LOWER'
prune bool

Whether to prune matches that overlap or are substrings of another match. For example, if "no history of" and "history of" are both matches, setting prune to True would drop "history of". Default is True.

True
Source code in medspacy/common/medspacy_matcher.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def __init__(
    self, nlp: Language, name: str = "medspacy_matcher", phrase_matcher_attr: str = "LOWER", prune: bool = True
):
    """
    Creates a MedspacyMatcher.

    Args:
        nlp: A spaCy Language model.
        name: The name of the component.
        phrase_matcher_attr: The attribute to use for spaCy's PhraseMatcher. Default is 'LOWER'.
        prune: Whether to prune matches that overlap or are substrings of another match. For example, if "no history
            of" and "history of" are both matches, setting prune to True would drop "history of". Default is True.
    """
    self.nlp = nlp.tokenizer # preserve only the tokenizer for creating phrasematcher rules
    self._rule_ids = set()
    self._labels = set()
    self._rule_map = dict()
    self._prune = prune
    self.__matcher = Matcher(nlp.vocab)
    self.__phrase_matcher = PhraseMatcher(nlp.vocab, attr=phrase_matcher_attr)
    self.__regex_matcher = RegexMatcher(nlp.vocab)

    self.__rule_count = 0
    self.__phrase_matcher_attr = phrase_matcher_attr
add(rules)

Adds a collection of rules to the matcher. Rules must inherit from medspacy.common.BaseRule.

Parameters:

Name Type Description Default
rules Iterable[BaseRule]

A collection of rules. Each rule must inherit from medspacy.common.BaseRule.

required
Source code in medspacy/common/medspacy_matcher.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def add(self, rules: Iterable[BaseRule]):
    """
    Adds a collection of rules to the matcher. Rules must inherit from `medspacy.common.BaseRule`.

    Args:
        rules: A collection of rules. Each rule must inherit from `medspacy.common.BaseRule`.
    """
    for rule in rules:
        if not isinstance(rule, BaseRule):
            raise TypeError("Rules must inherit from medspacy.common.BaseRule.")
        self._labels.add(rule.category)
        rule_id = f"{rule.category}_{self.__rule_count}"
        rule._rule_id = rule_id
        self._rule_map[rule_id] = rule
        if rule.pattern is not None:
            # If it's a string, add a RegEx
            if isinstance(rule.pattern, str):
                self.__regex_matcher.add(rule_id, [rule.pattern], rule.on_match)
            # If it's a list, add a pattern dictionary
            elif isinstance(rule.pattern, list):
                self.__matcher.add(rule_id, [rule.pattern], on_match=rule.on_match)
            else:
                raise ValueError(
                    f"The pattern argument must be either a string or a list, not {type(rule.pattern)}"
                )
        else:
            if self.__phrase_matcher_attr.lower() == "lower":
                # only lowercase when the phrase matcher is looking for lowercase matches.
                text = rule.literal.lower()
            else:
                # otherwise, expect users to handle phrases as aligned with their non-default phrase matching scheme
                # this prevents .lower() from blocking matches on attrs like ORTH or UPPER
                text = rule.literal
            doc = self.nlp(text)
            self.__phrase_matcher.add(
                rule_id,
                [doc],
                on_match=rule.on_match,
            )
        self.__rule_count += 1

regex_matcher

RegexMatcher

The RegexMatcher is an alternative to spaCy's native Matcher and PhraseMatcher classes and allows matching based on typical regular expressions over the underlying doc text rather than spacy token attributes.

This can be useful for allowing more traditional text matching methods, but can lead to issues if the matched spans in the text do not line up with spacy token boundaries. In this case, the RegexMatcher will by default resolve to the nearest token boundaries by expanding to the left and right. This behavior can be configured using resolve_start and resolve_end. To avoid this, consider using a list of dicts, such as in a spacy Matcher. For more information, see: https://spacy.io/usage/rule-based-matching

Examples of resolve_start/resolve_end: In the string 'SERVICE: Radiology' the pattern 'ICE: Rad' would match in the middle of the tokens 'SERVICE' and 'RADIOLOGY'. SpaCy would normally return None. The RegexMatcher will expand in the following ways: resolve_start='left': The resulting span will start at 'SERVICE' -> 'SERVICE: Radiology' resolve_start='right': The resulting span will start at ':' -> ': Radiology' resolve_end='left': The resulting span will end at ':': -> 'SERVICE:' resolve_end='right': The resulting span will end at 'RADIOLOGY' -> 'SERVICE: Radiology'

Source code in medspacy/common/regex_matcher.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
class RegexMatcher:
    """
    The RegexMatcher is an alternative to spaCy's native Matcher and PhraseMatcher classes and allows matching based on
    typical regular expressions over the underlying doc text rather than spacy token attributes.

    This can be useful for allowing more traditional text matching methods, but can lead to issues if the matched spans
    in the text do not line up with spacy token boundaries. In this case, the RegexMatcher will by default resolve to
    the nearest token  boundaries by expanding to the left and right. This behavior can be configured using
    `resolve_start` and `resolve_end`. To avoid this, consider using a list of dicts, such as in a spacy Matcher.
    For more information, see: https://spacy.io/usage/rule-based-matching

    Examples of resolve_start/resolve_end:
    In the string 'SERVICE: Radiology' the pattern 'ICE: Rad' would match in the middle of the tokens
    'SERVICE' and 'RADIOLOGY'. SpaCy would normally return None. The RegexMatcher will expand in the following ways:
    resolve_start='left': The resulting span will start at 'SERVICE' -> 'SERVICE: Radiology'
    resolve_start='right': The resulting span will start at ':' -> ': Radiology'
    resolve_end='left': The resulting span will end at ':': -> 'SERVICE:'
    resolve_end='right': The resulting span will end at 'RADIOLOGY' -> 'SERVICE: Radiology'

    """

    def __init__(
        self,
        vocab: Vocab,
        flags: re.RegexFlag = re.IGNORECASE,
        resolve_start: str = "left",
        resolve_end: str = "right",
    ):
        """
        Creates a new RegexMatcher.

        Args:
            vocab: A spaCy model vocabulary
            flags: Regular expression flag. Default re.IGNORECASE
            resolve_start: How to resolve if the start character index of a match does not align with spacy token
                boundaries. If 'left', will find the nearest token boundary to the left of the unmatched character
                index, leading to a longer than expected span. If 'right', will find the nearest token boundary to the
                right of the unmatched character index, leading to a shorter than expected span.  Default 'left'.
            resolve_end: How to resolve if the end character index of a match does not align with spacy token
                boundaries. If 'left', will find the nearest token boundary to the left of the unmatched character
                index, leading to a shorter than expected span. If 'right', will find the nearest token boundary to the
                right of the unmatched character index, leading to a longer than expected span. Default 'right'.
        """
        self.vocab = vocab
        self.flags = flags
        self.resolve_start = resolve_start
        self.resolve_end = resolve_end
        self._patterns = {}
        self._callbacks = {}
        self.labels = set()
        self._rule_item_mapping = dict()

    def add(
        self,
        match_id: str,
        regex_rules: Iterable[str],
        on_match: Optional[
            Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
        ] = None,
    ):
        """
        Add a rule with one or more regex patterns to one match id.

        Args:
            match_id: The name of the pattern.
            regex_rules: The list of regex strings to associate with `match_id`.
            on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
                matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
        """
        # i am not sure if these warnings are more annoying than useful.
        # warnings.warn(
        #     "You are using a TargetRule with a regex pattern, which is not "
        #     "natively supported in spacy and may lead to unexpected match spans. "
        #     "Consider using a list of dicts pattern instead. "
        #     "See https://spacy.io/usage/rule-based-matching",
        #     RuntimeWarning,
        # )
        if match_id not in self.vocab:
            self.vocab.strings.add(match_id)
        self._patterns.setdefault(self.vocab.strings[match_id], [])
        for pattern in regex_rules:
            self._patterns[self.vocab.strings[match_id]].append(
                re.compile(pattern, flags=self.flags)
            )
            self._callbacks[self.vocab.strings[match_id]] = on_match

    def get(self, key):
        return self._patterns.get(self.vocab.strings[key], [])

    def __call__(self, doc: Doc) -> List[Tuple[int, int, int]]:
        """
        Call the RegexMatcher on a spaCy Doc.

        Args:
            doc: The spaCy doc to process.

        Returns:
            The list of match tuples (match_id, start, end).
        """
        matches = []
        for (match_id, patterns) in self._patterns.items():
            for pattern in patterns:
                on_match = self._callbacks[match_id]
                for re_match in pattern.finditer(doc.text_with_ws):
                    span = doc.char_span(re_match.start(), re_match.end())
                    if span is None:
                        start = get_token_for_char(
                            doc, re_match.start(), resolve=self.resolve_start
                        )
                        end = get_token_for_char(
                            doc, re_match.end(), resolve=self.resolve_end
                        )
                        if end is None:
                            end_index = len(doc)
                        else:
                            end_index = end.i
                        span = doc[start.i : end_index]
                    # If it's an empty span, then that means that the token resolution
                    # must have resulted in no tokens being included.
                    # Don't add the match
                    if len(span):
                        match = (match_id, span.start, span.end)
                        matches.append(match)
                    # If a callback function was defined,
                    # call it according to the spaCy API:
                    # https://spacy.io/usage/rule-based-matching#on_match
                    if on_match is not None:
                        on_match(self, doc, len(matches) - 1, matches)

        return matches
__call__(doc)

Call the RegexMatcher on a spaCy Doc.

Parameters:

Name Type Description Default
doc Doc

The spaCy doc to process.

required

Returns:

Type Description
List[Tuple[int, int, int]]

The list of match tuples (match_id, start, end).

Source code in medspacy/common/regex_matcher.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def __call__(self, doc: Doc) -> List[Tuple[int, int, int]]:
    """
    Call the RegexMatcher on a spaCy Doc.

    Args:
        doc: The spaCy doc to process.

    Returns:
        The list of match tuples (match_id, start, end).
    """
    matches = []
    for (match_id, patterns) in self._patterns.items():
        for pattern in patterns:
            on_match = self._callbacks[match_id]
            for re_match in pattern.finditer(doc.text_with_ws):
                span = doc.char_span(re_match.start(), re_match.end())
                if span is None:
                    start = get_token_for_char(
                        doc, re_match.start(), resolve=self.resolve_start
                    )
                    end = get_token_for_char(
                        doc, re_match.end(), resolve=self.resolve_end
                    )
                    if end is None:
                        end_index = len(doc)
                    else:
                        end_index = end.i
                    span = doc[start.i : end_index]
                # If it's an empty span, then that means that the token resolution
                # must have resulted in no tokens being included.
                # Don't add the match
                if len(span):
                    match = (match_id, span.start, span.end)
                    matches.append(match)
                # If a callback function was defined,
                # call it according to the spaCy API:
                # https://spacy.io/usage/rule-based-matching#on_match
                if on_match is not None:
                    on_match(self, doc, len(matches) - 1, matches)

    return matches
__init__(vocab, flags=re.IGNORECASE, resolve_start='left', resolve_end='right')

Creates a new RegexMatcher.

Parameters:

Name Type Description Default
vocab Vocab

A spaCy model vocabulary

required
flags RegexFlag

Regular expression flag. Default re.IGNORECASE

IGNORECASE
resolve_start str

How to resolve if the start character index of a match does not align with spacy token boundaries. If 'left', will find the nearest token boundary to the left of the unmatched character index, leading to a longer than expected span. If 'right', will find the nearest token boundary to the right of the unmatched character index, leading to a shorter than expected span. Default 'left'.

'left'
resolve_end str

How to resolve if the end character index of a match does not align with spacy token boundaries. If 'left', will find the nearest token boundary to the left of the unmatched character index, leading to a shorter than expected span. If 'right', will find the nearest token boundary to the right of the unmatched character index, leading to a longer than expected span. Default 'right'.

'right'
Source code in medspacy/common/regex_matcher.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def __init__(
    self,
    vocab: Vocab,
    flags: re.RegexFlag = re.IGNORECASE,
    resolve_start: str = "left",
    resolve_end: str = "right",
):
    """
    Creates a new RegexMatcher.

    Args:
        vocab: A spaCy model vocabulary
        flags: Regular expression flag. Default re.IGNORECASE
        resolve_start: How to resolve if the start character index of a match does not align with spacy token
            boundaries. If 'left', will find the nearest token boundary to the left of the unmatched character
            index, leading to a longer than expected span. If 'right', will find the nearest token boundary to the
            right of the unmatched character index, leading to a shorter than expected span.  Default 'left'.
        resolve_end: How to resolve if the end character index of a match does not align with spacy token
            boundaries. If 'left', will find the nearest token boundary to the left of the unmatched character
            index, leading to a shorter than expected span. If 'right', will find the nearest token boundary to the
            right of the unmatched character index, leading to a longer than expected span. Default 'right'.
    """
    self.vocab = vocab
    self.flags = flags
    self.resolve_start = resolve_start
    self.resolve_end = resolve_end
    self._patterns = {}
    self._callbacks = {}
    self.labels = set()
    self._rule_item_mapping = dict()
add(match_id, regex_rules, on_match=None)

Add a rule with one or more regex patterns to one match id.

Parameters:

Name Type Description Default
match_id str

The name of the pattern.

required
regex_rules Iterable[str]

The list of regex strings to associate with match_id.

required
on_match Optional[Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]]

An optional callback function or other callable which takes 4 arguments: (matcher, doc, i, matches). For more information, see https://spacy.io/usage/rule-based-matching#on_match

None
Source code in medspacy/common/regex_matcher.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def add(
    self,
    match_id: str,
    regex_rules: Iterable[str],
    on_match: Optional[
        Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
    ] = None,
):
    """
    Add a rule with one or more regex patterns to one match id.

    Args:
        match_id: The name of the pattern.
        regex_rules: The list of regex strings to associate with `match_id`.
        on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
            matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
    """
    # i am not sure if these warnings are more annoying than useful.
    # warnings.warn(
    #     "You are using a TargetRule with a regex pattern, which is not "
    #     "natively supported in spacy and may lead to unexpected match spans. "
    #     "Consider using a list of dicts pattern instead. "
    #     "See https://spacy.io/usage/rule-based-matching",
    #     RuntimeWarning,
    # )
    if match_id not in self.vocab:
        self.vocab.strings.add(match_id)
    self._patterns.setdefault(self.vocab.strings[match_id], [])
    for pattern in regex_rules:
        self._patterns[self.vocab.strings[match_id]].append(
            re.compile(pattern, flags=self.flags)
        )
        self._callbacks[self.vocab.strings[match_id]] = on_match

util

This module will contain helper functions and classes for common clinical processing tasks which will be used in medspaCy's matcher objects.

get_token_for_char(doc, char_idx, resolve='left')

Get the token index that best matches a particular character index. Because regex find returns a character index and spaCy matches must align with token boundaries, each character index must be converted into a token index.

Parameters:

Name Type Description Default
doc Doc

The spaCy Doc to search in.

required
char_idx int

The character index to find the corresponding token for.

required
resolve str

The resolution type. "left" will snap character to the token index to the left which precede the

'left'

Returns:

Type Description
Union[Token, None]

The token that best fits the character index based on the resolution type.

Source code in medspacy/common/util.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def get_token_for_char(
    doc: Doc, char_idx: int, resolve: str = "left"
) -> Union[Token, None]:
    """
    Get the token index that best matches a particular character index. Because regex find returns a character index and
    spaCy matches must align with token boundaries, each character index must be converted into a token index.

    Args:
        doc: The spaCy Doc to search in.
        char_idx: The character index to find the corresponding token for.
        resolve: The resolution type. "left" will snap character to the token index to the left which precede the
        `char_idx`. "right" will snap character to the token index to the right, which follows the `char_idx`.

    Returns:
        The token that best fits the character index based on the resolution type.
    """
    if char_idx < 0:
        raise ValueError("char_idx must be > 0")
    if char_idx > len(doc.text_with_ws):
        raise ValueError(
            "char_idx {0} is out of range for text with length {1}".format(
                char_idx, len(doc.text_with_ws)
            )
        )
    for i, token in enumerate(doc):
        if char_idx > token.idx:
            continue
        if char_idx == token.idx:
            return token
        if char_idx < token.idx:
            if resolve == "left":
                return doc[i - 1]
            elif resolve == "right":
                return doc[i]
            else:
                raise ValueError("resolve must be either 'left' or 'right'")
    # Otherwise, we've reached the end of the doc, so this must be the final token
    # If resolving to the left, return the final token
    # If resolving to the right, return None, meaning it should go to the end of the doc
    if resolve == "left":
        return doc[-1]
    if resolve == "right":
        return None

matches_to_spans(doc, matches, set_label=True)

Converts all identified matches to spans.

Parameters:

Name Type Description Default
doc Doc

The spaCy doc corresponding to the matches.

required
matches List[Tuple[int, int, int]]

The list of match Tuples (match_id, start, end).

required
set_label bool

Whether to assign a label to the span based off the source rule. Default is True.

True

Returns:

Type Description
List[Span]

A list of spacy spans corresponding to the input matches.

Source code in medspacy/common/util.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
def matches_to_spans(
    doc: Doc, matches: List[Tuple[int, int, int]], set_label: bool = True
) -> List[Span]:
    """
    Converts all identified matches to spans.

    Args:
        doc: The spaCy doc corresponding to the matches.
        matches: The list of match Tuples (match_id, start, end).
        set_label: Whether to assign a label to the span based off the source rule. Default is True.

    Returns:
        A list of spacy spans corresponding to the input matches.
    """
    spans = []
    for (rule_id, start, end) in matches:
        if set_label:
            label = doc.vocab.strings[rule_id]
        else:
            label = None
        spans.append(Span(doc, start=start, end=end, label=label))
    return spans

overlaps(a, b)

Checks whether two match Tuples out of spacy matchers overlap.

Parameters:

Name Type Description Default
a Tuple[int, int, int]

A match Tuple (match_id, start, end).

required
b Tuple[int, int, int]

A match Tuple (match_id, start, end).

required

Returns:

Type Description
bool

Whether the tuples overlap.

Source code in medspacy/common/util.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def overlaps(a: Tuple[int, int, int], b: Tuple[int, int, int]) -> bool:
    """
    Checks whether two match Tuples out of spacy matchers overlap.

    Args:
        a: A match Tuple (match_id, start, end).
        b: A match Tuple (match_id, start, end).

    Returns:
        Whether the tuples overlap.
    """
    _, a_start, a_end = a
    _, b_start, b_end = b
    return tuple_overlaps((a_start, a_end), (b_start, b_end))

prune_overlapping_matches(matches, strategy='longest')

Prunes overlapping matches from a list of spaCy match tuples (match_id, start, end).

Parameters:

Name Type Description Default
matches List[Tuple[int, int, int]]

A list of match tuples of form (match_id, start, end).

required
strategy str

The pruning strategy to use. At this time, the only available option is "longest" and will keep the longest of any two overlapping spans. Other behavior will be added in a future update.

'longest'

Returns:

Type Description
List[Tuple[int, int, int]]

The pruned list of matches.

Source code in medspacy/common/util.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def prune_overlapping_matches(
    matches: List[Tuple[int, int, int]], strategy: str = "longest"
) -> List[Tuple[int, int, int]]:
    """
    Prunes overlapping matches from a list of spaCy match tuples (match_id, start, end).

    Args:
        matches: A list of match tuples of form (match_id, start, end).
        strategy: The pruning strategy to use. At this time, the only available option is "longest" and will keep the
            longest of any two overlapping spans. Other behavior will be added in a future update.

    Returns:
        The pruned list of matches.
    """
    if strategy != "longest":
        raise NotImplementedError(
            "No other filtering strategy has been implemented. Coming in a future update."
        )

    # Make a copy and sort
    unpruned = sorted(matches, key=lambda x: (x[1], x[2]))
    pruned = []
    num_matches = len(matches)
    if num_matches == 0:
        return matches
    curr_match = unpruned.pop(0)

    while True:
        if len(unpruned) == 0:
            pruned.append(curr_match)
            break
        next_match = unpruned.pop(0)

        # Check if they overlap
        if overlaps(curr_match, next_match):
            # Choose the larger span
            longer_span = max(curr_match, next_match, key=lambda x: (x[2] - x[1]))
            pruned.append(longer_span)
            if len(unpruned) == 0:
                break
            curr_match = unpruned.pop(0)
        else:
            pruned.append(curr_match)
            curr_match = next_match
    # Recursive base point
    if len(pruned) == num_matches:
        return pruned
    # Recursive function call
    else:
        return prune_overlapping_matches(pruned)

span_contains(span, target, regex=True, case_insensitive=True)

Return True if a Span object contains a target phrase.

Parameters:

Name Type Description Default
span Union[Doc, Span]

A spaCy Doc or Span, such as an entity in doc.ents

required
target str

A target phrase or iterable of phrases to check in span.text.lower().

required
regex bool

Whether to search the span using a regular expression rather than a literal string. Default is True.

True
case_insensitive bool

Whether the matching is case-insensitive. Default is True.

True
Source code in medspacy/common/util.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def span_contains(
    span: Union[Doc, Span],
    target: str,
    regex: bool = True,
    case_insensitive: bool = True,
) -> bool:
    """
    Return True if a Span object contains a target phrase.

    Args:
        span: A spaCy Doc or Span, such as an entity in doc.ents
        target: A target phrase or iterable of phrases to check in span.text.lower().
        regex: Whether to search the span using a regular expression rather than
            a literal string. Default is True.
        case_insensitive: Whether the matching is case-insensitive. Default is True.
    """
    if regex is True:
        if case_insensitive:
            func = lambda x: re.search(x, span.text, flags=re.IGNORECASE) is not None
        else:
            func = lambda x: re.search(x, span.text) is not None
    else:
        if case_insensitive:
            func = lambda x: x.lower() in span.text.lower()
        else:
            func = lambda x: x in span.text

    if isinstance(target, str):
        return func(target)

    # If it's an iterable, check if any of the strings are in sent
    for string in target:
        if func(string):
            return True
    return False

context

ConText

The ConText for spaCy processing.

This component matches modifiers in a Doc, defines their scope, and identifies edges between targets and modifiers. Sets two spaCy extensions: - Span..modifiers: a list of ConTextModifier objects which modify a target Span - Doc..context_graph: a ConText graph object which contains the targets, modifiers, and edges between them.

Source code in medspacy/context/context.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
@Language.factory("medspacy_context")
class ConText:
    """
    The ConText for spaCy processing.

    This component matches modifiers in a Doc, defines their scope, and identifies edges between targets and modifiers.
    Sets two spaCy extensions:
            - Span._.modifiers: a list of ConTextModifier objects which modify a target Span
            - Doc._.context_graph: a ConText graph object which contains the targets,
                modifiers, and edges between them.
    """

    def __init__(
        self,
        nlp: Language,
        name: str = "medspacy_context",
        rules: Optional[str] = "default",
        language_code: str = 'en',
        phrase_matcher_attr: str = "LOWER",
        allowed_types: Optional[Set[str]] = None,
        excluded_types: Optional[Set[str]] = None,
        terminating_types: Optional[Dict[str, Iterable[str]]] = None,
        max_scope: Optional[int] = None,
        max_targets: Optional[int] = None,
        prune_on_modifier_overlap: bool = True,
        prune_on_target_overlap: bool = False,
        span_attrs: Union[
            Literal["default"], Dict[str, Dict[str, Any]], None
        ] = "default",
        input_span_type: Union[Literal["ents", "group"]] = "ents",
        span_group_name: str = "medspacy_spans",
    ):
        """
        Creates a new ConText object.

        Args:
            nlp: A SpaCy Language object.
            name: The name of the component.
            rules: The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from
                original ConText rules and years of practical applications at the US Department of Veterans Affairs.  If
                None, no rules are loaded. Otherwise, must be a path to a json file containing rules. Add ConTextRules
                directly through `ConText.add`.
            language_code: Language code to use (ISO code) as a default for loading resources.  See documentation
                and also the /resources directory to see which resources might be available in each language.
                Default is "en" for English.
            phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
                is 'LOWER'.
            allowed_types: A global list of types included by context. Rules will operate on only spans with these
                labels.
            excluded_types: A global list of types excluded by context. Rules will not operate on spans with these
                labels.
            terminating_types: A global map of types to the types that can terminate them. This can be used to apply
                terminations to all rules of a particular type rather than adding to every rule individually in the
                ContextRule object.
            max_scope: The number of tokens around a modifier in a target can be modified. Default value is None,
                Context will use the sentence boundaries. If a value greater than zero, applies the window globally.
                Both options will be overridden by a more specific value in a ContextRule.
            max_targets: The maximum number of targets a modifier can modify. Default value is None, context will modify
                all targets in its scope. If a value greater than zero, applies this value globally. Both options will
                be overridden by a more specific value in a ContextRule.
            prune_on_modifier_overlap: Whether to prune modifiers which are substrings of another modifier. If True,
                will drop substrings completely. For example, if "no history of"  and "history of" are both
                ConTextRules,both will match the text "no history of afib", but only "no  history of" should modify
                afib. Default True.
            prune_on_target_overlap: Whether to remove any matched modifiers which overlap with target entities. If
                False, any overlapping modifiers will not modify the overlapping entity but will still modify any other
                targets in its scope. Default False.
            span_attrs: The optional span attributes to modify. Default option "default" uses attributes in
                `DEFAULT_ATTRIBUTES`. If a dictionary, format is mapping context modifier categories to a dictionary
                containing the attribute name and the value to set the attribute to when a  span is modified by a
                modifier of that category. If None, no attributes will be modified.
            input_span_type: "ents" or "group". Where to look for targets. "ents" will modify attributes of spans
                in doc.ents. "group" will modify attributes of spans in the span group specified by `span_group_name`.
            span_group_name: The name of the span group used when `input_span_type` is "group". Default is
                "medspacy_spans".
        """
        self.nlp = nlp
        self.name = name
        self.prune_on_modifier_overlap = prune_on_modifier_overlap
        self.prune_on_target_overlap = prune_on_target_overlap
        self.input_span_type = input_span_type
        self.span_group_name = span_group_name
        self.context_attributes_mapping = None

        self.DEFAULT_RULES_FILEPATH = path.join(
            Path(__file__).resolve().parents[2], "resources", language_code.lower(), "context_rules.json"
        )

        self.__matcher = MedspacyMatcher(
            nlp,
            name=name,
            phrase_matcher_attr=phrase_matcher_attr,
            prune=prune_on_modifier_overlap,
        )

        if span_attrs == "default":
            self.context_attributes_mapping = DEFAULT_ATTRIBUTES
            self.register_default_attributes()
        elif span_attrs:
            for _, attr_dict in span_attrs.items():
                for attr_name in attr_dict.keys():
                    if not Span.has_extension(attr_name):
                        raise ValueError(
                            f"Custom extension {attr_name} has not been set. Please ensure Span.set_extension is "
                            f"called for your pipeline's custom extensions."
                        )
            self.context_attributes_mapping = span_attrs

        self.register_graph_attributes()

        if max_scope is not None:
            if not (isinstance(max_scope, int) and max_scope > 0):
                raise ValueError(
                    f"If 'max_scope' must be a value greater than 0, not the current value: {max_scope}"
                )
        self.max_scope = max_scope

        self.allowed_types = allowed_types
        self.excluded_types = excluded_types
        self.max_targets = max_targets

        self.terminating_types = dict()
        if terminating_types:
            self.terminating_types = {
                k.upper(): v for (k, v) in terminating_types.items()
            }

        rule_path = None
        if rules == "default":
            rule_path = self.DEFAULT_RULES_FILEPATH
        else:
            rule_path = rules

        if rule_path:
            self.add(ConTextRule.from_json(rule_path))

    @property
    def rules(self):
        """
        Returns list of ConTextRules available to context.
        """
        return self.__matcher.rules

    @property
    def categories(self):
        """
        Returns list of categories available that Context might produce.
        """
        return self.__matcher.labels

    @property
    def input_span_type(self):
        """
        The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for
        a spaCy span group.

        Returns:
            The input type, "ents" or "group".
        """
        return self._input_span_type

    @input_span_type.setter
    def input_span_type(self, val):
        if not (val == "ents" or val == "group"):
            raise ValueError('input_type must be "ents" or "group".')
        self._input_span_type = val

    @property
    def span_group_name(self) -> str:
        """
        The name of the span group used by this component. If `input_type` is "group", calling this component will
        use spans in the span group with this name.

        Returns:
            The span group name.
        """
        return self._span_group_name

    @span_group_name.setter
    def span_group_name(self, name: str):
        if not name or not isinstance(name, str):
            raise ValueError("Span group name must be a string.")
        self._span_group_name = name

    def add(self, rules):
        """
        Adds ConTextRules to Context.

        Args:
            rules: A single ConTextRule or a collection of ConTextRules to add to the Sectionizer.
        """
        if isinstance(rules, ConTextRule):
            rules = [rules]
        for rule in rules:
            if not isinstance(rule, ConTextRule):
                raise TypeError(f"Rules must type ConTextRule, not {type(rule)}.")

            # If global attributes like allowed_types and max_scope are defined,
            # check if the ConTextRule has them defined. If not, set to the global
            for attr in (
                "allowed_types",
                "excluded_types",
                "max_scope",
                "max_targets",
            ):
                value = getattr(self, attr)
                if value is None:  # No global value set
                    continue
                if (
                    getattr(rule, attr) is None
                ):  # If the direction itself has it defined, don't override
                    setattr(rule, attr, value)

            # Check custom termination points
            if rule.category.upper() in self.terminating_types:
                for other_modifier in self.terminating_types[rule.category.upper()]:
                    rule.terminated_by.add(other_modifier.upper())

        self.__matcher.add(rules)

    @classmethod
    def register_graph_attributes(cls):
        """
        Registers spaCy attribute extensions: Span._.modifiers and Doc._.context_graph.
        """
        try:
            Span.set_extension("modifiers", default=(), force=True)
            Doc.set_extension("context_graph", default=None, force=True)
        except ValueError:  # Extension already set
            pass

    @classmethod
    def register_default_attributes(cls):
        """
        Registers the default values for the Span attributes defined in `DEFAULT_ATTRIBUTES`.
        """
        for attr_name in [
            "is_negated",
            "is_uncertain",
            "is_historical",
            "is_hypothetical",
            "is_family",
        ]:
            try:
                Span.set_extension(attr_name, default=False)
            except ValueError:  # Extension already set
                pass

    def set_context_attributes(self, edges):
        """
        Adds Span-level attributes to targets with modifiers.

        Args:
            edges: The edges of the ContextGraph to modify.
        """
        for (target, modifier) in edges:
            if modifier.category in self.context_attributes_mapping:
                attr_dict = self.context_attributes_mapping[modifier.category]
                for attr_name, attr_value in attr_dict.items():
                    setattr(target._, attr_name, attr_value)

    def __call__(self, doc, targets: str = None) -> Doc:
        """
        Applies the ConText algorithm to a Doc.

        Args:
            doc: The spaCy Doc to process.
            targets: The optional custom attribute extension on doc to run over. Must contain an iterable of Span objects

        Returns:
            The processed spaCy Doc.
        """
        if not targets and self.input_span_type == "ents":
            targets = doc.ents
        elif not targets and self.input_span_type == "group":
            targets = doc.spans[self.span_group_name]
        elif targets:
            targets = getattr(doc._, targets)
        # Store data in ConTextGraph object
        # TODO: move some of this over to ConTextGraph
        context_graph = ConTextGraph(
            prune_on_modifier_overlap=self.prune_on_target_overlap
        )

        context_graph.targets = targets

        context_graph.modifiers = []
        matches = self.__matcher(doc)

        for (match_id, start, end) in matches:
            # Get the ConTextRule object defining this modifier
            rule = self.__matcher.rule_map[self.nlp.vocab[match_id].text]
            modifier = ConTextModifier(rule, start, end, doc, max_scope=self.max_scope)
            context_graph.modifiers.append(modifier)

        context_graph.update_scopes()
        context_graph.apply_modifiers()

        # Link targets to their modifiers
        for target, modifier in context_graph.edges:
            target._.modifiers += (modifier,)

        # If attributes need to be modified
        if self.context_attributes_mapping:
            self.set_context_attributes(context_graph.edges)

        doc._.context_graph = context_graph

        return doc

categories property

Returns list of categories available that Context might produce.

input_span_type property writable

The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for a spaCy span group.

Returns:

Type Description

The input type, "ents" or "group".

rules property

Returns list of ConTextRules available to context.

span_group_name property writable

The name of the span group used by this component. If input_type is "group", calling this component will use spans in the span group with this name.

Returns:

Type Description
str

The span group name.

__call__(doc, targets=None)

Applies the ConText algorithm to a Doc.

Parameters:

Name Type Description Default
doc

The spaCy Doc to process.

required
targets str

The optional custom attribute extension on doc to run over. Must contain an iterable of Span objects

None

Returns:

Type Description
Doc

The processed spaCy Doc.

Source code in medspacy/context/context.py
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
def __call__(self, doc, targets: str = None) -> Doc:
    """
    Applies the ConText algorithm to a Doc.

    Args:
        doc: The spaCy Doc to process.
        targets: The optional custom attribute extension on doc to run over. Must contain an iterable of Span objects

    Returns:
        The processed spaCy Doc.
    """
    if not targets and self.input_span_type == "ents":
        targets = doc.ents
    elif not targets and self.input_span_type == "group":
        targets = doc.spans[self.span_group_name]
    elif targets:
        targets = getattr(doc._, targets)
    # Store data in ConTextGraph object
    # TODO: move some of this over to ConTextGraph
    context_graph = ConTextGraph(
        prune_on_modifier_overlap=self.prune_on_target_overlap
    )

    context_graph.targets = targets

    context_graph.modifiers = []
    matches = self.__matcher(doc)

    for (match_id, start, end) in matches:
        # Get the ConTextRule object defining this modifier
        rule = self.__matcher.rule_map[self.nlp.vocab[match_id].text]
        modifier = ConTextModifier(rule, start, end, doc, max_scope=self.max_scope)
        context_graph.modifiers.append(modifier)

    context_graph.update_scopes()
    context_graph.apply_modifiers()

    # Link targets to their modifiers
    for target, modifier in context_graph.edges:
        target._.modifiers += (modifier,)

    # If attributes need to be modified
    if self.context_attributes_mapping:
        self.set_context_attributes(context_graph.edges)

    doc._.context_graph = context_graph

    return doc

__init__(nlp, name='medspacy_context', rules='default', language_code='en', phrase_matcher_attr='LOWER', allowed_types=None, excluded_types=None, terminating_types=None, max_scope=None, max_targets=None, prune_on_modifier_overlap=True, prune_on_target_overlap=False, span_attrs='default', input_span_type='ents', span_group_name='medspacy_spans')

Creates a new ConText object.

Parameters:

Name Type Description Default
nlp Language

A SpaCy Language object.

required
name str

The name of the component.

'medspacy_context'
rules Optional[str]

The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from original ConText rules and years of practical applications at the US Department of Veterans Affairs. If None, no rules are loaded. Otherwise, must be a path to a json file containing rules. Add ConTextRules directly through ConText.add.

'default'
language_code str

Language code to use (ISO code) as a default for loading resources. See documentation and also the /resources directory to see which resources might be available in each language. Default is "en" for English.

'en'
phrase_matcher_attr str

The token attribute to use for PhraseMatcher for rules where pattern is None. Default is 'LOWER'.

'LOWER'
allowed_types Optional[Set[str]]

A global list of types included by context. Rules will operate on only spans with these labels.

None
excluded_types Optional[Set[str]]

A global list of types excluded by context. Rules will not operate on spans with these labels.

None
terminating_types Optional[Dict[str, Iterable[str]]]

A global map of types to the types that can terminate them. This can be used to apply terminations to all rules of a particular type rather than adding to every rule individually in the ContextRule object.

None
max_scope Optional[int]

The number of tokens around a modifier in a target can be modified. Default value is None, Context will use the sentence boundaries. If a value greater than zero, applies the window globally. Both options will be overridden by a more specific value in a ContextRule.

None
max_targets Optional[int]

The maximum number of targets a modifier can modify. Default value is None, context will modify all targets in its scope. If a value greater than zero, applies this value globally. Both options will be overridden by a more specific value in a ContextRule.

None
prune_on_modifier_overlap bool

Whether to prune modifiers which are substrings of another modifier. If True, will drop substrings completely. For example, if "no history of" and "history of" are both ConTextRules,both will match the text "no history of afib", but only "no history of" should modify afib. Default True.

True
prune_on_target_overlap bool

Whether to remove any matched modifiers which overlap with target entities. If False, any overlapping modifiers will not modify the overlapping entity but will still modify any other targets in its scope. Default False.

False
span_attrs Union[Literal['default'], Dict[str, Dict[str, Any]], None]

The optional span attributes to modify. Default option "default" uses attributes in DEFAULT_ATTRIBUTES. If a dictionary, format is mapping context modifier categories to a dictionary containing the attribute name and the value to set the attribute to when a span is modified by a modifier of that category. If None, no attributes will be modified.

'default'
input_span_type Union[Literal['ents', 'group']]

"ents" or "group". Where to look for targets. "ents" will modify attributes of spans in doc.ents. "group" will modify attributes of spans in the span group specified by span_group_name.

'ents'
span_group_name str

The name of the span group used when input_span_type is "group". Default is "medspacy_spans".

'medspacy_spans'
Source code in medspacy/context/context.py
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def __init__(
    self,
    nlp: Language,
    name: str = "medspacy_context",
    rules: Optional[str] = "default",
    language_code: str = 'en',
    phrase_matcher_attr: str = "LOWER",
    allowed_types: Optional[Set[str]] = None,
    excluded_types: Optional[Set[str]] = None,
    terminating_types: Optional[Dict[str, Iterable[str]]] = None,
    max_scope: Optional[int] = None,
    max_targets: Optional[int] = None,
    prune_on_modifier_overlap: bool = True,
    prune_on_target_overlap: bool = False,
    span_attrs: Union[
        Literal["default"], Dict[str, Dict[str, Any]], None
    ] = "default",
    input_span_type: Union[Literal["ents", "group"]] = "ents",
    span_group_name: str = "medspacy_spans",
):
    """
    Creates a new ConText object.

    Args:
        nlp: A SpaCy Language object.
        name: The name of the component.
        rules: The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from
            original ConText rules and years of practical applications at the US Department of Veterans Affairs.  If
            None, no rules are loaded. Otherwise, must be a path to a json file containing rules. Add ConTextRules
            directly through `ConText.add`.
        language_code: Language code to use (ISO code) as a default for loading resources.  See documentation
            and also the /resources directory to see which resources might be available in each language.
            Default is "en" for English.
        phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
            is 'LOWER'.
        allowed_types: A global list of types included by context. Rules will operate on only spans with these
            labels.
        excluded_types: A global list of types excluded by context. Rules will not operate on spans with these
            labels.
        terminating_types: A global map of types to the types that can terminate them. This can be used to apply
            terminations to all rules of a particular type rather than adding to every rule individually in the
            ContextRule object.
        max_scope: The number of tokens around a modifier in a target can be modified. Default value is None,
            Context will use the sentence boundaries. If a value greater than zero, applies the window globally.
            Both options will be overridden by a more specific value in a ContextRule.
        max_targets: The maximum number of targets a modifier can modify. Default value is None, context will modify
            all targets in its scope. If a value greater than zero, applies this value globally. Both options will
            be overridden by a more specific value in a ContextRule.
        prune_on_modifier_overlap: Whether to prune modifiers which are substrings of another modifier. If True,
            will drop substrings completely. For example, if "no history of"  and "history of" are both
            ConTextRules,both will match the text "no history of afib", but only "no  history of" should modify
            afib. Default True.
        prune_on_target_overlap: Whether to remove any matched modifiers which overlap with target entities. If
            False, any overlapping modifiers will not modify the overlapping entity but will still modify any other
            targets in its scope. Default False.
        span_attrs: The optional span attributes to modify. Default option "default" uses attributes in
            `DEFAULT_ATTRIBUTES`. If a dictionary, format is mapping context modifier categories to a dictionary
            containing the attribute name and the value to set the attribute to when a  span is modified by a
            modifier of that category. If None, no attributes will be modified.
        input_span_type: "ents" or "group". Where to look for targets. "ents" will modify attributes of spans
            in doc.ents. "group" will modify attributes of spans in the span group specified by `span_group_name`.
        span_group_name: The name of the span group used when `input_span_type` is "group". Default is
            "medspacy_spans".
    """
    self.nlp = nlp
    self.name = name
    self.prune_on_modifier_overlap = prune_on_modifier_overlap
    self.prune_on_target_overlap = prune_on_target_overlap
    self.input_span_type = input_span_type
    self.span_group_name = span_group_name
    self.context_attributes_mapping = None

    self.DEFAULT_RULES_FILEPATH = path.join(
        Path(__file__).resolve().parents[2], "resources", language_code.lower(), "context_rules.json"
    )

    self.__matcher = MedspacyMatcher(
        nlp,
        name=name,
        phrase_matcher_attr=phrase_matcher_attr,
        prune=prune_on_modifier_overlap,
    )

    if span_attrs == "default":
        self.context_attributes_mapping = DEFAULT_ATTRIBUTES
        self.register_default_attributes()
    elif span_attrs:
        for _, attr_dict in span_attrs.items():
            for attr_name in attr_dict.keys():
                if not Span.has_extension(attr_name):
                    raise ValueError(
                        f"Custom extension {attr_name} has not been set. Please ensure Span.set_extension is "
                        f"called for your pipeline's custom extensions."
                    )
        self.context_attributes_mapping = span_attrs

    self.register_graph_attributes()

    if max_scope is not None:
        if not (isinstance(max_scope, int) and max_scope > 0):
            raise ValueError(
                f"If 'max_scope' must be a value greater than 0, not the current value: {max_scope}"
            )
    self.max_scope = max_scope

    self.allowed_types = allowed_types
    self.excluded_types = excluded_types
    self.max_targets = max_targets

    self.terminating_types = dict()
    if terminating_types:
        self.terminating_types = {
            k.upper(): v for (k, v) in terminating_types.items()
        }

    rule_path = None
    if rules == "default":
        rule_path = self.DEFAULT_RULES_FILEPATH
    else:
        rule_path = rules

    if rule_path:
        self.add(ConTextRule.from_json(rule_path))

add(rules)

Adds ConTextRules to Context.

Parameters:

Name Type Description Default
rules

A single ConTextRule or a collection of ConTextRules to add to the Sectionizer.

required
Source code in medspacy/context/context.py
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
def add(self, rules):
    """
    Adds ConTextRules to Context.

    Args:
        rules: A single ConTextRule or a collection of ConTextRules to add to the Sectionizer.
    """
    if isinstance(rules, ConTextRule):
        rules = [rules]
    for rule in rules:
        if not isinstance(rule, ConTextRule):
            raise TypeError(f"Rules must type ConTextRule, not {type(rule)}.")

        # If global attributes like allowed_types and max_scope are defined,
        # check if the ConTextRule has them defined. If not, set to the global
        for attr in (
            "allowed_types",
            "excluded_types",
            "max_scope",
            "max_targets",
        ):
            value = getattr(self, attr)
            if value is None:  # No global value set
                continue
            if (
                getattr(rule, attr) is None
            ):  # If the direction itself has it defined, don't override
                setattr(rule, attr, value)

        # Check custom termination points
        if rule.category.upper() in self.terminating_types:
            for other_modifier in self.terminating_types[rule.category.upper()]:
                rule.terminated_by.add(other_modifier.upper())

    self.__matcher.add(rules)

register_default_attributes() classmethod

Registers the default values for the Span attributes defined in DEFAULT_ATTRIBUTES.

Source code in medspacy/context/context.py
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
@classmethod
def register_default_attributes(cls):
    """
    Registers the default values for the Span attributes defined in `DEFAULT_ATTRIBUTES`.
    """
    for attr_name in [
        "is_negated",
        "is_uncertain",
        "is_historical",
        "is_hypothetical",
        "is_family",
    ]:
        try:
            Span.set_extension(attr_name, default=False)
        except ValueError:  # Extension already set
            pass

register_graph_attributes() classmethod

Registers spaCy attribute extensions: Span..modifiers and Doc..context_graph.

Source code in medspacy/context/context.py
245
246
247
248
249
250
251
252
253
254
@classmethod
def register_graph_attributes(cls):
    """
    Registers spaCy attribute extensions: Span._.modifiers and Doc._.context_graph.
    """
    try:
        Span.set_extension("modifiers", default=(), force=True)
        Doc.set_extension("context_graph", default=None, force=True)
    except ValueError:  # Extension already set
        pass

set_context_attributes(edges)

Adds Span-level attributes to targets with modifiers.

Parameters:

Name Type Description Default
edges

The edges of the ContextGraph to modify.

required
Source code in medspacy/context/context.py
273
274
275
276
277
278
279
280
281
282
283
284
def set_context_attributes(self, edges):
    """
    Adds Span-level attributes to targets with modifiers.

    Args:
        edges: The edges of the ContextGraph to modify.
    """
    for (target, modifier) in edges:
        if modifier.category in self.context_attributes_mapping:
            attr_dict = self.context_attributes_mapping[modifier.category]
            for attr_name, attr_value in attr_dict.items():
                setattr(target._, attr_name, attr_value)

ConTextGraph

The ConTextGraph class defines the internal structure of the ConText algorithm. It stores a collection of modifiers, matched with ConTextRules, and targets from some other source such as the TargetMatcher or a spaCy NER model.

Each modifier can have some number of associated targets that it modifies. This relationship is stored as edges of of the graph.

Source code in medspacy/context/context_graph.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
class ConTextGraph:
    """
    The ConTextGraph class defines the internal structure of the ConText algorithm. It stores a collection of modifiers,
    matched with ConTextRules, and targets from some other source such as the TargetMatcher or a spaCy NER model.

    Each modifier can have some number of associated targets that it modifies. This relationship is stored as edges of
    of the graph.
    """

    def __init__(
        self,
        targets: Optional[List[Span]] = None,
        modifiers: Optional[List[ConTextModifier]] = None,
        edges: Optional[List] = None,
        prune_on_modifier_overlap: bool = False,
    ):
        """
        Creates a new ConTextGraph object.

        Args:
            targets: A spans that context might modify.
            modifiers: A list of ConTextModifiers that might modify the targets.
            edges: A list of edges between targets and modifiers representing the modification relationship.
            prune_on_modifier_overlap: Whether to prune modifiers when one modifier completely covers another.
        """
        self.targets = targets if targets is not None else []
        self.modifiers = modifiers if modifiers is not None else []
        self.edges = edges if edges is not None else []
        self.prune_on_modifier_overlap = prune_on_modifier_overlap

    def update_scopes(self):
        """
        Update the scope of all ConTextModifier.

        For each modifier in a list of ConTextModifiers, check against each other
        modifier to see if one of the modifiers should update the other.
        This allows neighboring similar modifiers to extend each other's
        scope and allows "terminate" modifiers to end a modifier's scope.
        """
        for i in range(len(self.modifiers) - 1):
            modifier1 = self.modifiers[i]
            for j in range(i + 1, len(self.modifiers)):
                modifier2 = self.modifiers[j]
                # TODO: Add modifier -> modifier edges
                modifier1.limit_scope(modifier2)
                modifier2.limit_scope(modifier1)

    def apply_modifiers(self):
        """
        Checks each target/modifier pair. If modifier modifies target,
        create an edge between them.
        """
        if self.prune_on_modifier_overlap:
            for i in range(len(self.modifiers) - 1, -1, -1):
                modifier = self.modifiers[i]
                for target in self.targets:
                    if tuple_overlaps(
                        (target.start, target.end), modifier.modifier_span
                    ):
                        self.modifiers.pop(i)
                        break

        edges = []
        for target in self.targets:
            for modifier in self.modifiers:
                if modifier.modifies(target):
                    modifier.modify(target)

        # Now do a second pass and reduce the number of targets
        # for any modifiers with a max_targets int
        for modifier in self.modifiers:
            modifier.reduce_targets()
            for target in modifier._targets:
                edges.append((target, modifier))

        self.edges = edges

    def __repr__(self):
        return f"<ConTextGraph> with {len(self.targets)} targets and {len(self.modifiers)} modifiers"

    def serialized_representation(self) -> Dict[str, Any]:
        """
        Returns the serialized representation of the ConTextGraph
        """
        return self.__dict__

    @classmethod
    def from_serialized_representation(cls, serialized_representation) -> ConTextGraph:
        """
        Creates the ConTextGraph from the serialized representation
        """
        context_graph = ConTextGraph(**serialized_representation)

        return context_graph

__init__(targets=None, modifiers=None, edges=None, prune_on_modifier_overlap=False)

Creates a new ConTextGraph object.

Parameters:

Name Type Description Default
targets Optional[List[Span]]

A spans that context might modify.

None
modifiers Optional[List[ConTextModifier]]

A list of ConTextModifiers that might modify the targets.

None
edges Optional[List]

A list of edges between targets and modifiers representing the modification relationship.

None
prune_on_modifier_overlap bool

Whether to prune modifiers when one modifier completely covers another.

False
Source code in medspacy/context/context_graph.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def __init__(
    self,
    targets: Optional[List[Span]] = None,
    modifiers: Optional[List[ConTextModifier]] = None,
    edges: Optional[List] = None,
    prune_on_modifier_overlap: bool = False,
):
    """
    Creates a new ConTextGraph object.

    Args:
        targets: A spans that context might modify.
        modifiers: A list of ConTextModifiers that might modify the targets.
        edges: A list of edges between targets and modifiers representing the modification relationship.
        prune_on_modifier_overlap: Whether to prune modifiers when one modifier completely covers another.
    """
    self.targets = targets if targets is not None else []
    self.modifiers = modifiers if modifiers is not None else []
    self.edges = edges if edges is not None else []
    self.prune_on_modifier_overlap = prune_on_modifier_overlap

apply_modifiers()

Checks each target/modifier pair. If modifier modifies target, create an edge between them.

Source code in medspacy/context/context_graph.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def apply_modifiers(self):
    """
    Checks each target/modifier pair. If modifier modifies target,
    create an edge between them.
    """
    if self.prune_on_modifier_overlap:
        for i in range(len(self.modifiers) - 1, -1, -1):
            modifier = self.modifiers[i]
            for target in self.targets:
                if tuple_overlaps(
                    (target.start, target.end), modifier.modifier_span
                ):
                    self.modifiers.pop(i)
                    break

    edges = []
    for target in self.targets:
        for modifier in self.modifiers:
            if modifier.modifies(target):
                modifier.modify(target)

    # Now do a second pass and reduce the number of targets
    # for any modifiers with a max_targets int
    for modifier in self.modifiers:
        modifier.reduce_targets()
        for target in modifier._targets:
            edges.append((target, modifier))

    self.edges = edges

from_serialized_representation(serialized_representation) classmethod

Creates the ConTextGraph from the serialized representation

Source code in medspacy/context/context_graph.py
 98
 99
100
101
102
103
104
105
@classmethod
def from_serialized_representation(cls, serialized_representation) -> ConTextGraph:
    """
    Creates the ConTextGraph from the serialized representation
    """
    context_graph = ConTextGraph(**serialized_representation)

    return context_graph

serialized_representation()

Returns the serialized representation of the ConTextGraph

Source code in medspacy/context/context_graph.py
92
93
94
95
96
def serialized_representation(self) -> Dict[str, Any]:
    """
    Returns the serialized representation of the ConTextGraph
    """
    return self.__dict__

update_scopes()

Update the scope of all ConTextModifier.

For each modifier in a list of ConTextModifiers, check against each other modifier to see if one of the modifiers should update the other. This allows neighboring similar modifiers to extend each other's scope and allows "terminate" modifiers to end a modifier's scope.

Source code in medspacy/context/context_graph.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def update_scopes(self):
    """
    Update the scope of all ConTextModifier.

    For each modifier in a list of ConTextModifiers, check against each other
    modifier to see if one of the modifiers should update the other.
    This allows neighboring similar modifiers to extend each other's
    scope and allows "terminate" modifiers to end a modifier's scope.
    """
    for i in range(len(self.modifiers) - 1):
        modifier1 = self.modifiers[i]
        for j in range(i + 1, len(self.modifiers)):
            modifier2 = self.modifiers[j]
            # TODO: Add modifier -> modifier edges
            modifier1.limit_scope(modifier2)
            modifier2.limit_scope(modifier1)

ConTextModifier

Represents a concept found by ConText in a document. An instance of this class is the result of ConTextRule matching text in a Doc.

Source code in medspacy/context/context_modifier.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
class ConTextModifier:
    """
    Represents a concept found by ConText in a document. An instance of this class is the result of ConTextRule matching
    text in a Doc.
    """

    def __init__(
        self,
        context_rule: ConTextRule,
        start: int,
        end: int,
        doc: Doc,
        scope_start: Optional[int] = None,
        scope_end: Optional[int] = None,
        max_scope: Optional[int] = None,
    ):
        """
        Create a new ConTextModifier from a document span. Each modifier represents a span in the text and a surrounding
        window. Spans such as entities or other members of span groups that occur within this window can be modified by
        this ConTextModifier.

        Args:
            context_rule: The ConTextRule object which defines the modifier.
            start: The start token index.
            end: The end token index (non-inclusive).
            doc: The spaCy Doc which contains this span. This is needed to initialize the modifier but is not
                maintained.
            scope_start: The start token index of the scope.
            scope_end: The end index of the scope.
            max_scope: Whether to use scope values rather than sentence boundaries for modifications.
        """
        self._context_rule = context_rule
        self._start = start
        self._end = end

        self._targets = []
        self._num_targets = 0

        self._max_scope = max_scope
        self._scope_start = scope_start
        self._scope_end = scope_end
        if doc is not None and (self._scope_end is None or self._scope_start is None):
            self.__set_scope(doc)

    @property
    def modifier_span(self) -> Tuple[int, int]:
        """
        The spaCy Span object, which is a view of self.doc, covered by this match.
        """
        return self._start, self._end

    @property
    def rule(self) -> ConTextRule:
        """
        Returns the associated context rule.
        """
        return self._context_rule

    @property
    def direction(self) -> str:
        """
        Returns the associated direction.
        """
        return self.rule.direction

    @property
    def category(self) -> str:
        """
        Returns the associated category.
        """
        return self.rule.category

    @property
    def scope_span(self) -> Tuple[int, int]:
        """
        Returns the associated scope.
        """
        return self._scope_start, self._scope_end

    @property
    def allowed_types(self) -> Set[str]:
        """
        Returns the associated allowed types.
        """
        return self.rule.allowed_types

    @property
    def excluded_types(self) -> Set[str]:
        """
        Returns the associated excluded types.
        """
        return self.rule.excluded_types

    @property
    def num_targets(self) -> int:
        """
        Returns the associated number of targets.
        """
        return self._num_targets

    @property
    def max_targets(self) -> Union[int, None]:
        """
        Returns the associated maximum number of targets.
        """
        return self.rule.max_targets

    @property
    def max_scope(self) -> Union[int, None]:
        """
        Returns the associated maximum scope.
        """
        return self.rule.max_scope

    def __set_scope(self, doc: Doc):
        """
        Applies the direction of the ConTextRule which generated this ConTextModifier to define a scope. If
        self._max_scope is None, then the default scope is the sentence which it occurs in whichever direction defined by
        self.direction. For example, if the direction is "forward", the scope will be [self.end: sentence.end]. If the
        direction is "backward", it will be [self.start: sentence.start].

        If self.max_scope is not None and the length of the default scope is longer than self.max_scope, it will be
        reduced to self.max_scope.

        Args:
            doc: The spaCy doc to use to set scope.
        """
        # If ConText is set to use defined windows, do that instead of sentence splitting
        if self._max_scope:
            full_scope_span = doc[self._start : self._end]._.window(
                n=self.rule.max_scope
            )
        # Otherwise, use the sentence
        else:
            full_scope_span = doc[self._start].sent
            if full_scope_span is None:
                raise ValueError(
                    "ConText failed because sentence boundaries have not been set. Add an upstream component such as the "
                    "dependency parser, Sentencizer, or PyRuSH to detect sentence boundaries or initialize ConText with "
                    "`max_scope` set to a value greater than 0."
                )

        if self.direction.lower() == "forward":
            self._scope_start, self._scope_end = self._end, full_scope_span.end
            if (
                self.max_scope is not None
                and (self._scope_end - self._scope_start) > self.max_scope
            ):
                self._scope_end = self._end + self.max_scope

        elif self.direction.lower() == "backward":
            self._scope_start, self._scope_end = (
                full_scope_span.start,
                self._start,
            )
            if (
                self.max_scope is not None
                and (self._scope_end - self._scope_start) > self.max_scope
            ):
                self._scope_start = self._start - self.max_scope

        else:  # bidirectional
            self._scope_start, self._scope_end = (
                full_scope_span.start,
                full_scope_span.end,
            )

            # Set the max scope on either side
            # Backwards
            if (
                self.max_scope is not None
                and (self._start - self._scope_start) > self.max_scope
            ):
                self._scope_start = self._start - self.max_scope
            # Forwards
            if (
                self.max_scope is not None
                and (self._scope_end - self._end) > self.max_scope
            ):
                self._scope_end = self._end + self.max_scope

    def update_scope(self, span: Span):
        """
        Changes the scope of self to be the given spaCy span.

        Args:
            span: a spaCy Span which contains the scope which a modifier should cover.
        """
        self._scope_start = span.start
        self._scope_end = span.end

    def limit_scope(self, other: ConTextModifier) -> bool:
        """
        If self and other have the same category or if other has a directionality of 'terminate', use the span of other
        to update the scope of self. Limiting the scope of two modifiers of the same category reduces the number of
        modifiers. For example, in 'no evidence of CHF, no pneumonia', 'pneumonia' will only be modified by 'no', not
        'no evidence of'. 'terminate' modifiers limit the scope of a modifier like 'no evidence of' in 'no evidence of
        CHF, but there is pneumonia'

        Args:
            other: The modifier to check against.

        Returns:
            Whether the other modifier modified the scope of self.
        """
        if not tuple_overlaps(self.scope_span, other.scope_span):
            return False
        if self.direction.upper() == "TERMINATE":
            return False
        # Check if the other modifier is a type which can modify self
        # or if they are the same category. If not, don't reduce scope.
        if (
            (other.direction.upper() != "TERMINATE")
            and (other.category.upper() not in self.rule.terminated_by)
            and (other.category.upper() != self.category.upper())
        ):
            return False

        # If two modifiers have the same category but modify different target types,
        # don't limit scope.
        if self.category == other.category and (
            (self.allowed_types != other.allowed_types)
            or (self.excluded_types != other.excluded_types)
        ):
            return False

        orig_scope = self.scope_span
        if self.direction.lower() in ("forward", "bidirectional"):
            if other > self:
                self._scope_end = min(self._scope_end, other.modifier_span[0])
        if self.direction.lower() in ("backward", "bidirectional"):
            if other < self:
                self._scope_start = max(self._scope_start, other.modifier_span[1])
        return orig_scope != self.scope_span

    def modifies(self, target: Span) -> bool:
        """
        Checks whether the target is within the modifier scope and if self is allowed to modify target.

        Args:
            target: a spaCy span representing a target concept.

        Returns:
            Whether the target is within `modifier_scope` and if self is allowed to modify the target.
        """
        # If the target and modifier overlap, meaning at least one token
        # one extracted as both a target and modifier, return False
        # to avoid self-modifying concepts

        if tuple_overlaps(
            self.modifier_span, (target.start, target.end)
        ):  # self.overlaps(target):
            return False
        if self.direction in ("TERMINATE", "PSEUDO"):
            return False
        if not self.allows(target.label_.upper()):
            return False

        if tuple_overlaps(self.scope_span, (target.start, target.end)):
            if not self.on_modifies(target):
                return False
            else:
                return True
        return False

    def allows(self, target_label: str) -> bool:
        """
        Returns whether if a modifier is able to modify a target type.

        Args:
            target_label: The target type to check.

        Returns:
            Whether the modifier is allowed to modify a target of the specified type. True if `target_label` in
            `self.allowed_types` or if `target_label` not in `self.excluded_tupes`. False otherwise.
        """
        if self.allowed_types is not None:
            return target_label in self.allowed_types
        if self.excluded_types is not None:
            return target_label not in self.excluded_types
        return True

    def on_modifies(self, target: Span) -> bool:
        """
        If the ConTextRule used to define a ConTextModifier has an `on_modifies` callback function, evaluate and return
        either True or False.

        Args:
            target: The spaCy span to evaluate.

        Returns:
            The result of the `on_modifies` callback for the rule. True if the callback is None.
        """
        if self.rule.on_modifies is None:
            return True
        # Find the span in between the target and modifier
        start = min(target.end, self._end)
        end = max(target.start, self._end)
        span_between = target.doc[start:end]
        rslt = self.rule.on_modifies(
            target, target.doc[self._start : self._end], span_between
        )
        if rslt not in (True, False):
            raise ValueError(
                "The on_modifies function must return either True or False indicating "
                "whether a modify modifies a target. Actual value: {0}".format(rslt)
            )
        return rslt

    def modify(self, target: Span):
        """
        Add target to the list of self._targets and increment self._num_targets.

        Args:
            target: The spaCy span to add.
        """
        self._targets.append(target)
        self._num_targets += 1

    def reduce_targets(self):
        """
        Reduces the number of targets to the n-closest targets based on the value of `self.max_targets`. If
        `self.max_targets` is None, no pruning is done.
        """
        if self.max_targets is None or self.num_targets <= self.max_targets:
            return

        target_dists = []
        for target in self._targets:
            dist = min(abs(self._start - target.end), abs(target.start - self._end))
            target_dists.append((target, dist))
        srtd_targets, _ = zip(*sorted(target_dists, key=lambda x: x[1]))
        self._targets = srtd_targets[: self.max_targets]
        self._num_targets = len(self._targets)

    def __gt__(self, other: ConTextModifier):
        return self._start > other.modifier_span[0]

    def __ge__(self, other):
        return self._start >= other.modifier_span[0]

    def __lt__(self, other):
        return self._end < other.modifier_span[1]

    def __le__(self, other):
        return self._end <= other.modifier_span[1]

    def __len__(self):
        return self._end - self._start

    def __repr__(self):
        return f"<ConTextModifier> [{self._start}, {self._end}, {self.category}]"

    def serialized_representation(self):
        """
        Serialized Representation of the modifier
        """
        dict_repr = dict()
        dict_repr["context_rule"] = self.rule.to_dict()
        dict_repr["start"] = self._start
        dict_repr["end"] = self._end
        dict_repr["max_scope"] = self._max_scope
        dict_repr["scope_start"] = self._scope_start
        dict_repr["scope_end"] = self._scope_end

        return dict_repr

    @classmethod
    def from_serialized_representation(
        cls, serialized_representation
    ) -> ConTextModifier:
        """
        Instantiates the class from the serialized representation
        """
        rule = ConTextRule.from_dict(serialized_representation["context_rule"])

        serialized_representation["context_rule"] = rule
        serialized_representation["doc"] = None

        return ConTextModifier(**serialized_representation)

allowed_types property

Returns the associated allowed types.

category property

Returns the associated category.

direction property

Returns the associated direction.

excluded_types property

Returns the associated excluded types.

max_scope property

Returns the associated maximum scope.

max_targets property

Returns the associated maximum number of targets.

modifier_span property

The spaCy Span object, which is a view of self.doc, covered by this match.

num_targets property

Returns the associated number of targets.

rule property

Returns the associated context rule.

scope_span property

Returns the associated scope.

__init__(context_rule, start, end, doc, scope_start=None, scope_end=None, max_scope=None)

Create a new ConTextModifier from a document span. Each modifier represents a span in the text and a surrounding window. Spans such as entities or other members of span groups that occur within this window can be modified by this ConTextModifier.

Parameters:

Name Type Description Default
context_rule ConTextRule

The ConTextRule object which defines the modifier.

required
start int

The start token index.

required
end int

The end token index (non-inclusive).

required
doc Doc

The spaCy Doc which contains this span. This is needed to initialize the modifier but is not maintained.

required
scope_start Optional[int]

The start token index of the scope.

None
scope_end Optional[int]

The end index of the scope.

None
max_scope Optional[int]

Whether to use scope values rather than sentence boundaries for modifications.

None
Source code in medspacy/context/context_modifier.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def __init__(
    self,
    context_rule: ConTextRule,
    start: int,
    end: int,
    doc: Doc,
    scope_start: Optional[int] = None,
    scope_end: Optional[int] = None,
    max_scope: Optional[int] = None,
):
    """
    Create a new ConTextModifier from a document span. Each modifier represents a span in the text and a surrounding
    window. Spans such as entities or other members of span groups that occur within this window can be modified by
    this ConTextModifier.

    Args:
        context_rule: The ConTextRule object which defines the modifier.
        start: The start token index.
        end: The end token index (non-inclusive).
        doc: The spaCy Doc which contains this span. This is needed to initialize the modifier but is not
            maintained.
        scope_start: The start token index of the scope.
        scope_end: The end index of the scope.
        max_scope: Whether to use scope values rather than sentence boundaries for modifications.
    """
    self._context_rule = context_rule
    self._start = start
    self._end = end

    self._targets = []
    self._num_targets = 0

    self._max_scope = max_scope
    self._scope_start = scope_start
    self._scope_end = scope_end
    if doc is not None and (self._scope_end is None or self._scope_start is None):
        self.__set_scope(doc)

__set_scope(doc)

Applies the direction of the ConTextRule which generated this ConTextModifier to define a scope. If self._max_scope is None, then the default scope is the sentence which it occurs in whichever direction defined by self.direction. For example, if the direction is "forward", the scope will be [self.end: sentence.end]. If the direction is "backward", it will be [self.start: sentence.start].

If self.max_scope is not None and the length of the default scope is longer than self.max_scope, it will be reduced to self.max_scope.

Parameters:

Name Type Description Default
doc Doc

The spaCy doc to use to set scope.

required
Source code in medspacy/context/context_modifier.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def __set_scope(self, doc: Doc):
    """
    Applies the direction of the ConTextRule which generated this ConTextModifier to define a scope. If
    self._max_scope is None, then the default scope is the sentence which it occurs in whichever direction defined by
    self.direction. For example, if the direction is "forward", the scope will be [self.end: sentence.end]. If the
    direction is "backward", it will be [self.start: sentence.start].

    If self.max_scope is not None and the length of the default scope is longer than self.max_scope, it will be
    reduced to self.max_scope.

    Args:
        doc: The spaCy doc to use to set scope.
    """
    # If ConText is set to use defined windows, do that instead of sentence splitting
    if self._max_scope:
        full_scope_span = doc[self._start : self._end]._.window(
            n=self.rule.max_scope
        )
    # Otherwise, use the sentence
    else:
        full_scope_span = doc[self._start].sent
        if full_scope_span is None:
            raise ValueError(
                "ConText failed because sentence boundaries have not been set. Add an upstream component such as the "
                "dependency parser, Sentencizer, or PyRuSH to detect sentence boundaries or initialize ConText with "
                "`max_scope` set to a value greater than 0."
            )

    if self.direction.lower() == "forward":
        self._scope_start, self._scope_end = self._end, full_scope_span.end
        if (
            self.max_scope is not None
            and (self._scope_end - self._scope_start) > self.max_scope
        ):
            self._scope_end = self._end + self.max_scope

    elif self.direction.lower() == "backward":
        self._scope_start, self._scope_end = (
            full_scope_span.start,
            self._start,
        )
        if (
            self.max_scope is not None
            and (self._scope_end - self._scope_start) > self.max_scope
        ):
            self._scope_start = self._start - self.max_scope

    else:  # bidirectional
        self._scope_start, self._scope_end = (
            full_scope_span.start,
            full_scope_span.end,
        )

        # Set the max scope on either side
        # Backwards
        if (
            self.max_scope is not None
            and (self._start - self._scope_start) > self.max_scope
        ):
            self._scope_start = self._start - self.max_scope
        # Forwards
        if (
            self.max_scope is not None
            and (self._scope_end - self._end) > self.max_scope
        ):
            self._scope_end = self._end + self.max_scope

allows(target_label)

Returns whether if a modifier is able to modify a target type.

Parameters:

Name Type Description Default
target_label str

The target type to check.

required

Returns:

Type Description
bool

Whether the modifier is allowed to modify a target of the specified type. True if target_label in

bool

self.allowed_types or if target_label not in self.excluded_tupes. False otherwise.

Source code in medspacy/context/context_modifier.py
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
def allows(self, target_label: str) -> bool:
    """
    Returns whether if a modifier is able to modify a target type.

    Args:
        target_label: The target type to check.

    Returns:
        Whether the modifier is allowed to modify a target of the specified type. True if `target_label` in
        `self.allowed_types` or if `target_label` not in `self.excluded_tupes`. False otherwise.
    """
    if self.allowed_types is not None:
        return target_label in self.allowed_types
    if self.excluded_types is not None:
        return target_label not in self.excluded_types
    return True

from_serialized_representation(serialized_representation) classmethod

Instantiates the class from the serialized representation

Source code in medspacy/context/context_modifier.py
379
380
381
382
383
384
385
386
387
388
389
390
391
@classmethod
def from_serialized_representation(
    cls, serialized_representation
) -> ConTextModifier:
    """
    Instantiates the class from the serialized representation
    """
    rule = ConTextRule.from_dict(serialized_representation["context_rule"])

    serialized_representation["context_rule"] = rule
    serialized_representation["doc"] = None

    return ConTextModifier(**serialized_representation)

limit_scope(other)

If self and other have the same category or if other has a directionality of 'terminate', use the span of other to update the scope of self. Limiting the scope of two modifiers of the same category reduces the number of modifiers. For example, in 'no evidence of CHF, no pneumonia', 'pneumonia' will only be modified by 'no', not 'no evidence of'. 'terminate' modifiers limit the scope of a modifier like 'no evidence of' in 'no evidence of CHF, but there is pneumonia'

Parameters:

Name Type Description Default
other ConTextModifier

The modifier to check against.

required

Returns:

Type Description
bool

Whether the other modifier modified the scope of self.

Source code in medspacy/context/context_modifier.py
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
def limit_scope(self, other: ConTextModifier) -> bool:
    """
    If self and other have the same category or if other has a directionality of 'terminate', use the span of other
    to update the scope of self. Limiting the scope of two modifiers of the same category reduces the number of
    modifiers. For example, in 'no evidence of CHF, no pneumonia', 'pneumonia' will only be modified by 'no', not
    'no evidence of'. 'terminate' modifiers limit the scope of a modifier like 'no evidence of' in 'no evidence of
    CHF, but there is pneumonia'

    Args:
        other: The modifier to check against.

    Returns:
        Whether the other modifier modified the scope of self.
    """
    if not tuple_overlaps(self.scope_span, other.scope_span):
        return False
    if self.direction.upper() == "TERMINATE":
        return False
    # Check if the other modifier is a type which can modify self
    # or if they are the same category. If not, don't reduce scope.
    if (
        (other.direction.upper() != "TERMINATE")
        and (other.category.upper() not in self.rule.terminated_by)
        and (other.category.upper() != self.category.upper())
    ):
        return False

    # If two modifiers have the same category but modify different target types,
    # don't limit scope.
    if self.category == other.category and (
        (self.allowed_types != other.allowed_types)
        or (self.excluded_types != other.excluded_types)
    ):
        return False

    orig_scope = self.scope_span
    if self.direction.lower() in ("forward", "bidirectional"):
        if other > self:
            self._scope_end = min(self._scope_end, other.modifier_span[0])
    if self.direction.lower() in ("backward", "bidirectional"):
        if other < self:
            self._scope_start = max(self._scope_start, other.modifier_span[1])
    return orig_scope != self.scope_span

modifies(target)

Checks whether the target is within the modifier scope and if self is allowed to modify target.

Parameters:

Name Type Description Default
target Span

a spaCy span representing a target concept.

required

Returns:

Type Description
bool

Whether the target is within modifier_scope and if self is allowed to modify the target.

Source code in medspacy/context/context_modifier.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
def modifies(self, target: Span) -> bool:
    """
    Checks whether the target is within the modifier scope and if self is allowed to modify target.

    Args:
        target: a spaCy span representing a target concept.

    Returns:
        Whether the target is within `modifier_scope` and if self is allowed to modify the target.
    """
    # If the target and modifier overlap, meaning at least one token
    # one extracted as both a target and modifier, return False
    # to avoid self-modifying concepts

    if tuple_overlaps(
        self.modifier_span, (target.start, target.end)
    ):  # self.overlaps(target):
        return False
    if self.direction in ("TERMINATE", "PSEUDO"):
        return False
    if not self.allows(target.label_.upper()):
        return False

    if tuple_overlaps(self.scope_span, (target.start, target.end)):
        if not self.on_modifies(target):
            return False
        else:
            return True
    return False

modify(target)

Add target to the list of self._targets and increment self._num_targets.

Parameters:

Name Type Description Default
target Span

The spaCy span to add.

required
Source code in medspacy/context/context_modifier.py
321
322
323
324
325
326
327
328
329
def modify(self, target: Span):
    """
    Add target to the list of self._targets and increment self._num_targets.

    Args:
        target: The spaCy span to add.
    """
    self._targets.append(target)
    self._num_targets += 1

on_modifies(target)

If the ConTextRule used to define a ConTextModifier has an on_modifies callback function, evaluate and return either True or False.

Parameters:

Name Type Description Default
target Span

The spaCy span to evaluate.

required

Returns:

Type Description
bool

The result of the on_modifies callback for the rule. True if the callback is None.

Source code in medspacy/context/context_modifier.py
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
def on_modifies(self, target: Span) -> bool:
    """
    If the ConTextRule used to define a ConTextModifier has an `on_modifies` callback function, evaluate and return
    either True or False.

    Args:
        target: The spaCy span to evaluate.

    Returns:
        The result of the `on_modifies` callback for the rule. True if the callback is None.
    """
    if self.rule.on_modifies is None:
        return True
    # Find the span in between the target and modifier
    start = min(target.end, self._end)
    end = max(target.start, self._end)
    span_between = target.doc[start:end]
    rslt = self.rule.on_modifies(
        target, target.doc[self._start : self._end], span_between
    )
    if rslt not in (True, False):
        raise ValueError(
            "The on_modifies function must return either True or False indicating "
            "whether a modify modifies a target. Actual value: {0}".format(rslt)
        )
    return rslt

reduce_targets()

Reduces the number of targets to the n-closest targets based on the value of self.max_targets. If self.max_targets is None, no pruning is done.

Source code in medspacy/context/context_modifier.py
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
def reduce_targets(self):
    """
    Reduces the number of targets to the n-closest targets based on the value of `self.max_targets`. If
    `self.max_targets` is None, no pruning is done.
    """
    if self.max_targets is None or self.num_targets <= self.max_targets:
        return

    target_dists = []
    for target in self._targets:
        dist = min(abs(self._start - target.end), abs(target.start - self._end))
        target_dists.append((target, dist))
    srtd_targets, _ = zip(*sorted(target_dists, key=lambda x: x[1]))
    self._targets = srtd_targets[: self.max_targets]
    self._num_targets = len(self._targets)

serialized_representation()

Serialized Representation of the modifier

Source code in medspacy/context/context_modifier.py
365
366
367
368
369
370
371
372
373
374
375
376
377
def serialized_representation(self):
    """
    Serialized Representation of the modifier
    """
    dict_repr = dict()
    dict_repr["context_rule"] = self.rule.to_dict()
    dict_repr["start"] = self._start
    dict_repr["end"] = self._end
    dict_repr["max_scope"] = self._max_scope
    dict_repr["scope_start"] = self._scope_start
    dict_repr["scope_end"] = self._scope_end

    return dict_repr

update_scope(span)

Changes the scope of self to be the given spaCy span.

Parameters:

Name Type Description Default
span Span

a spaCy Span which contains the scope which a modifier should cover.

required
Source code in medspacy/context/context_modifier.py
193
194
195
196
197
198
199
200
201
def update_scope(self, span: Span):
    """
    Changes the scope of self to be the given spaCy span.

    Args:
        span: a spaCy Span which contains the scope which a modifier should cover.
    """
    self._scope_start = span.start
    self._scope_end = span.end

ConTextRule

Bases: BaseRule

A ConTextRule defines a ConText modifier. ConTextRules are rules which define which spans are extracted as modifiers and how they behave, such as the phrase to be matched, the category/semantic class, the direction of the modifier in the text, and what types of target spans can be modified.

Source code in medspacy/context/context_rule.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
class ConTextRule(BaseRule):
    """
    A ConTextRule defines a ConText modifier. ConTextRules are rules which define which spans are extracted as modifiers
    and how they behave, such as the phrase to be matched, the category/semantic class, the direction of the modifier in
    the text, and what types of target spans can be modified.
    """

    _ALLOWED_DIRECTIONS = (
        "FORWARD",
        "BACKWARD",
        "BIDIRECTIONAL",
        "TERMINATE",
        "PSEUDO"
    )
    _ALLOWED_KEYS = {
        "literal",
        "direction",
        "pattern",
        "category",
        "metadata",
        "allowed_types",
        "excluded_types",
        "max_targets",
        "max_scope",
    }

    def __init__(
        self,
        literal: str,
        category: str,
        pattern: Optional[Union[str, List[Dict[str, str]]]] = None,
        direction: str = "BIDIRECTIONAL",
        on_match: Optional[
            Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
        ] = None,
        on_modifies: Optional[Callable[[Span, Span, Span], bool]] = None,
        allowed_types: Optional[Set[str]] = None,
        excluded_types: Optional[Set[str]] = None,
        max_scope: Optional[int] = None,
        max_targets: Optional[int] = None,
        terminated_by: Optional[Set[str]] = None,
        metadata: Optional[Dict[Any, Any]] = None,
    ):
        """
        Creates a ConTextRule object.

        The primary arguments of `literal` `category`, and `direction` define the span of text to be matched, the
        semantic category, and the direction within the sentence in which the modifier operates.
        Other arguments specify additional custom logic such as:
            - Additional control over what text can be matched as a modifier (pattern and on_match)
            - Which types of targets can be modified (allowed_types, excluded_types)
            - The scope size and number of targets that a modifier can modify (max_targets, max_scope)
            - Other logic for terminating a span or for allowing a modifier to modify a target (on_modifies,
            terminated_by)

        Args:
            literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
                matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
                but can be used as a reference as the rule name.
            category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
            pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
                token-based pattern matching to match using token attributes. If a string, will use medspaCy's
                RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
                https://spacy.io/usage/rule-based-matching.
            direction: The directionality or action of a modifier. This defines which part of a sentence a modifier will
                include as its scope. Entities within the scope will be considered to be modified.
                Valid values are:
                - "FORWARD": Scope will begin after the end of a modifier and move to the right
                - "BACKWARD": Scope will begin before the beginning of a modifier and move to the left
                - "BIDIRECTIONAL": Scope will expand on either side of a modifier
                - "TERMINATE": A special direction to limit any other modifiers if this phrase is in its scope. Example:
                    "no evidence of chf but there is pneumonia": "but" will prevent "no evidence of" from modifying
                    "pneumonia"
                - "PSEUDO": A special direction which will not modify any targets. This can be used for differentiating
                    superstrings of modifiers. Example: A modifier with literal="negative attitude" will prevent the
                    phrase "negative" in "She has a negative attitude about her treatment" from being extracted as a
                    modifier.
            on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
                matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
            on_modifies: Callback function to run when building an edge between a target and a modifier. This allows
                specifying custom logic for allowing or preventing certain modifiers from modifying certain targets. The
                callable should take 3 arguments:
                    target: The spaCy Span from doc.ents (ie., 'Evidence of pneumonia')
                    modifier: The spaCy Span covered in a resulting modifier (ie., 'no evidence of')
                    span_between: The Span between the target and modifier in question.
                Should return either True or False. If returns False, then the modifier will not modify the target.
            allowed_types: A collection of target labels to allow a modifier to modify. If None, will apply to any type
                not specifically excluded in excluded_types. Only one of allowed_types and excluded_types can be used.
                An error will be thrown if both are not None.
            excluded_types: A collection of target labels which this modifier cannot modify. If None, will apply to all
                target types unless allowed_types is not None.
            max_scope: A number of tokens to explicitly limit the size of the modifier's scope. If None, the scope will
                include the entire sentence in the direction of `direction` and the entire sentence for "BIDIRECTIONAL".
                This is useful for requiring modifiers be very close to a concept in the text or for preventing long
                modifier ranges caused by sentence splitting problems.
            max_targets: The maximum number of targets which a modifier can modify. If None, will modify all targets in
                its scope.
            terminated_by: An optional collection of other modifier categories which will terminate the scope of this
                modifier. If None, only "TERMINATE" will do this. Example: if a ConTextRule defining "positive for" has
                terminated_by={"NEGATED_EXISTENCE"}, then in the sentence "positive for flu, negative for RSV", the
                positive modifier will modify "flu" but will be terminated by "negative for" and will not modify "RSV".
                This helps prevent multiple conflicting modifiers from distributing too far across a sentence.
            metadata: Optional dictionary of any extra metadata.
        """
        super().__init__(literal, category.upper(), pattern, on_match, metadata)
        self.on_modifies = on_modifies

        if allowed_types is not None and excluded_types is not None:
            raise ValueError(
                "A ConTextRule was instantiated with non-null values for both allowed_types and excluded_types. "
                "Only one of these can be non-null."
            )
        if allowed_types is not None:
            self.allowed_types = {label.upper() for label in allowed_types}
        else:
            self.allowed_types = None
        if excluded_types is not None:
            self.excluded_types = {label.upper() for label in excluded_types}
        else:
            self.excluded_types = None

        if max_targets is not None and max_targets <= 0:
            raise ValueError("max_targets must be >= 0 or None.")
        self.max_targets = max_targets
        if max_scope is not None and max_scope <= 0:
            raise ValueError("max_scope must be >= 0 or None.")
        self.max_scope = max_scope
        if terminated_by is None:
            terminated_by = set()
        else:
            if isinstance(terminated_by, str):
                raise ValueError(
                    f"terminated_by must be an iterable, such as a list or set, not {terminated_by}."
                )
            terminated_by = {string.upper() for string in terminated_by}

        self.terminated_by = terminated_by

        self.metadata = metadata

        if direction.upper() not in self._ALLOWED_DIRECTIONS:
            raise ValueError(
                "Direction {0} not recognized. Must be one of: {1}".format(
                    direction, self._ALLOWED_DIRECTIONS
                )
            )
        self.direction = direction.upper()

    @classmethod
    def from_json(cls, filepath) -> List[ConTextRule]:
        """
        Reads in a lexicon of modifiers from a JSON file under the key `context_rules`.

        Args:
            filepath: The .json file containing modifier rules. Must contain `context_rules` key containing the rule
                JSONs.

        Returns:
            A list of ConTextRules objects read from the JSON.
        """

        with open(filepath) as file:
            modifier_data = json.load(file)
        context_rules = []
        for data in modifier_data["context_rules"]:
            context_rules.append(ConTextRule.from_dict(data))
        return context_rules

    @classmethod
    def from_dict(cls, rule_dict) -> ConTextRule:
        """
        Reads a dictionary into a ConTextRule.

        Args:
            rule_dict: The dictionary to convert.

        Returns:
            The ConTextRule created from the dictionary.
        """
        keys = set(rule_dict.keys())
        invalid_keys = keys.difference(cls._ALLOWED_KEYS)
        if invalid_keys:
            msg = (
                "JSON object contains invalid keys: {0}.\n"
                "Must be one of: {1}".format(invalid_keys, cls._ALLOWED_KEYS)
            )
            raise ValueError(msg)
        rule = ConTextRule(**rule_dict)
        return rule

    def to_dict(self):
        """
        Converts ConTextItems to a python dictionary. Used when writing context rules to a json file.

        Returns:
            The dictionary containing the ConTextRule info.
        """

        rule_dict = {}
        for key in self._ALLOWED_KEYS:
            value = self.__dict__.get(key)
            if isinstance(value, set):
                value = list(value)
            if value is not None:
                rule_dict[key] = value
        return rule_dict

    @classmethod
    def to_json(cls, context_rules: List[ConTextRule], filepath: str):
        """Writes ConTextItems to a json file.

            Args:
            context_rules: a list of ContextRules that will be written to a file.
            filepath: the .json file to contain modifier rules
        """
        import json

        data = {"context_rules": [rule.to_dict() for rule in context_rules]}
        with open(filepath, "w") as file:
            json.dump(data, file, indent=4)

    def __repr__(self):
        return (
            f"ConTextRule(literal='{self.literal}', category='{self.category}', pattern={self.pattern}, "
            f"direction='{self.direction}')"
        )

__init__(literal, category, pattern=None, direction='BIDIRECTIONAL', on_match=None, on_modifies=None, allowed_types=None, excluded_types=None, max_scope=None, max_targets=None, terminated_by=None, metadata=None)

Creates a ConTextRule object.

The primary arguments of literal category, and direction define the span of text to be matched, the semantic category, and the direction within the sentence in which the modifier operates. Other arguments specify additional custom logic such as: - Additional control over what text can be matched as a modifier (pattern and on_match) - Which types of targets can be modified (allowed_types, excluded_types) - The scope size and number of targets that a modifier can modify (max_targets, max_scope) - Other logic for terminating a span or for allowing a modifier to modify a target (on_modifies, terminated_by)

Parameters:

Name Type Description Default
literal str

The string representation of a concept. If pattern is None, this string will be lower-cased and matched to the lower-case string. If pattern is not None, this argument will not be used for matching but can be used as a reference as the rule name.

required
category str

The semantic class of the matched span. This corresponds to the label_ attribute of an entity.

required
pattern Optional[Union[str, List[Dict[str, str]]]]

A list or string to use as a spaCy pattern rather than literal. If a list, will use spaCy token-based pattern matching to match using token attributes. If a string, will use medspaCy's RegexMatcher. If None, will use literal as the pattern for phrase matching. For more information, see https://spacy.io/usage/rule-based-matching.

None
direction str

The directionality or action of a modifier. This defines which part of a sentence a modifier will include as its scope. Entities within the scope will be considered to be modified. Valid values are: - "FORWARD": Scope will begin after the end of a modifier and move to the right - "BACKWARD": Scope will begin before the beginning of a modifier and move to the left - "BIDIRECTIONAL": Scope will expand on either side of a modifier - "TERMINATE": A special direction to limit any other modifiers if this phrase is in its scope. Example: "no evidence of chf but there is pneumonia": "but" will prevent "no evidence of" from modifying "pneumonia" - "PSEUDO": A special direction which will not modify any targets. This can be used for differentiating superstrings of modifiers. Example: A modifier with literal="negative attitude" will prevent the phrase "negative" in "She has a negative attitude about her treatment" from being extracted as a modifier.

'BIDIRECTIONAL'
on_match Optional[Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]]

An optional callback function or other callable which takes 4 arguments: (matcher, doc, i, matches). For more information, see https://spacy.io/usage/rule-based-matching#on_match

None
on_modifies Optional[Callable[[Span, Span, Span], bool]]

Callback function to run when building an edge between a target and a modifier. This allows specifying custom logic for allowing or preventing certain modifiers from modifying certain targets. The callable should take 3 arguments: target: The spaCy Span from doc.ents (ie., 'Evidence of pneumonia') modifier: The spaCy Span covered in a resulting modifier (ie., 'no evidence of') span_between: The Span between the target and modifier in question. Should return either True or False. If returns False, then the modifier will not modify the target.

None
allowed_types Optional[Set[str]]

A collection of target labels to allow a modifier to modify. If None, will apply to any type not specifically excluded in excluded_types. Only one of allowed_types and excluded_types can be used. An error will be thrown if both are not None.

None
excluded_types Optional[Set[str]]

A collection of target labels which this modifier cannot modify. If None, will apply to all target types unless allowed_types is not None.

None
max_scope Optional[int]

A number of tokens to explicitly limit the size of the modifier's scope. If None, the scope will include the entire sentence in the direction of direction and the entire sentence for "BIDIRECTIONAL". This is useful for requiring modifiers be very close to a concept in the text or for preventing long modifier ranges caused by sentence splitting problems.

None
max_targets Optional[int]

The maximum number of targets which a modifier can modify. If None, will modify all targets in its scope.

None
terminated_by Optional[Set[str]]

An optional collection of other modifier categories which will terminate the scope of this modifier. If None, only "TERMINATE" will do this. Example: if a ConTextRule defining "positive for" has terminated_by={"NEGATED_EXISTENCE"}, then in the sentence "positive for flu, negative for RSV", the positive modifier will modify "flu" but will be terminated by "negative for" and will not modify "RSV". This helps prevent multiple conflicting modifiers from distributing too far across a sentence.

None
metadata Optional[Dict[Any, Any]]

Optional dictionary of any extra metadata.

None
Source code in medspacy/context/context_rule.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def __init__(
    self,
    literal: str,
    category: str,
    pattern: Optional[Union[str, List[Dict[str, str]]]] = None,
    direction: str = "BIDIRECTIONAL",
    on_match: Optional[
        Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
    ] = None,
    on_modifies: Optional[Callable[[Span, Span, Span], bool]] = None,
    allowed_types: Optional[Set[str]] = None,
    excluded_types: Optional[Set[str]] = None,
    max_scope: Optional[int] = None,
    max_targets: Optional[int] = None,
    terminated_by: Optional[Set[str]] = None,
    metadata: Optional[Dict[Any, Any]] = None,
):
    """
    Creates a ConTextRule object.

    The primary arguments of `literal` `category`, and `direction` define the span of text to be matched, the
    semantic category, and the direction within the sentence in which the modifier operates.
    Other arguments specify additional custom logic such as:
        - Additional control over what text can be matched as a modifier (pattern and on_match)
        - Which types of targets can be modified (allowed_types, excluded_types)
        - The scope size and number of targets that a modifier can modify (max_targets, max_scope)
        - Other logic for terminating a span or for allowing a modifier to modify a target (on_modifies,
        terminated_by)

    Args:
        literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
            matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
            but can be used as a reference as the rule name.
        category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
        pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
            token-based pattern matching to match using token attributes. If a string, will use medspaCy's
            RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
            https://spacy.io/usage/rule-based-matching.
        direction: The directionality or action of a modifier. This defines which part of a sentence a modifier will
            include as its scope. Entities within the scope will be considered to be modified.
            Valid values are:
            - "FORWARD": Scope will begin after the end of a modifier and move to the right
            - "BACKWARD": Scope will begin before the beginning of a modifier and move to the left
            - "BIDIRECTIONAL": Scope will expand on either side of a modifier
            - "TERMINATE": A special direction to limit any other modifiers if this phrase is in its scope. Example:
                "no evidence of chf but there is pneumonia": "but" will prevent "no evidence of" from modifying
                "pneumonia"
            - "PSEUDO": A special direction which will not modify any targets. This can be used for differentiating
                superstrings of modifiers. Example: A modifier with literal="negative attitude" will prevent the
                phrase "negative" in "She has a negative attitude about her treatment" from being extracted as a
                modifier.
        on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
            matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
        on_modifies: Callback function to run when building an edge between a target and a modifier. This allows
            specifying custom logic for allowing or preventing certain modifiers from modifying certain targets. The
            callable should take 3 arguments:
                target: The spaCy Span from doc.ents (ie., 'Evidence of pneumonia')
                modifier: The spaCy Span covered in a resulting modifier (ie., 'no evidence of')
                span_between: The Span between the target and modifier in question.
            Should return either True or False. If returns False, then the modifier will not modify the target.
        allowed_types: A collection of target labels to allow a modifier to modify. If None, will apply to any type
            not specifically excluded in excluded_types. Only one of allowed_types and excluded_types can be used.
            An error will be thrown if both are not None.
        excluded_types: A collection of target labels which this modifier cannot modify. If None, will apply to all
            target types unless allowed_types is not None.
        max_scope: A number of tokens to explicitly limit the size of the modifier's scope. If None, the scope will
            include the entire sentence in the direction of `direction` and the entire sentence for "BIDIRECTIONAL".
            This is useful for requiring modifiers be very close to a concept in the text or for preventing long
            modifier ranges caused by sentence splitting problems.
        max_targets: The maximum number of targets which a modifier can modify. If None, will modify all targets in
            its scope.
        terminated_by: An optional collection of other modifier categories which will terminate the scope of this
            modifier. If None, only "TERMINATE" will do this. Example: if a ConTextRule defining "positive for" has
            terminated_by={"NEGATED_EXISTENCE"}, then in the sentence "positive for flu, negative for RSV", the
            positive modifier will modify "flu" but will be terminated by "negative for" and will not modify "RSV".
            This helps prevent multiple conflicting modifiers from distributing too far across a sentence.
        metadata: Optional dictionary of any extra metadata.
    """
    super().__init__(literal, category.upper(), pattern, on_match, metadata)
    self.on_modifies = on_modifies

    if allowed_types is not None and excluded_types is not None:
        raise ValueError(
            "A ConTextRule was instantiated with non-null values for both allowed_types and excluded_types. "
            "Only one of these can be non-null."
        )
    if allowed_types is not None:
        self.allowed_types = {label.upper() for label in allowed_types}
    else:
        self.allowed_types = None
    if excluded_types is not None:
        self.excluded_types = {label.upper() for label in excluded_types}
    else:
        self.excluded_types = None

    if max_targets is not None and max_targets <= 0:
        raise ValueError("max_targets must be >= 0 or None.")
    self.max_targets = max_targets
    if max_scope is not None and max_scope <= 0:
        raise ValueError("max_scope must be >= 0 or None.")
    self.max_scope = max_scope
    if terminated_by is None:
        terminated_by = set()
    else:
        if isinstance(terminated_by, str):
            raise ValueError(
                f"terminated_by must be an iterable, such as a list or set, not {terminated_by}."
            )
        terminated_by = {string.upper() for string in terminated_by}

    self.terminated_by = terminated_by

    self.metadata = metadata

    if direction.upper() not in self._ALLOWED_DIRECTIONS:
        raise ValueError(
            "Direction {0} not recognized. Must be one of: {1}".format(
                direction, self._ALLOWED_DIRECTIONS
            )
        )
    self.direction = direction.upper()

from_dict(rule_dict) classmethod

Reads a dictionary into a ConTextRule.

Parameters:

Name Type Description Default
rule_dict

The dictionary to convert.

required

Returns:

Type Description
ConTextRule

The ConTextRule created from the dictionary.

Source code in medspacy/context/context_rule.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
@classmethod
def from_dict(cls, rule_dict) -> ConTextRule:
    """
    Reads a dictionary into a ConTextRule.

    Args:
        rule_dict: The dictionary to convert.

    Returns:
        The ConTextRule created from the dictionary.
    """
    keys = set(rule_dict.keys())
    invalid_keys = keys.difference(cls._ALLOWED_KEYS)
    if invalid_keys:
        msg = (
            "JSON object contains invalid keys: {0}.\n"
            "Must be one of: {1}".format(invalid_keys, cls._ALLOWED_KEYS)
        )
        raise ValueError(msg)
    rule = ConTextRule(**rule_dict)
    return rule

from_json(filepath) classmethod

Reads in a lexicon of modifiers from a JSON file under the key context_rules.

Parameters:

Name Type Description Default
filepath

The .json file containing modifier rules. Must contain context_rules key containing the rule JSONs.

required

Returns:

Type Description
List[ConTextRule]

A list of ConTextRules objects read from the JSON.

Source code in medspacy/context/context_rule.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
@classmethod
def from_json(cls, filepath) -> List[ConTextRule]:
    """
    Reads in a lexicon of modifiers from a JSON file under the key `context_rules`.

    Args:
        filepath: The .json file containing modifier rules. Must contain `context_rules` key containing the rule
            JSONs.

    Returns:
        A list of ConTextRules objects read from the JSON.
    """

    with open(filepath) as file:
        modifier_data = json.load(file)
    context_rules = []
    for data in modifier_data["context_rules"]:
        context_rules.append(ConTextRule.from_dict(data))
    return context_rules

to_dict()

Converts ConTextItems to a python dictionary. Used when writing context rules to a json file.

Returns:

Type Description

The dictionary containing the ConTextRule info.

Source code in medspacy/context/context_rule.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def to_dict(self):
    """
    Converts ConTextItems to a python dictionary. Used when writing context rules to a json file.

    Returns:
        The dictionary containing the ConTextRule info.
    """

    rule_dict = {}
    for key in self._ALLOWED_KEYS:
        value = self.__dict__.get(key)
        if isinstance(value, set):
            value = list(value)
        if value is not None:
            rule_dict[key] = value
    return rule_dict

to_json(context_rules, filepath) classmethod

Writes ConTextItems to a json file.

Args: context_rules: a list of ContextRules that will be written to a file. filepath: the .json file to contain modifier rules

Source code in medspacy/context/context_rule.py
224
225
226
227
228
229
230
231
232
233
234
235
236
@classmethod
def to_json(cls, context_rules: List[ConTextRule], filepath: str):
    """Writes ConTextItems to a json file.

        Args:
        context_rules: a list of ContextRules that will be written to a file.
        filepath: the .json file to contain modifier rules
    """
    import json

    data = {"context_rules": [rule.to_dict() for rule in context_rules]}
    with open(filepath, "w") as file:
        json.dump(data, file, indent=4)

context

The ConText definiton.

ConText

The ConText for spaCy processing.

This component matches modifiers in a Doc, defines their scope, and identifies edges between targets and modifiers. Sets two spaCy extensions: - Span..modifiers: a list of ConTextModifier objects which modify a target Span - Doc..context_graph: a ConText graph object which contains the targets, modifiers, and edges between them.

Source code in medspacy/context/context.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
@Language.factory("medspacy_context")
class ConText:
    """
    The ConText for spaCy processing.

    This component matches modifiers in a Doc, defines their scope, and identifies edges between targets and modifiers.
    Sets two spaCy extensions:
            - Span._.modifiers: a list of ConTextModifier objects which modify a target Span
            - Doc._.context_graph: a ConText graph object which contains the targets,
                modifiers, and edges between them.
    """

    def __init__(
        self,
        nlp: Language,
        name: str = "medspacy_context",
        rules: Optional[str] = "default",
        language_code: str = 'en',
        phrase_matcher_attr: str = "LOWER",
        allowed_types: Optional[Set[str]] = None,
        excluded_types: Optional[Set[str]] = None,
        terminating_types: Optional[Dict[str, Iterable[str]]] = None,
        max_scope: Optional[int] = None,
        max_targets: Optional[int] = None,
        prune_on_modifier_overlap: bool = True,
        prune_on_target_overlap: bool = False,
        span_attrs: Union[
            Literal["default"], Dict[str, Dict[str, Any]], None
        ] = "default",
        input_span_type: Union[Literal["ents", "group"]] = "ents",
        span_group_name: str = "medspacy_spans",
    ):
        """
        Creates a new ConText object.

        Args:
            nlp: A SpaCy Language object.
            name: The name of the component.
            rules: The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from
                original ConText rules and years of practical applications at the US Department of Veterans Affairs.  If
                None, no rules are loaded. Otherwise, must be a path to a json file containing rules. Add ConTextRules
                directly through `ConText.add`.
            language_code: Language code to use (ISO code) as a default for loading resources.  See documentation
                and also the /resources directory to see which resources might be available in each language.
                Default is "en" for English.
            phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
                is 'LOWER'.
            allowed_types: A global list of types included by context. Rules will operate on only spans with these
                labels.
            excluded_types: A global list of types excluded by context. Rules will not operate on spans with these
                labels.
            terminating_types: A global map of types to the types that can terminate them. This can be used to apply
                terminations to all rules of a particular type rather than adding to every rule individually in the
                ContextRule object.
            max_scope: The number of tokens around a modifier in a target can be modified. Default value is None,
                Context will use the sentence boundaries. If a value greater than zero, applies the window globally.
                Both options will be overridden by a more specific value in a ContextRule.
            max_targets: The maximum number of targets a modifier can modify. Default value is None, context will modify
                all targets in its scope. If a value greater than zero, applies this value globally. Both options will
                be overridden by a more specific value in a ContextRule.
            prune_on_modifier_overlap: Whether to prune modifiers which are substrings of another modifier. If True,
                will drop substrings completely. For example, if "no history of"  and "history of" are both
                ConTextRules,both will match the text "no history of afib", but only "no  history of" should modify
                afib. Default True.
            prune_on_target_overlap: Whether to remove any matched modifiers which overlap with target entities. If
                False, any overlapping modifiers will not modify the overlapping entity but will still modify any other
                targets in its scope. Default False.
            span_attrs: The optional span attributes to modify. Default option "default" uses attributes in
                `DEFAULT_ATTRIBUTES`. If a dictionary, format is mapping context modifier categories to a dictionary
                containing the attribute name and the value to set the attribute to when a  span is modified by a
                modifier of that category. If None, no attributes will be modified.
            input_span_type: "ents" or "group". Where to look for targets. "ents" will modify attributes of spans
                in doc.ents. "group" will modify attributes of spans in the span group specified by `span_group_name`.
            span_group_name: The name of the span group used when `input_span_type` is "group". Default is
                "medspacy_spans".
        """
        self.nlp = nlp
        self.name = name
        self.prune_on_modifier_overlap = prune_on_modifier_overlap
        self.prune_on_target_overlap = prune_on_target_overlap
        self.input_span_type = input_span_type
        self.span_group_name = span_group_name
        self.context_attributes_mapping = None

        self.DEFAULT_RULES_FILEPATH = path.join(
            Path(__file__).resolve().parents[2], "resources", language_code.lower(), "context_rules.json"
        )

        self.__matcher = MedspacyMatcher(
            nlp,
            name=name,
            phrase_matcher_attr=phrase_matcher_attr,
            prune=prune_on_modifier_overlap,
        )

        if span_attrs == "default":
            self.context_attributes_mapping = DEFAULT_ATTRIBUTES
            self.register_default_attributes()
        elif span_attrs:
            for _, attr_dict in span_attrs.items():
                for attr_name in attr_dict.keys():
                    if not Span.has_extension(attr_name):
                        raise ValueError(
                            f"Custom extension {attr_name} has not been set. Please ensure Span.set_extension is "
                            f"called for your pipeline's custom extensions."
                        )
            self.context_attributes_mapping = span_attrs

        self.register_graph_attributes()

        if max_scope is not None:
            if not (isinstance(max_scope, int) and max_scope > 0):
                raise ValueError(
                    f"If 'max_scope' must be a value greater than 0, not the current value: {max_scope}"
                )
        self.max_scope = max_scope

        self.allowed_types = allowed_types
        self.excluded_types = excluded_types
        self.max_targets = max_targets

        self.terminating_types = dict()
        if terminating_types:
            self.terminating_types = {
                k.upper(): v for (k, v) in terminating_types.items()
            }

        rule_path = None
        if rules == "default":
            rule_path = self.DEFAULT_RULES_FILEPATH
        else:
            rule_path = rules

        if rule_path:
            self.add(ConTextRule.from_json(rule_path))

    @property
    def rules(self):
        """
        Returns list of ConTextRules available to context.
        """
        return self.__matcher.rules

    @property
    def categories(self):
        """
        Returns list of categories available that Context might produce.
        """
        return self.__matcher.labels

    @property
    def input_span_type(self):
        """
        The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for
        a spaCy span group.

        Returns:
            The input type, "ents" or "group".
        """
        return self._input_span_type

    @input_span_type.setter
    def input_span_type(self, val):
        if not (val == "ents" or val == "group"):
            raise ValueError('input_type must be "ents" or "group".')
        self._input_span_type = val

    @property
    def span_group_name(self) -> str:
        """
        The name of the span group used by this component. If `input_type` is "group", calling this component will
        use spans in the span group with this name.

        Returns:
            The span group name.
        """
        return self._span_group_name

    @span_group_name.setter
    def span_group_name(self, name: str):
        if not name or not isinstance(name, str):
            raise ValueError("Span group name must be a string.")
        self._span_group_name = name

    def add(self, rules):
        """
        Adds ConTextRules to Context.

        Args:
            rules: A single ConTextRule or a collection of ConTextRules to add to the Sectionizer.
        """
        if isinstance(rules, ConTextRule):
            rules = [rules]
        for rule in rules:
            if not isinstance(rule, ConTextRule):
                raise TypeError(f"Rules must type ConTextRule, not {type(rule)}.")

            # If global attributes like allowed_types and max_scope are defined,
            # check if the ConTextRule has them defined. If not, set to the global
            for attr in (
                "allowed_types",
                "excluded_types",
                "max_scope",
                "max_targets",
            ):
                value = getattr(self, attr)
                if value is None:  # No global value set
                    continue
                if (
                    getattr(rule, attr) is None
                ):  # If the direction itself has it defined, don't override
                    setattr(rule, attr, value)

            # Check custom termination points
            if rule.category.upper() in self.terminating_types:
                for other_modifier in self.terminating_types[rule.category.upper()]:
                    rule.terminated_by.add(other_modifier.upper())

        self.__matcher.add(rules)

    @classmethod
    def register_graph_attributes(cls):
        """
        Registers spaCy attribute extensions: Span._.modifiers and Doc._.context_graph.
        """
        try:
            Span.set_extension("modifiers", default=(), force=True)
            Doc.set_extension("context_graph", default=None, force=True)
        except ValueError:  # Extension already set
            pass

    @classmethod
    def register_default_attributes(cls):
        """
        Registers the default values for the Span attributes defined in `DEFAULT_ATTRIBUTES`.
        """
        for attr_name in [
            "is_negated",
            "is_uncertain",
            "is_historical",
            "is_hypothetical",
            "is_family",
        ]:
            try:
                Span.set_extension(attr_name, default=False)
            except ValueError:  # Extension already set
                pass

    def set_context_attributes(self, edges):
        """
        Adds Span-level attributes to targets with modifiers.

        Args:
            edges: The edges of the ContextGraph to modify.
        """
        for (target, modifier) in edges:
            if modifier.category in self.context_attributes_mapping:
                attr_dict = self.context_attributes_mapping[modifier.category]
                for attr_name, attr_value in attr_dict.items():
                    setattr(target._, attr_name, attr_value)

    def __call__(self, doc, targets: str = None) -> Doc:
        """
        Applies the ConText algorithm to a Doc.

        Args:
            doc: The spaCy Doc to process.
            targets: The optional custom attribute extension on doc to run over. Must contain an iterable of Span objects

        Returns:
            The processed spaCy Doc.
        """
        if not targets and self.input_span_type == "ents":
            targets = doc.ents
        elif not targets and self.input_span_type == "group":
            targets = doc.spans[self.span_group_name]
        elif targets:
            targets = getattr(doc._, targets)
        # Store data in ConTextGraph object
        # TODO: move some of this over to ConTextGraph
        context_graph = ConTextGraph(
            prune_on_modifier_overlap=self.prune_on_target_overlap
        )

        context_graph.targets = targets

        context_graph.modifiers = []
        matches = self.__matcher(doc)

        for (match_id, start, end) in matches:
            # Get the ConTextRule object defining this modifier
            rule = self.__matcher.rule_map[self.nlp.vocab[match_id].text]
            modifier = ConTextModifier(rule, start, end, doc, max_scope=self.max_scope)
            context_graph.modifiers.append(modifier)

        context_graph.update_scopes()
        context_graph.apply_modifiers()

        # Link targets to their modifiers
        for target, modifier in context_graph.edges:
            target._.modifiers += (modifier,)

        # If attributes need to be modified
        if self.context_attributes_mapping:
            self.set_context_attributes(context_graph.edges)

        doc._.context_graph = context_graph

        return doc
categories property

Returns list of categories available that Context might produce.

input_span_type property writable

The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for a spaCy span group.

Returns:

Type Description

The input type, "ents" or "group".

rules property

Returns list of ConTextRules available to context.

span_group_name property writable

The name of the span group used by this component. If input_type is "group", calling this component will use spans in the span group with this name.

Returns:

Type Description
str

The span group name.

__call__(doc, targets=None)

Applies the ConText algorithm to a Doc.

Parameters:

Name Type Description Default
doc

The spaCy Doc to process.

required
targets str

The optional custom attribute extension on doc to run over. Must contain an iterable of Span objects

None

Returns:

Type Description
Doc

The processed spaCy Doc.

Source code in medspacy/context/context.py
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
def __call__(self, doc, targets: str = None) -> Doc:
    """
    Applies the ConText algorithm to a Doc.

    Args:
        doc: The spaCy Doc to process.
        targets: The optional custom attribute extension on doc to run over. Must contain an iterable of Span objects

    Returns:
        The processed spaCy Doc.
    """
    if not targets and self.input_span_type == "ents":
        targets = doc.ents
    elif not targets and self.input_span_type == "group":
        targets = doc.spans[self.span_group_name]
    elif targets:
        targets = getattr(doc._, targets)
    # Store data in ConTextGraph object
    # TODO: move some of this over to ConTextGraph
    context_graph = ConTextGraph(
        prune_on_modifier_overlap=self.prune_on_target_overlap
    )

    context_graph.targets = targets

    context_graph.modifiers = []
    matches = self.__matcher(doc)

    for (match_id, start, end) in matches:
        # Get the ConTextRule object defining this modifier
        rule = self.__matcher.rule_map[self.nlp.vocab[match_id].text]
        modifier = ConTextModifier(rule, start, end, doc, max_scope=self.max_scope)
        context_graph.modifiers.append(modifier)

    context_graph.update_scopes()
    context_graph.apply_modifiers()

    # Link targets to their modifiers
    for target, modifier in context_graph.edges:
        target._.modifiers += (modifier,)

    # If attributes need to be modified
    if self.context_attributes_mapping:
        self.set_context_attributes(context_graph.edges)

    doc._.context_graph = context_graph

    return doc
__init__(nlp, name='medspacy_context', rules='default', language_code='en', phrase_matcher_attr='LOWER', allowed_types=None, excluded_types=None, terminating_types=None, max_scope=None, max_targets=None, prune_on_modifier_overlap=True, prune_on_target_overlap=False, span_attrs='default', input_span_type='ents', span_group_name='medspacy_spans')

Creates a new ConText object.

Parameters:

Name Type Description Default
nlp Language

A SpaCy Language object.

required
name str

The name of the component.

'medspacy_context'
rules Optional[str]

The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from original ConText rules and years of practical applications at the US Department of Veterans Affairs. If None, no rules are loaded. Otherwise, must be a path to a json file containing rules. Add ConTextRules directly through ConText.add.

'default'
language_code str

Language code to use (ISO code) as a default for loading resources. See documentation and also the /resources directory to see which resources might be available in each language. Default is "en" for English.

'en'
phrase_matcher_attr str

The token attribute to use for PhraseMatcher for rules where pattern is None. Default is 'LOWER'.

'LOWER'
allowed_types Optional[Set[str]]

A global list of types included by context. Rules will operate on only spans with these labels.

None
excluded_types Optional[Set[str]]

A global list of types excluded by context. Rules will not operate on spans with these labels.

None
terminating_types Optional[Dict[str, Iterable[str]]]

A global map of types to the types that can terminate them. This can be used to apply terminations to all rules of a particular type rather than adding to every rule individually in the ContextRule object.

None
max_scope Optional[int]

The number of tokens around a modifier in a target can be modified. Default value is None, Context will use the sentence boundaries. If a value greater than zero, applies the window globally. Both options will be overridden by a more specific value in a ContextRule.

None
max_targets Optional[int]

The maximum number of targets a modifier can modify. Default value is None, context will modify all targets in its scope. If a value greater than zero, applies this value globally. Both options will be overridden by a more specific value in a ContextRule.

None
prune_on_modifier_overlap bool

Whether to prune modifiers which are substrings of another modifier. If True, will drop substrings completely. For example, if "no history of" and "history of" are both ConTextRules,both will match the text "no history of afib", but only "no history of" should modify afib. Default True.

True
prune_on_target_overlap bool

Whether to remove any matched modifiers which overlap with target entities. If False, any overlapping modifiers will not modify the overlapping entity but will still modify any other targets in its scope. Default False.

False
span_attrs Union[Literal['default'], Dict[str, Dict[str, Any]], None]

The optional span attributes to modify. Default option "default" uses attributes in DEFAULT_ATTRIBUTES. If a dictionary, format is mapping context modifier categories to a dictionary containing the attribute name and the value to set the attribute to when a span is modified by a modifier of that category. If None, no attributes will be modified.

'default'
input_span_type Union[Literal['ents', 'group']]

"ents" or "group". Where to look for targets. "ents" will modify attributes of spans in doc.ents. "group" will modify attributes of spans in the span group specified by span_group_name.

'ents'
span_group_name str

The name of the span group used when input_span_type is "group". Default is "medspacy_spans".

'medspacy_spans'
Source code in medspacy/context/context.py
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def __init__(
    self,
    nlp: Language,
    name: str = "medspacy_context",
    rules: Optional[str] = "default",
    language_code: str = 'en',
    phrase_matcher_attr: str = "LOWER",
    allowed_types: Optional[Set[str]] = None,
    excluded_types: Optional[Set[str]] = None,
    terminating_types: Optional[Dict[str, Iterable[str]]] = None,
    max_scope: Optional[int] = None,
    max_targets: Optional[int] = None,
    prune_on_modifier_overlap: bool = True,
    prune_on_target_overlap: bool = False,
    span_attrs: Union[
        Literal["default"], Dict[str, Dict[str, Any]], None
    ] = "default",
    input_span_type: Union[Literal["ents", "group"]] = "ents",
    span_group_name: str = "medspacy_spans",
):
    """
    Creates a new ConText object.

    Args:
        nlp: A SpaCy Language object.
        name: The name of the component.
        rules: The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from
            original ConText rules and years of practical applications at the US Department of Veterans Affairs.  If
            None, no rules are loaded. Otherwise, must be a path to a json file containing rules. Add ConTextRules
            directly through `ConText.add`.
        language_code: Language code to use (ISO code) as a default for loading resources.  See documentation
            and also the /resources directory to see which resources might be available in each language.
            Default is "en" for English.
        phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
            is 'LOWER'.
        allowed_types: A global list of types included by context. Rules will operate on only spans with these
            labels.
        excluded_types: A global list of types excluded by context. Rules will not operate on spans with these
            labels.
        terminating_types: A global map of types to the types that can terminate them. This can be used to apply
            terminations to all rules of a particular type rather than adding to every rule individually in the
            ContextRule object.
        max_scope: The number of tokens around a modifier in a target can be modified. Default value is None,
            Context will use the sentence boundaries. If a value greater than zero, applies the window globally.
            Both options will be overridden by a more specific value in a ContextRule.
        max_targets: The maximum number of targets a modifier can modify. Default value is None, context will modify
            all targets in its scope. If a value greater than zero, applies this value globally. Both options will
            be overridden by a more specific value in a ContextRule.
        prune_on_modifier_overlap: Whether to prune modifiers which are substrings of another modifier. If True,
            will drop substrings completely. For example, if "no history of"  and "history of" are both
            ConTextRules,both will match the text "no history of afib", but only "no  history of" should modify
            afib. Default True.
        prune_on_target_overlap: Whether to remove any matched modifiers which overlap with target entities. If
            False, any overlapping modifiers will not modify the overlapping entity but will still modify any other
            targets in its scope. Default False.
        span_attrs: The optional span attributes to modify. Default option "default" uses attributes in
            `DEFAULT_ATTRIBUTES`. If a dictionary, format is mapping context modifier categories to a dictionary
            containing the attribute name and the value to set the attribute to when a  span is modified by a
            modifier of that category. If None, no attributes will be modified.
        input_span_type: "ents" or "group". Where to look for targets. "ents" will modify attributes of spans
            in doc.ents. "group" will modify attributes of spans in the span group specified by `span_group_name`.
        span_group_name: The name of the span group used when `input_span_type` is "group". Default is
            "medspacy_spans".
    """
    self.nlp = nlp
    self.name = name
    self.prune_on_modifier_overlap = prune_on_modifier_overlap
    self.prune_on_target_overlap = prune_on_target_overlap
    self.input_span_type = input_span_type
    self.span_group_name = span_group_name
    self.context_attributes_mapping = None

    self.DEFAULT_RULES_FILEPATH = path.join(
        Path(__file__).resolve().parents[2], "resources", language_code.lower(), "context_rules.json"
    )

    self.__matcher = MedspacyMatcher(
        nlp,
        name=name,
        phrase_matcher_attr=phrase_matcher_attr,
        prune=prune_on_modifier_overlap,
    )

    if span_attrs == "default":
        self.context_attributes_mapping = DEFAULT_ATTRIBUTES
        self.register_default_attributes()
    elif span_attrs:
        for _, attr_dict in span_attrs.items():
            for attr_name in attr_dict.keys():
                if not Span.has_extension(attr_name):
                    raise ValueError(
                        f"Custom extension {attr_name} has not been set. Please ensure Span.set_extension is "
                        f"called for your pipeline's custom extensions."
                    )
        self.context_attributes_mapping = span_attrs

    self.register_graph_attributes()

    if max_scope is not None:
        if not (isinstance(max_scope, int) and max_scope > 0):
            raise ValueError(
                f"If 'max_scope' must be a value greater than 0, not the current value: {max_scope}"
            )
    self.max_scope = max_scope

    self.allowed_types = allowed_types
    self.excluded_types = excluded_types
    self.max_targets = max_targets

    self.terminating_types = dict()
    if terminating_types:
        self.terminating_types = {
            k.upper(): v for (k, v) in terminating_types.items()
        }

    rule_path = None
    if rules == "default":
        rule_path = self.DEFAULT_RULES_FILEPATH
    else:
        rule_path = rules

    if rule_path:
        self.add(ConTextRule.from_json(rule_path))
add(rules)

Adds ConTextRules to Context.

Parameters:

Name Type Description Default
rules

A single ConTextRule or a collection of ConTextRules to add to the Sectionizer.

required
Source code in medspacy/context/context.py
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
def add(self, rules):
    """
    Adds ConTextRules to Context.

    Args:
        rules: A single ConTextRule or a collection of ConTextRules to add to the Sectionizer.
    """
    if isinstance(rules, ConTextRule):
        rules = [rules]
    for rule in rules:
        if not isinstance(rule, ConTextRule):
            raise TypeError(f"Rules must type ConTextRule, not {type(rule)}.")

        # If global attributes like allowed_types and max_scope are defined,
        # check if the ConTextRule has them defined. If not, set to the global
        for attr in (
            "allowed_types",
            "excluded_types",
            "max_scope",
            "max_targets",
        ):
            value = getattr(self, attr)
            if value is None:  # No global value set
                continue
            if (
                getattr(rule, attr) is None
            ):  # If the direction itself has it defined, don't override
                setattr(rule, attr, value)

        # Check custom termination points
        if rule.category.upper() in self.terminating_types:
            for other_modifier in self.terminating_types[rule.category.upper()]:
                rule.terminated_by.add(other_modifier.upper())

    self.__matcher.add(rules)
register_default_attributes() classmethod

Registers the default values for the Span attributes defined in DEFAULT_ATTRIBUTES.

Source code in medspacy/context/context.py
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
@classmethod
def register_default_attributes(cls):
    """
    Registers the default values for the Span attributes defined in `DEFAULT_ATTRIBUTES`.
    """
    for attr_name in [
        "is_negated",
        "is_uncertain",
        "is_historical",
        "is_hypothetical",
        "is_family",
    ]:
        try:
            Span.set_extension(attr_name, default=False)
        except ValueError:  # Extension already set
            pass
register_graph_attributes() classmethod

Registers spaCy attribute extensions: Span..modifiers and Doc..context_graph.

Source code in medspacy/context/context.py
245
246
247
248
249
250
251
252
253
254
@classmethod
def register_graph_attributes(cls):
    """
    Registers spaCy attribute extensions: Span._.modifiers and Doc._.context_graph.
    """
    try:
        Span.set_extension("modifiers", default=(), force=True)
        Doc.set_extension("context_graph", default=None, force=True)
    except ValueError:  # Extension already set
        pass
set_context_attributes(edges)

Adds Span-level attributes to targets with modifiers.

Parameters:

Name Type Description Default
edges

The edges of the ContextGraph to modify.

required
Source code in medspacy/context/context.py
273
274
275
276
277
278
279
280
281
282
283
284
def set_context_attributes(self, edges):
    """
    Adds Span-level attributes to targets with modifiers.

    Args:
        edges: The edges of the ContextGraph to modify.
    """
    for (target, modifier) in edges:
        if modifier.category in self.context_attributes_mapping:
            attr_dict = self.context_attributes_mapping[modifier.category]
            for attr_name, attr_value in attr_dict.items():
                setattr(target._, attr_name, attr_value)

context_graph

ConTextGraph

The ConTextGraph class defines the internal structure of the ConText algorithm. It stores a collection of modifiers, matched with ConTextRules, and targets from some other source such as the TargetMatcher or a spaCy NER model.

Each modifier can have some number of associated targets that it modifies. This relationship is stored as edges of of the graph.

Source code in medspacy/context/context_graph.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
class ConTextGraph:
    """
    The ConTextGraph class defines the internal structure of the ConText algorithm. It stores a collection of modifiers,
    matched with ConTextRules, and targets from some other source such as the TargetMatcher or a spaCy NER model.

    Each modifier can have some number of associated targets that it modifies. This relationship is stored as edges of
    of the graph.
    """

    def __init__(
        self,
        targets: Optional[List[Span]] = None,
        modifiers: Optional[List[ConTextModifier]] = None,
        edges: Optional[List] = None,
        prune_on_modifier_overlap: bool = False,
    ):
        """
        Creates a new ConTextGraph object.

        Args:
            targets: A spans that context might modify.
            modifiers: A list of ConTextModifiers that might modify the targets.
            edges: A list of edges between targets and modifiers representing the modification relationship.
            prune_on_modifier_overlap: Whether to prune modifiers when one modifier completely covers another.
        """
        self.targets = targets if targets is not None else []
        self.modifiers = modifiers if modifiers is not None else []
        self.edges = edges if edges is not None else []
        self.prune_on_modifier_overlap = prune_on_modifier_overlap

    def update_scopes(self):
        """
        Update the scope of all ConTextModifier.

        For each modifier in a list of ConTextModifiers, check against each other
        modifier to see if one of the modifiers should update the other.
        This allows neighboring similar modifiers to extend each other's
        scope and allows "terminate" modifiers to end a modifier's scope.
        """
        for i in range(len(self.modifiers) - 1):
            modifier1 = self.modifiers[i]
            for j in range(i + 1, len(self.modifiers)):
                modifier2 = self.modifiers[j]
                # TODO: Add modifier -> modifier edges
                modifier1.limit_scope(modifier2)
                modifier2.limit_scope(modifier1)

    def apply_modifiers(self):
        """
        Checks each target/modifier pair. If modifier modifies target,
        create an edge between them.
        """
        if self.prune_on_modifier_overlap:
            for i in range(len(self.modifiers) - 1, -1, -1):
                modifier = self.modifiers[i]
                for target in self.targets:
                    if tuple_overlaps(
                        (target.start, target.end), modifier.modifier_span
                    ):
                        self.modifiers.pop(i)
                        break

        edges = []
        for target in self.targets:
            for modifier in self.modifiers:
                if modifier.modifies(target):
                    modifier.modify(target)

        # Now do a second pass and reduce the number of targets
        # for any modifiers with a max_targets int
        for modifier in self.modifiers:
            modifier.reduce_targets()
            for target in modifier._targets:
                edges.append((target, modifier))

        self.edges = edges

    def __repr__(self):
        return f"<ConTextGraph> with {len(self.targets)} targets and {len(self.modifiers)} modifiers"

    def serialized_representation(self) -> Dict[str, Any]:
        """
        Returns the serialized representation of the ConTextGraph
        """
        return self.__dict__

    @classmethod
    def from_serialized_representation(cls, serialized_representation) -> ConTextGraph:
        """
        Creates the ConTextGraph from the serialized representation
        """
        context_graph = ConTextGraph(**serialized_representation)

        return context_graph
__init__(targets=None, modifiers=None, edges=None, prune_on_modifier_overlap=False)

Creates a new ConTextGraph object.

Parameters:

Name Type Description Default
targets Optional[List[Span]]

A spans that context might modify.

None
modifiers Optional[List[ConTextModifier]]

A list of ConTextModifiers that might modify the targets.

None
edges Optional[List]

A list of edges between targets and modifiers representing the modification relationship.

None
prune_on_modifier_overlap bool

Whether to prune modifiers when one modifier completely covers another.

False
Source code in medspacy/context/context_graph.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def __init__(
    self,
    targets: Optional[List[Span]] = None,
    modifiers: Optional[List[ConTextModifier]] = None,
    edges: Optional[List] = None,
    prune_on_modifier_overlap: bool = False,
):
    """
    Creates a new ConTextGraph object.

    Args:
        targets: A spans that context might modify.
        modifiers: A list of ConTextModifiers that might modify the targets.
        edges: A list of edges between targets and modifiers representing the modification relationship.
        prune_on_modifier_overlap: Whether to prune modifiers when one modifier completely covers another.
    """
    self.targets = targets if targets is not None else []
    self.modifiers = modifiers if modifiers is not None else []
    self.edges = edges if edges is not None else []
    self.prune_on_modifier_overlap = prune_on_modifier_overlap
apply_modifiers()

Checks each target/modifier pair. If modifier modifies target, create an edge between them.

Source code in medspacy/context/context_graph.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def apply_modifiers(self):
    """
    Checks each target/modifier pair. If modifier modifies target,
    create an edge between them.
    """
    if self.prune_on_modifier_overlap:
        for i in range(len(self.modifiers) - 1, -1, -1):
            modifier = self.modifiers[i]
            for target in self.targets:
                if tuple_overlaps(
                    (target.start, target.end), modifier.modifier_span
                ):
                    self.modifiers.pop(i)
                    break

    edges = []
    for target in self.targets:
        for modifier in self.modifiers:
            if modifier.modifies(target):
                modifier.modify(target)

    # Now do a second pass and reduce the number of targets
    # for any modifiers with a max_targets int
    for modifier in self.modifiers:
        modifier.reduce_targets()
        for target in modifier._targets:
            edges.append((target, modifier))

    self.edges = edges
from_serialized_representation(serialized_representation) classmethod

Creates the ConTextGraph from the serialized representation

Source code in medspacy/context/context_graph.py
 98
 99
100
101
102
103
104
105
@classmethod
def from_serialized_representation(cls, serialized_representation) -> ConTextGraph:
    """
    Creates the ConTextGraph from the serialized representation
    """
    context_graph = ConTextGraph(**serialized_representation)

    return context_graph
serialized_representation()

Returns the serialized representation of the ConTextGraph

Source code in medspacy/context/context_graph.py
92
93
94
95
96
def serialized_representation(self) -> Dict[str, Any]:
    """
    Returns the serialized representation of the ConTextGraph
    """
    return self.__dict__
update_scopes()

Update the scope of all ConTextModifier.

For each modifier in a list of ConTextModifiers, check against each other modifier to see if one of the modifiers should update the other. This allows neighboring similar modifiers to extend each other's scope and allows "terminate" modifiers to end a modifier's scope.

Source code in medspacy/context/context_graph.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def update_scopes(self):
    """
    Update the scope of all ConTextModifier.

    For each modifier in a list of ConTextModifiers, check against each other
    modifier to see if one of the modifiers should update the other.
    This allows neighboring similar modifiers to extend each other's
    scope and allows "terminate" modifiers to end a modifier's scope.
    """
    for i in range(len(self.modifiers) - 1):
        modifier1 = self.modifiers[i]
        for j in range(i + 1, len(self.modifiers)):
            modifier2 = self.modifiers[j]
            # TODO: Add modifier -> modifier edges
            modifier1.limit_scope(modifier2)
            modifier2.limit_scope(modifier1)

context_modifier

ConTextModifier

Represents a concept found by ConText in a document. An instance of this class is the result of ConTextRule matching text in a Doc.

Source code in medspacy/context/context_modifier.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
class ConTextModifier:
    """
    Represents a concept found by ConText in a document. An instance of this class is the result of ConTextRule matching
    text in a Doc.
    """

    def __init__(
        self,
        context_rule: ConTextRule,
        start: int,
        end: int,
        doc: Doc,
        scope_start: Optional[int] = None,
        scope_end: Optional[int] = None,
        max_scope: Optional[int] = None,
    ):
        """
        Create a new ConTextModifier from a document span. Each modifier represents a span in the text and a surrounding
        window. Spans such as entities or other members of span groups that occur within this window can be modified by
        this ConTextModifier.

        Args:
            context_rule: The ConTextRule object which defines the modifier.
            start: The start token index.
            end: The end token index (non-inclusive).
            doc: The spaCy Doc which contains this span. This is needed to initialize the modifier but is not
                maintained.
            scope_start: The start token index of the scope.
            scope_end: The end index of the scope.
            max_scope: Whether to use scope values rather than sentence boundaries for modifications.
        """
        self._context_rule = context_rule
        self._start = start
        self._end = end

        self._targets = []
        self._num_targets = 0

        self._max_scope = max_scope
        self._scope_start = scope_start
        self._scope_end = scope_end
        if doc is not None and (self._scope_end is None or self._scope_start is None):
            self.__set_scope(doc)

    @property
    def modifier_span(self) -> Tuple[int, int]:
        """
        The spaCy Span object, which is a view of self.doc, covered by this match.
        """
        return self._start, self._end

    @property
    def rule(self) -> ConTextRule:
        """
        Returns the associated context rule.
        """
        return self._context_rule

    @property
    def direction(self) -> str:
        """
        Returns the associated direction.
        """
        return self.rule.direction

    @property
    def category(self) -> str:
        """
        Returns the associated category.
        """
        return self.rule.category

    @property
    def scope_span(self) -> Tuple[int, int]:
        """
        Returns the associated scope.
        """
        return self._scope_start, self._scope_end

    @property
    def allowed_types(self) -> Set[str]:
        """
        Returns the associated allowed types.
        """
        return self.rule.allowed_types

    @property
    def excluded_types(self) -> Set[str]:
        """
        Returns the associated excluded types.
        """
        return self.rule.excluded_types

    @property
    def num_targets(self) -> int:
        """
        Returns the associated number of targets.
        """
        return self._num_targets

    @property
    def max_targets(self) -> Union[int, None]:
        """
        Returns the associated maximum number of targets.
        """
        return self.rule.max_targets

    @property
    def max_scope(self) -> Union[int, None]:
        """
        Returns the associated maximum scope.
        """
        return self.rule.max_scope

    def __set_scope(self, doc: Doc):
        """
        Applies the direction of the ConTextRule which generated this ConTextModifier to define a scope. If
        self._max_scope is None, then the default scope is the sentence which it occurs in whichever direction defined by
        self.direction. For example, if the direction is "forward", the scope will be [self.end: sentence.end]. If the
        direction is "backward", it will be [self.start: sentence.start].

        If self.max_scope is not None and the length of the default scope is longer than self.max_scope, it will be
        reduced to self.max_scope.

        Args:
            doc: The spaCy doc to use to set scope.
        """
        # If ConText is set to use defined windows, do that instead of sentence splitting
        if self._max_scope:
            full_scope_span = doc[self._start : self._end]._.window(
                n=self.rule.max_scope
            )
        # Otherwise, use the sentence
        else:
            full_scope_span = doc[self._start].sent
            if full_scope_span is None:
                raise ValueError(
                    "ConText failed because sentence boundaries have not been set. Add an upstream component such as the "
                    "dependency parser, Sentencizer, or PyRuSH to detect sentence boundaries or initialize ConText with "
                    "`max_scope` set to a value greater than 0."
                )

        if self.direction.lower() == "forward":
            self._scope_start, self._scope_end = self._end, full_scope_span.end
            if (
                self.max_scope is not None
                and (self._scope_end - self._scope_start) > self.max_scope
            ):
                self._scope_end = self._end + self.max_scope

        elif self.direction.lower() == "backward":
            self._scope_start, self._scope_end = (
                full_scope_span.start,
                self._start,
            )
            if (
                self.max_scope is not None
                and (self._scope_end - self._scope_start) > self.max_scope
            ):
                self._scope_start = self._start - self.max_scope

        else:  # bidirectional
            self._scope_start, self._scope_end = (
                full_scope_span.start,
                full_scope_span.end,
            )

            # Set the max scope on either side
            # Backwards
            if (
                self.max_scope is not None
                and (self._start - self._scope_start) > self.max_scope
            ):
                self._scope_start = self._start - self.max_scope
            # Forwards
            if (
                self.max_scope is not None
                and (self._scope_end - self._end) > self.max_scope
            ):
                self._scope_end = self._end + self.max_scope

    def update_scope(self, span: Span):
        """
        Changes the scope of self to be the given spaCy span.

        Args:
            span: a spaCy Span which contains the scope which a modifier should cover.
        """
        self._scope_start = span.start
        self._scope_end = span.end

    def limit_scope(self, other: ConTextModifier) -> bool:
        """
        If self and other have the same category or if other has a directionality of 'terminate', use the span of other
        to update the scope of self. Limiting the scope of two modifiers of the same category reduces the number of
        modifiers. For example, in 'no evidence of CHF, no pneumonia', 'pneumonia' will only be modified by 'no', not
        'no evidence of'. 'terminate' modifiers limit the scope of a modifier like 'no evidence of' in 'no evidence of
        CHF, but there is pneumonia'

        Args:
            other: The modifier to check against.

        Returns:
            Whether the other modifier modified the scope of self.
        """
        if not tuple_overlaps(self.scope_span, other.scope_span):
            return False
        if self.direction.upper() == "TERMINATE":
            return False
        # Check if the other modifier is a type which can modify self
        # or if they are the same category. If not, don't reduce scope.
        if (
            (other.direction.upper() != "TERMINATE")
            and (other.category.upper() not in self.rule.terminated_by)
            and (other.category.upper() != self.category.upper())
        ):
            return False

        # If two modifiers have the same category but modify different target types,
        # don't limit scope.
        if self.category == other.category and (
            (self.allowed_types != other.allowed_types)
            or (self.excluded_types != other.excluded_types)
        ):
            return False

        orig_scope = self.scope_span
        if self.direction.lower() in ("forward", "bidirectional"):
            if other > self:
                self._scope_end = min(self._scope_end, other.modifier_span[0])
        if self.direction.lower() in ("backward", "bidirectional"):
            if other < self:
                self._scope_start = max(self._scope_start, other.modifier_span[1])
        return orig_scope != self.scope_span

    def modifies(self, target: Span) -> bool:
        """
        Checks whether the target is within the modifier scope and if self is allowed to modify target.

        Args:
            target: a spaCy span representing a target concept.

        Returns:
            Whether the target is within `modifier_scope` and if self is allowed to modify the target.
        """
        # If the target and modifier overlap, meaning at least one token
        # one extracted as both a target and modifier, return False
        # to avoid self-modifying concepts

        if tuple_overlaps(
            self.modifier_span, (target.start, target.end)
        ):  # self.overlaps(target):
            return False
        if self.direction in ("TERMINATE", "PSEUDO"):
            return False
        if not self.allows(target.label_.upper()):
            return False

        if tuple_overlaps(self.scope_span, (target.start, target.end)):
            if not self.on_modifies(target):
                return False
            else:
                return True
        return False

    def allows(self, target_label: str) -> bool:
        """
        Returns whether if a modifier is able to modify a target type.

        Args:
            target_label: The target type to check.

        Returns:
            Whether the modifier is allowed to modify a target of the specified type. True if `target_label` in
            `self.allowed_types` or if `target_label` not in `self.excluded_tupes`. False otherwise.
        """
        if self.allowed_types is not None:
            return target_label in self.allowed_types
        if self.excluded_types is not None:
            return target_label not in self.excluded_types
        return True

    def on_modifies(self, target: Span) -> bool:
        """
        If the ConTextRule used to define a ConTextModifier has an `on_modifies` callback function, evaluate and return
        either True or False.

        Args:
            target: The spaCy span to evaluate.

        Returns:
            The result of the `on_modifies` callback for the rule. True if the callback is None.
        """
        if self.rule.on_modifies is None:
            return True
        # Find the span in between the target and modifier
        start = min(target.end, self._end)
        end = max(target.start, self._end)
        span_between = target.doc[start:end]
        rslt = self.rule.on_modifies(
            target, target.doc[self._start : self._end], span_between
        )
        if rslt not in (True, False):
            raise ValueError(
                "The on_modifies function must return either True or False indicating "
                "whether a modify modifies a target. Actual value: {0}".format(rslt)
            )
        return rslt

    def modify(self, target: Span):
        """
        Add target to the list of self._targets and increment self._num_targets.

        Args:
            target: The spaCy span to add.
        """
        self._targets.append(target)
        self._num_targets += 1

    def reduce_targets(self):
        """
        Reduces the number of targets to the n-closest targets based on the value of `self.max_targets`. If
        `self.max_targets` is None, no pruning is done.
        """
        if self.max_targets is None or self.num_targets <= self.max_targets:
            return

        target_dists = []
        for target in self._targets:
            dist = min(abs(self._start - target.end), abs(target.start - self._end))
            target_dists.append((target, dist))
        srtd_targets, _ = zip(*sorted(target_dists, key=lambda x: x[1]))
        self._targets = srtd_targets[: self.max_targets]
        self._num_targets = len(self._targets)

    def __gt__(self, other: ConTextModifier):
        return self._start > other.modifier_span[0]

    def __ge__(self, other):
        return self._start >= other.modifier_span[0]

    def __lt__(self, other):
        return self._end < other.modifier_span[1]

    def __le__(self, other):
        return self._end <= other.modifier_span[1]

    def __len__(self):
        return self._end - self._start

    def __repr__(self):
        return f"<ConTextModifier> [{self._start}, {self._end}, {self.category}]"

    def serialized_representation(self):
        """
        Serialized Representation of the modifier
        """
        dict_repr = dict()
        dict_repr["context_rule"] = self.rule.to_dict()
        dict_repr["start"] = self._start
        dict_repr["end"] = self._end
        dict_repr["max_scope"] = self._max_scope
        dict_repr["scope_start"] = self._scope_start
        dict_repr["scope_end"] = self._scope_end

        return dict_repr

    @classmethod
    def from_serialized_representation(
        cls, serialized_representation
    ) -> ConTextModifier:
        """
        Instantiates the class from the serialized representation
        """
        rule = ConTextRule.from_dict(serialized_representation["context_rule"])

        serialized_representation["context_rule"] = rule
        serialized_representation["doc"] = None

        return ConTextModifier(**serialized_representation)
allowed_types property

Returns the associated allowed types.

category property

Returns the associated category.

direction property

Returns the associated direction.

excluded_types property

Returns the associated excluded types.

max_scope property

Returns the associated maximum scope.

max_targets property

Returns the associated maximum number of targets.

modifier_span property

The spaCy Span object, which is a view of self.doc, covered by this match.

num_targets property

Returns the associated number of targets.

rule property

Returns the associated context rule.

scope_span property

Returns the associated scope.

__init__(context_rule, start, end, doc, scope_start=None, scope_end=None, max_scope=None)

Create a new ConTextModifier from a document span. Each modifier represents a span in the text and a surrounding window. Spans such as entities or other members of span groups that occur within this window can be modified by this ConTextModifier.

Parameters:

Name Type Description Default
context_rule ConTextRule

The ConTextRule object which defines the modifier.

required
start int

The start token index.

required
end int

The end token index (non-inclusive).

required
doc Doc

The spaCy Doc which contains this span. This is needed to initialize the modifier but is not maintained.

required
scope_start Optional[int]

The start token index of the scope.

None
scope_end Optional[int]

The end index of the scope.

None
max_scope Optional[int]

Whether to use scope values rather than sentence boundaries for modifications.

None
Source code in medspacy/context/context_modifier.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def __init__(
    self,
    context_rule: ConTextRule,
    start: int,
    end: int,
    doc: Doc,
    scope_start: Optional[int] = None,
    scope_end: Optional[int] = None,
    max_scope: Optional[int] = None,
):
    """
    Create a new ConTextModifier from a document span. Each modifier represents a span in the text and a surrounding
    window. Spans such as entities or other members of span groups that occur within this window can be modified by
    this ConTextModifier.

    Args:
        context_rule: The ConTextRule object which defines the modifier.
        start: The start token index.
        end: The end token index (non-inclusive).
        doc: The spaCy Doc which contains this span. This is needed to initialize the modifier but is not
            maintained.
        scope_start: The start token index of the scope.
        scope_end: The end index of the scope.
        max_scope: Whether to use scope values rather than sentence boundaries for modifications.
    """
    self._context_rule = context_rule
    self._start = start
    self._end = end

    self._targets = []
    self._num_targets = 0

    self._max_scope = max_scope
    self._scope_start = scope_start
    self._scope_end = scope_end
    if doc is not None and (self._scope_end is None or self._scope_start is None):
        self.__set_scope(doc)
__set_scope(doc)

Applies the direction of the ConTextRule which generated this ConTextModifier to define a scope. If self._max_scope is None, then the default scope is the sentence which it occurs in whichever direction defined by self.direction. For example, if the direction is "forward", the scope will be [self.end: sentence.end]. If the direction is "backward", it will be [self.start: sentence.start].

If self.max_scope is not None and the length of the default scope is longer than self.max_scope, it will be reduced to self.max_scope.

Parameters:

Name Type Description Default
doc Doc

The spaCy doc to use to set scope.

required
Source code in medspacy/context/context_modifier.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def __set_scope(self, doc: Doc):
    """
    Applies the direction of the ConTextRule which generated this ConTextModifier to define a scope. If
    self._max_scope is None, then the default scope is the sentence which it occurs in whichever direction defined by
    self.direction. For example, if the direction is "forward", the scope will be [self.end: sentence.end]. If the
    direction is "backward", it will be [self.start: sentence.start].

    If self.max_scope is not None and the length of the default scope is longer than self.max_scope, it will be
    reduced to self.max_scope.

    Args:
        doc: The spaCy doc to use to set scope.
    """
    # If ConText is set to use defined windows, do that instead of sentence splitting
    if self._max_scope:
        full_scope_span = doc[self._start : self._end]._.window(
            n=self.rule.max_scope
        )
    # Otherwise, use the sentence
    else:
        full_scope_span = doc[self._start].sent
        if full_scope_span is None:
            raise ValueError(
                "ConText failed because sentence boundaries have not been set. Add an upstream component such as the "
                "dependency parser, Sentencizer, or PyRuSH to detect sentence boundaries or initialize ConText with "
                "`max_scope` set to a value greater than 0."
            )

    if self.direction.lower() == "forward":
        self._scope_start, self._scope_end = self._end, full_scope_span.end
        if (
            self.max_scope is not None
            and (self._scope_end - self._scope_start) > self.max_scope
        ):
            self._scope_end = self._end + self.max_scope

    elif self.direction.lower() == "backward":
        self._scope_start, self._scope_end = (
            full_scope_span.start,
            self._start,
        )
        if (
            self.max_scope is not None
            and (self._scope_end - self._scope_start) > self.max_scope
        ):
            self._scope_start = self._start - self.max_scope

    else:  # bidirectional
        self._scope_start, self._scope_end = (
            full_scope_span.start,
            full_scope_span.end,
        )

        # Set the max scope on either side
        # Backwards
        if (
            self.max_scope is not None
            and (self._start - self._scope_start) > self.max_scope
        ):
            self._scope_start = self._start - self.max_scope
        # Forwards
        if (
            self.max_scope is not None
            and (self._scope_end - self._end) > self.max_scope
        ):
            self._scope_end = self._end + self.max_scope
allows(target_label)

Returns whether if a modifier is able to modify a target type.

Parameters:

Name Type Description Default
target_label str

The target type to check.

required

Returns:

Type Description
bool

Whether the modifier is allowed to modify a target of the specified type. True if target_label in

bool

self.allowed_types or if target_label not in self.excluded_tupes. False otherwise.

Source code in medspacy/context/context_modifier.py
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
def allows(self, target_label: str) -> bool:
    """
    Returns whether if a modifier is able to modify a target type.

    Args:
        target_label: The target type to check.

    Returns:
        Whether the modifier is allowed to modify a target of the specified type. True if `target_label` in
        `self.allowed_types` or if `target_label` not in `self.excluded_tupes`. False otherwise.
    """
    if self.allowed_types is not None:
        return target_label in self.allowed_types
    if self.excluded_types is not None:
        return target_label not in self.excluded_types
    return True
from_serialized_representation(serialized_representation) classmethod

Instantiates the class from the serialized representation

Source code in medspacy/context/context_modifier.py
379
380
381
382
383
384
385
386
387
388
389
390
391
@classmethod
def from_serialized_representation(
    cls, serialized_representation
) -> ConTextModifier:
    """
    Instantiates the class from the serialized representation
    """
    rule = ConTextRule.from_dict(serialized_representation["context_rule"])

    serialized_representation["context_rule"] = rule
    serialized_representation["doc"] = None

    return ConTextModifier(**serialized_representation)
limit_scope(other)

If self and other have the same category or if other has a directionality of 'terminate', use the span of other to update the scope of self. Limiting the scope of two modifiers of the same category reduces the number of modifiers. For example, in 'no evidence of CHF, no pneumonia', 'pneumonia' will only be modified by 'no', not 'no evidence of'. 'terminate' modifiers limit the scope of a modifier like 'no evidence of' in 'no evidence of CHF, but there is pneumonia'

Parameters:

Name Type Description Default
other ConTextModifier

The modifier to check against.

required

Returns:

Type Description
bool

Whether the other modifier modified the scope of self.

Source code in medspacy/context/context_modifier.py
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
def limit_scope(self, other: ConTextModifier) -> bool:
    """
    If self and other have the same category or if other has a directionality of 'terminate', use the span of other
    to update the scope of self. Limiting the scope of two modifiers of the same category reduces the number of
    modifiers. For example, in 'no evidence of CHF, no pneumonia', 'pneumonia' will only be modified by 'no', not
    'no evidence of'. 'terminate' modifiers limit the scope of a modifier like 'no evidence of' in 'no evidence of
    CHF, but there is pneumonia'

    Args:
        other: The modifier to check against.

    Returns:
        Whether the other modifier modified the scope of self.
    """
    if not tuple_overlaps(self.scope_span, other.scope_span):
        return False
    if self.direction.upper() == "TERMINATE":
        return False
    # Check if the other modifier is a type which can modify self
    # or if they are the same category. If not, don't reduce scope.
    if (
        (other.direction.upper() != "TERMINATE")
        and (other.category.upper() not in self.rule.terminated_by)
        and (other.category.upper() != self.category.upper())
    ):
        return False

    # If two modifiers have the same category but modify different target types,
    # don't limit scope.
    if self.category == other.category and (
        (self.allowed_types != other.allowed_types)
        or (self.excluded_types != other.excluded_types)
    ):
        return False

    orig_scope = self.scope_span
    if self.direction.lower() in ("forward", "bidirectional"):
        if other > self:
            self._scope_end = min(self._scope_end, other.modifier_span[0])
    if self.direction.lower() in ("backward", "bidirectional"):
        if other < self:
            self._scope_start = max(self._scope_start, other.modifier_span[1])
    return orig_scope != self.scope_span
modifies(target)

Checks whether the target is within the modifier scope and if self is allowed to modify target.

Parameters:

Name Type Description Default
target Span

a spaCy span representing a target concept.

required

Returns:

Type Description
bool

Whether the target is within modifier_scope and if self is allowed to modify the target.

Source code in medspacy/context/context_modifier.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
def modifies(self, target: Span) -> bool:
    """
    Checks whether the target is within the modifier scope and if self is allowed to modify target.

    Args:
        target: a spaCy span representing a target concept.

    Returns:
        Whether the target is within `modifier_scope` and if self is allowed to modify the target.
    """
    # If the target and modifier overlap, meaning at least one token
    # one extracted as both a target and modifier, return False
    # to avoid self-modifying concepts

    if tuple_overlaps(
        self.modifier_span, (target.start, target.end)
    ):  # self.overlaps(target):
        return False
    if self.direction in ("TERMINATE", "PSEUDO"):
        return False
    if not self.allows(target.label_.upper()):
        return False

    if tuple_overlaps(self.scope_span, (target.start, target.end)):
        if not self.on_modifies(target):
            return False
        else:
            return True
    return False
modify(target)

Add target to the list of self._targets and increment self._num_targets.

Parameters:

Name Type Description Default
target Span

The spaCy span to add.

required
Source code in medspacy/context/context_modifier.py
321
322
323
324
325
326
327
328
329
def modify(self, target: Span):
    """
    Add target to the list of self._targets and increment self._num_targets.

    Args:
        target: The spaCy span to add.
    """
    self._targets.append(target)
    self._num_targets += 1
on_modifies(target)

If the ConTextRule used to define a ConTextModifier has an on_modifies callback function, evaluate and return either True or False.

Parameters:

Name Type Description Default
target Span

The spaCy span to evaluate.

required

Returns:

Type Description
bool

The result of the on_modifies callback for the rule. True if the callback is None.

Source code in medspacy/context/context_modifier.py
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
def on_modifies(self, target: Span) -> bool:
    """
    If the ConTextRule used to define a ConTextModifier has an `on_modifies` callback function, evaluate and return
    either True or False.

    Args:
        target: The spaCy span to evaluate.

    Returns:
        The result of the `on_modifies` callback for the rule. True if the callback is None.
    """
    if self.rule.on_modifies is None:
        return True
    # Find the span in between the target and modifier
    start = min(target.end, self._end)
    end = max(target.start, self._end)
    span_between = target.doc[start:end]
    rslt = self.rule.on_modifies(
        target, target.doc[self._start : self._end], span_between
    )
    if rslt not in (True, False):
        raise ValueError(
            "The on_modifies function must return either True or False indicating "
            "whether a modify modifies a target. Actual value: {0}".format(rslt)
        )
    return rslt
reduce_targets()

Reduces the number of targets to the n-closest targets based on the value of self.max_targets. If self.max_targets is None, no pruning is done.

Source code in medspacy/context/context_modifier.py
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
def reduce_targets(self):
    """
    Reduces the number of targets to the n-closest targets based on the value of `self.max_targets`. If
    `self.max_targets` is None, no pruning is done.
    """
    if self.max_targets is None or self.num_targets <= self.max_targets:
        return

    target_dists = []
    for target in self._targets:
        dist = min(abs(self._start - target.end), abs(target.start - self._end))
        target_dists.append((target, dist))
    srtd_targets, _ = zip(*sorted(target_dists, key=lambda x: x[1]))
    self._targets = srtd_targets[: self.max_targets]
    self._num_targets = len(self._targets)
serialized_representation()

Serialized Representation of the modifier

Source code in medspacy/context/context_modifier.py
365
366
367
368
369
370
371
372
373
374
375
376
377
def serialized_representation(self):
    """
    Serialized Representation of the modifier
    """
    dict_repr = dict()
    dict_repr["context_rule"] = self.rule.to_dict()
    dict_repr["start"] = self._start
    dict_repr["end"] = self._end
    dict_repr["max_scope"] = self._max_scope
    dict_repr["scope_start"] = self._scope_start
    dict_repr["scope_end"] = self._scope_end

    return dict_repr
update_scope(span)

Changes the scope of self to be the given spaCy span.

Parameters:

Name Type Description Default
span Span

a spaCy Span which contains the scope which a modifier should cover.

required
Source code in medspacy/context/context_modifier.py
193
194
195
196
197
198
199
200
201
def update_scope(self, span: Span):
    """
    Changes the scope of self to be the given spaCy span.

    Args:
        span: a spaCy Span which contains the scope which a modifier should cover.
    """
    self._scope_start = span.start
    self._scope_end = span.end

context_rule

ConTextRule

Bases: BaseRule

A ConTextRule defines a ConText modifier. ConTextRules are rules which define which spans are extracted as modifiers and how they behave, such as the phrase to be matched, the category/semantic class, the direction of the modifier in the text, and what types of target spans can be modified.

Source code in medspacy/context/context_rule.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
class ConTextRule(BaseRule):
    """
    A ConTextRule defines a ConText modifier. ConTextRules are rules which define which spans are extracted as modifiers
    and how they behave, such as the phrase to be matched, the category/semantic class, the direction of the modifier in
    the text, and what types of target spans can be modified.
    """

    _ALLOWED_DIRECTIONS = (
        "FORWARD",
        "BACKWARD",
        "BIDIRECTIONAL",
        "TERMINATE",
        "PSEUDO"
    )
    _ALLOWED_KEYS = {
        "literal",
        "direction",
        "pattern",
        "category",
        "metadata",
        "allowed_types",
        "excluded_types",
        "max_targets",
        "max_scope",
    }

    def __init__(
        self,
        literal: str,
        category: str,
        pattern: Optional[Union[str, List[Dict[str, str]]]] = None,
        direction: str = "BIDIRECTIONAL",
        on_match: Optional[
            Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
        ] = None,
        on_modifies: Optional[Callable[[Span, Span, Span], bool]] = None,
        allowed_types: Optional[Set[str]] = None,
        excluded_types: Optional[Set[str]] = None,
        max_scope: Optional[int] = None,
        max_targets: Optional[int] = None,
        terminated_by: Optional[Set[str]] = None,
        metadata: Optional[Dict[Any, Any]] = None,
    ):
        """
        Creates a ConTextRule object.

        The primary arguments of `literal` `category`, and `direction` define the span of text to be matched, the
        semantic category, and the direction within the sentence in which the modifier operates.
        Other arguments specify additional custom logic such as:
            - Additional control over what text can be matched as a modifier (pattern and on_match)
            - Which types of targets can be modified (allowed_types, excluded_types)
            - The scope size and number of targets that a modifier can modify (max_targets, max_scope)
            - Other logic for terminating a span or for allowing a modifier to modify a target (on_modifies,
            terminated_by)

        Args:
            literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
                matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
                but can be used as a reference as the rule name.
            category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
            pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
                token-based pattern matching to match using token attributes. If a string, will use medspaCy's
                RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
                https://spacy.io/usage/rule-based-matching.
            direction: The directionality or action of a modifier. This defines which part of a sentence a modifier will
                include as its scope. Entities within the scope will be considered to be modified.
                Valid values are:
                - "FORWARD": Scope will begin after the end of a modifier and move to the right
                - "BACKWARD": Scope will begin before the beginning of a modifier and move to the left
                - "BIDIRECTIONAL": Scope will expand on either side of a modifier
                - "TERMINATE": A special direction to limit any other modifiers if this phrase is in its scope. Example:
                    "no evidence of chf but there is pneumonia": "but" will prevent "no evidence of" from modifying
                    "pneumonia"
                - "PSEUDO": A special direction which will not modify any targets. This can be used for differentiating
                    superstrings of modifiers. Example: A modifier with literal="negative attitude" will prevent the
                    phrase "negative" in "She has a negative attitude about her treatment" from being extracted as a
                    modifier.
            on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
                matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
            on_modifies: Callback function to run when building an edge between a target and a modifier. This allows
                specifying custom logic for allowing or preventing certain modifiers from modifying certain targets. The
                callable should take 3 arguments:
                    target: The spaCy Span from doc.ents (ie., 'Evidence of pneumonia')
                    modifier: The spaCy Span covered in a resulting modifier (ie., 'no evidence of')
                    span_between: The Span between the target and modifier in question.
                Should return either True or False. If returns False, then the modifier will not modify the target.
            allowed_types: A collection of target labels to allow a modifier to modify. If None, will apply to any type
                not specifically excluded in excluded_types. Only one of allowed_types and excluded_types can be used.
                An error will be thrown if both are not None.
            excluded_types: A collection of target labels which this modifier cannot modify. If None, will apply to all
                target types unless allowed_types is not None.
            max_scope: A number of tokens to explicitly limit the size of the modifier's scope. If None, the scope will
                include the entire sentence in the direction of `direction` and the entire sentence for "BIDIRECTIONAL".
                This is useful for requiring modifiers be very close to a concept in the text or for preventing long
                modifier ranges caused by sentence splitting problems.
            max_targets: The maximum number of targets which a modifier can modify. If None, will modify all targets in
                its scope.
            terminated_by: An optional collection of other modifier categories which will terminate the scope of this
                modifier. If None, only "TERMINATE" will do this. Example: if a ConTextRule defining "positive for" has
                terminated_by={"NEGATED_EXISTENCE"}, then in the sentence "positive for flu, negative for RSV", the
                positive modifier will modify "flu" but will be terminated by "negative for" and will not modify "RSV".
                This helps prevent multiple conflicting modifiers from distributing too far across a sentence.
            metadata: Optional dictionary of any extra metadata.
        """
        super().__init__(literal, category.upper(), pattern, on_match, metadata)
        self.on_modifies = on_modifies

        if allowed_types is not None and excluded_types is not None:
            raise ValueError(
                "A ConTextRule was instantiated with non-null values for both allowed_types and excluded_types. "
                "Only one of these can be non-null."
            )
        if allowed_types is not None:
            self.allowed_types = {label.upper() for label in allowed_types}
        else:
            self.allowed_types = None
        if excluded_types is not None:
            self.excluded_types = {label.upper() for label in excluded_types}
        else:
            self.excluded_types = None

        if max_targets is not None and max_targets <= 0:
            raise ValueError("max_targets must be >= 0 or None.")
        self.max_targets = max_targets
        if max_scope is not None and max_scope <= 0:
            raise ValueError("max_scope must be >= 0 or None.")
        self.max_scope = max_scope
        if terminated_by is None:
            terminated_by = set()
        else:
            if isinstance(terminated_by, str):
                raise ValueError(
                    f"terminated_by must be an iterable, such as a list or set, not {terminated_by}."
                )
            terminated_by = {string.upper() for string in terminated_by}

        self.terminated_by = terminated_by

        self.metadata = metadata

        if direction.upper() not in self._ALLOWED_DIRECTIONS:
            raise ValueError(
                "Direction {0} not recognized. Must be one of: {1}".format(
                    direction, self._ALLOWED_DIRECTIONS
                )
            )
        self.direction = direction.upper()

    @classmethod
    def from_json(cls, filepath) -> List[ConTextRule]:
        """
        Reads in a lexicon of modifiers from a JSON file under the key `context_rules`.

        Args:
            filepath: The .json file containing modifier rules. Must contain `context_rules` key containing the rule
                JSONs.

        Returns:
            A list of ConTextRules objects read from the JSON.
        """

        with open(filepath) as file:
            modifier_data = json.load(file)
        context_rules = []
        for data in modifier_data["context_rules"]:
            context_rules.append(ConTextRule.from_dict(data))
        return context_rules

    @classmethod
    def from_dict(cls, rule_dict) -> ConTextRule:
        """
        Reads a dictionary into a ConTextRule.

        Args:
            rule_dict: The dictionary to convert.

        Returns:
            The ConTextRule created from the dictionary.
        """
        keys = set(rule_dict.keys())
        invalid_keys = keys.difference(cls._ALLOWED_KEYS)
        if invalid_keys:
            msg = (
                "JSON object contains invalid keys: {0}.\n"
                "Must be one of: {1}".format(invalid_keys, cls._ALLOWED_KEYS)
            )
            raise ValueError(msg)
        rule = ConTextRule(**rule_dict)
        return rule

    def to_dict(self):
        """
        Converts ConTextItems to a python dictionary. Used when writing context rules to a json file.

        Returns:
            The dictionary containing the ConTextRule info.
        """

        rule_dict = {}
        for key in self._ALLOWED_KEYS:
            value = self.__dict__.get(key)
            if isinstance(value, set):
                value = list(value)
            if value is not None:
                rule_dict[key] = value
        return rule_dict

    @classmethod
    def to_json(cls, context_rules: List[ConTextRule], filepath: str):
        """Writes ConTextItems to a json file.

            Args:
            context_rules: a list of ContextRules that will be written to a file.
            filepath: the .json file to contain modifier rules
        """
        import json

        data = {"context_rules": [rule.to_dict() for rule in context_rules]}
        with open(filepath, "w") as file:
            json.dump(data, file, indent=4)

    def __repr__(self):
        return (
            f"ConTextRule(literal='{self.literal}', category='{self.category}', pattern={self.pattern}, "
            f"direction='{self.direction}')"
        )
__init__(literal, category, pattern=None, direction='BIDIRECTIONAL', on_match=None, on_modifies=None, allowed_types=None, excluded_types=None, max_scope=None, max_targets=None, terminated_by=None, metadata=None)

Creates a ConTextRule object.

The primary arguments of literal category, and direction define the span of text to be matched, the semantic category, and the direction within the sentence in which the modifier operates. Other arguments specify additional custom logic such as: - Additional control over what text can be matched as a modifier (pattern and on_match) - Which types of targets can be modified (allowed_types, excluded_types) - The scope size and number of targets that a modifier can modify (max_targets, max_scope) - Other logic for terminating a span or for allowing a modifier to modify a target (on_modifies, terminated_by)

Parameters:

Name Type Description Default
literal str

The string representation of a concept. If pattern is None, this string will be lower-cased and matched to the lower-case string. If pattern is not None, this argument will not be used for matching but can be used as a reference as the rule name.

required
category str

The semantic class of the matched span. This corresponds to the label_ attribute of an entity.

required
pattern Optional[Union[str, List[Dict[str, str]]]]

A list or string to use as a spaCy pattern rather than literal. If a list, will use spaCy token-based pattern matching to match using token attributes. If a string, will use medspaCy's RegexMatcher. If None, will use literal as the pattern for phrase matching. For more information, see https://spacy.io/usage/rule-based-matching.

None
direction str

The directionality or action of a modifier. This defines which part of a sentence a modifier will include as its scope. Entities within the scope will be considered to be modified. Valid values are: - "FORWARD": Scope will begin after the end of a modifier and move to the right - "BACKWARD": Scope will begin before the beginning of a modifier and move to the left - "BIDIRECTIONAL": Scope will expand on either side of a modifier - "TERMINATE": A special direction to limit any other modifiers if this phrase is in its scope. Example: "no evidence of chf but there is pneumonia": "but" will prevent "no evidence of" from modifying "pneumonia" - "PSEUDO": A special direction which will not modify any targets. This can be used for differentiating superstrings of modifiers. Example: A modifier with literal="negative attitude" will prevent the phrase "negative" in "She has a negative attitude about her treatment" from being extracted as a modifier.

'BIDIRECTIONAL'
on_match Optional[Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]]

An optional callback function or other callable which takes 4 arguments: (matcher, doc, i, matches). For more information, see https://spacy.io/usage/rule-based-matching#on_match

None
on_modifies Optional[Callable[[Span, Span, Span], bool]]

Callback function to run when building an edge between a target and a modifier. This allows specifying custom logic for allowing or preventing certain modifiers from modifying certain targets. The callable should take 3 arguments: target: The spaCy Span from doc.ents (ie., 'Evidence of pneumonia') modifier: The spaCy Span covered in a resulting modifier (ie., 'no evidence of') span_between: The Span between the target and modifier in question. Should return either True or False. If returns False, then the modifier will not modify the target.

None
allowed_types Optional[Set[str]]

A collection of target labels to allow a modifier to modify. If None, will apply to any type not specifically excluded in excluded_types. Only one of allowed_types and excluded_types can be used. An error will be thrown if both are not None.

None
excluded_types Optional[Set[str]]

A collection of target labels which this modifier cannot modify. If None, will apply to all target types unless allowed_types is not None.

None
max_scope Optional[int]

A number of tokens to explicitly limit the size of the modifier's scope. If None, the scope will include the entire sentence in the direction of direction and the entire sentence for "BIDIRECTIONAL". This is useful for requiring modifiers be very close to a concept in the text or for preventing long modifier ranges caused by sentence splitting problems.

None
max_targets Optional[int]

The maximum number of targets which a modifier can modify. If None, will modify all targets in its scope.

None
terminated_by Optional[Set[str]]

An optional collection of other modifier categories which will terminate the scope of this modifier. If None, only "TERMINATE" will do this. Example: if a ConTextRule defining "positive for" has terminated_by={"NEGATED_EXISTENCE"}, then in the sentence "positive for flu, negative for RSV", the positive modifier will modify "flu" but will be terminated by "negative for" and will not modify "RSV". This helps prevent multiple conflicting modifiers from distributing too far across a sentence.

None
metadata Optional[Dict[Any, Any]]

Optional dictionary of any extra metadata.

None
Source code in medspacy/context/context_rule.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def __init__(
    self,
    literal: str,
    category: str,
    pattern: Optional[Union[str, List[Dict[str, str]]]] = None,
    direction: str = "BIDIRECTIONAL",
    on_match: Optional[
        Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
    ] = None,
    on_modifies: Optional[Callable[[Span, Span, Span], bool]] = None,
    allowed_types: Optional[Set[str]] = None,
    excluded_types: Optional[Set[str]] = None,
    max_scope: Optional[int] = None,
    max_targets: Optional[int] = None,
    terminated_by: Optional[Set[str]] = None,
    metadata: Optional[Dict[Any, Any]] = None,
):
    """
    Creates a ConTextRule object.

    The primary arguments of `literal` `category`, and `direction` define the span of text to be matched, the
    semantic category, and the direction within the sentence in which the modifier operates.
    Other arguments specify additional custom logic such as:
        - Additional control over what text can be matched as a modifier (pattern and on_match)
        - Which types of targets can be modified (allowed_types, excluded_types)
        - The scope size and number of targets that a modifier can modify (max_targets, max_scope)
        - Other logic for terminating a span or for allowing a modifier to modify a target (on_modifies,
        terminated_by)

    Args:
        literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
            matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
            but can be used as a reference as the rule name.
        category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
        pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
            token-based pattern matching to match using token attributes. If a string, will use medspaCy's
            RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
            https://spacy.io/usage/rule-based-matching.
        direction: The directionality or action of a modifier. This defines which part of a sentence a modifier will
            include as its scope. Entities within the scope will be considered to be modified.
            Valid values are:
            - "FORWARD": Scope will begin after the end of a modifier and move to the right
            - "BACKWARD": Scope will begin before the beginning of a modifier and move to the left
            - "BIDIRECTIONAL": Scope will expand on either side of a modifier
            - "TERMINATE": A special direction to limit any other modifiers if this phrase is in its scope. Example:
                "no evidence of chf but there is pneumonia": "but" will prevent "no evidence of" from modifying
                "pneumonia"
            - "PSEUDO": A special direction which will not modify any targets. This can be used for differentiating
                superstrings of modifiers. Example: A modifier with literal="negative attitude" will prevent the
                phrase "negative" in "She has a negative attitude about her treatment" from being extracted as a
                modifier.
        on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
            matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
        on_modifies: Callback function to run when building an edge between a target and a modifier. This allows
            specifying custom logic for allowing or preventing certain modifiers from modifying certain targets. The
            callable should take 3 arguments:
                target: The spaCy Span from doc.ents (ie., 'Evidence of pneumonia')
                modifier: The spaCy Span covered in a resulting modifier (ie., 'no evidence of')
                span_between: The Span between the target and modifier in question.
            Should return either True or False. If returns False, then the modifier will not modify the target.
        allowed_types: A collection of target labels to allow a modifier to modify. If None, will apply to any type
            not specifically excluded in excluded_types. Only one of allowed_types and excluded_types can be used.
            An error will be thrown if both are not None.
        excluded_types: A collection of target labels which this modifier cannot modify. If None, will apply to all
            target types unless allowed_types is not None.
        max_scope: A number of tokens to explicitly limit the size of the modifier's scope. If None, the scope will
            include the entire sentence in the direction of `direction` and the entire sentence for "BIDIRECTIONAL".
            This is useful for requiring modifiers be very close to a concept in the text or for preventing long
            modifier ranges caused by sentence splitting problems.
        max_targets: The maximum number of targets which a modifier can modify. If None, will modify all targets in
            its scope.
        terminated_by: An optional collection of other modifier categories which will terminate the scope of this
            modifier. If None, only "TERMINATE" will do this. Example: if a ConTextRule defining "positive for" has
            terminated_by={"NEGATED_EXISTENCE"}, then in the sentence "positive for flu, negative for RSV", the
            positive modifier will modify "flu" but will be terminated by "negative for" and will not modify "RSV".
            This helps prevent multiple conflicting modifiers from distributing too far across a sentence.
        metadata: Optional dictionary of any extra metadata.
    """
    super().__init__(literal, category.upper(), pattern, on_match, metadata)
    self.on_modifies = on_modifies

    if allowed_types is not None and excluded_types is not None:
        raise ValueError(
            "A ConTextRule was instantiated with non-null values for both allowed_types and excluded_types. "
            "Only one of these can be non-null."
        )
    if allowed_types is not None:
        self.allowed_types = {label.upper() for label in allowed_types}
    else:
        self.allowed_types = None
    if excluded_types is not None:
        self.excluded_types = {label.upper() for label in excluded_types}
    else:
        self.excluded_types = None

    if max_targets is not None and max_targets <= 0:
        raise ValueError("max_targets must be >= 0 or None.")
    self.max_targets = max_targets
    if max_scope is not None and max_scope <= 0:
        raise ValueError("max_scope must be >= 0 or None.")
    self.max_scope = max_scope
    if terminated_by is None:
        terminated_by = set()
    else:
        if isinstance(terminated_by, str):
            raise ValueError(
                f"terminated_by must be an iterable, such as a list or set, not {terminated_by}."
            )
        terminated_by = {string.upper() for string in terminated_by}

    self.terminated_by = terminated_by

    self.metadata = metadata

    if direction.upper() not in self._ALLOWED_DIRECTIONS:
        raise ValueError(
            "Direction {0} not recognized. Must be one of: {1}".format(
                direction, self._ALLOWED_DIRECTIONS
            )
        )
    self.direction = direction.upper()
from_dict(rule_dict) classmethod

Reads a dictionary into a ConTextRule.

Parameters:

Name Type Description Default
rule_dict

The dictionary to convert.

required

Returns:

Type Description
ConTextRule

The ConTextRule created from the dictionary.

Source code in medspacy/context/context_rule.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
@classmethod
def from_dict(cls, rule_dict) -> ConTextRule:
    """
    Reads a dictionary into a ConTextRule.

    Args:
        rule_dict: The dictionary to convert.

    Returns:
        The ConTextRule created from the dictionary.
    """
    keys = set(rule_dict.keys())
    invalid_keys = keys.difference(cls._ALLOWED_KEYS)
    if invalid_keys:
        msg = (
            "JSON object contains invalid keys: {0}.\n"
            "Must be one of: {1}".format(invalid_keys, cls._ALLOWED_KEYS)
        )
        raise ValueError(msg)
    rule = ConTextRule(**rule_dict)
    return rule
from_json(filepath) classmethod

Reads in a lexicon of modifiers from a JSON file under the key context_rules.

Parameters:

Name Type Description Default
filepath

The .json file containing modifier rules. Must contain context_rules key containing the rule JSONs.

required

Returns:

Type Description
List[ConTextRule]

A list of ConTextRules objects read from the JSON.

Source code in medspacy/context/context_rule.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
@classmethod
def from_json(cls, filepath) -> List[ConTextRule]:
    """
    Reads in a lexicon of modifiers from a JSON file under the key `context_rules`.

    Args:
        filepath: The .json file containing modifier rules. Must contain `context_rules` key containing the rule
            JSONs.

    Returns:
        A list of ConTextRules objects read from the JSON.
    """

    with open(filepath) as file:
        modifier_data = json.load(file)
    context_rules = []
    for data in modifier_data["context_rules"]:
        context_rules.append(ConTextRule.from_dict(data))
    return context_rules
to_dict()

Converts ConTextItems to a python dictionary. Used when writing context rules to a json file.

Returns:

Type Description

The dictionary containing the ConTextRule info.

Source code in medspacy/context/context_rule.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def to_dict(self):
    """
    Converts ConTextItems to a python dictionary. Used when writing context rules to a json file.

    Returns:
        The dictionary containing the ConTextRule info.
    """

    rule_dict = {}
    for key in self._ALLOWED_KEYS:
        value = self.__dict__.get(key)
        if isinstance(value, set):
            value = list(value)
        if value is not None:
            rule_dict[key] = value
    return rule_dict
to_json(context_rules, filepath) classmethod

Writes ConTextItems to a json file.

Args: context_rules: a list of ContextRules that will be written to a file. filepath: the .json file to contain modifier rules

Source code in medspacy/context/context_rule.py
224
225
226
227
228
229
230
231
232
233
234
235
236
@classmethod
def to_json(cls, context_rules: List[ConTextRule], filepath: str):
    """Writes ConTextItems to a json file.

        Args:
        context_rules: a list of ContextRules that will be written to a file.
        filepath: the .json file to contain modifier rules
    """
    import json

    data = {"context_rules": [rule.to_dict() for rule in context_rules]}
    with open(filepath, "w") as file:
        json.dump(data, file, indent=4)

util

This module will contain helper functions and classes for common clinical processing tasks which will be used in medspaCy's context implementation.

is_modified_by(span, modifier_label)

Check whether a span has a modifier of a specific type.

Parameters:

Name Type Description Default
span Span

The span to examine.

required
modifier_label str

The type of modifier to check for.

required

Returns:

Type Description
bool

Whether there is a modifier of modifier_label that modifies span.

Source code in medspacy/context/util.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
def is_modified_by(span: Span, modifier_label: str) -> bool:
    """
    Check whether a span has a modifier of a specific type.

    Args:
        span: The span to examine.
        modifier_label: The type of modifier to check for.

    Returns:
        Whether there is a modifier of `modifier_label` that modifies `span`.
    """
    for modifier in span._.modifiers:
        if modifier.category.upper() == modifier_label.upper():
            return True
    return False

custom_tokenizer

create_medspacy_tokenizer(nlp)

Generates a custom tokenizer to augment the default spacy tokenizer for situations commonly seen in clinical text. This includes: * Punctuation infixes. For example, this allows the following examples to be more aggresively tokenized as : "Patient complains of c/o" -> [..., 'c', '/', 'o'] "chf+cp" -> ['chf', '+', 'cp'] @param nlp: Spacy language model

Source code in medspacy/custom_tokenizer.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def create_medspacy_tokenizer(nlp):
    """Generates a custom tokenizer to augment the default spacy tokenizer
     for situations commonly seen in clinical text.
     This includes:
         * Punctuation infixes.
             For example, this allows the following examples to be more aggresively tokenized as :
                 "Patient complains of c/o" -> [..., 'c', '/', 'o']
                 "chf+cp" -> ['chf', '+', 'cp']
    @param nlp: Spacy language model
    """

    # augment the defaults
    # this is not quite correct.  We do not want to break on uppercase and we do not
    # want to break on all punctuation (periods)
    # infixes = nlp.Defaults.infixes + (r'''[^a-z0-9]''',)
    # escape all the punctuation we want to allow to allow to break up tokens

    # get all python punctuation
    punctuation_chars = string.punctuation
    # remove periods so that we do not break up '1.5 mg' into '1 . 5 mg'
    punctuation_chars = punctuation_chars.replace(".", "")

    infixes = nlp.Defaults.infixes + [
        r"""[{}]""".format(re.escape(punctuation_chars)),
    ]
    prefixes = nlp.Defaults.prefixes
    suffixes = nlp.Defaults.suffixes

    # compile
    infix_re = compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(prefixes)
    suffix_re = compile_suffix_regex(suffixes)

    # Default exceptions could be extended later
    tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions.copy()

    # now create this
    tokenizer = Tokenizer(
        nlp.vocab,
        tokenizer_exceptions,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        token_match=nlp.tokenizer.token_match,
    )

    return tokenizer

io

DbConnect

DbConnect is a wrapper for either a pyodbc or sqlite3 connection. It can then be passed into the DbReader and DbWriter classes to retrieve/store document data.

Source code in medspacy/io/db_connect.py
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
class DbConnect:
    """DbConnect is a wrapper for either a pyodbc or sqlite3 connection. It can then be
    passed into the DbReader and DbWriter classes to retrieve/store document data.
    """

    def __init__(
        self, driver=None, server=None, db=None, user=None, pwd=None, conn=None
    ):
        """Create a new DbConnect object. You can pass in either information for a pyodbc connection string
        or directly pass in a sqlite or pyodbc connection object.

        If conn is None, all other arguments must be supplied. If conn is passed in, all other arguments will be ignored.

        Args:
            driver
            server
            db:
            user
            pwd
            conn
        """
        if conn is None:
            if not all([driver, server, db, user, pwd]):
                raise ValueError(
                    "If you are not passing in a connection object, "
                    "you must pass in all other arguments to create a DB connection."
                )
            import pyodbc

            self.conn = pyodbc.connect(
                "DRIVER={0};SERVER={1};DATABASE={2};USER={3};PWD={4}".format(
                    driver, server, db, user, pwd
                )
            )
        else:
            self.conn = conn
        self.cursor = self.conn.cursor()
        # according this thread, bulk insert for sqlserver need to set fast_executemany=True.
        # https://stackoverflow.com/questions/29638136/how-to-speed-up-bulk-insert-to-ms-sql-server-using-pyodbc
        if hasattr(self.cursor, 'fast_executemany'):
            self.cursor.fast_executemany = True

        import sqlite3

        if isinstance(self.conn, sqlite3.Connection):
            self.db_lib = "sqlite3"
            self.database_exception = sqlite3.DatabaseError
        else:
            import pyodbc
            if isinstance(self.conn, pyodbc.Connection):
                self.db_lib = "pyodbc"
                self.database_exception = pyodbc.DatabaseError
            else:
                raise ValueError(
                    "conn must be either a sqlite3 or pyodbc Connection object, not {0}".format(
                        type(self.conn)
                    )
                )

        print("Opened connection to {0}.{1}".format(server, db))

    def create_table(self, query, table_name, drop_existing):
        if drop_existing:
            try:
                self.cursor.execute("drop table if exists {0}".format(table_name))
            # except pyodbc.DatabaseError:
            except self.database_exception as e:
                pass
            else:
                self.conn.commit()
        try:
            self.cursor.execute(query)
        except self.database_exception as e:
            self.conn.rollback()
            self.conn.close()
            raise e
        else:
            self.conn.commit()
            print("Created table {0} with query: {1}".format(table_name, query))

    def write(self, query, data):
        try:
            self.cursor.executemany(query, data)
        except self.database_exception as e:
            self.conn.rollback()
            self.conn.close()
            raise e
        else:
            self.conn.commit()
            # print("Wrote {0} rows with query: {1}".format(len(data), query))

    def read(self, query):
        self.cursor.execute(query)
        result = self.cursor.fetchall()
        # print("Read {0} rows with query: {1}".format(len(result), query))
        return result

    def close(self):
        self.conn.commit()
        self.conn.close()
        print("Connection closed.")

__init__(driver=None, server=None, db=None, user=None, pwd=None, conn=None)

Create a new DbConnect object. You can pass in either information for a pyodbc connection string or directly pass in a sqlite or pyodbc connection object.

If conn is None, all other arguments must be supplied. If conn is passed in, all other arguments will be ignored.

Parameters:

Name Type Description Default
db
None
Source code in medspacy/io/db_connect.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def __init__(
    self, driver=None, server=None, db=None, user=None, pwd=None, conn=None
):
    """Create a new DbConnect object. You can pass in either information for a pyodbc connection string
    or directly pass in a sqlite or pyodbc connection object.

    If conn is None, all other arguments must be supplied. If conn is passed in, all other arguments will be ignored.

    Args:
        driver
        server
        db:
        user
        pwd
        conn
    """
    if conn is None:
        if not all([driver, server, db, user, pwd]):
            raise ValueError(
                "If you are not passing in a connection object, "
                "you must pass in all other arguments to create a DB connection."
            )
        import pyodbc

        self.conn = pyodbc.connect(
            "DRIVER={0};SERVER={1};DATABASE={2};USER={3};PWD={4}".format(
                driver, server, db, user, pwd
            )
        )
    else:
        self.conn = conn
    self.cursor = self.conn.cursor()
    # according this thread, bulk insert for sqlserver need to set fast_executemany=True.
    # https://stackoverflow.com/questions/29638136/how-to-speed-up-bulk-insert-to-ms-sql-server-using-pyodbc
    if hasattr(self.cursor, 'fast_executemany'):
        self.cursor.fast_executemany = True

    import sqlite3

    if isinstance(self.conn, sqlite3.Connection):
        self.db_lib = "sqlite3"
        self.database_exception = sqlite3.DatabaseError
    else:
        import pyodbc
        if isinstance(self.conn, pyodbc.Connection):
            self.db_lib = "pyodbc"
            self.database_exception = pyodbc.DatabaseError
        else:
            raise ValueError(
                "conn must be either a sqlite3 or pyodbc Connection object, not {0}".format(
                    type(self.conn)
                )
            )

    print("Opened connection to {0}.{1}".format(server, db))

DbWriter

DbWriter is a utility class for writing structured data back to a database.

Source code in medspacy/io/db_writer.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
class DbWriter:
    """DbWriter is a utility class for writing structured data back to a database."""

    def __init__(
            self,
            db_conn,
            destination_table,
            cols=None,
            col_types=None,
            doc_dtype="ents",
            create_table=False,
            drop_existing=False,
            write_batch_size=100,
    ):
        """Create a new DbWriter object.

        Args:
            db_conn: A medspacy.io.DbConnect object
            destination_table: The name of the table to write to
            cols (opt): The names of the columns of the destination table. These should align with attributes extracted
                by DocConsumer and stored in doc._.data. A set of default values can be accessed by:
                >>> DbWriter.get_default_cols()
            col_types (opt): The sql data types of the table columns. They should correspond 1:1 with cols.
                A set of default values can be accesed by:
                >>> DbWriter.get_default_col_types()
            doc_dtype: The type of data from DocConsumer to write from a doc.
                Either ("ents", "section", "context", or "doc")
            create_table (bool): Whether to create a table

        """
        self.db = db_conn
        self.destination_table = destination_table
        self._create_table = create_table
        self.drop_existing = drop_existing
        if cols is None and col_types is None:
            cols = DEFAULT_COLS[doc_dtype]
            col_types = [DEFAULT_COL_TYPES[doc_dtype][col] for col in cols]
        elif cols is None and col_types is not None:
            raise ValueError("cols must be specified if col_types is not None.")
        self.cols = cols
        self.col_types = col_types
        _validate_dtypes((doc_dtype,))
        self.doc_dtype = doc_dtype
        self.batch_size = write_batch_size

        self.insert_query = ""
        if create_table:
            self.create_table()
        self.make_insert_query()

    @classmethod
    def get_default_col_types(cls, dtypes=None):

        if dtypes is None:
            dtypes = tuple(DEFAULT_COL_TYPES.keys())
        else:
            if isinstance(dtypes, str):
                dtypes = (dtypes,)

        _validate_dtypes(dtypes)
        dtype_col_types = {
            dtype: col_types
            for (dtype, col_types) in DEFAULT_COL_TYPES.items()
            if dtype in dtypes
        }
        return dtype_col_types

    @classmethod
    def get_default_cols(cls, dtypes=None):
        if dtypes is None:
            dtypes = tuple(DEFAULT_COL_TYPES.keys())
        else:
            if isinstance(dtypes, str):
                dtypes = (dtypes,)
        _validate_dtypes(dtypes)

        dtype_cols = {
            dtype: cols
            for (dtype, cols) in DEFAULT_COL_TYPES.items()
            if dtype in dtypes
        }
        return dtype_cols

    def create_table(self):
        query = "CREATE TABLE {0} (".format(self.destination_table)
        for i, col in enumerate(self.cols):
            query += "{0} {1}".format(col, self.col_types[i])
            if i < len(self.cols) - 1:
                query += ", "
            else:
                query += ")"
        self.db.create_table(query, self.destination_table, self.drop_existing)

    def make_insert_query(self):
        col_list = ", ".join([col for col in self.cols])
        q_list = ", ".join(["?" for col in self.cols])
        self.insert_query = "INSERT INTO {0} ({1}) VALUES ({2})".format(
            self.destination_table, col_list, q_list
        )

    def write(self, docs: Union[Doc, List[Doc]]):
        """Write a list of docs or doc to a database."""
        if isinstance(docs, Doc):
            self.write_doc(docs)
        else:
            self.write_docs(docs)

    def write_doc(self, doc):
        """Write a doc to a database."""
        data = doc._.get_data(self.doc_dtype, attrs=self.cols, as_rows=True)
        self.write_data(data)

    def write_docs(self, docs, batch_size=800):
        """write a list of docs to database through bulk insert"""
        data = []
        for doc in docs:
            data.extend(doc._.get_data(self.doc_dtype, attrs=self.cols, as_rows=True))
            if len(data) >= batch_size:
                self.write_data(data)
                data = []
        if len(data) > 0:
            self.write_data(data)
        pass

    def write_data(self, data):
        self.db.write(self.insert_query, data)

    def close(self):
        self.db.close()

__init__(db_conn, destination_table, cols=None, col_types=None, doc_dtype='ents', create_table=False, drop_existing=False, write_batch_size=100)

Create a new DbWriter object.

Parameters:

Name Type Description Default
db_conn

A medspacy.io.DbConnect object

required
destination_table

The name of the table to write to

required
cols opt

The names of the columns of the destination table. These should align with attributes extracted by DocConsumer and stored in doc._.data. A set of default values can be accessed by:

DbWriter.get_default_cols()

None
col_types opt

The sql data types of the table columns. They should correspond 1:1 with cols. A set of default values can be accesed by:

DbWriter.get_default_col_types()

None
doc_dtype

The type of data from DocConsumer to write from a doc. Either ("ents", "section", "context", or "doc")

'ents'
create_table bool

Whether to create a table

False
Source code in medspacy/io/db_writer.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def __init__(
        self,
        db_conn,
        destination_table,
        cols=None,
        col_types=None,
        doc_dtype="ents",
        create_table=False,
        drop_existing=False,
        write_batch_size=100,
):
    """Create a new DbWriter object.

    Args:
        db_conn: A medspacy.io.DbConnect object
        destination_table: The name of the table to write to
        cols (opt): The names of the columns of the destination table. These should align with attributes extracted
            by DocConsumer and stored in doc._.data. A set of default values can be accessed by:
            >>> DbWriter.get_default_cols()
        col_types (opt): The sql data types of the table columns. They should correspond 1:1 with cols.
            A set of default values can be accesed by:
            >>> DbWriter.get_default_col_types()
        doc_dtype: The type of data from DocConsumer to write from a doc.
            Either ("ents", "section", "context", or "doc")
        create_table (bool): Whether to create a table

    """
    self.db = db_conn
    self.destination_table = destination_table
    self._create_table = create_table
    self.drop_existing = drop_existing
    if cols is None and col_types is None:
        cols = DEFAULT_COLS[doc_dtype]
        col_types = [DEFAULT_COL_TYPES[doc_dtype][col] for col in cols]
    elif cols is None and col_types is not None:
        raise ValueError("cols must be specified if col_types is not None.")
    self.cols = cols
    self.col_types = col_types
    _validate_dtypes((doc_dtype,))
    self.doc_dtype = doc_dtype
    self.batch_size = write_batch_size

    self.insert_query = ""
    if create_table:
        self.create_table()
    self.make_insert_query()

write(docs)

Write a list of docs or doc to a database.

Source code in medspacy/io/db_writer.py
160
161
162
163
164
165
def write(self, docs: Union[Doc, List[Doc]]):
    """Write a list of docs or doc to a database."""
    if isinstance(docs, Doc):
        self.write_doc(docs)
    else:
        self.write_docs(docs)

write_doc(doc)

Write a doc to a database.

Source code in medspacy/io/db_writer.py
167
168
169
170
def write_doc(self, doc):
    """Write a doc to a database."""
    data = doc._.get_data(self.doc_dtype, attrs=self.cols, as_rows=True)
    self.write_data(data)

write_docs(docs, batch_size=800)

write a list of docs to database through bulk insert

Source code in medspacy/io/db_writer.py
172
173
174
175
176
177
178
179
180
181
182
def write_docs(self, docs, batch_size=800):
    """write a list of docs to database through bulk insert"""
    data = []
    for doc in docs:
        data.extend(doc._.get_data(self.doc_dtype, attrs=self.cols, as_rows=True))
        if len(data) >= batch_size:
            self.write_data(data)
            data = []
    if len(data) > 0:
        self.write_data(data)
    pass

DocConsumer

A DocConsumer object will consume a spacy doc and output rows based on a configuration provided by the user.

This component extracts structured information from a Doc. Information is stored in doc._.data, which is a nested dictionary. The outer keys represent the data type of can one or more of: - "ents": data about the spans in doc.ents such as the text, label, context attributes, section information, or custom attributes - "group": data about spans in a span group with the name span_group_attrs section text and category - "context": data about entity-modifier pairs extracted by ConText - "doc": a single doc-level representation. By default only doc.text is extracted, but other attributes may be specified

Once processed, a doc's data can be accessed either by:
    - doc._.data
    - doc._.get_data(dtype=...)
    - doc._.ent_data
    - doc._.to_dataframe(dtype=...)
Source code in medspacy/io/doc_consumer.py
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
@Language.factory("medspacy_doc_consumer")
class DocConsumer:
    """
    A DocConsumer object will consume a spacy doc and output rows based on a configuration provided by the user.

    This component extracts structured information from a Doc. Information is stored in doc._.data, which is a
        nested dictionary. The outer keys represent the data type of can one or more of:
            - "ents": data about the spans in doc.ents such as the text, label,
                context attributes, section information, or custom attributes
            - "group": data about spans in a span group with the name `span_group_attrs` section text and category
            - "context": data about entity-modifier pairs extracted by ConText
            - "doc": a single doc-level representation. By default only doc.text is extracted, but other attributes may
                be specified

        Once processed, a doc's data can be accessed either by:
            - doc._.data
            - doc._.get_data(dtype=...)
            - doc._.ent_data
            - doc._.to_dataframe(dtype=...)
    """

    def __init__(
        self,
        nlp,
        name: str = "medspacy_doc_consumer",
        dtypes: Tuple = ("ents",),
        dtype_attrs: Dict = None,
        span_group_name: str = "medspacy_spans",
    ):
        """
        Creates a new DocConsumer.

        Args:
            nlp: A spaCy model
            dtypes: Either a tuple of data types to collect or the string "all". Default ("ents",). Valid  options are:
                "ents", "group", "section", "context", "doc".
            dtype_attrs: An optional dictionary mapping the data types in dtypes to a list of attributes. If None, will
                set defaults for each dtype. Attributes for "ents", "group", and "doc" may be customized be adding either
                native or custom attributes (i.e., ent._....) "context" and "section" are not customizable at this time.
                Default values for each dtype can be retrieved by the class method `DocConsumer.get_default_attrs()
            span_group_name: the name of the span group used when dtypes contains "group". At this time, only one span
                group is supported.
        """
        self.nlp = nlp
        self.name = name
        self._span_group_name = span_group_name
        if not isinstance(dtypes, tuple):
            if dtypes == "all":
                dtypes = tuple(ALLOWED_DATA_TYPES)
            else:
                raise ValueError(
                    "dtypes must be either 'all' or a tuple, not {0}".format(dtypes)
                )
        for dtype in dtypes:
            if dtype not in ALLOWED_DATA_TYPES:
                raise ValueError(
                    "Invalid dtypes. Supported dtypes are {0}, not {1}".format(
                        ALLOWED_DATA_TYPES, dtype
                    )
                )
            if dtype == "section":
                self.validate_section_attrs(dtype_attrs)
        self.dtypes = dtypes
        self.dtype_attrs = dtype_attrs

        if self.dtype_attrs is None:
            self._set_default_attrs()

    @classmethod
    def get_default_attrs(cls, dtypes: Optional[Tuple] = None):
        """
        Gets the default attributes available to each type specified.

        Args:
            dtypes: Optional tuple containing "ents", "group", "context", "section", or "doc". If None, all will be
                returned.

        Returns:
            The attributes the doc consumer will output for each of the specified types in `dtypes`.
        """
        if dtypes is None:
            dtypes = ALLOWED_DATA_TYPES
        else:
            if isinstance(dtypes, str):
                dtypes = (dtypes,)
            for dtype in dtypes:
                if dtype not in ALLOWED_DATA_TYPES:
                    raise ValueError("Invalid dtype,", dtype)
        dtype_attrs = {
            dtype: list(attrs)
            for (dtype, attrs) in DEFAULT_ATTRS.items()
            if dtype in dtypes
        }
        return dtype_attrs

    def _set_default_attrs(self):
        """
        Gets the default attributes.
        """
        self.dtype_attrs = self.get_default_attrs(self.dtypes)

    def validate_section_attrs(self, attrs):
        """
        Validate that section attributes are either not specified or are valid attribute names.
        """
        if attrs is None:
            return True
        if "section" not in attrs:
            return True
        diff = set(attrs["section"]).difference(ALLOWED_SECTION_ATTRS)
        if diff:
            raise ValueError("Invalid section dtype_attrs specified: {0}".format(diff))
        return True

    def __call__(self, doc):
        """
        Call the doc consumer on a doc and assign the data.

        Args:
            doc: The Doc to process.

        Returns:
            The processed Doc.
        """
        data = dict()
        for dtype, attrs in self.dtype_attrs.items():
            data.setdefault(dtype, OrderedDict())
            for attr in attrs:
                data[dtype][attr] = list()
        if "ents" in self.dtypes:
            for ent in doc.ents:
                for attr in self.dtype_attrs["ents"]:
                    try:
                        val = getattr(ent, attr)
                    except AttributeError:
                        val = getattr(ent._, attr)
                    data["ents"][attr].append(val)
        if "group" in self.dtypes:
            for span in doc.spans[self._span_group_name]:
                for attr in self.dtype_attrs["group"]:
                    try:
                        val = getattr(span, attr)
                    except AttributeError:
                        val = getattr(span._, attr)
                    data["group"][attr].append(val)
        if "context" in self.dtypes:
            for (ent, modifier) in doc._.context_graph.edges:
                self.add_context_edge_attributes(ent, modifier, data["context"], doc)
        if "section" in self.dtypes:
            for section in doc._.sections:
                self.add_section_attributes(section, data["section"], doc)
        if "doc" in self.dtypes:
            for attr in self.dtype_attrs["doc"]:
                try:
                    val = getattr(doc, attr)
                except AttributeError:
                    val = getattr(doc._, attr)
                data["doc"][attr].append(val)

        doc._.data = data
        return doc

    def add_context_edge_attributes(
        self, ent: Span, modifier: ConTextModifier, context_data, doc
    ):
        span_tup = modifier.modifier_span
        span = doc[span_tup[0] : span_tup[1]]
        scope_tup = modifier.scope_span
        scope = doc[scope_tup[0] : scope_tup[1]]
        for attr in self.dtype_attrs["context"]:
            if attr == "ent_text":
                context_data["ent_text"].append(ent.text)
            elif attr == "ent_label_":
                context_data["ent_label_"].append(ent.label_)
            elif attr == "ent_start_char":
                context_data["ent_start_char"].append(ent.start_char)
            elif attr == "ent_end_char":
                context_data["ent_end_char"].append(ent.end_char)
            elif attr == "modifier_text":
                context_data["modifier_text"].append(span.text)
            elif attr == "modifier_category":
                context_data["modifier_category"].append(modifier.category)
            elif attr == "modifier_direction":
               context_data["modifier_direction"].append(modifier.direction)
            elif attr == "modifier_start_char":
                context_data["modifier_start_char"].append(span.start_char)
            elif attr == "modifier_end_char":
                context_data["modifier_end_char"].append(span.end_char)
            elif attr == "modifier_scope_start_char":
                context_data["modifier_scope_start_char"].append(scope.start_char)
            elif attr == "modifier_scope_end_char":
                context_data["modifier_scope_end_char"].append(scope.end_char)
            else:
            # if specified attribute is not one of these standard values, check the entity to see if it's an entity value
                try:
                    val = getattr(ent, attr)
                except AttributeError:
                    try:
                        val = getattr(ent._, attr)
                    except AttributeError:
                        raise ValueError(f"Attributes for dtype 'context' must be either "
                                         f"a registered custom Span attribute (i.e., Span._.attr) or one of these pre-defined values: "
                                          f"{ALLOWED_CONTEXT_ATTRS}. \nYou passed in '{attr}'")
                context_data[f"{attr}"].append(val)

    def add_section_attributes(self, section, section_data, doc):
        # Allow for null sections
        section_title_tup = section.title_span
        section_body_tup = section.body_span
        section_title = doc[section_title_tup[0] : section_title_tup[1]]
        section_body = doc[section_body_tup[0] : section_body_tup[1]]
        if "section_category" in self.dtype_attrs["section"]:
            section_data["section_category"].append(section.category)
        if section.category is not None:
            if "section_title_text" in self.dtype_attrs["section"]:
                section_data["section_title_text"].append(section_title.text)
            if "section_title_start_char" in self.dtype_attrs["section"]:
                section_data["section_title_start_char"].append(
                    section_title.start_char
                )
            if "section_title_end_char" in self.dtype_attrs["section"]:
                section_data["section_title_end_char"].append(section_title.end_char)
        else:
            if "section_title_text" in self.dtype_attrs["section"]:
                section_data["section_title_text"].append(None)
            if "section_title_start_char" in self.dtype_attrs["section"]:
                section_data["section_title_start_char"].append(0)
            if "section_title_end_char" in self.dtype_attrs["section"]:
                section_data["section_title_end_char"].append(0)
        if "section_body" in self.dtype_attrs["section"]:
            section_data["section_body"].append(section_body.text)
        if "section_body_start_char" in self.dtype_attrs["section"]:
            section_data["section_body_start_char"].append(section_body.start_char)
        if "section_body_end_char" in self.dtype_attrs["section"]:
            section_data["section_body_end_char"].append(section_body.end_char)
        if "section_parent" in self.dtype_attrs["section"]:
            section_data["section_parent"].append(section.parent)

__call__(doc)

Call the doc consumer on a doc and assign the data.

Parameters:

Name Type Description Default
doc

The Doc to process.

required

Returns:

Type Description

The processed Doc.

Source code in medspacy/io/doc_consumer.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def __call__(self, doc):
    """
    Call the doc consumer on a doc and assign the data.

    Args:
        doc: The Doc to process.

    Returns:
        The processed Doc.
    """
    data = dict()
    for dtype, attrs in self.dtype_attrs.items():
        data.setdefault(dtype, OrderedDict())
        for attr in attrs:
            data[dtype][attr] = list()
    if "ents" in self.dtypes:
        for ent in doc.ents:
            for attr in self.dtype_attrs["ents"]:
                try:
                    val = getattr(ent, attr)
                except AttributeError:
                    val = getattr(ent._, attr)
                data["ents"][attr].append(val)
    if "group" in self.dtypes:
        for span in doc.spans[self._span_group_name]:
            for attr in self.dtype_attrs["group"]:
                try:
                    val = getattr(span, attr)
                except AttributeError:
                    val = getattr(span._, attr)
                data["group"][attr].append(val)
    if "context" in self.dtypes:
        for (ent, modifier) in doc._.context_graph.edges:
            self.add_context_edge_attributes(ent, modifier, data["context"], doc)
    if "section" in self.dtypes:
        for section in doc._.sections:
            self.add_section_attributes(section, data["section"], doc)
    if "doc" in self.dtypes:
        for attr in self.dtype_attrs["doc"]:
            try:
                val = getattr(doc, attr)
            except AttributeError:
                val = getattr(doc._, attr)
            data["doc"][attr].append(val)

    doc._.data = data
    return doc

__init__(nlp, name='medspacy_doc_consumer', dtypes=('ents',), dtype_attrs=None, span_group_name='medspacy_spans')

Creates a new DocConsumer.

Parameters:

Name Type Description Default
nlp

A spaCy model

required
dtypes Tuple

Either a tuple of data types to collect or the string "all". Default ("ents",). Valid options are: "ents", "group", "section", "context", "doc".

('ents',)
dtype_attrs Dict

An optional dictionary mapping the data types in dtypes to a list of attributes. If None, will set defaults for each dtype. Attributes for "ents", "group", and "doc" may be customized be adding either native or custom attributes (i.e., ent._....) "context" and "section" are not customizable at this time. Default values for each dtype can be retrieved by the class method `DocConsumer.get_default_attrs()

None
span_group_name str

the name of the span group used when dtypes contains "group". At this time, only one span group is supported.

'medspacy_spans'
Source code in medspacy/io/doc_consumer.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def __init__(
    self,
    nlp,
    name: str = "medspacy_doc_consumer",
    dtypes: Tuple = ("ents",),
    dtype_attrs: Dict = None,
    span_group_name: str = "medspacy_spans",
):
    """
    Creates a new DocConsumer.

    Args:
        nlp: A spaCy model
        dtypes: Either a tuple of data types to collect or the string "all". Default ("ents",). Valid  options are:
            "ents", "group", "section", "context", "doc".
        dtype_attrs: An optional dictionary mapping the data types in dtypes to a list of attributes. If None, will
            set defaults for each dtype. Attributes for "ents", "group", and "doc" may be customized be adding either
            native or custom attributes (i.e., ent._....) "context" and "section" are not customizable at this time.
            Default values for each dtype can be retrieved by the class method `DocConsumer.get_default_attrs()
        span_group_name: the name of the span group used when dtypes contains "group". At this time, only one span
            group is supported.
    """
    self.nlp = nlp
    self.name = name
    self._span_group_name = span_group_name
    if not isinstance(dtypes, tuple):
        if dtypes == "all":
            dtypes = tuple(ALLOWED_DATA_TYPES)
        else:
            raise ValueError(
                "dtypes must be either 'all' or a tuple, not {0}".format(dtypes)
            )
    for dtype in dtypes:
        if dtype not in ALLOWED_DATA_TYPES:
            raise ValueError(
                "Invalid dtypes. Supported dtypes are {0}, not {1}".format(
                    ALLOWED_DATA_TYPES, dtype
                )
            )
        if dtype == "section":
            self.validate_section_attrs(dtype_attrs)
    self.dtypes = dtypes
    self.dtype_attrs = dtype_attrs

    if self.dtype_attrs is None:
        self._set_default_attrs()

_set_default_attrs()

Gets the default attributes.

Source code in medspacy/io/doc_consumer.py
156
157
158
159
160
def _set_default_attrs(self):
    """
    Gets the default attributes.
    """
    self.dtype_attrs = self.get_default_attrs(self.dtypes)

get_default_attrs(dtypes=None) classmethod

Gets the default attributes available to each type specified.

Parameters:

Name Type Description Default
dtypes Optional[Tuple]

Optional tuple containing "ents", "group", "context", "section", or "doc". If None, all will be returned.

None

Returns:

Type Description

The attributes the doc consumer will output for each of the specified types in dtypes.

Source code in medspacy/io/doc_consumer.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
@classmethod
def get_default_attrs(cls, dtypes: Optional[Tuple] = None):
    """
    Gets the default attributes available to each type specified.

    Args:
        dtypes: Optional tuple containing "ents", "group", "context", "section", or "doc". If None, all will be
            returned.

    Returns:
        The attributes the doc consumer will output for each of the specified types in `dtypes`.
    """
    if dtypes is None:
        dtypes = ALLOWED_DATA_TYPES
    else:
        if isinstance(dtypes, str):
            dtypes = (dtypes,)
        for dtype in dtypes:
            if dtype not in ALLOWED_DATA_TYPES:
                raise ValueError("Invalid dtype,", dtype)
    dtype_attrs = {
        dtype: list(attrs)
        for (dtype, attrs) in DEFAULT_ATTRS.items()
        if dtype in dtypes
    }
    return dtype_attrs

validate_section_attrs(attrs)

Validate that section attributes are either not specified or are valid attribute names.

Source code in medspacy/io/doc_consumer.py
162
163
164
165
166
167
168
169
170
171
172
173
def validate_section_attrs(self, attrs):
    """
    Validate that section attributes are either not specified or are valid attribute names.
    """
    if attrs is None:
        return True
    if "section" not in attrs:
        return True
    diff = set(attrs["section"]).difference(ALLOWED_SECTION_ATTRS)
    if diff:
        raise ValueError("Invalid section dtype_attrs specified: {0}".format(diff))
    return True

Pipeline

The Pipeline class executes a batch process of reading texts, processing them with a spaCy model, and writing the results back to a database.

Source code in medspacy/io/pipeline.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
@Language.factory("medspacy_pipeline")
class Pipeline:
    """The Pipeline class executes a batch process of reading texts, processing them with a spaCy model, and writing
    the results back to a database.
    """

    def __init__(self, nlp, reader, writer, name="medspacy_pipeline", dtype="ent"):
        """Create a new Pipeline object.
        Args:
            reader: A DbReader object
            writer: A Dbwriter object
            nlp: A spaCy model
            dtype: The DocConsumer data type to write to a database.
                Default "ent
                Valid options are ("ent", "section", "context", "doc")
        """

        self.reader = reader
        self.writer = writer
        self.name = name
        self.nlp = nlp
        self.dtype = dtype
        if dtype not in ALLOWED_DATA_TYPES:
            raise ValueError(
                "Invalid dtypes. Supported dtypes are {0}, not {1}".format(
                    ALLOWED_DATA_TYPES, dtype
                )
            )

    def process(self):
        """Run a pipeline by reading a set of texts from a source table, processing them with nlp,
        and writing doc._.data back to the destination table.
        """
        query_result = self.reader.read()
        data = None
        while query_result:
            if len(query_result) > 0:
                query_zip = list(zip(*query_result))
                ids = query_zip[0]
                texts = query_zip[1]

                docs = self.nlp.pipe(texts)

                for i, doc in enumerate(docs):
                    text_id = ids[i]
                    # Get the data as rows of tuples
                    doc_data = doc._.get_data(self.dtype, as_rows=True)
                    # Add the identifier column
                    doc_data = [(text_id,) + row_data for row_data in doc_data]
                    # doc_data.insert(0, self.writer.cols[0], [text_id for _ in range(len(doc_data))])
                    # doc_data = pd.DataFrame(data=doc._.get_data(self.dtype))
                    # doc_data.insert(0, self.writer.cols[0], [text_id for _ in range(len(doc_data))])

                    if data is None:
                        data = doc_data.copy()
                    else:
                        data += doc_data.copy()
                    if len(data) >= self.writer.batch_size:
                        self.writer.write_data(data)
                        data = None
            query_result = self.reader.read()

        if data is not None:
            self.writer.write_data(data)
            data = None

        self.reader.close()
        if self.writer.db.conn != self.reader.db.conn:
            self.writer.close()

__init__(nlp, reader, writer, name='medspacy_pipeline', dtype='ent')

Create a new Pipeline object. Args: reader: A DbReader object writer: A Dbwriter object nlp: A spaCy model dtype: The DocConsumer data type to write to a database. Default "ent Valid options are ("ent", "section", "context", "doc")

Source code in medspacy/io/pipeline.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def __init__(self, nlp, reader, writer, name="medspacy_pipeline", dtype="ent"):
    """Create a new Pipeline object.
    Args:
        reader: A DbReader object
        writer: A Dbwriter object
        nlp: A spaCy model
        dtype: The DocConsumer data type to write to a database.
            Default "ent
            Valid options are ("ent", "section", "context", "doc")
    """

    self.reader = reader
    self.writer = writer
    self.name = name
    self.nlp = nlp
    self.dtype = dtype
    if dtype not in ALLOWED_DATA_TYPES:
        raise ValueError(
            "Invalid dtypes. Supported dtypes are {0}, not {1}".format(
                ALLOWED_DATA_TYPES, dtype
            )
        )

process()

Run a pipeline by reading a set of texts from a source table, processing them with nlp, and writing doc._.data back to the destination table.

Source code in medspacy/io/pipeline.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def process(self):
    """Run a pipeline by reading a set of texts from a source table, processing them with nlp,
    and writing doc._.data back to the destination table.
    """
    query_result = self.reader.read()
    data = None
    while query_result:
        if len(query_result) > 0:
            query_zip = list(zip(*query_result))
            ids = query_zip[0]
            texts = query_zip[1]

            docs = self.nlp.pipe(texts)

            for i, doc in enumerate(docs):
                text_id = ids[i]
                # Get the data as rows of tuples
                doc_data = doc._.get_data(self.dtype, as_rows=True)
                # Add the identifier column
                doc_data = [(text_id,) + row_data for row_data in doc_data]
                # doc_data.insert(0, self.writer.cols[0], [text_id for _ in range(len(doc_data))])
                # doc_data = pd.DataFrame(data=doc._.get_data(self.dtype))
                # doc_data.insert(0, self.writer.cols[0], [text_id for _ in range(len(doc_data))])

                if data is None:
                    data = doc_data.copy()
                else:
                    data += doc_data.copy()
                if len(data) >= self.writer.batch_size:
                    self.writer.write_data(data)
                    data = None
        query_result = self.reader.read()

    if data is not None:
        self.writer.write_data(data)
        data = None

    self.reader.close()
    if self.writer.db.conn != self.reader.db.conn:
        self.writer.close()

db_connect

DbConnect

DbConnect is a wrapper for either a pyodbc or sqlite3 connection. It can then be passed into the DbReader and DbWriter classes to retrieve/store document data.

Source code in medspacy/io/db_connect.py
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
class DbConnect:
    """DbConnect is a wrapper for either a pyodbc or sqlite3 connection. It can then be
    passed into the DbReader and DbWriter classes to retrieve/store document data.
    """

    def __init__(
        self, driver=None, server=None, db=None, user=None, pwd=None, conn=None
    ):
        """Create a new DbConnect object. You can pass in either information for a pyodbc connection string
        or directly pass in a sqlite or pyodbc connection object.

        If conn is None, all other arguments must be supplied. If conn is passed in, all other arguments will be ignored.

        Args:
            driver
            server
            db:
            user
            pwd
            conn
        """
        if conn is None:
            if not all([driver, server, db, user, pwd]):
                raise ValueError(
                    "If you are not passing in a connection object, "
                    "you must pass in all other arguments to create a DB connection."
                )
            import pyodbc

            self.conn = pyodbc.connect(
                "DRIVER={0};SERVER={1};DATABASE={2};USER={3};PWD={4}".format(
                    driver, server, db, user, pwd
                )
            )
        else:
            self.conn = conn
        self.cursor = self.conn.cursor()
        # according this thread, bulk insert for sqlserver need to set fast_executemany=True.
        # https://stackoverflow.com/questions/29638136/how-to-speed-up-bulk-insert-to-ms-sql-server-using-pyodbc
        if hasattr(self.cursor, 'fast_executemany'):
            self.cursor.fast_executemany = True

        import sqlite3

        if isinstance(self.conn, sqlite3.Connection):
            self.db_lib = "sqlite3"
            self.database_exception = sqlite3.DatabaseError
        else:
            import pyodbc
            if isinstance(self.conn, pyodbc.Connection):
                self.db_lib = "pyodbc"
                self.database_exception = pyodbc.DatabaseError
            else:
                raise ValueError(
                    "conn must be either a sqlite3 or pyodbc Connection object, not {0}".format(
                        type(self.conn)
                    )
                )

        print("Opened connection to {0}.{1}".format(server, db))

    def create_table(self, query, table_name, drop_existing):
        if drop_existing:
            try:
                self.cursor.execute("drop table if exists {0}".format(table_name))
            # except pyodbc.DatabaseError:
            except self.database_exception as e:
                pass
            else:
                self.conn.commit()
        try:
            self.cursor.execute(query)
        except self.database_exception as e:
            self.conn.rollback()
            self.conn.close()
            raise e
        else:
            self.conn.commit()
            print("Created table {0} with query: {1}".format(table_name, query))

    def write(self, query, data):
        try:
            self.cursor.executemany(query, data)
        except self.database_exception as e:
            self.conn.rollback()
            self.conn.close()
            raise e
        else:
            self.conn.commit()
            # print("Wrote {0} rows with query: {1}".format(len(data), query))

    def read(self, query):
        self.cursor.execute(query)
        result = self.cursor.fetchall()
        # print("Read {0} rows with query: {1}".format(len(result), query))
        return result

    def close(self):
        self.conn.commit()
        self.conn.close()
        print("Connection closed.")
__init__(driver=None, server=None, db=None, user=None, pwd=None, conn=None)

Create a new DbConnect object. You can pass in either information for a pyodbc connection string or directly pass in a sqlite or pyodbc connection object.

If conn is None, all other arguments must be supplied. If conn is passed in, all other arguments will be ignored.

Parameters:

Name Type Description Default
db
None
Source code in medspacy/io/db_connect.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def __init__(
    self, driver=None, server=None, db=None, user=None, pwd=None, conn=None
):
    """Create a new DbConnect object. You can pass in either information for a pyodbc connection string
    or directly pass in a sqlite or pyodbc connection object.

    If conn is None, all other arguments must be supplied. If conn is passed in, all other arguments will be ignored.

    Args:
        driver
        server
        db:
        user
        pwd
        conn
    """
    if conn is None:
        if not all([driver, server, db, user, pwd]):
            raise ValueError(
                "If you are not passing in a connection object, "
                "you must pass in all other arguments to create a DB connection."
            )
        import pyodbc

        self.conn = pyodbc.connect(
            "DRIVER={0};SERVER={1};DATABASE={2};USER={3};PWD={4}".format(
                driver, server, db, user, pwd
            )
        )
    else:
        self.conn = conn
    self.cursor = self.conn.cursor()
    # according this thread, bulk insert for sqlserver need to set fast_executemany=True.
    # https://stackoverflow.com/questions/29638136/how-to-speed-up-bulk-insert-to-ms-sql-server-using-pyodbc
    if hasattr(self.cursor, 'fast_executemany'):
        self.cursor.fast_executemany = True

    import sqlite3

    if isinstance(self.conn, sqlite3.Connection):
        self.db_lib = "sqlite3"
        self.database_exception = sqlite3.DatabaseError
    else:
        import pyodbc
        if isinstance(self.conn, pyodbc.Connection):
            self.db_lib = "pyodbc"
            self.database_exception = pyodbc.DatabaseError
        else:
            raise ValueError(
                "conn must be either a sqlite3 or pyodbc Connection object, not {0}".format(
                    type(self.conn)
                )
            )

    print("Opened connection to {0}.{1}".format(server, db))

db_writer

DbWriter

DbWriter is a utility class for writing structured data back to a database.

Source code in medspacy/io/db_writer.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
class DbWriter:
    """DbWriter is a utility class for writing structured data back to a database."""

    def __init__(
            self,
            db_conn,
            destination_table,
            cols=None,
            col_types=None,
            doc_dtype="ents",
            create_table=False,
            drop_existing=False,
            write_batch_size=100,
    ):
        """Create a new DbWriter object.

        Args:
            db_conn: A medspacy.io.DbConnect object
            destination_table: The name of the table to write to
            cols (opt): The names of the columns of the destination table. These should align with attributes extracted
                by DocConsumer and stored in doc._.data. A set of default values can be accessed by:
                >>> DbWriter.get_default_cols()
            col_types (opt): The sql data types of the table columns. They should correspond 1:1 with cols.
                A set of default values can be accesed by:
                >>> DbWriter.get_default_col_types()
            doc_dtype: The type of data from DocConsumer to write from a doc.
                Either ("ents", "section", "context", or "doc")
            create_table (bool): Whether to create a table

        """
        self.db = db_conn
        self.destination_table = destination_table
        self._create_table = create_table
        self.drop_existing = drop_existing
        if cols is None and col_types is None:
            cols = DEFAULT_COLS[doc_dtype]
            col_types = [DEFAULT_COL_TYPES[doc_dtype][col] for col in cols]
        elif cols is None and col_types is not None:
            raise ValueError("cols must be specified if col_types is not None.")
        self.cols = cols
        self.col_types = col_types
        _validate_dtypes((doc_dtype,))
        self.doc_dtype = doc_dtype
        self.batch_size = write_batch_size

        self.insert_query = ""
        if create_table:
            self.create_table()
        self.make_insert_query()

    @classmethod
    def get_default_col_types(cls, dtypes=None):

        if dtypes is None:
            dtypes = tuple(DEFAULT_COL_TYPES.keys())
        else:
            if isinstance(dtypes, str):
                dtypes = (dtypes,)

        _validate_dtypes(dtypes)
        dtype_col_types = {
            dtype: col_types
            for (dtype, col_types) in DEFAULT_COL_TYPES.items()
            if dtype in dtypes
        }
        return dtype_col_types

    @classmethod
    def get_default_cols(cls, dtypes=None):
        if dtypes is None:
            dtypes = tuple(DEFAULT_COL_TYPES.keys())
        else:
            if isinstance(dtypes, str):
                dtypes = (dtypes,)
        _validate_dtypes(dtypes)

        dtype_cols = {
            dtype: cols
            for (dtype, cols) in DEFAULT_COL_TYPES.items()
            if dtype in dtypes
        }
        return dtype_cols

    def create_table(self):
        query = "CREATE TABLE {0} (".format(self.destination_table)
        for i, col in enumerate(self.cols):
            query += "{0} {1}".format(col, self.col_types[i])
            if i < len(self.cols) - 1:
                query += ", "
            else:
                query += ")"
        self.db.create_table(query, self.destination_table, self.drop_existing)

    def make_insert_query(self):
        col_list = ", ".join([col for col in self.cols])
        q_list = ", ".join(["?" for col in self.cols])
        self.insert_query = "INSERT INTO {0} ({1}) VALUES ({2})".format(
            self.destination_table, col_list, q_list
        )

    def write(self, docs: Union[Doc, List[Doc]]):
        """Write a list of docs or doc to a database."""
        if isinstance(docs, Doc):
            self.write_doc(docs)
        else:
            self.write_docs(docs)

    def write_doc(self, doc):
        """Write a doc to a database."""
        data = doc._.get_data(self.doc_dtype, attrs=self.cols, as_rows=True)
        self.write_data(data)

    def write_docs(self, docs, batch_size=800):
        """write a list of docs to database through bulk insert"""
        data = []
        for doc in docs:
            data.extend(doc._.get_data(self.doc_dtype, attrs=self.cols, as_rows=True))
            if len(data) >= batch_size:
                self.write_data(data)
                data = []
        if len(data) > 0:
            self.write_data(data)
        pass

    def write_data(self, data):
        self.db.write(self.insert_query, data)

    def close(self):
        self.db.close()
__init__(db_conn, destination_table, cols=None, col_types=None, doc_dtype='ents', create_table=False, drop_existing=False, write_batch_size=100)

Create a new DbWriter object.

Parameters:

Name Type Description Default
db_conn

A medspacy.io.DbConnect object

required
destination_table

The name of the table to write to

required
cols opt

The names of the columns of the destination table. These should align with attributes extracted by DocConsumer and stored in doc._.data. A set of default values can be accessed by:

DbWriter.get_default_cols()

None
col_types opt

The sql data types of the table columns. They should correspond 1:1 with cols. A set of default values can be accesed by:

DbWriter.get_default_col_types()

None
doc_dtype

The type of data from DocConsumer to write from a doc. Either ("ents", "section", "context", or "doc")

'ents'
create_table bool

Whether to create a table

False
Source code in medspacy/io/db_writer.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def __init__(
        self,
        db_conn,
        destination_table,
        cols=None,
        col_types=None,
        doc_dtype="ents",
        create_table=False,
        drop_existing=False,
        write_batch_size=100,
):
    """Create a new DbWriter object.

    Args:
        db_conn: A medspacy.io.DbConnect object
        destination_table: The name of the table to write to
        cols (opt): The names of the columns of the destination table. These should align with attributes extracted
            by DocConsumer and stored in doc._.data. A set of default values can be accessed by:
            >>> DbWriter.get_default_cols()
        col_types (opt): The sql data types of the table columns. They should correspond 1:1 with cols.
            A set of default values can be accesed by:
            >>> DbWriter.get_default_col_types()
        doc_dtype: The type of data from DocConsumer to write from a doc.
            Either ("ents", "section", "context", or "doc")
        create_table (bool): Whether to create a table

    """
    self.db = db_conn
    self.destination_table = destination_table
    self._create_table = create_table
    self.drop_existing = drop_existing
    if cols is None and col_types is None:
        cols = DEFAULT_COLS[doc_dtype]
        col_types = [DEFAULT_COL_TYPES[doc_dtype][col] for col in cols]
    elif cols is None and col_types is not None:
        raise ValueError("cols must be specified if col_types is not None.")
    self.cols = cols
    self.col_types = col_types
    _validate_dtypes((doc_dtype,))
    self.doc_dtype = doc_dtype
    self.batch_size = write_batch_size

    self.insert_query = ""
    if create_table:
        self.create_table()
    self.make_insert_query()
write(docs)

Write a list of docs or doc to a database.

Source code in medspacy/io/db_writer.py
160
161
162
163
164
165
def write(self, docs: Union[Doc, List[Doc]]):
    """Write a list of docs or doc to a database."""
    if isinstance(docs, Doc):
        self.write_doc(docs)
    else:
        self.write_docs(docs)
write_doc(doc)

Write a doc to a database.

Source code in medspacy/io/db_writer.py
167
168
169
170
def write_doc(self, doc):
    """Write a doc to a database."""
    data = doc._.get_data(self.doc_dtype, attrs=self.cols, as_rows=True)
    self.write_data(data)
write_docs(docs, batch_size=800)

write a list of docs to database through bulk insert

Source code in medspacy/io/db_writer.py
172
173
174
175
176
177
178
179
180
181
182
def write_docs(self, docs, batch_size=800):
    """write a list of docs to database through bulk insert"""
    data = []
    for doc in docs:
        data.extend(doc._.get_data(self.doc_dtype, attrs=self.cols, as_rows=True))
        if len(data) >= batch_size:
            self.write_data(data)
            data = []
    if len(data) > 0:
        self.write_data(data)
    pass

doc_consumer

DocConsumer

A DocConsumer object will consume a spacy doc and output rows based on a configuration provided by the user.

This component extracts structured information from a Doc. Information is stored in doc._.data, which is a nested dictionary. The outer keys represent the data type of can one or more of: - "ents": data about the spans in doc.ents such as the text, label, context attributes, section information, or custom attributes - "group": data about spans in a span group with the name span_group_attrs section text and category - "context": data about entity-modifier pairs extracted by ConText - "doc": a single doc-level representation. By default only doc.text is extracted, but other attributes may be specified

Once processed, a doc's data can be accessed either by:
    - doc._.data
    - doc._.get_data(dtype=...)
    - doc._.ent_data
    - doc._.to_dataframe(dtype=...)
Source code in medspacy/io/doc_consumer.py
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
@Language.factory("medspacy_doc_consumer")
class DocConsumer:
    """
    A DocConsumer object will consume a spacy doc and output rows based on a configuration provided by the user.

    This component extracts structured information from a Doc. Information is stored in doc._.data, which is a
        nested dictionary. The outer keys represent the data type of can one or more of:
            - "ents": data about the spans in doc.ents such as the text, label,
                context attributes, section information, or custom attributes
            - "group": data about spans in a span group with the name `span_group_attrs` section text and category
            - "context": data about entity-modifier pairs extracted by ConText
            - "doc": a single doc-level representation. By default only doc.text is extracted, but other attributes may
                be specified

        Once processed, a doc's data can be accessed either by:
            - doc._.data
            - doc._.get_data(dtype=...)
            - doc._.ent_data
            - doc._.to_dataframe(dtype=...)
    """

    def __init__(
        self,
        nlp,
        name: str = "medspacy_doc_consumer",
        dtypes: Tuple = ("ents",),
        dtype_attrs: Dict = None,
        span_group_name: str = "medspacy_spans",
    ):
        """
        Creates a new DocConsumer.

        Args:
            nlp: A spaCy model
            dtypes: Either a tuple of data types to collect or the string "all". Default ("ents",). Valid  options are:
                "ents", "group", "section", "context", "doc".
            dtype_attrs: An optional dictionary mapping the data types in dtypes to a list of attributes. If None, will
                set defaults for each dtype. Attributes for "ents", "group", and "doc" may be customized be adding either
                native or custom attributes (i.e., ent._....) "context" and "section" are not customizable at this time.
                Default values for each dtype can be retrieved by the class method `DocConsumer.get_default_attrs()
            span_group_name: the name of the span group used when dtypes contains "group". At this time, only one span
                group is supported.
        """
        self.nlp = nlp
        self.name = name
        self._span_group_name = span_group_name
        if not isinstance(dtypes, tuple):
            if dtypes == "all":
                dtypes = tuple(ALLOWED_DATA_TYPES)
            else:
                raise ValueError(
                    "dtypes must be either 'all' or a tuple, not {0}".format(dtypes)
                )
        for dtype in dtypes:
            if dtype not in ALLOWED_DATA_TYPES:
                raise ValueError(
                    "Invalid dtypes. Supported dtypes are {0}, not {1}".format(
                        ALLOWED_DATA_TYPES, dtype
                    )
                )
            if dtype == "section":
                self.validate_section_attrs(dtype_attrs)
        self.dtypes = dtypes
        self.dtype_attrs = dtype_attrs

        if self.dtype_attrs is None:
            self._set_default_attrs()

    @classmethod
    def get_default_attrs(cls, dtypes: Optional[Tuple] = None):
        """
        Gets the default attributes available to each type specified.

        Args:
            dtypes: Optional tuple containing "ents", "group", "context", "section", or "doc". If None, all will be
                returned.

        Returns:
            The attributes the doc consumer will output for each of the specified types in `dtypes`.
        """
        if dtypes is None:
            dtypes = ALLOWED_DATA_TYPES
        else:
            if isinstance(dtypes, str):
                dtypes = (dtypes,)
            for dtype in dtypes:
                if dtype not in ALLOWED_DATA_TYPES:
                    raise ValueError("Invalid dtype,", dtype)
        dtype_attrs = {
            dtype: list(attrs)
            for (dtype, attrs) in DEFAULT_ATTRS.items()
            if dtype in dtypes
        }
        return dtype_attrs

    def _set_default_attrs(self):
        """
        Gets the default attributes.
        """
        self.dtype_attrs = self.get_default_attrs(self.dtypes)

    def validate_section_attrs(self, attrs):
        """
        Validate that section attributes are either not specified or are valid attribute names.
        """
        if attrs is None:
            return True
        if "section" not in attrs:
            return True
        diff = set(attrs["section"]).difference(ALLOWED_SECTION_ATTRS)
        if diff:
            raise ValueError("Invalid section dtype_attrs specified: {0}".format(diff))
        return True

    def __call__(self, doc):
        """
        Call the doc consumer on a doc and assign the data.

        Args:
            doc: The Doc to process.

        Returns:
            The processed Doc.
        """
        data = dict()
        for dtype, attrs in self.dtype_attrs.items():
            data.setdefault(dtype, OrderedDict())
            for attr in attrs:
                data[dtype][attr] = list()
        if "ents" in self.dtypes:
            for ent in doc.ents:
                for attr in self.dtype_attrs["ents"]:
                    try:
                        val = getattr(ent, attr)
                    except AttributeError:
                        val = getattr(ent._, attr)
                    data["ents"][attr].append(val)
        if "group" in self.dtypes:
            for span in doc.spans[self._span_group_name]:
                for attr in self.dtype_attrs["group"]:
                    try:
                        val = getattr(span, attr)
                    except AttributeError:
                        val = getattr(span._, attr)
                    data["group"][attr].append(val)
        if "context" in self.dtypes:
            for (ent, modifier) in doc._.context_graph.edges:
                self.add_context_edge_attributes(ent, modifier, data["context"], doc)
        if "section" in self.dtypes:
            for section in doc._.sections:
                self.add_section_attributes(section, data["section"], doc)
        if "doc" in self.dtypes:
            for attr in self.dtype_attrs["doc"]:
                try:
                    val = getattr(doc, attr)
                except AttributeError:
                    val = getattr(doc._, attr)
                data["doc"][attr].append(val)

        doc._.data = data
        return doc

    def add_context_edge_attributes(
        self, ent: Span, modifier: ConTextModifier, context_data, doc
    ):
        span_tup = modifier.modifier_span
        span = doc[span_tup[0] : span_tup[1]]
        scope_tup = modifier.scope_span
        scope = doc[scope_tup[0] : scope_tup[1]]
        for attr in self.dtype_attrs["context"]:
            if attr == "ent_text":
                context_data["ent_text"].append(ent.text)
            elif attr == "ent_label_":
                context_data["ent_label_"].append(ent.label_)
            elif attr == "ent_start_char":
                context_data["ent_start_char"].append(ent.start_char)
            elif attr == "ent_end_char":
                context_data["ent_end_char"].append(ent.end_char)
            elif attr == "modifier_text":
                context_data["modifier_text"].append(span.text)
            elif attr == "modifier_category":
                context_data["modifier_category"].append(modifier.category)
            elif attr == "modifier_direction":
               context_data["modifier_direction"].append(modifier.direction)
            elif attr == "modifier_start_char":
                context_data["modifier_start_char"].append(span.start_char)
            elif attr == "modifier_end_char":
                context_data["modifier_end_char"].append(span.end_char)
            elif attr == "modifier_scope_start_char":
                context_data["modifier_scope_start_char"].append(scope.start_char)
            elif attr == "modifier_scope_end_char":
                context_data["modifier_scope_end_char"].append(scope.end_char)
            else:
            # if specified attribute is not one of these standard values, check the entity to see if it's an entity value
                try:
                    val = getattr(ent, attr)
                except AttributeError:
                    try:
                        val = getattr(ent._, attr)
                    except AttributeError:
                        raise ValueError(f"Attributes for dtype 'context' must be either "
                                         f"a registered custom Span attribute (i.e., Span._.attr) or one of these pre-defined values: "
                                          f"{ALLOWED_CONTEXT_ATTRS}. \nYou passed in '{attr}'")
                context_data[f"{attr}"].append(val)

    def add_section_attributes(self, section, section_data, doc):
        # Allow for null sections
        section_title_tup = section.title_span
        section_body_tup = section.body_span
        section_title = doc[section_title_tup[0] : section_title_tup[1]]
        section_body = doc[section_body_tup[0] : section_body_tup[1]]
        if "section_category" in self.dtype_attrs["section"]:
            section_data["section_category"].append(section.category)
        if section.category is not None:
            if "section_title_text" in self.dtype_attrs["section"]:
                section_data["section_title_text"].append(section_title.text)
            if "section_title_start_char" in self.dtype_attrs["section"]:
                section_data["section_title_start_char"].append(
                    section_title.start_char
                )
            if "section_title_end_char" in self.dtype_attrs["section"]:
                section_data["section_title_end_char"].append(section_title.end_char)
        else:
            if "section_title_text" in self.dtype_attrs["section"]:
                section_data["section_title_text"].append(None)
            if "section_title_start_char" in self.dtype_attrs["section"]:
                section_data["section_title_start_char"].append(0)
            if "section_title_end_char" in self.dtype_attrs["section"]:
                section_data["section_title_end_char"].append(0)
        if "section_body" in self.dtype_attrs["section"]:
            section_data["section_body"].append(section_body.text)
        if "section_body_start_char" in self.dtype_attrs["section"]:
            section_data["section_body_start_char"].append(section_body.start_char)
        if "section_body_end_char" in self.dtype_attrs["section"]:
            section_data["section_body_end_char"].append(section_body.end_char)
        if "section_parent" in self.dtype_attrs["section"]:
            section_data["section_parent"].append(section.parent)
__call__(doc)

Call the doc consumer on a doc and assign the data.

Parameters:

Name Type Description Default
doc

The Doc to process.

required

Returns:

Type Description

The processed Doc.

Source code in medspacy/io/doc_consumer.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def __call__(self, doc):
    """
    Call the doc consumer on a doc and assign the data.

    Args:
        doc: The Doc to process.

    Returns:
        The processed Doc.
    """
    data = dict()
    for dtype, attrs in self.dtype_attrs.items():
        data.setdefault(dtype, OrderedDict())
        for attr in attrs:
            data[dtype][attr] = list()
    if "ents" in self.dtypes:
        for ent in doc.ents:
            for attr in self.dtype_attrs["ents"]:
                try:
                    val = getattr(ent, attr)
                except AttributeError:
                    val = getattr(ent._, attr)
                data["ents"][attr].append(val)
    if "group" in self.dtypes:
        for span in doc.spans[self._span_group_name]:
            for attr in self.dtype_attrs["group"]:
                try:
                    val = getattr(span, attr)
                except AttributeError:
                    val = getattr(span._, attr)
                data["group"][attr].append(val)
    if "context" in self.dtypes:
        for (ent, modifier) in doc._.context_graph.edges:
            self.add_context_edge_attributes(ent, modifier, data["context"], doc)
    if "section" in self.dtypes:
        for section in doc._.sections:
            self.add_section_attributes(section, data["section"], doc)
    if "doc" in self.dtypes:
        for attr in self.dtype_attrs["doc"]:
            try:
                val = getattr(doc, attr)
            except AttributeError:
                val = getattr(doc._, attr)
            data["doc"][attr].append(val)

    doc._.data = data
    return doc
__init__(nlp, name='medspacy_doc_consumer', dtypes=('ents',), dtype_attrs=None, span_group_name='medspacy_spans')

Creates a new DocConsumer.

Parameters:

Name Type Description Default
nlp

A spaCy model

required
dtypes Tuple

Either a tuple of data types to collect or the string "all". Default ("ents",). Valid options are: "ents", "group", "section", "context", "doc".

('ents',)
dtype_attrs Dict

An optional dictionary mapping the data types in dtypes to a list of attributes. If None, will set defaults for each dtype. Attributes for "ents", "group", and "doc" may be customized be adding either native or custom attributes (i.e., ent._....) "context" and "section" are not customizable at this time. Default values for each dtype can be retrieved by the class method `DocConsumer.get_default_attrs()

None
span_group_name str

the name of the span group used when dtypes contains "group". At this time, only one span group is supported.

'medspacy_spans'
Source code in medspacy/io/doc_consumer.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def __init__(
    self,
    nlp,
    name: str = "medspacy_doc_consumer",
    dtypes: Tuple = ("ents",),
    dtype_attrs: Dict = None,
    span_group_name: str = "medspacy_spans",
):
    """
    Creates a new DocConsumer.

    Args:
        nlp: A spaCy model
        dtypes: Either a tuple of data types to collect or the string "all". Default ("ents",). Valid  options are:
            "ents", "group", "section", "context", "doc".
        dtype_attrs: An optional dictionary mapping the data types in dtypes to a list of attributes. If None, will
            set defaults for each dtype. Attributes for "ents", "group", and "doc" may be customized be adding either
            native or custom attributes (i.e., ent._....) "context" and "section" are not customizable at this time.
            Default values for each dtype can be retrieved by the class method `DocConsumer.get_default_attrs()
        span_group_name: the name of the span group used when dtypes contains "group". At this time, only one span
            group is supported.
    """
    self.nlp = nlp
    self.name = name
    self._span_group_name = span_group_name
    if not isinstance(dtypes, tuple):
        if dtypes == "all":
            dtypes = tuple(ALLOWED_DATA_TYPES)
        else:
            raise ValueError(
                "dtypes must be either 'all' or a tuple, not {0}".format(dtypes)
            )
    for dtype in dtypes:
        if dtype not in ALLOWED_DATA_TYPES:
            raise ValueError(
                "Invalid dtypes. Supported dtypes are {0}, not {1}".format(
                    ALLOWED_DATA_TYPES, dtype
                )
            )
        if dtype == "section":
            self.validate_section_attrs(dtype_attrs)
    self.dtypes = dtypes
    self.dtype_attrs = dtype_attrs

    if self.dtype_attrs is None:
        self._set_default_attrs()
_set_default_attrs()

Gets the default attributes.

Source code in medspacy/io/doc_consumer.py
156
157
158
159
160
def _set_default_attrs(self):
    """
    Gets the default attributes.
    """
    self.dtype_attrs = self.get_default_attrs(self.dtypes)
get_default_attrs(dtypes=None) classmethod

Gets the default attributes available to each type specified.

Parameters:

Name Type Description Default
dtypes Optional[Tuple]

Optional tuple containing "ents", "group", "context", "section", or "doc". If None, all will be returned.

None

Returns:

Type Description

The attributes the doc consumer will output for each of the specified types in dtypes.

Source code in medspacy/io/doc_consumer.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
@classmethod
def get_default_attrs(cls, dtypes: Optional[Tuple] = None):
    """
    Gets the default attributes available to each type specified.

    Args:
        dtypes: Optional tuple containing "ents", "group", "context", "section", or "doc". If None, all will be
            returned.

    Returns:
        The attributes the doc consumer will output for each of the specified types in `dtypes`.
    """
    if dtypes is None:
        dtypes = ALLOWED_DATA_TYPES
    else:
        if isinstance(dtypes, str):
            dtypes = (dtypes,)
        for dtype in dtypes:
            if dtype not in ALLOWED_DATA_TYPES:
                raise ValueError("Invalid dtype,", dtype)
    dtype_attrs = {
        dtype: list(attrs)
        for (dtype, attrs) in DEFAULT_ATTRS.items()
        if dtype in dtypes
    }
    return dtype_attrs
validate_section_attrs(attrs)

Validate that section attributes are either not specified or are valid attribute names.

Source code in medspacy/io/doc_consumer.py
162
163
164
165
166
167
168
169
170
171
172
173
def validate_section_attrs(self, attrs):
    """
    Validate that section attributes are either not specified or are valid attribute names.
    """
    if attrs is None:
        return True
    if "section" not in attrs:
        return True
    diff = set(attrs["section"]).difference(ALLOWED_SECTION_ATTRS)
    if diff:
        raise ValueError("Invalid section dtype_attrs specified: {0}".format(diff))
    return True

pipeline

Pipeline

The Pipeline class executes a batch process of reading texts, processing them with a spaCy model, and writing the results back to a database.

Source code in medspacy/io/pipeline.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
@Language.factory("medspacy_pipeline")
class Pipeline:
    """The Pipeline class executes a batch process of reading texts, processing them with a spaCy model, and writing
    the results back to a database.
    """

    def __init__(self, nlp, reader, writer, name="medspacy_pipeline", dtype="ent"):
        """Create a new Pipeline object.
        Args:
            reader: A DbReader object
            writer: A Dbwriter object
            nlp: A spaCy model
            dtype: The DocConsumer data type to write to a database.
                Default "ent
                Valid options are ("ent", "section", "context", "doc")
        """

        self.reader = reader
        self.writer = writer
        self.name = name
        self.nlp = nlp
        self.dtype = dtype
        if dtype not in ALLOWED_DATA_TYPES:
            raise ValueError(
                "Invalid dtypes. Supported dtypes are {0}, not {1}".format(
                    ALLOWED_DATA_TYPES, dtype
                )
            )

    def process(self):
        """Run a pipeline by reading a set of texts from a source table, processing them with nlp,
        and writing doc._.data back to the destination table.
        """
        query_result = self.reader.read()
        data = None
        while query_result:
            if len(query_result) > 0:
                query_zip = list(zip(*query_result))
                ids = query_zip[0]
                texts = query_zip[1]

                docs = self.nlp.pipe(texts)

                for i, doc in enumerate(docs):
                    text_id = ids[i]
                    # Get the data as rows of tuples
                    doc_data = doc._.get_data(self.dtype, as_rows=True)
                    # Add the identifier column
                    doc_data = [(text_id,) + row_data for row_data in doc_data]
                    # doc_data.insert(0, self.writer.cols[0], [text_id for _ in range(len(doc_data))])
                    # doc_data = pd.DataFrame(data=doc._.get_data(self.dtype))
                    # doc_data.insert(0, self.writer.cols[0], [text_id for _ in range(len(doc_data))])

                    if data is None:
                        data = doc_data.copy()
                    else:
                        data += doc_data.copy()
                    if len(data) >= self.writer.batch_size:
                        self.writer.write_data(data)
                        data = None
            query_result = self.reader.read()

        if data is not None:
            self.writer.write_data(data)
            data = None

        self.reader.close()
        if self.writer.db.conn != self.reader.db.conn:
            self.writer.close()
__init__(nlp, reader, writer, name='medspacy_pipeline', dtype='ent')

Create a new Pipeline object. Args: reader: A DbReader object writer: A Dbwriter object nlp: A spaCy model dtype: The DocConsumer data type to write to a database. Default "ent Valid options are ("ent", "section", "context", "doc")

Source code in medspacy/io/pipeline.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def __init__(self, nlp, reader, writer, name="medspacy_pipeline", dtype="ent"):
    """Create a new Pipeline object.
    Args:
        reader: A DbReader object
        writer: A Dbwriter object
        nlp: A spaCy model
        dtype: The DocConsumer data type to write to a database.
            Default "ent
            Valid options are ("ent", "section", "context", "doc")
    """

    self.reader = reader
    self.writer = writer
    self.name = name
    self.nlp = nlp
    self.dtype = dtype
    if dtype not in ALLOWED_DATA_TYPES:
        raise ValueError(
            "Invalid dtypes. Supported dtypes are {0}, not {1}".format(
                ALLOWED_DATA_TYPES, dtype
            )
        )
process()

Run a pipeline by reading a set of texts from a source table, processing them with nlp, and writing doc._.data back to the destination table.

Source code in medspacy/io/pipeline.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def process(self):
    """Run a pipeline by reading a set of texts from a source table, processing them with nlp,
    and writing doc._.data back to the destination table.
    """
    query_result = self.reader.read()
    data = None
    while query_result:
        if len(query_result) > 0:
            query_zip = list(zip(*query_result))
            ids = query_zip[0]
            texts = query_zip[1]

            docs = self.nlp.pipe(texts)

            for i, doc in enumerate(docs):
                text_id = ids[i]
                # Get the data as rows of tuples
                doc_data = doc._.get_data(self.dtype, as_rows=True)
                # Add the identifier column
                doc_data = [(text_id,) + row_data for row_data in doc_data]
                # doc_data.insert(0, self.writer.cols[0], [text_id for _ in range(len(doc_data))])
                # doc_data = pd.DataFrame(data=doc._.get_data(self.dtype))
                # doc_data.insert(0, self.writer.cols[0], [text_id for _ in range(len(doc_data))])

                if data is None:
                    data = doc_data.copy()
                else:
                    data += doc_data.copy()
                if len(data) >= self.writer.batch_size:
                    self.writer.write_data(data)
                    data = None
        query_result = self.reader.read()

    if data is not None:
        self.writer.write_data(data)
        data = None

    self.reader.close()
    if self.writer.db.conn != self.reader.db.conn:
        self.writer.close()

postprocess

PostprocessingPattern

PostprocessingPatterns are callable functions and equality values wrapped together that will create triggers in the later Postprocessor as part of PostprocessingRules.

Source code in medspacy/postprocess/postprocessing_pattern.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class PostprocessingPattern:
    """
    PostprocessingPatterns are callable functions and equality values wrapped together that will create triggers
    in the later Postprocessor as part of PostprocessingRules.
    """

    def __init__(self, condition: Callable, success_value: Any = True, **kwargs):
        """
        A PostprocessingPattern defines a single condition to check against an entity.

        Args:
            condition: A function to call on an entity. If the result of the function call equals success_value, then
                the pattern passes.
            success_value: The value which should be returned by condition(ent) in order for the pattern to pass. Must
                have == defined for condition(ent) == success_value.
            kwargs: Optional keyword arguments to call with condition(ent, **kwargs).
        """
        self.condition = condition
        self.success_value = success_value
        self.kwargs = kwargs

    def __call__(self, ent: Span) -> bool:
        """
        Call the PostprocessingPattern on the span specified.

        Args:
            ent: the span to process.

        Returns:
            Whether calling `condition` on the entity specified is `success_value`.
        """
        if self.kwargs:
            result = self.condition(ent, **self.kwargs)
        else:
            result = self.condition(ent)
        return result == self.success_value

__call__(ent)

Call the PostprocessingPattern on the span specified.

Parameters:

Name Type Description Default
ent Span

the span to process.

required

Returns:

Type Description
bool

Whether calling condition on the entity specified is success_value.

Source code in medspacy/postprocess/postprocessing_pattern.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def __call__(self, ent: Span) -> bool:
    """
    Call the PostprocessingPattern on the span specified.

    Args:
        ent: the span to process.

    Returns:
        Whether calling `condition` on the entity specified is `success_value`.
    """
    if self.kwargs:
        result = self.condition(ent, **self.kwargs)
    else:
        result = self.condition(ent)
    return result == self.success_value

__init__(condition, success_value=True, **kwargs)

A PostprocessingPattern defines a single condition to check against an entity.

Parameters:

Name Type Description Default
condition Callable

A function to call on an entity. If the result of the function call equals success_value, then the pattern passes.

required
success_value Any

The value which should be returned by condition(ent) in order for the pattern to pass. Must have == defined for condition(ent) == success_value.

True
kwargs

Optional keyword arguments to call with condition(ent, **kwargs).

{}
Source code in medspacy/postprocess/postprocessing_pattern.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def __init__(self, condition: Callable, success_value: Any = True, **kwargs):
    """
    A PostprocessingPattern defines a single condition to check against an entity.

    Args:
        condition: A function to call on an entity. If the result of the function call equals success_value, then
            the pattern passes.
        success_value: The value which should be returned by condition(ent) in order for the pattern to pass. Must
            have == defined for condition(ent) == success_value.
        kwargs: Optional keyword arguments to call with condition(ent, **kwargs).
    """
    self.condition = condition
    self.success_value = success_value
    self.kwargs = kwargs

PostprocessingRule

Source code in medspacy/postprocess/postprocessing_rule.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
class PostprocessingRule:
    def __init__(
        self,
        patterns: Iterable[PostprocessingPattern],
        action: Callable,
        name: str = None,
        description: str = None,
        span_group_name: str = "medspacy_spans",
        **kwargs,
    ):
        """
        A PostprocessingRule checks conditions of a spaCy Span entity and executes some action if all rules are met.

        patterns: A list of PostprocessingPatterns, each of which check a condition of an entity.
        action: A function to call with the entity as an argument. This function should take the following arguments:
            ent: The spacy span
            i: The index of ent
            input_span_type: "ents" or "group". Describes where to look for spans.
            span_group_name: The name of the span group used when `input_span_type` is "group".
            kwargs: Any additional keyword arguments for action.
        name: Optional name of direction.
        description: Optional description of the direction.
        kwargs: Optional keyword arguments to send to `action`.

        """
        self.patterns = patterns
        self.action = action
        self.name = name
        self.description = description
        self.input_span_type = None
        self.span_group_name = span_group_name
        self.kwargs = kwargs

    def __call__(self, ent, i, debug=False):
        """
        Iterate through all the rules in self.rules.
        If any pattern does not pass (ie., return True), then returns False.
        If they all pass, execute self.action and return True.
        """
        for pattern in self.patterns:
            # If this is a tuple, at least one has to pass
            if isinstance(pattern, tuple):
                passed = False
                for subpattern in pattern:
                    rslt = subpattern(ent)
                    if rslt is True:
                        passed = True
                        break
                if passed is False:
                    return False
            # Otherwise just check a single value
            else:
                rslt = pattern(ent)
                if rslt is False:
                    return False

        # Every pattern passed - do the action
        if debug:
            print("Passed:", self, "on ent:", ent, ent.sent)

        try:
            if self.kwargs:
                self.action(
                    ent, i, self.input_span_type, self.span_group_name, **self.kwargs
                )
            else:
                self.action(ent, i, self.input_span_type, self.span_group_name)
        except TypeError:
            _raise_action_error(
                self.action,
                (ent, i, self.input_span_type, self.span_group_name, self.kwargs),
            )

    def __repr__(self):
        return f"PostprocessingRule: {self.name} - {self.description}"

__call__(ent, i, debug=False)

Iterate through all the rules in self.rules. If any pattern does not pass (ie., return True), then returns False. If they all pass, execute self.action and return True.

Source code in medspacy/postprocess/postprocessing_rule.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def __call__(self, ent, i, debug=False):
    """
    Iterate through all the rules in self.rules.
    If any pattern does not pass (ie., return True), then returns False.
    If they all pass, execute self.action and return True.
    """
    for pattern in self.patterns:
        # If this is a tuple, at least one has to pass
        if isinstance(pattern, tuple):
            passed = False
            for subpattern in pattern:
                rslt = subpattern(ent)
                if rslt is True:
                    passed = True
                    break
            if passed is False:
                return False
        # Otherwise just check a single value
        else:
            rslt = pattern(ent)
            if rslt is False:
                return False

    # Every pattern passed - do the action
    if debug:
        print("Passed:", self, "on ent:", ent, ent.sent)

    try:
        if self.kwargs:
            self.action(
                ent, i, self.input_span_type, self.span_group_name, **self.kwargs
            )
        else:
            self.action(ent, i, self.input_span_type, self.span_group_name)
    except TypeError:
        _raise_action_error(
            self.action,
            (ent, i, self.input_span_type, self.span_group_name, self.kwargs),
        )

__init__(patterns, action, name=None, description=None, span_group_name='medspacy_spans', **kwargs)

A PostprocessingRule checks conditions of a spaCy Span entity and executes some action if all rules are met.

patterns: A list of PostprocessingPatterns, each of which check a condition of an entity. action: A function to call with the entity as an argument. This function should take the following arguments: ent: The spacy span i: The index of ent input_span_type: "ents" or "group". Describes where to look for spans. span_group_name: The name of the span group used when input_span_type is "group". kwargs: Any additional keyword arguments for action. name: Optional name of direction. description: Optional description of the direction. kwargs: Optional keyword arguments to send to action.

Source code in medspacy/postprocess/postprocessing_rule.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def __init__(
    self,
    patterns: Iterable[PostprocessingPattern],
    action: Callable,
    name: str = None,
    description: str = None,
    span_group_name: str = "medspacy_spans",
    **kwargs,
):
    """
    A PostprocessingRule checks conditions of a spaCy Span entity and executes some action if all rules are met.

    patterns: A list of PostprocessingPatterns, each of which check a condition of an entity.
    action: A function to call with the entity as an argument. This function should take the following arguments:
        ent: The spacy span
        i: The index of ent
        input_span_type: "ents" or "group". Describes where to look for spans.
        span_group_name: The name of the span group used when `input_span_type` is "group".
        kwargs: Any additional keyword arguments for action.
    name: Optional name of direction.
    description: Optional description of the direction.
    kwargs: Optional keyword arguments to send to `action`.

    """
    self.patterns = patterns
    self.action = action
    self.name = name
    self.description = description
    self.input_span_type = None
    self.span_group_name = span_group_name
    self.kwargs = kwargs

Postprocessor

Source code in medspacy/postprocess/postprocessor.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
@Language.factory("medspacy_postprocessor")
class Postprocessor:
    def __init__(
        self,
        nlp: Language,
        name: str = "medspacy_postprocessor",
        rules: Iterable[PostprocessingRule] = None,
        debug: bool = False,
        input_span_type: Literal["ents", "group"] = "ents",
        span_group_name: str = "medspacy_spans",
    ):
        self.nlp = nlp
        self.name = name
        self._rules = []
        self.debug = debug
        self._input_span_type = input_span_type
        self._span_group_name = span_group_name

        if rules:
            self.add(rules)

    @property
    def rules(self) -> List[PostprocessingRule]:
        """
        Gets the rules.

        Returns:
            The list of PostprocessingRules available to the Postprocessor.
        """
        return self._rules

    @property
    def input_span_type(self):
        """
        The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for
        a spaCy span group.

        Returns:
            The input type, "ents" or "group".
        """
        return self._input_span_type

    @input_span_type.setter
    def input_span_type(self, val):
        if not (val == "ents" or val == "group"):
            raise ValueError('input_span_type must be "ents" or "group".')
        self._input_span_type = val

    @property
    def span_group_name(self) -> str:
        """
        The name of the span group used by this component. If `input_span_type` is "group", calling this component will
        use spans in the span group with this name.

        Returns:
            The span group name.
        """
        return self._span_group_name

    @span_group_name.setter
    def span_group_name(self, name: str):
        if not name or not isinstance(name, str):
            raise ValueError("Span group name must be a string.")
        self._span_group_name = name

    def add(self, rules: Union[PostprocessingRule, Iterable[PostprocessingRule]]):
        """
        Adds PostprocessingRules to the Postprocessor.

        Args:
            rules: A single PostprocessingRule or a collection of PostprocessingRules to add to the Postprocessor.
        """
        if isinstance(rules, PostprocessingRule):
            rules = [rules]
        for rule in rules:
            if not isinstance(rule, PostprocessingRule):
                raise TypeError(
                    f"Rules must be type PostprocessingRule, not {type(rule)}."
                )
            if rule.input_span_type is None:
                rule.input_span_type = self.input_span_type
        self._rules += rules

    def __call__(self, doc: Doc):
        """
        Calls the Postprocessor on a spaCy doc. This will call each PostprocessingRule on the doc.

        Args:
            doc: The Doc to process.

        Returns:
            The processed Doc.
        """
        # Iterate through the entities in reversed order
        if self._input_span_type == "ents":
            spans = doc.ents
        else:
            spans = doc.spans[self._span_group_name]

        for i in range(len(spans) - 1, -1, -1):
            ent = spans[i]
            if self.debug:
                print(ent)

            # let's keep track of whether the rule makes a change to spans
            span_count_before_rule = None
            if self._input_span_type == "ents":
                span_count_before_rule = len(doc.ents)
            else:
                span_count_before_rule = len(doc.spans[self.span_group_name])

            for rule in self.rules:
                rule(ent, i, debug=self.debug)
                # Check if the entity was removed based on span counts before and after rule execution
                # if it was, skip to the next entity
                try:
                    if self._input_span_type == "ents":
                        if len(doc.ents) != span_count_before_rule:
                            break
                    else:
                        if len(doc.spans[self.span_group_name]) != span_count_before_rule:
                            break
                except IndexError:
                    break
            # if self.debug:
            #     print()
        return doc

input_span_type property writable

The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for a spaCy span group.

Returns:

Type Description

The input type, "ents" or "group".

rules property

Gets the rules.

Returns:

Type Description
List[PostprocessingRule]

The list of PostprocessingRules available to the Postprocessor.

span_group_name property writable

The name of the span group used by this component. If input_span_type is "group", calling this component will use spans in the span group with this name.

Returns:

Type Description
str

The span group name.

__call__(doc)

Calls the Postprocessor on a spaCy doc. This will call each PostprocessingRule on the doc.

Parameters:

Name Type Description Default
doc Doc

The Doc to process.

required

Returns:

Type Description

The processed Doc.

Source code in medspacy/postprocess/postprocessor.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def __call__(self, doc: Doc):
    """
    Calls the Postprocessor on a spaCy doc. This will call each PostprocessingRule on the doc.

    Args:
        doc: The Doc to process.

    Returns:
        The processed Doc.
    """
    # Iterate through the entities in reversed order
    if self._input_span_type == "ents":
        spans = doc.ents
    else:
        spans = doc.spans[self._span_group_name]

    for i in range(len(spans) - 1, -1, -1):
        ent = spans[i]
        if self.debug:
            print(ent)

        # let's keep track of whether the rule makes a change to spans
        span_count_before_rule = None
        if self._input_span_type == "ents":
            span_count_before_rule = len(doc.ents)
        else:
            span_count_before_rule = len(doc.spans[self.span_group_name])

        for rule in self.rules:
            rule(ent, i, debug=self.debug)
            # Check if the entity was removed based on span counts before and after rule execution
            # if it was, skip to the next entity
            try:
                if self._input_span_type == "ents":
                    if len(doc.ents) != span_count_before_rule:
                        break
                else:
                    if len(doc.spans[self.span_group_name]) != span_count_before_rule:
                        break
            except IndexError:
                break
        # if self.debug:
        #     print()
    return doc

add(rules)

Adds PostprocessingRules to the Postprocessor.

Parameters:

Name Type Description Default
rules Union[PostprocessingRule, Iterable[PostprocessingRule]]

A single PostprocessingRule or a collection of PostprocessingRules to add to the Postprocessor.

required
Source code in medspacy/postprocess/postprocessor.py
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def add(self, rules: Union[PostprocessingRule, Iterable[PostprocessingRule]]):
    """
    Adds PostprocessingRules to the Postprocessor.

    Args:
        rules: A single PostprocessingRule or a collection of PostprocessingRules to add to the Postprocessor.
    """
    if isinstance(rules, PostprocessingRule):
        rules = [rules]
    for rule in rules:
        if not isinstance(rule, PostprocessingRule):
            raise TypeError(
                f"Rules must be type PostprocessingRule, not {type(rule)}."
            )
        if rule.input_span_type is None:
            rule.input_span_type = self.input_span_type
    self._rules += rules

postprocessing_functions

This module contains some simple functions that can be used as action or condition functions for postprocessing rules.

ent_contains(ent, target, regex=True)

Check if an entity occurs in the same sentence as another span of text. Case-insensitive.

Parameters:

Name Type Description Default
ent Span

The span to check.

required
target Union[str, Iterable[str]]

A string or a collection of strings that will be searched inside ent.

required
regex bool

If the target specified is a regex pattern. Default is True.

True

Returns:

Type Description
bool

Whether the target is contained in the ent.

Source code in medspacy/postprocess/postprocessing_functions.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def ent_contains(
    ent: Span, target: Union[str, Iterable[str]], regex: bool = True
) -> bool:
    """
    Check if an entity occurs in the same sentence as another span of text. Case-insensitive.

    Args:
        ent: The span to check.
        target: A string or a collection of strings that will be searched inside `ent`.
        regex: If the `target` specified is a regex pattern. Default is True.

    Returns:
        Whether the target is contained in the ent.
    """
    return span_contains(ent, target, regex)

is_family(span)

Returns whether a span is marked as family.

Parameters:

Name Type Description Default
span Span

The span to check.

required

Returns:

Type Description
bool

Whether the specified span has span._.is_family set to True.

Source code in medspacy/postprocess/postprocessing_functions.py
63
64
65
66
67
68
69
70
71
72
73
def is_family(span: Span) -> bool:
    """
    Returns whether a span is marked as family.

    Args:
        span: The span to check.

    Returns:
        Whether the specified span has span._.is_family set to True.
    """
    return span._.is_family

is_followed_by(ent, target, window=1)

Checks if an entity is followed by a target word within a certain window. If any phrases in target are more than one token long, this may not capture it if window is smaller than the number of tokens. Case-insensitive.

Parameters:

Name Type Description Default
ent Span

The span to check.

required
target Union[str, Iterable[str]]

A string or a collection of strings that will be searched for in the text following ent.

required
window int

The number of tokens to search for target following ent. Default is 1.

1

Returns:

Type Description
bool

Whether the entity specified is followed by a target.

Source code in medspacy/postprocess/postprocessing_functions.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def is_followed_by(
    ent: Span, target: Union[str, Iterable[str]], window: int = 1
) -> bool:
    """
    Checks if an entity is followed by a target word within a certain window. If any phrases in target are more than one
    token long, this may not capture it if window is smaller than the number of tokens. Case-insensitive.

    Args:
        ent: The span to check.
        target: A string or a collection of strings that will be searched for in the text following `ent`.
        window: The number of tokens to search for `target` following `ent`. Default is 1.

    Returns:
        Whether the entity specified is followed by a target.
    """
    following_span = ent.doc[ent.end : ent.end + window]
    following_string = " ".join([token.text.lower() for token in following_span])
    if isinstance(target, str):
        return target.lower() in following_string
    for string in target:
        if string.lower() in following_string:
            return True
    return False

is_historical(span)

Returns whether a span is marked as historical.

Parameters:

Name Type Description Default
span Span

The span to check.

required

Returns:

Type Description
bool

Whether the specified span has span._.is_historical set to True.

Source code in medspacy/postprocess/postprocessing_functions.py
37
38
39
40
41
42
43
44
45
46
47
def is_historical(span: Span) -> bool:
    """
    Returns whether a span is marked as historical.

    Args:
        span: The span to check.

    Returns:
        Whether the specified span has span._.is_historical set to True.
    """
    return span._.is_historical

is_hypothetical(span)

Returns whether a span is marked as hypothetical.

Parameters:

Name Type Description Default
span Span

The span to check.

required

Returns:

Type Description
bool

Whether the specified span has span._.is_hypothetical set to True.

Source code in medspacy/postprocess/postprocessing_functions.py
50
51
52
53
54
55
56
57
58
59
60
def is_hypothetical(span: Span) -> bool:
    """
    Returns whether a span is marked as hypothetical.

    Args:
        span: The span to check.

    Returns:
        Whether the specified span has span._.is_hypothetical set to True.
    """
    return span._.is_hypothetical

is_modified_by_category(span, category)

Returns whether a span is modified by a ConTextModifier of that type.

Parameters:

Name Type Description Default
span Span

The span to check.

required
category str

The category to check whether a ConTextModifier of that type modifies the span.

required

Returns:

Type Description
bool

Whether the specified span has the specified modifier type.

Source code in medspacy/postprocess/postprocessing_functions.py
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def is_modified_by_category(span: Span, category: str) -> bool:
    """
    Returns whether a span is modified by a ConTextModifier of that type.

    Args:
        span: The span to check.
        category: The category to check whether a ConTextModifier of that type modifies the span.

    Returns:
        Whether the specified span has the specified modifier type.
    """
    for modifier in span._.modifiers:
        if modifier.category.upper() == category.upper():
            return True
    return False

is_modified_by_text(span, target, regex=True)

Returns whether a span is modified by a ConTextModifier with the specified text.

Parameters:

Name Type Description Default
span Span

The span to check.

required
target Union[str, Iterable[str]]

The category to check whether a ConTextModifier with this text modifies the span.

required
regex bool

If the target specified is a regex pattern. Default is True.

True

Returns:

Type Description
bool

Whether the specified span has the specified modifier type.

Source code in medspacy/postprocess/postprocessing_functions.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def is_modified_by_text(
    span: Span, target: Union[str, Iterable[str]], regex: bool = True
) -> bool:
    """
    Returns whether a span is modified by a ConTextModifier with the specified text.

    Args:
        span: The span to check.
        target: The category to check whether a ConTextModifier with this text modifies the span.
        regex: If the `target` specified is a regex pattern. Default is True.

    Returns:
        Whether the specified span has the specified modifier type.
    """
    for modifier in span._.modifiers:
        if span_contains(modifier.span, target, regex):
            return True
    return False

is_negated(span)

Returns whether a span is marked as negated.

Parameters:

Name Type Description Default
span Span

The span to check.

required

Returns:

Type Description
bool

Whether the specified span has span._.is_negated set to True.

Source code in medspacy/postprocess/postprocessing_functions.py
11
12
13
14
15
16
17
18
19
20
21
def is_negated(span: Span) -> bool:
    """
    Returns whether a span is marked as negated.

    Args:
        span: The span to check.

    Returns:
        Whether the specified span has span._.is_negated set to True.
    """
    return span._.is_negated

is_preceded_by(ent, target, window=1)

Checks if an entity is preceded by a target word within a certain window. If any phrases in target are more than one token long, this may not capture it if window is smaller than the number of tokens. Case-insensitive.

Parameters:

Name Type Description Default
ent Span

The span to check.

required
target Union[str, Iterable[str]]

A string or a collection of strings that will be searched for in the text preceding ent.

required
window int

The number of tokens to search for target preceding ent. Default is 1.

1

Returns:

Type Description
bool

Whether the entity specified is preceded by a target.

Source code in medspacy/postprocess/postprocessing_functions.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def is_preceded_by(
    ent: Span, target: Union[str, Iterable[str]], window: int = 1
) -> bool:
    """
    Checks if an entity is preceded by a target word within a certain window. If any phrases in target are more than one
    token long, this may not capture it if window is smaller than the number of tokens. Case-insensitive.

    Args:
        ent: The span to check.
        target: A string or a collection of strings that will be searched for in the text preceding `ent`.
        window: The number of tokens to search for `target` preceding `ent`. Default is 1.

    Returns:
        Whether the entity specified is preceded by a target.
    """
    preceding_span = ent.doc[ent.start - window : ent.start]
    preceding_string = " ".join([token.text.lower() for token in preceding_span])
    if isinstance(target, str):
        return target.lower() in preceding_string
    for string in target:
        if string.lower() in preceding_string:
            return True
    return False

is_uncertain(span)

Returns whether a span is marked as uncertain.

Parameters:

Name Type Description Default
span Span

The span to check.

required

Returns:

Type Description
bool

Whether the specified span has span._.is_uncertain set to True.

Source code in medspacy/postprocess/postprocessing_functions.py
24
25
26
27
28
29
30
31
32
33
34
def is_uncertain(span: Span) -> bool:
    """
    Returns whether a span is marked as uncertain.

    Args:
        span: The span to check.

    Returns:
        Whether the specified span has span._.is_uncertain set to True.
    """
    return span._.is_uncertain

remove_ent(ent, i, input_type='ents', span_group_name='medspacy_spans')

Remove an entity at position [i] from doc.ents.

Parameters:

Name Type Description Default
ent Span

The entity to remove.

required
i int

The index of ent in its source list.

required
input_type Literal['ents', 'group']

The source of the entity, either "ents" or "group".

'ents'
span_group_name str

If input_type is "group", the name of the span group.

'medspacy_spans'
Source code in medspacy/postprocess/postprocessing_functions.py
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
def remove_ent(
    ent: Span,
    i: int,
    input_type: Literal["ents", "group"] = "ents",
    span_group_name: str = "medspacy_spans",
):
    """
    Remove an entity at position [i] from doc.ents.

    Args:
        ent: The entity to remove.
        i: The index of `ent` in its source list.
        input_type: The source of the entity, either "ents" or "group".
        span_group_name: If `input_type` is "group", the name of the span group.
    """
    doc = ent.doc
    if input_type == "ents":
        doc.ents = doc.ents[:i] + doc.ents[i + 1 :]
    elif input_type == "group":
        t = list(doc.spans[span_group_name])
        doc.spans[span_group_name] = t[:i] + t[i + 1 :]

sentence_contains(ent, target, regex=True)

Check if an entity occurs in the same sentence as another span of text.

Parameters:

Name Type Description Default
ent Span

The span to check.

required
target Union[str, Iterable[str]]

A string or a collection of strings that will be searched for in the text of the sentence containing ent.

required
regex

If the target specified is a regex pattern. Default is True.

True
Source code in medspacy/postprocess/postprocessing_functions.py
180
181
182
183
184
185
186
187
188
189
190
def sentence_contains(ent: Span, target: Union[str, Iterable[str]], regex=True) -> bool:
    """
    Check if an entity occurs in the same sentence as another span of text.

    Args:
        ent: The span to check.
        target: A string or a collection of strings that will be searched for in the text of the sentence containing
            `ent`.
        regex: If the `target` specified is a regex pattern. Default is True.
    """
    return span_contains(ent.sent, target, regex)

set_family(ent, i, value=True)

Set the value of ent._.is_family to value.

Source code in medspacy/postprocess/postprocessing_functions.py
279
280
281
def set_family(ent, i, value=True):
    "Set the value of ent._.is_family to value."
    ent._.is_hypothetical = value

set_historical(ent, i, value=True)

Set the value of ent._.is_historical to value.

Source code in medspacy/postprocess/postprocessing_functions.py
269
270
271
def set_historical(ent, i, value=True):
    """Set the value of ent._.is_historical to value."""
    ent._.is_historical = value

set_hypothetical(ent, i, value=True)

Set the value of ent._.is_hypothetical to value.

Source code in medspacy/postprocess/postprocessing_functions.py
274
275
276
def set_hypothetical(ent, i, value=True):
    """Set the value of ent._.is_hypothetical to value."""
    ent._.is_hypothetical = value

set_label(ent, i, input_type='ents', span_group_name='medspacy_spans', **kwargs)

Creates a copy of the entity with a new label.

WARNING: This is not fully safe, as spaCy does not allow modifying the label of a span. Instead, this creates a new copy and attempts to copy existing attributes, but this is not totally reliable.

Parameters:

Name Type Description Default
ent

The entity to MODIFY.

required
i

The index of ent in its source list.

required
input_type Literal['ents', 'group']

The source of the entity, either "ents" or "group".

'ents'
span_group_name str

If input_type is "group", the name of the span group.

'medspacy_spans'
Source code in medspacy/postprocess/postprocessing_functions.py
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
def set_label(
    ent,
    i,
    input_type: Literal["ents", "group"] = "ents",
    span_group_name: str = "medspacy_spans",
    **kwargs
):
    """
    Creates a copy of the entity with a new label.

    WARNING: This is not fully safe, as spaCy does not allow modifying the label of a span. Instead, this creates a new
    copy and attempts to copy existing attributes, but this is not totally reliable.

    Args:
        ent: The entity to MODIFY.
        i: The index of `ent` in its source list.
        input_type: The source of the entity, either "ents" or "group".
        span_group_name: If `input_type` is "group", the name of the span group.
    """
    from spacy.tokens import Span

    new_ent = Span(ent.doc, ent.start, ent.end, label=kwargs["label"])
    # Copy any additional attributes
    # NOTE: This may not be complete and should be used with caution
    for (attr, values) in ent._.__dict__["_extensions"].items():
        setattr(new_ent._, attr, values[0])
    if input_type == "ents":
        if len(ent.doc.ents) == 1:
            ent.doc.ents = (new_ent,)
        else:
            ent.doc.ents = ent.doc.ents[:i] + (new_ent,) + ent.doc.ents[i + 1 :]
    else:
        if len(ent.doc.spans[span_group_name] == 1):
            ent.doc.spans[span_group_name] = (new_ent,)
        else:
            ent.doc.spans[span_group_name] = (
                ent.doc.spans[span_group_name][:i]
                + (new_ent,)
                + ent.doc.spans[span_group_name][i + 1 :]
            )

set_negated(ent, i, value=True)

Set the value of ent._.is_negated to value.

Source code in medspacy/postprocess/postprocessing_functions.py
259
260
261
def set_negated(ent, i, value=True):
    """Set the value of ent._.is_negated to value."""
    ent._.is_negated = value

set_uncertain(ent, i, value=True)

Set the value of ent._.is_uncertain to value.

Source code in medspacy/postprocess/postprocessing_functions.py
264
265
266
def set_uncertain(ent, i, value=True):
    """Set the value of ent._.is_uncertain to value."""
    ent._.is_uncertain = value

postprocessing_pattern

PostprocessingPattern

PostprocessingPatterns are callable functions and equality values wrapped together that will create triggers in the later Postprocessor as part of PostprocessingRules.

Source code in medspacy/postprocess/postprocessing_pattern.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class PostprocessingPattern:
    """
    PostprocessingPatterns are callable functions and equality values wrapped together that will create triggers
    in the later Postprocessor as part of PostprocessingRules.
    """

    def __init__(self, condition: Callable, success_value: Any = True, **kwargs):
        """
        A PostprocessingPattern defines a single condition to check against an entity.

        Args:
            condition: A function to call on an entity. If the result of the function call equals success_value, then
                the pattern passes.
            success_value: The value which should be returned by condition(ent) in order for the pattern to pass. Must
                have == defined for condition(ent) == success_value.
            kwargs: Optional keyword arguments to call with condition(ent, **kwargs).
        """
        self.condition = condition
        self.success_value = success_value
        self.kwargs = kwargs

    def __call__(self, ent: Span) -> bool:
        """
        Call the PostprocessingPattern on the span specified.

        Args:
            ent: the span to process.

        Returns:
            Whether calling `condition` on the entity specified is `success_value`.
        """
        if self.kwargs:
            result = self.condition(ent, **self.kwargs)
        else:
            result = self.condition(ent)
        return result == self.success_value
__call__(ent)

Call the PostprocessingPattern on the span specified.

Parameters:

Name Type Description Default
ent Span

the span to process.

required

Returns:

Type Description
bool

Whether calling condition on the entity specified is success_value.

Source code in medspacy/postprocess/postprocessing_pattern.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def __call__(self, ent: Span) -> bool:
    """
    Call the PostprocessingPattern on the span specified.

    Args:
        ent: the span to process.

    Returns:
        Whether calling `condition` on the entity specified is `success_value`.
    """
    if self.kwargs:
        result = self.condition(ent, **self.kwargs)
    else:
        result = self.condition(ent)
    return result == self.success_value
__init__(condition, success_value=True, **kwargs)

A PostprocessingPattern defines a single condition to check against an entity.

Parameters:

Name Type Description Default
condition Callable

A function to call on an entity. If the result of the function call equals success_value, then the pattern passes.

required
success_value Any

The value which should be returned by condition(ent) in order for the pattern to pass. Must have == defined for condition(ent) == success_value.

True
kwargs

Optional keyword arguments to call with condition(ent, **kwargs).

{}
Source code in medspacy/postprocess/postprocessing_pattern.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def __init__(self, condition: Callable, success_value: Any = True, **kwargs):
    """
    A PostprocessingPattern defines a single condition to check against an entity.

    Args:
        condition: A function to call on an entity. If the result of the function call equals success_value, then
            the pattern passes.
        success_value: The value which should be returned by condition(ent) in order for the pattern to pass. Must
            have == defined for condition(ent) == success_value.
        kwargs: Optional keyword arguments to call with condition(ent, **kwargs).
    """
    self.condition = condition
    self.success_value = success_value
    self.kwargs = kwargs

postprocessing_rule

PostprocessingRule

Source code in medspacy/postprocess/postprocessing_rule.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
class PostprocessingRule:
    def __init__(
        self,
        patterns: Iterable[PostprocessingPattern],
        action: Callable,
        name: str = None,
        description: str = None,
        span_group_name: str = "medspacy_spans",
        **kwargs,
    ):
        """
        A PostprocessingRule checks conditions of a spaCy Span entity and executes some action if all rules are met.

        patterns: A list of PostprocessingPatterns, each of which check a condition of an entity.
        action: A function to call with the entity as an argument. This function should take the following arguments:
            ent: The spacy span
            i: The index of ent
            input_span_type: "ents" or "group". Describes where to look for spans.
            span_group_name: The name of the span group used when `input_span_type` is "group".
            kwargs: Any additional keyword arguments for action.
        name: Optional name of direction.
        description: Optional description of the direction.
        kwargs: Optional keyword arguments to send to `action`.

        """
        self.patterns = patterns
        self.action = action
        self.name = name
        self.description = description
        self.input_span_type = None
        self.span_group_name = span_group_name
        self.kwargs = kwargs

    def __call__(self, ent, i, debug=False):
        """
        Iterate through all the rules in self.rules.
        If any pattern does not pass (ie., return True), then returns False.
        If they all pass, execute self.action and return True.
        """
        for pattern in self.patterns:
            # If this is a tuple, at least one has to pass
            if isinstance(pattern, tuple):
                passed = False
                for subpattern in pattern:
                    rslt = subpattern(ent)
                    if rslt is True:
                        passed = True
                        break
                if passed is False:
                    return False
            # Otherwise just check a single value
            else:
                rslt = pattern(ent)
                if rslt is False:
                    return False

        # Every pattern passed - do the action
        if debug:
            print("Passed:", self, "on ent:", ent, ent.sent)

        try:
            if self.kwargs:
                self.action(
                    ent, i, self.input_span_type, self.span_group_name, **self.kwargs
                )
            else:
                self.action(ent, i, self.input_span_type, self.span_group_name)
        except TypeError:
            _raise_action_error(
                self.action,
                (ent, i, self.input_span_type, self.span_group_name, self.kwargs),
            )

    def __repr__(self):
        return f"PostprocessingRule: {self.name} - {self.description}"
__call__(ent, i, debug=False)

Iterate through all the rules in self.rules. If any pattern does not pass (ie., return True), then returns False. If they all pass, execute self.action and return True.

Source code in medspacy/postprocess/postprocessing_rule.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def __call__(self, ent, i, debug=False):
    """
    Iterate through all the rules in self.rules.
    If any pattern does not pass (ie., return True), then returns False.
    If they all pass, execute self.action and return True.
    """
    for pattern in self.patterns:
        # If this is a tuple, at least one has to pass
        if isinstance(pattern, tuple):
            passed = False
            for subpattern in pattern:
                rslt = subpattern(ent)
                if rslt is True:
                    passed = True
                    break
            if passed is False:
                return False
        # Otherwise just check a single value
        else:
            rslt = pattern(ent)
            if rslt is False:
                return False

    # Every pattern passed - do the action
    if debug:
        print("Passed:", self, "on ent:", ent, ent.sent)

    try:
        if self.kwargs:
            self.action(
                ent, i, self.input_span_type, self.span_group_name, **self.kwargs
            )
        else:
            self.action(ent, i, self.input_span_type, self.span_group_name)
    except TypeError:
        _raise_action_error(
            self.action,
            (ent, i, self.input_span_type, self.span_group_name, self.kwargs),
        )
__init__(patterns, action, name=None, description=None, span_group_name='medspacy_spans', **kwargs)

A PostprocessingRule checks conditions of a spaCy Span entity and executes some action if all rules are met.

patterns: A list of PostprocessingPatterns, each of which check a condition of an entity. action: A function to call with the entity as an argument. This function should take the following arguments: ent: The spacy span i: The index of ent input_span_type: "ents" or "group". Describes where to look for spans. span_group_name: The name of the span group used when input_span_type is "group". kwargs: Any additional keyword arguments for action. name: Optional name of direction. description: Optional description of the direction. kwargs: Optional keyword arguments to send to action.

Source code in medspacy/postprocess/postprocessing_rule.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def __init__(
    self,
    patterns: Iterable[PostprocessingPattern],
    action: Callable,
    name: str = None,
    description: str = None,
    span_group_name: str = "medspacy_spans",
    **kwargs,
):
    """
    A PostprocessingRule checks conditions of a spaCy Span entity and executes some action if all rules are met.

    patterns: A list of PostprocessingPatterns, each of which check a condition of an entity.
    action: A function to call with the entity as an argument. This function should take the following arguments:
        ent: The spacy span
        i: The index of ent
        input_span_type: "ents" or "group". Describes where to look for spans.
        span_group_name: The name of the span group used when `input_span_type` is "group".
        kwargs: Any additional keyword arguments for action.
    name: Optional name of direction.
    description: Optional description of the direction.
    kwargs: Optional keyword arguments to send to `action`.

    """
    self.patterns = patterns
    self.action = action
    self.name = name
    self.description = description
    self.input_span_type = None
    self.span_group_name = span_group_name
    self.kwargs = kwargs

postprocessor

Postprocessor

Source code in medspacy/postprocess/postprocessor.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
@Language.factory("medspacy_postprocessor")
class Postprocessor:
    def __init__(
        self,
        nlp: Language,
        name: str = "medspacy_postprocessor",
        rules: Iterable[PostprocessingRule] = None,
        debug: bool = False,
        input_span_type: Literal["ents", "group"] = "ents",
        span_group_name: str = "medspacy_spans",
    ):
        self.nlp = nlp
        self.name = name
        self._rules = []
        self.debug = debug
        self._input_span_type = input_span_type
        self._span_group_name = span_group_name

        if rules:
            self.add(rules)

    @property
    def rules(self) -> List[PostprocessingRule]:
        """
        Gets the rules.

        Returns:
            The list of PostprocessingRules available to the Postprocessor.
        """
        return self._rules

    @property
    def input_span_type(self):
        """
        The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for
        a spaCy span group.

        Returns:
            The input type, "ents" or "group".
        """
        return self._input_span_type

    @input_span_type.setter
    def input_span_type(self, val):
        if not (val == "ents" or val == "group"):
            raise ValueError('input_span_type must be "ents" or "group".')
        self._input_span_type = val

    @property
    def span_group_name(self) -> str:
        """
        The name of the span group used by this component. If `input_span_type` is "group", calling this component will
        use spans in the span group with this name.

        Returns:
            The span group name.
        """
        return self._span_group_name

    @span_group_name.setter
    def span_group_name(self, name: str):
        if not name or not isinstance(name, str):
            raise ValueError("Span group name must be a string.")
        self._span_group_name = name

    def add(self, rules: Union[PostprocessingRule, Iterable[PostprocessingRule]]):
        """
        Adds PostprocessingRules to the Postprocessor.

        Args:
            rules: A single PostprocessingRule or a collection of PostprocessingRules to add to the Postprocessor.
        """
        if isinstance(rules, PostprocessingRule):
            rules = [rules]
        for rule in rules:
            if not isinstance(rule, PostprocessingRule):
                raise TypeError(
                    f"Rules must be type PostprocessingRule, not {type(rule)}."
                )
            if rule.input_span_type is None:
                rule.input_span_type = self.input_span_type
        self._rules += rules

    def __call__(self, doc: Doc):
        """
        Calls the Postprocessor on a spaCy doc. This will call each PostprocessingRule on the doc.

        Args:
            doc: The Doc to process.

        Returns:
            The processed Doc.
        """
        # Iterate through the entities in reversed order
        if self._input_span_type == "ents":
            spans = doc.ents
        else:
            spans = doc.spans[self._span_group_name]

        for i in range(len(spans) - 1, -1, -1):
            ent = spans[i]
            if self.debug:
                print(ent)

            # let's keep track of whether the rule makes a change to spans
            span_count_before_rule = None
            if self._input_span_type == "ents":
                span_count_before_rule = len(doc.ents)
            else:
                span_count_before_rule = len(doc.spans[self.span_group_name])

            for rule in self.rules:
                rule(ent, i, debug=self.debug)
                # Check if the entity was removed based on span counts before and after rule execution
                # if it was, skip to the next entity
                try:
                    if self._input_span_type == "ents":
                        if len(doc.ents) != span_count_before_rule:
                            break
                    else:
                        if len(doc.spans[self.span_group_name]) != span_count_before_rule:
                            break
                except IndexError:
                    break
            # if self.debug:
            #     print()
        return doc
input_span_type property writable

The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for a spaCy span group.

Returns:

Type Description

The input type, "ents" or "group".

rules property

Gets the rules.

Returns:

Type Description
List[PostprocessingRule]

The list of PostprocessingRules available to the Postprocessor.

span_group_name property writable

The name of the span group used by this component. If input_span_type is "group", calling this component will use spans in the span group with this name.

Returns:

Type Description
str

The span group name.

__call__(doc)

Calls the Postprocessor on a spaCy doc. This will call each PostprocessingRule on the doc.

Parameters:

Name Type Description Default
doc Doc

The Doc to process.

required

Returns:

Type Description

The processed Doc.

Source code in medspacy/postprocess/postprocessor.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def __call__(self, doc: Doc):
    """
    Calls the Postprocessor on a spaCy doc. This will call each PostprocessingRule on the doc.

    Args:
        doc: The Doc to process.

    Returns:
        The processed Doc.
    """
    # Iterate through the entities in reversed order
    if self._input_span_type == "ents":
        spans = doc.ents
    else:
        spans = doc.spans[self._span_group_name]

    for i in range(len(spans) - 1, -1, -1):
        ent = spans[i]
        if self.debug:
            print(ent)

        # let's keep track of whether the rule makes a change to spans
        span_count_before_rule = None
        if self._input_span_type == "ents":
            span_count_before_rule = len(doc.ents)
        else:
            span_count_before_rule = len(doc.spans[self.span_group_name])

        for rule in self.rules:
            rule(ent, i, debug=self.debug)
            # Check if the entity was removed based on span counts before and after rule execution
            # if it was, skip to the next entity
            try:
                if self._input_span_type == "ents":
                    if len(doc.ents) != span_count_before_rule:
                        break
                else:
                    if len(doc.spans[self.span_group_name]) != span_count_before_rule:
                        break
            except IndexError:
                break
        # if self.debug:
        #     print()
    return doc
add(rules)

Adds PostprocessingRules to the Postprocessor.

Parameters:

Name Type Description Default
rules Union[PostprocessingRule, Iterable[PostprocessingRule]]

A single PostprocessingRule or a collection of PostprocessingRules to add to the Postprocessor.

required
Source code in medspacy/postprocess/postprocessor.py
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def add(self, rules: Union[PostprocessingRule, Iterable[PostprocessingRule]]):
    """
    Adds PostprocessingRules to the Postprocessor.

    Args:
        rules: A single PostprocessingRule or a collection of PostprocessingRules to add to the Postprocessor.
    """
    if isinstance(rules, PostprocessingRule):
        rules = [rules]
    for rule in rules:
        if not isinstance(rule, PostprocessingRule):
            raise TypeError(
                f"Rules must be type PostprocessingRule, not {type(rule)}."
            )
        if rule.input_span_type is None:
            rule.input_span_type = self.input_span_type
    self._rules += rules

preprocess

PreprocessingRule

This is a rule for handling preprocessing in the medspaCy Preprocessor. This class does not inherit from BaseRule, as it cannot be used in a spaCy pipeline. The Preprocessor and PreprocessingRules are designed to preprocess text before entering a spaCy pipeline to allow for destructive preprocessing, such as stripping or replacing text.

Source code in medspacy/preprocess/preprocessing_rule.py
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
class PreprocessingRule:
    """
    This is a rule for handling preprocessing in the medspaCy Preprocessor. This class does not inherit from BaseRule,
    as it cannot be used in a spaCy pipeline. The Preprocessor and PreprocessingRules are designed to preprocess text
    before entering a spaCy pipeline to allow for destructive preprocessing, such as stripping or replacing text.
    """

    _ALLOWED_KEYS = {"pattern", "repl", "desc", "pattern", "flags"}

    def __init__(
        self,
        pattern: str,
        repl: Union[str, Callable[[re.Match], Any]] = "",
        flags: re.RegexFlag = re.IGNORECASE,
        callback: Optional[Callable[[str, re.Match], str]] = None,
        desc: Optional[str] = None,
    ):
        """
        Creates a new PreprocessingRule. Preprocessing rules define spans of text to be removed and optionally
        replaced from the text underneath a doc.

        Args:
            pattern: The text pattern to match and replace in a doc. Must be a string, which will be compiled as
                a regular expression. The patterns will lead to re.Match objects.
            repl: The text to replace a matched string with. By default, repl is an empty string. If repl is a function,
                sends function to re.sub and it will be called on each Match object. More info here
                https://docs.python.org/3/library/re.html#re.sub
            flags: A regex compilation flag. Default is re.IGNORECASE.
            callback: An optional callable which takes the raw text and a Match and returns the new copy of the text,
                rather than just replacing strings for the matched text. This can allow larger text manipulation, such
                as stripping out an entire section based on a header.
            desc: An optional description.
        """
        self.pattern = re.compile(pattern, flags=flags)
        self.repl = repl
        self.callback = callback
        self.desc = desc

    @classmethod
    def from_dict(cls, d: Dict) -> PreprocessingRule:
        """
        Creates a PreprocessingRule from a dictionary.

        Args:
            d: The dict to read.

        Returns:
            A PreprocessingRule from the dictionary.
        """
        return PreprocessingRule(
            d["pattern"],
            repl=d["repl"],
            flags=d["flags"],
            callback=d["callback"],
            desc=d.get("desc", None),
        )

    def to_dict(self):
        """
        Writes a preprocessing rule to a dictionary. Useful for writing all rules to a json later.

        Returns:
            A dictionary containing the PreprocessingRule's data.
        """
        d = {
            "pattern": self.pattern.pattern,
            "repl": self.repl,
            "callback": self.callback,
            "desc": self.desc,
            "flags": self.pattern.flags,
        }
        return d

    @classmethod
    def from_json(cls, filepath):
        """
        Read a JSON file containing PreprocessingRule data at the key "preprocessing_rules".

        Args:
            filepath: The filepath of the JSON to read.

        Returns:
            A list of PreprocessingRules from the JSON file.
        """
        import json

        with open(filepath) as f:
            data = json.load(f)
        return [
            PreprocessingRule.from_dict(rule) for rule in data["preprocessing_rules"]
        ]

    def __call__(self, text):
        """
        Apply a preprocessing direction. If the callback attribute of direction is None, then it will return a string
        using the direction sub method. If callback is not None, then callback function will be executed using
        the resulting match as an argument.
        """
        # If the direction just has a repl attribute,
        # Just return a simple re.sub
        if self.callback is None:
            return self.pattern.sub(self.repl, text)

        match = self.pattern.search(text)
        if match is None:
            return text
        return self.callback(text, match)

    def __repr__(self):
        return (
            f"PreprocessingRule(pattern={self.pattern.pattern}, flags={self.pattern.flags}, repl={self.repl}, "
            f"callback={self.callback}, desc={self.desc})"
        )

__call__(text)

Apply a preprocessing direction. If the callback attribute of direction is None, then it will return a string using the direction sub method. If callback is not None, then callback function will be executed using the resulting match as an argument.

Source code in medspacy/preprocess/preprocessing_rule.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def __call__(self, text):
    """
    Apply a preprocessing direction. If the callback attribute of direction is None, then it will return a string
    using the direction sub method. If callback is not None, then callback function will be executed using
    the resulting match as an argument.
    """
    # If the direction just has a repl attribute,
    # Just return a simple re.sub
    if self.callback is None:
        return self.pattern.sub(self.repl, text)

    match = self.pattern.search(text)
    if match is None:
        return text
    return self.callback(text, match)

__init__(pattern, repl='', flags=re.IGNORECASE, callback=None, desc=None)

Creates a new PreprocessingRule. Preprocessing rules define spans of text to be removed and optionally replaced from the text underneath a doc.

Parameters:

Name Type Description Default
pattern str

The text pattern to match and replace in a doc. Must be a string, which will be compiled as a regular expression. The patterns will lead to re.Match objects.

required
repl Union[str, Callable[[Match], Any]]

The text to replace a matched string with. By default, repl is an empty string. If repl is a function, sends function to re.sub and it will be called on each Match object. More info here https://docs.python.org/3/library/re.html#re.sub

''
flags RegexFlag

A regex compilation flag. Default is re.IGNORECASE.

IGNORECASE
callback Optional[Callable[[str, Match], str]]

An optional callable which takes the raw text and a Match and returns the new copy of the text, rather than just replacing strings for the matched text. This can allow larger text manipulation, such as stripping out an entire section based on a header.

None
desc Optional[str]

An optional description.

None
Source code in medspacy/preprocess/preprocessing_rule.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def __init__(
    self,
    pattern: str,
    repl: Union[str, Callable[[re.Match], Any]] = "",
    flags: re.RegexFlag = re.IGNORECASE,
    callback: Optional[Callable[[str, re.Match], str]] = None,
    desc: Optional[str] = None,
):
    """
    Creates a new PreprocessingRule. Preprocessing rules define spans of text to be removed and optionally
    replaced from the text underneath a doc.

    Args:
        pattern: The text pattern to match and replace in a doc. Must be a string, which will be compiled as
            a regular expression. The patterns will lead to re.Match objects.
        repl: The text to replace a matched string with. By default, repl is an empty string. If repl is a function,
            sends function to re.sub and it will be called on each Match object. More info here
            https://docs.python.org/3/library/re.html#re.sub
        flags: A regex compilation flag. Default is re.IGNORECASE.
        callback: An optional callable which takes the raw text and a Match and returns the new copy of the text,
            rather than just replacing strings for the matched text. This can allow larger text manipulation, such
            as stripping out an entire section based on a header.
        desc: An optional description.
    """
    self.pattern = re.compile(pattern, flags=flags)
    self.repl = repl
    self.callback = callback
    self.desc = desc

from_dict(d) classmethod

Creates a PreprocessingRule from a dictionary.

Parameters:

Name Type Description Default
d Dict

The dict to read.

required

Returns:

Type Description
PreprocessingRule

A PreprocessingRule from the dictionary.

Source code in medspacy/preprocess/preprocessing_rule.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
@classmethod
def from_dict(cls, d: Dict) -> PreprocessingRule:
    """
    Creates a PreprocessingRule from a dictionary.

    Args:
        d: The dict to read.

    Returns:
        A PreprocessingRule from the dictionary.
    """
    return PreprocessingRule(
        d["pattern"],
        repl=d["repl"],
        flags=d["flags"],
        callback=d["callback"],
        desc=d.get("desc", None),
    )

from_json(filepath) classmethod

Read a JSON file containing PreprocessingRule data at the key "preprocessing_rules".

Parameters:

Name Type Description Default
filepath

The filepath of the JSON to read.

required

Returns:

Type Description

A list of PreprocessingRules from the JSON file.

Source code in medspacy/preprocess/preprocessing_rule.py
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
@classmethod
def from_json(cls, filepath):
    """
    Read a JSON file containing PreprocessingRule data at the key "preprocessing_rules".

    Args:
        filepath: The filepath of the JSON to read.

    Returns:
        A list of PreprocessingRules from the JSON file.
    """
    import json

    with open(filepath) as f:
        data = json.load(f)
    return [
        PreprocessingRule.from_dict(rule) for rule in data["preprocessing_rules"]
    ]

to_dict()

Writes a preprocessing rule to a dictionary. Useful for writing all rules to a json later.

Returns:

Type Description

A dictionary containing the PreprocessingRule's data.

Source code in medspacy/preprocess/preprocessing_rule.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def to_dict(self):
    """
    Writes a preprocessing rule to a dictionary. Useful for writing all rules to a json later.

    Returns:
        A dictionary containing the PreprocessingRule's data.
    """
    d = {
        "pattern": self.pattern.pattern,
        "repl": self.repl,
        "callback": self.callback,
        "desc": self.desc,
        "flags": self.pattern.flags,
    }
    return d

Preprocessor

This is the medspacy Preprocessor class. It is designed as a wrapper for destructive preprocessing rules such as stripping or replacing text in a document before the text enters a spaCy pipeline.

This is NOT a spaCy component and cannot be added to a spaCy pipeline. Please use the preprocessor before calling nlp("your text here"). SpaCy only allows for non-destructive processing on the text, but that is not always advisable for every project, so this enables destructive preprocessing when required.

Source code in medspacy/preprocess/preprocessor.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class Preprocessor:
    """
    This is the medspacy Preprocessor class. It is designed as a wrapper for destructive preprocessing rules such as
    stripping or replacing text in a document before the text enters a spaCy pipeline.

    This is NOT a spaCy component and cannot be added to a spaCy pipeline. Please use the preprocessor before
    calling `nlp("your text here")`. SpaCy only allows for non-destructive processing on the text, but that is not
    always advisable for every project, so this enables destructive preprocessing when required.
    """

    def __init__(self, tokenizer):
        """

        Args:
            tokenizer:
        """
        self.tokenizer = tokenizer
        self._rules = []

    def add(self, rules: Union[PreprocessingRule, Iterable[PreprocessingRule]]):
        """
        Adds a PreprocessingRule or collection of PreprocessingRules to the Preprocessor.

        Args:
            rules: A single PreprocessingRule or a collection of PreprocessingRules to add.
        """
        if isinstance(rules, PreprocessingRule):
            rules = [rules]
        for rule in rules:
            if not isinstance(rule, PreprocessingRule):
                raise TypeError(
                    f"Each rule must be an instance of PreprocessingRule, not {type(rule)}."
                )
        self._rules += rules

    def __call__(self, text, tokenize=True) -> Union[str, Doc]:
        """

        Args:
            text:
            tokenize:

        Returns:

        """
        for rule in self._rules:
            text = rule(text)

        if not tokenize:
            return text

        return self.tokenizer(text)

__call__(text, tokenize=True)

Parameters:

Name Type Description Default
text
required
tokenize
True

Returns:

Source code in medspacy/preprocess/preprocessor.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __call__(self, text, tokenize=True) -> Union[str, Doc]:
    """

    Args:
        text:
        tokenize:

    Returns:

    """
    for rule in self._rules:
        text = rule(text)

    if not tokenize:
        return text

    return self.tokenizer(text)

__init__(tokenizer)

Parameters:

Name Type Description Default
tokenizer
required
Source code in medspacy/preprocess/preprocessor.py
18
19
20
21
22
23
24
25
def __init__(self, tokenizer):
    """

    Args:
        tokenizer:
    """
    self.tokenizer = tokenizer
    self._rules = []

add(rules)

Adds a PreprocessingRule or collection of PreprocessingRules to the Preprocessor.

Parameters:

Name Type Description Default
rules Union[PreprocessingRule, Iterable[PreprocessingRule]]

A single PreprocessingRule or a collection of PreprocessingRules to add.

required
Source code in medspacy/preprocess/preprocessor.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def add(self, rules: Union[PreprocessingRule, Iterable[PreprocessingRule]]):
    """
    Adds a PreprocessingRule or collection of PreprocessingRules to the Preprocessor.

    Args:
        rules: A single PreprocessingRule or a collection of PreprocessingRules to add.
    """
    if isinstance(rules, PreprocessingRule):
        rules = [rules]
    for rule in rules:
        if not isinstance(rule, PreprocessingRule):
            raise TypeError(
                f"Each rule must be an instance of PreprocessingRule, not {type(rule)}."
            )
    self._rules += rules

preprocessing_rule

PreprocessingRule

This is a rule for handling preprocessing in the medspaCy Preprocessor. This class does not inherit from BaseRule, as it cannot be used in a spaCy pipeline. The Preprocessor and PreprocessingRules are designed to preprocess text before entering a spaCy pipeline to allow for destructive preprocessing, such as stripping or replacing text.

Source code in medspacy/preprocess/preprocessing_rule.py
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
class PreprocessingRule:
    """
    This is a rule for handling preprocessing in the medspaCy Preprocessor. This class does not inherit from BaseRule,
    as it cannot be used in a spaCy pipeline. The Preprocessor and PreprocessingRules are designed to preprocess text
    before entering a spaCy pipeline to allow for destructive preprocessing, such as stripping or replacing text.
    """

    _ALLOWED_KEYS = {"pattern", "repl", "desc", "pattern", "flags"}

    def __init__(
        self,
        pattern: str,
        repl: Union[str, Callable[[re.Match], Any]] = "",
        flags: re.RegexFlag = re.IGNORECASE,
        callback: Optional[Callable[[str, re.Match], str]] = None,
        desc: Optional[str] = None,
    ):
        """
        Creates a new PreprocessingRule. Preprocessing rules define spans of text to be removed and optionally
        replaced from the text underneath a doc.

        Args:
            pattern: The text pattern to match and replace in a doc. Must be a string, which will be compiled as
                a regular expression. The patterns will lead to re.Match objects.
            repl: The text to replace a matched string with. By default, repl is an empty string. If repl is a function,
                sends function to re.sub and it will be called on each Match object. More info here
                https://docs.python.org/3/library/re.html#re.sub
            flags: A regex compilation flag. Default is re.IGNORECASE.
            callback: An optional callable which takes the raw text and a Match and returns the new copy of the text,
                rather than just replacing strings for the matched text. This can allow larger text manipulation, such
                as stripping out an entire section based on a header.
            desc: An optional description.
        """
        self.pattern = re.compile(pattern, flags=flags)
        self.repl = repl
        self.callback = callback
        self.desc = desc

    @classmethod
    def from_dict(cls, d: Dict) -> PreprocessingRule:
        """
        Creates a PreprocessingRule from a dictionary.

        Args:
            d: The dict to read.

        Returns:
            A PreprocessingRule from the dictionary.
        """
        return PreprocessingRule(
            d["pattern"],
            repl=d["repl"],
            flags=d["flags"],
            callback=d["callback"],
            desc=d.get("desc", None),
        )

    def to_dict(self):
        """
        Writes a preprocessing rule to a dictionary. Useful for writing all rules to a json later.

        Returns:
            A dictionary containing the PreprocessingRule's data.
        """
        d = {
            "pattern": self.pattern.pattern,
            "repl": self.repl,
            "callback": self.callback,
            "desc": self.desc,
            "flags": self.pattern.flags,
        }
        return d

    @classmethod
    def from_json(cls, filepath):
        """
        Read a JSON file containing PreprocessingRule data at the key "preprocessing_rules".

        Args:
            filepath: The filepath of the JSON to read.

        Returns:
            A list of PreprocessingRules from the JSON file.
        """
        import json

        with open(filepath) as f:
            data = json.load(f)
        return [
            PreprocessingRule.from_dict(rule) for rule in data["preprocessing_rules"]
        ]

    def __call__(self, text):
        """
        Apply a preprocessing direction. If the callback attribute of direction is None, then it will return a string
        using the direction sub method. If callback is not None, then callback function will be executed using
        the resulting match as an argument.
        """
        # If the direction just has a repl attribute,
        # Just return a simple re.sub
        if self.callback is None:
            return self.pattern.sub(self.repl, text)

        match = self.pattern.search(text)
        if match is None:
            return text
        return self.callback(text, match)

    def __repr__(self):
        return (
            f"PreprocessingRule(pattern={self.pattern.pattern}, flags={self.pattern.flags}, repl={self.repl}, "
            f"callback={self.callback}, desc={self.desc})"
        )
__call__(text)

Apply a preprocessing direction. If the callback attribute of direction is None, then it will return a string using the direction sub method. If callback is not None, then callback function will be executed using the resulting match as an argument.

Source code in medspacy/preprocess/preprocessing_rule.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def __call__(self, text):
    """
    Apply a preprocessing direction. If the callback attribute of direction is None, then it will return a string
    using the direction sub method. If callback is not None, then callback function will be executed using
    the resulting match as an argument.
    """
    # If the direction just has a repl attribute,
    # Just return a simple re.sub
    if self.callback is None:
        return self.pattern.sub(self.repl, text)

    match = self.pattern.search(text)
    if match is None:
        return text
    return self.callback(text, match)
__init__(pattern, repl='', flags=re.IGNORECASE, callback=None, desc=None)

Creates a new PreprocessingRule. Preprocessing rules define spans of text to be removed and optionally replaced from the text underneath a doc.

Parameters:

Name Type Description Default
pattern str

The text pattern to match and replace in a doc. Must be a string, which will be compiled as a regular expression. The patterns will lead to re.Match objects.

required
repl Union[str, Callable[[Match], Any]]

The text to replace a matched string with. By default, repl is an empty string. If repl is a function, sends function to re.sub and it will be called on each Match object. More info here https://docs.python.org/3/library/re.html#re.sub

''
flags RegexFlag

A regex compilation flag. Default is re.IGNORECASE.

IGNORECASE
callback Optional[Callable[[str, Match], str]]

An optional callable which takes the raw text and a Match and returns the new copy of the text, rather than just replacing strings for the matched text. This can allow larger text manipulation, such as stripping out an entire section based on a header.

None
desc Optional[str]

An optional description.

None
Source code in medspacy/preprocess/preprocessing_rule.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def __init__(
    self,
    pattern: str,
    repl: Union[str, Callable[[re.Match], Any]] = "",
    flags: re.RegexFlag = re.IGNORECASE,
    callback: Optional[Callable[[str, re.Match], str]] = None,
    desc: Optional[str] = None,
):
    """
    Creates a new PreprocessingRule. Preprocessing rules define spans of text to be removed and optionally
    replaced from the text underneath a doc.

    Args:
        pattern: The text pattern to match and replace in a doc. Must be a string, which will be compiled as
            a regular expression. The patterns will lead to re.Match objects.
        repl: The text to replace a matched string with. By default, repl is an empty string. If repl is a function,
            sends function to re.sub and it will be called on each Match object. More info here
            https://docs.python.org/3/library/re.html#re.sub
        flags: A regex compilation flag. Default is re.IGNORECASE.
        callback: An optional callable which takes the raw text and a Match and returns the new copy of the text,
            rather than just replacing strings for the matched text. This can allow larger text manipulation, such
            as stripping out an entire section based on a header.
        desc: An optional description.
    """
    self.pattern = re.compile(pattern, flags=flags)
    self.repl = repl
    self.callback = callback
    self.desc = desc
from_dict(d) classmethod

Creates a PreprocessingRule from a dictionary.

Parameters:

Name Type Description Default
d Dict

The dict to read.

required

Returns:

Type Description
PreprocessingRule

A PreprocessingRule from the dictionary.

Source code in medspacy/preprocess/preprocessing_rule.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
@classmethod
def from_dict(cls, d: Dict) -> PreprocessingRule:
    """
    Creates a PreprocessingRule from a dictionary.

    Args:
        d: The dict to read.

    Returns:
        A PreprocessingRule from the dictionary.
    """
    return PreprocessingRule(
        d["pattern"],
        repl=d["repl"],
        flags=d["flags"],
        callback=d["callback"],
        desc=d.get("desc", None),
    )
from_json(filepath) classmethod

Read a JSON file containing PreprocessingRule data at the key "preprocessing_rules".

Parameters:

Name Type Description Default
filepath

The filepath of the JSON to read.

required

Returns:

Type Description

A list of PreprocessingRules from the JSON file.

Source code in medspacy/preprocess/preprocessing_rule.py
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
@classmethod
def from_json(cls, filepath):
    """
    Read a JSON file containing PreprocessingRule data at the key "preprocessing_rules".

    Args:
        filepath: The filepath of the JSON to read.

    Returns:
        A list of PreprocessingRules from the JSON file.
    """
    import json

    with open(filepath) as f:
        data = json.load(f)
    return [
        PreprocessingRule.from_dict(rule) for rule in data["preprocessing_rules"]
    ]
to_dict()

Writes a preprocessing rule to a dictionary. Useful for writing all rules to a json later.

Returns:

Type Description

A dictionary containing the PreprocessingRule's data.

Source code in medspacy/preprocess/preprocessing_rule.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def to_dict(self):
    """
    Writes a preprocessing rule to a dictionary. Useful for writing all rules to a json later.

    Returns:
        A dictionary containing the PreprocessingRule's data.
    """
    d = {
        "pattern": self.pattern.pattern,
        "repl": self.repl,
        "callback": self.callback,
        "desc": self.desc,
        "flags": self.pattern.flags,
    }
    return d

preprocessor

Preprocessor

This is the medspacy Preprocessor class. It is designed as a wrapper for destructive preprocessing rules such as stripping or replacing text in a document before the text enters a spaCy pipeline.

This is NOT a spaCy component and cannot be added to a spaCy pipeline. Please use the preprocessor before calling nlp("your text here"). SpaCy only allows for non-destructive processing on the text, but that is not always advisable for every project, so this enables destructive preprocessing when required.

Source code in medspacy/preprocess/preprocessor.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class Preprocessor:
    """
    This is the medspacy Preprocessor class. It is designed as a wrapper for destructive preprocessing rules such as
    stripping or replacing text in a document before the text enters a spaCy pipeline.

    This is NOT a spaCy component and cannot be added to a spaCy pipeline. Please use the preprocessor before
    calling `nlp("your text here")`. SpaCy only allows for non-destructive processing on the text, but that is not
    always advisable for every project, so this enables destructive preprocessing when required.
    """

    def __init__(self, tokenizer):
        """

        Args:
            tokenizer:
        """
        self.tokenizer = tokenizer
        self._rules = []

    def add(self, rules: Union[PreprocessingRule, Iterable[PreprocessingRule]]):
        """
        Adds a PreprocessingRule or collection of PreprocessingRules to the Preprocessor.

        Args:
            rules: A single PreprocessingRule or a collection of PreprocessingRules to add.
        """
        if isinstance(rules, PreprocessingRule):
            rules = [rules]
        for rule in rules:
            if not isinstance(rule, PreprocessingRule):
                raise TypeError(
                    f"Each rule must be an instance of PreprocessingRule, not {type(rule)}."
                )
        self._rules += rules

    def __call__(self, text, tokenize=True) -> Union[str, Doc]:
        """

        Args:
            text:
            tokenize:

        Returns:

        """
        for rule in self._rules:
            text = rule(text)

        if not tokenize:
            return text

        return self.tokenizer(text)
__call__(text, tokenize=True)

Parameters:

Name Type Description Default
text
required
tokenize
True

Returns:

Source code in medspacy/preprocess/preprocessor.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __call__(self, text, tokenize=True) -> Union[str, Doc]:
    """

    Args:
        text:
        tokenize:

    Returns:

    """
    for rule in self._rules:
        text = rule(text)

    if not tokenize:
        return text

    return self.tokenizer(text)
__init__(tokenizer)

Parameters:

Name Type Description Default
tokenizer
required
Source code in medspacy/preprocess/preprocessor.py
18
19
20
21
22
23
24
25
def __init__(self, tokenizer):
    """

    Args:
        tokenizer:
    """
    self.tokenizer = tokenizer
    self._rules = []
add(rules)

Adds a PreprocessingRule or collection of PreprocessingRules to the Preprocessor.

Parameters:

Name Type Description Default
rules Union[PreprocessingRule, Iterable[PreprocessingRule]]

A single PreprocessingRule or a collection of PreprocessingRules to add.

required
Source code in medspacy/preprocess/preprocessor.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def add(self, rules: Union[PreprocessingRule, Iterable[PreprocessingRule]]):
    """
    Adds a PreprocessingRule or collection of PreprocessingRules to the Preprocessor.

    Args:
        rules: A single PreprocessingRule or a collection of PreprocessingRules to add.
    """
    if isinstance(rules, PreprocessingRule):
        rules = [rules]
    for rule in rules:
        if not isinstance(rule, PreprocessingRule):
            raise TypeError(
                f"Each rule must be an instance of PreprocessingRule, not {type(rule)}."
            )
    self._rules += rules

section_detection

Section

Bases: object

Section is the object that stores the result of processing by the Sectionizer class. A Section contains information describing the section's category, title span, body span, parent, and the rule that created it.

Section category is equivalent to label_ in a basic spaCy entity. It is a normalized name for the section type determined on initialization, either created manually or through the Sectionizer pipeline component.

Section title, defined with title_start, title_end, and title_span represents the section title or header matched with the rule. In the text "Past medical history: stroke and high blood pressure", "Past medical history:" would be the title.

Section body is defined with body_start, body_end, and body_span. It represents the text between the end of the current section's title and the start of the title for the next Section or when scope is set in the rule or by the Sectionizer. In the text "Past medical history: stroke and high blood pressure", "stroke and high blood pressure" would be the body.

Parent is a string that represents the conceptual "parent" section in a section->subsection->subsubsection hierarchy. Candidates are determined by category in the rule and matched at runtime.

Source code in medspacy/section_detection/section.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
class Section(object):
    """
    Section is the object that stores the result of processing by the Sectionizer class. A Section contains information
    describing the section's category, title span, body span, parent, and the rule that created it.

    Section `category` is equivalent to `label_` in a basic spaCy entity. It is a normalized name for the section type
    determined on initialization, either created manually or through the Sectionizer pipeline component.

    Section title, defined with `title_start`, `title_end`, and `title_span` represents the section title or header
    matched with the rule. In the text "Past medical history: stroke and high blood pressure", "Past medical history:"
    would be the title.

    Section body is defined with `body_start`, `body_end`, and `body_span`. It represents the text between the end of
    the current section's title and the start of the title for the next Section or when scope is set in the rule or by
    the Sectionizer. In the text "Past medical history: stroke and high blood pressure", "stroke and high blood
    pressure" would be the body.

    Parent is a string that represents the conceptual "parent" section in a section->subsection->subsubsection
    hierarchy. Candidates are determined by category in the rule and matched at runtime.
    """

    def __init__(
        self,
        category: Union[str, None],
        title_start: int,
        title_end: int,
        body_start: int,
        body_end: int,
        parent: Optional[str] = None,
        rule: Optional[SectionRule] = None,
    ):
        """
        Create a new Section object.

        Args:
            category: A normalized name for the section. Equivalent to `label_` for basic spaCy entities.
            title_start: Index of the first token of the section title.
            title_end: Index of the last token of the section title.
            body_start: Index of the first token of the section body.
            body_end: Index of the last token of the section body.
            parent: The category of the parent section.
            rule: The SectionRule that generated the section.
        """
        self.category = category
        self.title_start = title_start
        self.title_end = title_end
        self.body_start = body_start
        self.body_end = body_end
        self.parent = parent
        self.rule = rule

    def __repr__(self):
        return (
            f"Section(category={self.category} at {self.title_start} : {self.title_end} in the doc with a body at "
            f"{self.body_start} : {self.body_end} based on the rule {self.rule}"
        )

    @property
    def title_span(self):
        """
        Gets the span of the section title.

        Returns:
            A tuple (int,int) containing the start and end indexes of the section title.
        """
        return self.title_start, self.title_end

    @property
    def body_span(self):
        """
        Gets the span of the section body.

        Returns:
            A tuple (int,int) containing the start and end indexes of the section body.
        """
        return self.body_start, self.body_end

    @property
    def section_span(self):
        """
        Gets the span of the entire section, from title start to body end.

        Returns:
            A tuple (int,int) containing the start index of the section title and the end index of the section body.
        """
        return self.title_start, self.body_end

    def serialized_representation(self):
        """
        Serialize the Section.

        Returns:
            A json-serialized representation of the section.
        """
        rule = self.rule

        return {
            "category": self.category,
            "title_start": self.title_start,
            "title_end": self.title_end,
            "body_start": self.body_start,
            "body_end": self.body_end,
            "parent": self.parent,
            "rule": rule.to_dict() if rule is not None else None,
        }

    @classmethod
    def from_serialized_representation(cls, serialized_representation: Dict[str, str]):
        """
        Load the section from a json-serialized form.

        Args:
            serialized_representation: The dictionary form of the section object to load.

        Returns:
            A Section object containing the data from the dictionary provided.
        """
        rule = SectionRule.from_dict(serialized_representation["rule"])
        section = Section(
            **{k: v for k, v in serialized_representation.items() if k not in ["rule"]}
        )
        section.rule = rule

        return section

body_span property

Gets the span of the section body.

Returns:

Type Description

A tuple (int,int) containing the start and end indexes of the section body.

section_span property

Gets the span of the entire section, from title start to body end.

Returns:

Type Description

A tuple (int,int) containing the start index of the section title and the end index of the section body.

title_span property

Gets the span of the section title.

Returns:

Type Description

A tuple (int,int) containing the start and end indexes of the section title.

__init__(category, title_start, title_end, body_start, body_end, parent=None, rule=None)

Create a new Section object.

Parameters:

Name Type Description Default
category Union[str, None]

A normalized name for the section. Equivalent to label_ for basic spaCy entities.

required
title_start int

Index of the first token of the section title.

required
title_end int

Index of the last token of the section title.

required
body_start int

Index of the first token of the section body.

required
body_end int

Index of the last token of the section body.

required
parent Optional[str]

The category of the parent section.

None
rule Optional[SectionRule]

The SectionRule that generated the section.

None
Source code in medspacy/section_detection/section.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __init__(
    self,
    category: Union[str, None],
    title_start: int,
    title_end: int,
    body_start: int,
    body_end: int,
    parent: Optional[str] = None,
    rule: Optional[SectionRule] = None,
):
    """
    Create a new Section object.

    Args:
        category: A normalized name for the section. Equivalent to `label_` for basic spaCy entities.
        title_start: Index of the first token of the section title.
        title_end: Index of the last token of the section title.
        body_start: Index of the first token of the section body.
        body_end: Index of the last token of the section body.
        parent: The category of the parent section.
        rule: The SectionRule that generated the section.
    """
    self.category = category
    self.title_start = title_start
    self.title_end = title_end
    self.body_start = body_start
    self.body_end = body_end
    self.parent = parent
    self.rule = rule

from_serialized_representation(serialized_representation) classmethod

Load the section from a json-serialized form.

Parameters:

Name Type Description Default
serialized_representation Dict[str, str]

The dictionary form of the section object to load.

required

Returns:

Type Description

A Section object containing the data from the dictionary provided.

Source code in medspacy/section_detection/section.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
@classmethod
def from_serialized_representation(cls, serialized_representation: Dict[str, str]):
    """
    Load the section from a json-serialized form.

    Args:
        serialized_representation: The dictionary form of the section object to load.

    Returns:
        A Section object containing the data from the dictionary provided.
    """
    rule = SectionRule.from_dict(serialized_representation["rule"])
    section = Section(
        **{k: v for k, v in serialized_representation.items() if k not in ["rule"]}
    )
    section.rule = rule

    return section

serialized_representation()

Serialize the Section.

Returns:

Type Description

A json-serialized representation of the section.

Source code in medspacy/section_detection/section.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def serialized_representation(self):
    """
    Serialize the Section.

    Returns:
        A json-serialized representation of the section.
    """
    rule = self.rule

    return {
        "category": self.category,
        "title_start": self.title_start,
        "title_end": self.title_end,
        "body_start": self.body_start,
        "body_end": self.body_end,
        "parent": self.parent,
        "rule": rule.to_dict() if rule is not None else None,
    }

SectionRule

Bases: BaseRule

SectionRule defines rules for extracting entities from text using the Sectionizer.

Source code in medspacy/section_detection/section_rule.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
class SectionRule(BaseRule):
    """
    SectionRule defines rules for extracting entities from text using the Sectionizer.
    """

    _ALLOWED_KEYS = {
        "literal",
        "pattern",
        "category",
        "metadata",
        "parents",
        "parent_required",
        "max_scope",
    }

    def __init__(
        self,
        literal: str,
        category: str,
        pattern: Optional[Union[List[Dict[str, str]], str]] = None,
        on_match: Optional[
            Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
        ] = None,
        max_scope: Optional[int] = None,
        parents: Optional[List[str]] = None,
        parent_required: bool = False,
        metadata: Optional[Dict[Any, Any]] = None,
    ):
        """
        Class for defining rules for extracting entities from text using TargetMatcher.

        Args:
            literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
                matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
                but can be used as a reference as the rule name.
            category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
            pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
                token-based pattern matching to match using token attributes. If a string, will use medspaCy's
                RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
                https://spacy.io/usage/rule-based-matching.
            on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
                matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
            max_scope: A number of tokens to explicitly limit the size of a section body. If None, the scope will
                include the entire doc up until either the next section header or the end of the doc. This variable can
                also be set at a global level as `Sectionizer(nlp, max_scope=...), but if the attribute is set here, the
                rule scope will take precedence. If not None, this will be the number of tokens following the matched
                section header
                    Example:
                        In the text "Past Medical History: Pt has hx of pneumonia",
                        SectionRule("Past Medical History:", "pmh", max_scope=None) will include the entire doc, but
                        SectionRule("Past Medical History:", "pmh", max_scope=2) will limit the section
                            to be "Past Medical History: Pt has"
                This can be useful for limiting certain sections which are known to be short or allowing others to be
                longer than the regular global max_scope.
            parents: A list of candidate parents for determining subsections
            parent_required: Whether a parent is required for the section to exist in the final output. If true and no
                parent is identified, the section will be removed.
            metadata: Optional dictionary of any extra metadata.
        """
        super().__init__(literal, category, pattern, on_match, metadata)
        self.max_scope = max_scope
        self.parents = parents
        if parent_required:
            if not parents:
                raise ValueError(
                    f"Jsonl file incorrectly formatted for pattern name {category}. "
                    f"If parents are required, then at least one parent must be specified."
                )
        self.parent_required = parent_required

    @classmethod
    def from_json(cls, filepath) -> List[SectionRule]:
        """
        Read in a lexicon of modifiers from a JSON file.

        Args:
            filepath: the .json file containing modifier rules

        Returns:
            section_rules: a list of SectionRule objects
        """
        import json

        with open(filepath) as file:
            section_data = json.load(file)
        section_rules = []
        for data in section_data["section_rules"]:
            section_rules.append(SectionRule.from_dict(data))
        return section_rules

    @classmethod
    def from_dict(cls, rule_dict):
        """
        Reads a dictionary into a SectionRule list. Used when reading from a json file.

        Args:
            rule_dict: the dictionary to convert

        Returns:
            item: the SectionRule created from the dictionary
        """
        keys = set(rule_dict.keys())
        invalid_keys = keys.difference(cls._ALLOWED_KEYS)
        if invalid_keys:
            msg = (
                f"JSON object contains invalid keys: {invalid_keys}. "
                f"Must be one of: {cls._ALLOWED_KEYS}"
            )
            raise ValueError(msg)
        rule = SectionRule(**rule_dict)
        return rule

    def to_dict(self):
        """
        Converts TargetRules to a python dictionary. Used when writing section rules to a json file.

        Returns:
            rule_dict: the dictionary containing the TargetRule info.
        """
        rule_dict = {}
        for key in self._ALLOWED_KEYS:
            value = self.__dict__.get(key)
            if value is not None:
                rule_dict[key] = value
        return rule_dict

    def __repr__(self):
        return f"""SectionRule(literal="{self.literal}", category="{self.category}", pattern={self.pattern}, on_match={self.on_match}, parents={self.parents}, parent_required={self.parent_required})"""

__init__(literal, category, pattern=None, on_match=None, max_scope=None, parents=None, parent_required=False, metadata=None)

Class for defining rules for extracting entities from text using TargetMatcher.

Parameters:

Name Type Description Default
literal str

The string representation of a concept. If pattern is None, this string will be lower-cased and matched to the lower-case string. If pattern is not None, this argument will not be used for matching but can be used as a reference as the rule name.

required
category str

The semantic class of the matched span. This corresponds to the label_ attribute of an entity.

required
pattern Optional[Union[List[Dict[str, str]], str]]

A list or string to use as a spaCy pattern rather than literal. If a list, will use spaCy token-based pattern matching to match using token attributes. If a string, will use medspaCy's RegexMatcher. If None, will use literal as the pattern for phrase matching. For more information, see https://spacy.io/usage/rule-based-matching.

None
on_match Optional[Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]]

An optional callback function or other callable which takes 4 arguments: (matcher, doc, i, matches). For more information, see https://spacy.io/usage/rule-based-matching#on_match

None
max_scope Optional[int]

A number of tokens to explicitly limit the size of a section body. If None, the scope will include the entire doc up until either the next section header or the end of the doc. This variable can also be set at a global level as `Sectionizer(nlp, max_scope=...), but if the attribute is set here, the rule scope will take precedence. If not None, this will be the number of tokens following the matched section header Example: In the text "Past Medical History: Pt has hx of pneumonia", SectionRule("Past Medical History:", "pmh", max_scope=None) will include the entire doc, but SectionRule("Past Medical History:", "pmh", max_scope=2) will limit the section to be "Past Medical History: Pt has" This can be useful for limiting certain sections which are known to be short or allowing others to be longer than the regular global max_scope.

None
parents Optional[List[str]]

A list of candidate parents for determining subsections

None
parent_required bool

Whether a parent is required for the section to exist in the final output. If true and no parent is identified, the section will be removed.

False
metadata Optional[Dict[Any, Any]]

Optional dictionary of any extra metadata.

None
Source code in medspacy/section_detection/section_rule.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def __init__(
    self,
    literal: str,
    category: str,
    pattern: Optional[Union[List[Dict[str, str]], str]] = None,
    on_match: Optional[
        Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
    ] = None,
    max_scope: Optional[int] = None,
    parents: Optional[List[str]] = None,
    parent_required: bool = False,
    metadata: Optional[Dict[Any, Any]] = None,
):
    """
    Class for defining rules for extracting entities from text using TargetMatcher.

    Args:
        literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
            matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
            but can be used as a reference as the rule name.
        category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
        pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
            token-based pattern matching to match using token attributes. If a string, will use medspaCy's
            RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
            https://spacy.io/usage/rule-based-matching.
        on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
            matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
        max_scope: A number of tokens to explicitly limit the size of a section body. If None, the scope will
            include the entire doc up until either the next section header or the end of the doc. This variable can
            also be set at a global level as `Sectionizer(nlp, max_scope=...), but if the attribute is set here, the
            rule scope will take precedence. If not None, this will be the number of tokens following the matched
            section header
                Example:
                    In the text "Past Medical History: Pt has hx of pneumonia",
                    SectionRule("Past Medical History:", "pmh", max_scope=None) will include the entire doc, but
                    SectionRule("Past Medical History:", "pmh", max_scope=2) will limit the section
                        to be "Past Medical History: Pt has"
            This can be useful for limiting certain sections which are known to be short or allowing others to be
            longer than the regular global max_scope.
        parents: A list of candidate parents for determining subsections
        parent_required: Whether a parent is required for the section to exist in the final output. If true and no
            parent is identified, the section will be removed.
        metadata: Optional dictionary of any extra metadata.
    """
    super().__init__(literal, category, pattern, on_match, metadata)
    self.max_scope = max_scope
    self.parents = parents
    if parent_required:
        if not parents:
            raise ValueError(
                f"Jsonl file incorrectly formatted for pattern name {category}. "
                f"If parents are required, then at least one parent must be specified."
            )
    self.parent_required = parent_required

from_dict(rule_dict) classmethod

Reads a dictionary into a SectionRule list. Used when reading from a json file.

Parameters:

Name Type Description Default
rule_dict

the dictionary to convert

required

Returns:

Name Type Description
item

the SectionRule created from the dictionary

Source code in medspacy/section_detection/section_rule.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
@classmethod
def from_dict(cls, rule_dict):
    """
    Reads a dictionary into a SectionRule list. Used when reading from a json file.

    Args:
        rule_dict: the dictionary to convert

    Returns:
        item: the SectionRule created from the dictionary
    """
    keys = set(rule_dict.keys())
    invalid_keys = keys.difference(cls._ALLOWED_KEYS)
    if invalid_keys:
        msg = (
            f"JSON object contains invalid keys: {invalid_keys}. "
            f"Must be one of: {cls._ALLOWED_KEYS}"
        )
        raise ValueError(msg)
    rule = SectionRule(**rule_dict)
    return rule

from_json(filepath) classmethod

Read in a lexicon of modifiers from a JSON file.

Parameters:

Name Type Description Default
filepath

the .json file containing modifier rules

required

Returns:

Name Type Description
section_rules List[SectionRule]

a list of SectionRule objects

Source code in medspacy/section_detection/section_rule.py
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
@classmethod
def from_json(cls, filepath) -> List[SectionRule]:
    """
    Read in a lexicon of modifiers from a JSON file.

    Args:
        filepath: the .json file containing modifier rules

    Returns:
        section_rules: a list of SectionRule objects
    """
    import json

    with open(filepath) as file:
        section_data = json.load(file)
    section_rules = []
    for data in section_data["section_rules"]:
        section_rules.append(SectionRule.from_dict(data))
    return section_rules

to_dict()

Converts TargetRules to a python dictionary. Used when writing section rules to a json file.

Returns:

Name Type Description
rule_dict

the dictionary containing the TargetRule info.

Source code in medspacy/section_detection/section_rule.py
123
124
125
126
127
128
129
130
131
132
133
134
135
def to_dict(self):
    """
    Converts TargetRules to a python dictionary. Used when writing section rules to a json file.

    Returns:
        rule_dict: the dictionary containing the TargetRule info.
    """
    rule_dict = {}
    for key in self._ALLOWED_KEYS:
        value = self.__dict__.get(key)
        if value is not None:
            rule_dict[key] = value
    return rule_dict

Sectionizer

The Sectionizer will search for spans in the text which match section header rules, such as 'Past Medical History:'. Sections will be represented in custom attributes as: category: A normalized title of the section. Example: 'past_medical_history' section_title: The Span of the doc which was matched as a section header. Example: 'Past Medical History:' section_span: The entire section of the note, starting with section_header and up until the end of the section, which will be either the start of the next section header of some pre-specified scope. Example: 'Past Medical History: Type II DM'

Section attributes will be registered for each Doc, Span, and Token in the following attributes: Doc..sections: A list of namedtuples of type Section with 4 elements: - section_title - section_header - section_parent - section_span. A Doc will also have attributes corresponding to lists of each (ie., Doc..section_titles, Doc..section_headers, Doc..section_parents, Doc..section_list) (Span|Token)..section_title (Span|Token)..section_header (Span|Token)..section_parent (Span|Token)._.section_span

Source code in medspacy/section_detection/sectionizer.py
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
@Language.factory("medspacy_sectionizer")
class Sectionizer:
    """
    The Sectionizer will search for spans in the text which match section header rules, such as 'Past Medical History:'.
    Sections will be represented in custom attributes as:
        category: A normalized title of the section. Example: 'past_medical_history'
        section_title: The Span of the doc which was matched as a section header.
            Example: 'Past Medical History:'
        section_span: The entire section of the note, starting with section_header and up until the end
            of the section, which will be either the start of the next section header of some pre-specified
            scope. Example: 'Past Medical History: Type II DM'

    Section attributes will be registered for each Doc, Span, and Token in the following attributes:
        Doc._.sections: A list of namedtuples of type Section with 4 elements:
            - section_title
            - section_header
            - section_parent
            - section_span.
        A Doc will also have attributes corresponding to lists of each
            (ie., Doc._.section_titles, Doc._.section_headers, Doc._.section_parents, Doc._.section_list)
        (Span|Token)._.section_title
        (Span|Token)._.section_header
        (Span|Token)._.section_parent
        (Span|Token)._.section_span
    """

    def __init__(
        self,
        nlp: Language,
        name: str = "medspacy_sectionizer",
        rules: Optional[str] = "default",
        language_code: str = 'en',
        max_section_length: Optional[int] = None,
        phrase_matcher_attr: str = "LOWER",
        require_start_line: bool = False,
        require_end_line: bool = False,
        newline_pattern: str = r"[\n\r]+[\s]*$",
        input_span_type: Union[Literal["ents", "group"], None] = "ents",
        span_group_name: str = "medspacy_spans",
        span_attrs: Union[
            Literal["default"], Dict[str, Dict[str, Any]], None
        ] = "default",
        apply_sentence_boundary: bool = False,
    ):
        """
        Create a new Sectionizer component.

        Args:
            nlp: A SpaCy Language object.
            name: The name of the component.
            rules: The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from
                SecTag, MIMIC-III, and practical refinement at the US Department of Veterans Affairs. If None, no rules
                are loaded. Otherwise, must be a path to a json file containing rules. Add SectionRules directly through
                `Sectionizer.add`.
            language_code: Language code to use (ISO code) as a default for loading resources.  See documentation
                and also the /resources directory to see which resources might be available in each language.
                Default is "en" for English.
            max_section_length: Optional argument specifying the maximum number of tokens following a section header
                which can be included in a section body. This can be useful if you think your section rules are
                incomplete and want to prevent sections from running too long in the note. Default is None, meaning that
                the scope of a section will be until either the next section header or the end of the document.
            phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
                is 'LOWER'.
            require_start_line: Optionally require a section header to start on a new line. Default False.
            require_end_line: Optionally require a section header to end with a new line. Default False.
            newline_pattern: Regular expression to match the new line either preceding or following a header
                if either require_start_line or require_end_line are True. Default is r"[\n\r]+[\s]*$"
            span_attrs: The optional span attributes to modify. Default option "default" uses attributes in
                `DEFAULT_ATTRIBUTES`. If a dictionary of custom attributes, format is a dictionary mapping section
                categories to a dictionary containing the attribute name and the value to set the attribute to when a
                span is contained in a section of that category. Custom attributes must be assigned with
                `Span.set_extension` before creating the Sectionizer. If None, sectionizer will not modify span
                attributes.
            input_span_type: "ents" or "group". Where to look for spans when modifying attributes of spans
                contained in a section if `span_attrs` is not None. "ents" will modify attributes of spans in doc.ents.
                "group" will modify attributes of spans in the span group specified by `span_group_name`.
            span_group_name: The name of the span group used when `input_span_type` is "group". Default is
                "medspacy_spans".
            apply_sentence_boundary: Optionally end sentence before and after section header boundary. This ensures
                the section header is considered its own sentence.
        """
        self.nlp = nlp
        self.name = name
        self.max_section_length = max_section_length
        self.require_start_line = require_start_line
        self.require_end_line = require_end_line
        self.newline_pattern = re.compile(newline_pattern)
        self.assertion_attributes_mapping = None
        self._parent_sections = {}
        self._parent_required = {}
        self._input_span_type = input_span_type
        self._span_group_name = span_group_name
        self._apply_sentence_boundary = apply_sentence_boundary

        self.__matcher = MedspacyMatcher(
            nlp, name=name, phrase_matcher_attr=phrase_matcher_attr
        )

        self.DEFAULT_RULES_FILEPATH = path.join(
            Path(__file__).resolve().parents[2],
            "resources",
            language_code.lower(),
            "section_patterns.json",
        )

        rule_path = None
        if rules == "default":
            rule_path = self.DEFAULT_RULES_FILEPATH
        else:
            rule_path = rules

        if rule_path:
            self.add(SectionRule.from_json(rule_path))

        if span_attrs == "default":
            self.assertion_attributes_mapping = DEFAULT_ATTRS
            self.register_default_attributes()
        elif span_attrs:
            for _, attr_dict in span_attrs.items():
                for attr_name in attr_dict.keys():
                    if not Span.has_extension(attr_name):
                        raise ValueError(
                            f"Custom extension {attr_name} has not been set. Please ensure Span.set_extension is "
                            f"called for your pipeline's custom extensions."
                        )
            self.assertion_attributes_mapping = span_attrs

    @property
    def rules(self) -> List[SectionRule]:
        """
        Gets list of rules associated with the Sectionizer.

        Returns:
            The list of SectionRules associated with the Sectionizer.
        """
        return self.__matcher.rules

    @property
    def section_categories(self) -> Set[str]:
        """
        Gets a list of categories used in the Sectionizer.

        Returns:
                The list of all section categories available to the Sectionizer.
        """
        return self.__matcher.labels

    @property
    def input_span_type(self):
        """
        The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for
        a spaCy span group.

        Returns:
            The input type, "ents" or "group".
        """
        return self._input_span_type

    @input_span_type.setter
    def input_span_type(self, val):
        if not (val == "ents" or val == "group"):
            raise ValueError('input_type must be "ents" or "group".')
        self._input_span_type = val

    @property
    def span_group_name(self) -> str:
        """
        The name of the span group used by this component. If `input_type` is "group", calling this component will
        use spans in the span group with this name.

        Returns:
            The span group name.
        """
        return self._span_group_name

    @span_group_name.setter
    def span_group_name(self, name: str):
        if not name or not isinstance(name, str):
            raise ValueError("Span group name must be a string.")
        self._span_group_name = name

    @classmethod
    def register_default_attributes(cls):
        """
        Register the default values for the Span attributes defined in `DEFAULT_ATTRIBUTES`.
        """
        for attr_name in [
            "is_negated",
            "is_uncertain",
            "is_historical",
            "is_hypothetical",
            "is_family",
        ]:
            try:
                Span.set_extension(attr_name, default=False)
            except ValueError:  # Extension already set
                pass

    def add(self, rules):
        """
        Adds SectionRules to the Sectionizer.

        Args:
            rules: A single SectionRule or a collection of SectionRules to add to the Sectionizer.
        """
        if isinstance(rules, SectionRule):
            rules = [rules]

        for rule in rules:
            if not isinstance(rule, SectionRule):
                raise TypeError("Rules must be type SectionRule, not", type(rule))

        self.__matcher.add(rules)

        for rule in rules:
            name = rule.category
            parents = rule.parents
            parent_required = rule.parent_required
            if parents:
                if name in self._parent_sections.keys():
                    warnings.warn(
                        f"Duplicate section title {name}. Merging parents. "
                        f"If this is not intended, please specify distinct titles.",
                        RuntimeWarning,
                    )
                    self._parent_sections[name].update(parents)
                else:
                    self._parent_sections[name] = set(parents)

            if (
                name in self._parent_required.keys()
                and self._parent_required[name] != parent_required
            ):
                warnings.warn(
                    f"Duplicate section title {name} has different parent_required option. "
                    f"Setting parent_required to False.",
                    RuntimeWarning,
                )
                self._parent_required[name] = False
            else:
                self._parent_required[name] = parent_required

    def set_parent_sections(
        self, sections: List[Tuple[int, int, int]]
    ) -> List[Tuple[int, int, int, int]]:
        """
        Determine the legal parent-child section relationships from the list
        of in-order sections of a document and the possible parents of each
        section as specified during direction creation.

        Args:
            sections: a list of spacy match tuples found in the doc

        Returns:
            A list of tuples (match_id, start, end, parent_idx) where the first three indices are the same as the input
            and the added parent_idx represents the index in the list that corresponds to the parent section. Might be a
            smaller list than the input due to pruning with `parent_required`.
        """
        sections_final = []
        removed_sections = 0
        for i, (match_id, start, end) in enumerate(sections):
            name = self.__matcher.rule_map[self.nlp.vocab.strings[match_id]].category
            required = self._parent_required[name]
            i_a = i - removed_sections  # adjusted index for removed values
            if required and i_a == 0:
                removed_sections += 1
                continue
            elif i_a == 0 or name not in self._parent_sections.keys():
                sections_final.append((match_id, start, end, None))
            else:
                parents = self._parent_sections[name]
                identified_parent = None
                for parent in parents:
                    # go backwards through the section "tree" until you hit a root or the start of the list
                    candidate = self.__matcher.rule_map[
                        self.nlp.vocab.strings[sections_final[i_a - 1][0]]
                    ].category
                    candidates_parent_idx = sections_final[i_a - 1][3]
                    if candidates_parent_idx is not None:
                        candidates_parent = self.__matcher.rule_map[
                            self.nlp.vocab.strings[
                                sections_final[candidates_parent_idx][0]
                            ]
                        ].category
                    else:
                        candidates_parent = None
                    candidate_i = i_a - 1
                    while candidate:
                        if candidate == parent:
                            identified_parent = candidate_i
                            candidate = None
                        else:
                            # if you are at the end of the list... no parent
                            if candidate_i < 1:
                                candidate = None
                                continue
                            # if the current candidate has no parent... no parent exists
                            if not candidates_parent:
                                candidate = None
                                continue
                            # otherwise get the previous item in the list
                            temp = self.__matcher.rule_map[
                                self.nlp.vocab.strings[
                                    sections_final[candidate_i - 1][0]
                                ]
                            ].category
                            temp_parent_idx = sections_final[candidate_i - 1][3]
                            if temp_parent_idx is not None:
                                temp_parent = self.__matcher.rule_map[
                                    self.nlp.vocab.strings[
                                        sections_final[temp_parent_idx][0]
                                    ]
                                ].category
                            else:
                                temp_parent = None
                            # if the previous item is the parent of the current item
                            # OR if the previous item is a sibling of the current item
                            # continue to search
                            if (
                                temp == candidates_parent
                                or temp_parent == candidates_parent
                            ):
                                candidate = temp
                                candidates_parent = temp_parent
                                candidate_i -= 1
                            # otherwise, there is no further tree traversal
                            else:
                                candidate = None

                # if a parent is required, then add
                if identified_parent is not None or not required:
                    # if the parent is identified, add section
                    # if the parent is not required, add section
                    # if parent is not identified and required, do not add the section
                    sections_final.append((match_id, start, end, identified_parent))
                else:
                    removed_sections += 1
        return sections_final

    def set_assertion_attributes(self, spans: Iterable[Span]):
        """
        Add Span-level attributes to entities based on which section they occur in.

        Args:
            spans: the spans to modify.
        """
        for span in spans:
            if (
                span._.section
                and span._.section.category in self.assertion_attributes_mapping
            ):
                attr_dict = self.assertion_attributes_mapping[span._.section.category]
                for (attr_name, attr_value) in attr_dict.items():
                    setattr(span._, attr_name, attr_value)

    def __call__(self, doc: Doc) -> Doc:
        """
        Call the Sectionizer on a spaCy doc. Sectionizer will identify sections using provided rules, then evaluate any
        section hierarchy as needed, create section spans, and modify attributes on existing spans based on the sections
        the entities spans in.

        Args:
            doc: The Doc to process.

        Returns:
            The processed spaCy Doc.
        """
        matches = self.__matcher(doc)
        if self.require_start_line:
            matches = self.filter_start_lines(doc, matches)
        if self.require_end_line:
            matches = self.filter_end_lines(doc, matches)
        if self._parent_sections:
            matches = self.set_parent_sections(matches)

        # If this has already been processed by the sectionizer, reset the sections
        doc._.sections = []
        # if there were no matches, return the doc as one section
        if len(matches) == 0:
            doc._.sections.append(Section(None, 0, 0, 0, len(doc)))
            return doc

        section_list = []
        # if the first match does not begin at token 0, handle the first section
        first_match = matches[0]
        if first_match[1] != 0:
            section_list.append(Section(None, 0, 0, 0, first_match[1]))

        # handle section spans
        for i, match in enumerate(matches):
            parent = None
            if len(match) == 4:
                (match_id, start, end, parent_idx) = match
                if parent_idx is not None:
                    parent = section_list[parent_idx]
            else:
                # IDEs will warn here about match shape disagreeing w/ type hinting, but this if is only used if
                # parent sections were never set, so parent_idx does not exist
                (match_id, start, end) = match

            # Make section header its own sentence
            if self._apply_sentence_boundary:
                # Section headers should be considered the start of a sentence
                doc[start].sent_start = True
                # Text following the header should also be considered a new sentence
                if end < len(doc):
                    doc[end].sent_start = True

            rule = self.__matcher.rule_map[self.nlp.vocab.strings[match_id]]
            category = rule.category
            # If this is the last match, it should include the rest of the doc
            if i == len(matches) - 1:
                # If there is no scope limitation, go until the end of the doc
                if self.max_section_length is None and rule.max_scope is None:
                    section_list.append(
                        Section(category, start, end, end, len(doc), parent, rule)
                    )
                else:
                    # If the rule has a max_scope, use that as a precedence
                    if rule.max_scope is not None:
                        scope_end = min(end + rule.max_scope, doc[-1].i + 1)
                    else:
                        scope_end = min(end + self.max_section_length, doc[-1].i + 1)

                    section_list.append(
                        Section(category, start, end, end, scope_end, parent, rule)
                    )
            # Otherwise, go until the next section header
            else:
                next_match = matches[i + 1]
                if len(match) == 4:
                    _, next_start, _, _ = next_match
                else:
                    _, next_start, _ = next_match
                if self.max_section_length is None and rule.max_scope is None:
                    section_list.append(
                        Section(category, start, end, end, next_start, parent, rule)
                    )
                else:
                    if rule.max_scope is not None:
                        scope_end = min(end + rule.max_scope, next_start)
                    else:
                        scope_end = min(end + self.max_section_length, next_start)
                    section_list.append(
                        Section(category, start, end, end, scope_end, parent, rule)
                    )

        for section in section_list:
            doc._.sections.append(section)
            start, end = section.section_span
            for token in doc[start:end]:
                token._.section = section

        # If it is specified to add assertion attributes,
        # iterate through the entities in doc and add them
        if self.assertion_attributes_mapping:
            if self._input_span_type.lower() == "ents":
                self.set_assertion_attributes(doc.ents)
            elif self._input_span_type.lower() == "group":
                self.set_assertion_attributes(doc.spans[self._span_group_name])

        return doc

    def filter_start_lines(
        self, doc: Doc, matches: List[Tuple[int, int, int]]
    ) -> List[Tuple[int, int, int]]:
        """
        Filter a list of matches to only contain spans where the start token is the beginning of a new line.

        Returns:
            A list of match tuples (match_id, start, end) that meet the filter criteria.
        """
        return [
            m for m in matches if util.is_start_line(m[1], doc, self.newline_pattern)
        ]

    def filter_end_lines(
        self, doc: Doc, matches: List[Tuple[int, int, int]]
    ) -> List[Tuple[int, int, int]]:
        """
        Filter a list of matches to only contain spans where the start token is followed by a new line.

        Returns:
            A list of match tuples (match_id, start, end) that meet the filter criteria.
        """
        return [
            m for m in matches if util.is_end_line(m[2] - 1, doc, self.newline_pattern)
        ]

input_span_type property writable

The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for a spaCy span group.

Returns:

Type Description

The input type, "ents" or "group".

rules property

Gets list of rules associated with the Sectionizer.

Returns:

Type Description
List[SectionRule]

The list of SectionRules associated with the Sectionizer.

section_categories property

Gets a list of categories used in the Sectionizer.

Returns:

Type Description
Set[str]

The list of all section categories available to the Sectionizer.

span_group_name property writable

The name of the span group used by this component. If input_type is "group", calling this component will use spans in the span group with this name.

Returns:

Type Description
str

The span group name.

__call__(doc)

Call the Sectionizer on a spaCy doc. Sectionizer will identify sections using provided rules, then evaluate any section hierarchy as needed, create section spans, and modify attributes on existing spans based on the sections the entities spans in.

Parameters:

Name Type Description Default
doc Doc

The Doc to process.

required

Returns:

Type Description
Doc

The processed spaCy Doc.

Source code in medspacy/section_detection/sectionizer.py
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
def __call__(self, doc: Doc) -> Doc:
    """
    Call the Sectionizer on a spaCy doc. Sectionizer will identify sections using provided rules, then evaluate any
    section hierarchy as needed, create section spans, and modify attributes on existing spans based on the sections
    the entities spans in.

    Args:
        doc: The Doc to process.

    Returns:
        The processed spaCy Doc.
    """
    matches = self.__matcher(doc)
    if self.require_start_line:
        matches = self.filter_start_lines(doc, matches)
    if self.require_end_line:
        matches = self.filter_end_lines(doc, matches)
    if self._parent_sections:
        matches = self.set_parent_sections(matches)

    # If this has already been processed by the sectionizer, reset the sections
    doc._.sections = []
    # if there were no matches, return the doc as one section
    if len(matches) == 0:
        doc._.sections.append(Section(None, 0, 0, 0, len(doc)))
        return doc

    section_list = []
    # if the first match does not begin at token 0, handle the first section
    first_match = matches[0]
    if first_match[1] != 0:
        section_list.append(Section(None, 0, 0, 0, first_match[1]))

    # handle section spans
    for i, match in enumerate(matches):
        parent = None
        if len(match) == 4:
            (match_id, start, end, parent_idx) = match
            if parent_idx is not None:
                parent = section_list[parent_idx]
        else:
            # IDEs will warn here about match shape disagreeing w/ type hinting, but this if is only used if
            # parent sections were never set, so parent_idx does not exist
            (match_id, start, end) = match

        # Make section header its own sentence
        if self._apply_sentence_boundary:
            # Section headers should be considered the start of a sentence
            doc[start].sent_start = True
            # Text following the header should also be considered a new sentence
            if end < len(doc):
                doc[end].sent_start = True

        rule = self.__matcher.rule_map[self.nlp.vocab.strings[match_id]]
        category = rule.category
        # If this is the last match, it should include the rest of the doc
        if i == len(matches) - 1:
            # If there is no scope limitation, go until the end of the doc
            if self.max_section_length is None and rule.max_scope is None:
                section_list.append(
                    Section(category, start, end, end, len(doc), parent, rule)
                )
            else:
                # If the rule has a max_scope, use that as a precedence
                if rule.max_scope is not None:
                    scope_end = min(end + rule.max_scope, doc[-1].i + 1)
                else:
                    scope_end = min(end + self.max_section_length, doc[-1].i + 1)

                section_list.append(
                    Section(category, start, end, end, scope_end, parent, rule)
                )
        # Otherwise, go until the next section header
        else:
            next_match = matches[i + 1]
            if len(match) == 4:
                _, next_start, _, _ = next_match
            else:
                _, next_start, _ = next_match
            if self.max_section_length is None and rule.max_scope is None:
                section_list.append(
                    Section(category, start, end, end, next_start, parent, rule)
                )
            else:
                if rule.max_scope is not None:
                    scope_end = min(end + rule.max_scope, next_start)
                else:
                    scope_end = min(end + self.max_section_length, next_start)
                section_list.append(
                    Section(category, start, end, end, scope_end, parent, rule)
                )

    for section in section_list:
        doc._.sections.append(section)
        start, end = section.section_span
        for token in doc[start:end]:
            token._.section = section

    # If it is specified to add assertion attributes,
    # iterate through the entities in doc and add them
    if self.assertion_attributes_mapping:
        if self._input_span_type.lower() == "ents":
            self.set_assertion_attributes(doc.ents)
        elif self._input_span_type.lower() == "group":
            self.set_assertion_attributes(doc.spans[self._span_group_name])

    return doc

__init__(nlp, name='medspacy_sectionizer', rules='default', language_code='en', max_section_length=None, phrase_matcher_attr='LOWER', require_start_line=False, require_end_line=False, newline_pattern='[\\n\\r]+[\\s]*$', input_span_type='ents', span_group_name='medspacy_spans', span_attrs='default', apply_sentence_boundary=False)

   Create a new Sectionizer component.

   Args:
       nlp: A SpaCy Language object.
       name: The name of the component.
       rules: The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from
           SecTag, MIMIC-III, and practical refinement at the US Department of Veterans Affairs. If None, no rules
           are loaded. Otherwise, must be a path to a json file containing rules. Add SectionRules directly through
           `Sectionizer.add`.
       language_code: Language code to use (ISO code) as a default for loading resources.  See documentation
           and also the /resources directory to see which resources might be available in each language.
           Default is "en" for English.
       max_section_length: Optional argument specifying the maximum number of tokens following a section header
           which can be included in a section body. This can be useful if you think your section rules are
           incomplete and want to prevent sections from running too long in the note. Default is None, meaning that
           the scope of a section will be until either the next section header or the end of the document.
       phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
           is 'LOWER'.
       require_start_line: Optionally require a section header to start on a new line. Default False.
       require_end_line: Optionally require a section header to end with a new line. Default False.
       newline_pattern: Regular expression to match the new line either preceding or following a header
           if either require_start_line or require_end_line are True. Default is r"[

]+[\s]*$" span_attrs: The optional span attributes to modify. Default option "default" uses attributes in DEFAULT_ATTRIBUTES. If a dictionary of custom attributes, format is a dictionary mapping section categories to a dictionary containing the attribute name and the value to set the attribute to when a span is contained in a section of that category. Custom attributes must be assigned with Span.set_extension before creating the Sectionizer. If None, sectionizer will not modify span attributes. input_span_type: "ents" or "group". Where to look for spans when modifying attributes of spans contained in a section if span_attrs is not None. "ents" will modify attributes of spans in doc.ents. "group" will modify attributes of spans in the span group specified by span_group_name. span_group_name: The name of the span group used when input_span_type is "group". Default is "medspacy_spans". apply_sentence_boundary: Optionally end sentence before and after section header boundary. This ensures the section header is considered its own sentence.

Source code in medspacy/section_detection/sectionizer.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def __init__(
    self,
    nlp: Language,
    name: str = "medspacy_sectionizer",
    rules: Optional[str] = "default",
    language_code: str = 'en',
    max_section_length: Optional[int] = None,
    phrase_matcher_attr: str = "LOWER",
    require_start_line: bool = False,
    require_end_line: bool = False,
    newline_pattern: str = r"[\n\r]+[\s]*$",
    input_span_type: Union[Literal["ents", "group"], None] = "ents",
    span_group_name: str = "medspacy_spans",
    span_attrs: Union[
        Literal["default"], Dict[str, Dict[str, Any]], None
    ] = "default",
    apply_sentence_boundary: bool = False,
):
    """
    Create a new Sectionizer component.

    Args:
        nlp: A SpaCy Language object.
        name: The name of the component.
        rules: The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from
            SecTag, MIMIC-III, and practical refinement at the US Department of Veterans Affairs. If None, no rules
            are loaded. Otherwise, must be a path to a json file containing rules. Add SectionRules directly through
            `Sectionizer.add`.
        language_code: Language code to use (ISO code) as a default for loading resources.  See documentation
            and also the /resources directory to see which resources might be available in each language.
            Default is "en" for English.
        max_section_length: Optional argument specifying the maximum number of tokens following a section header
            which can be included in a section body. This can be useful if you think your section rules are
            incomplete and want to prevent sections from running too long in the note. Default is None, meaning that
            the scope of a section will be until either the next section header or the end of the document.
        phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
            is 'LOWER'.
        require_start_line: Optionally require a section header to start on a new line. Default False.
        require_end_line: Optionally require a section header to end with a new line. Default False.
        newline_pattern: Regular expression to match the new line either preceding or following a header
            if either require_start_line or require_end_line are True. Default is r"[\n\r]+[\s]*$"
        span_attrs: The optional span attributes to modify. Default option "default" uses attributes in
            `DEFAULT_ATTRIBUTES`. If a dictionary of custom attributes, format is a dictionary mapping section
            categories to a dictionary containing the attribute name and the value to set the attribute to when a
            span is contained in a section of that category. Custom attributes must be assigned with
            `Span.set_extension` before creating the Sectionizer. If None, sectionizer will not modify span
            attributes.
        input_span_type: "ents" or "group". Where to look for spans when modifying attributes of spans
            contained in a section if `span_attrs` is not None. "ents" will modify attributes of spans in doc.ents.
            "group" will modify attributes of spans in the span group specified by `span_group_name`.
        span_group_name: The name of the span group used when `input_span_type` is "group". Default is
            "medspacy_spans".
        apply_sentence_boundary: Optionally end sentence before and after section header boundary. This ensures
            the section header is considered its own sentence.
    """
    self.nlp = nlp
    self.name = name
    self.max_section_length = max_section_length
    self.require_start_line = require_start_line
    self.require_end_line = require_end_line
    self.newline_pattern = re.compile(newline_pattern)
    self.assertion_attributes_mapping = None
    self._parent_sections = {}
    self._parent_required = {}
    self._input_span_type = input_span_type
    self._span_group_name = span_group_name
    self._apply_sentence_boundary = apply_sentence_boundary

    self.__matcher = MedspacyMatcher(
        nlp, name=name, phrase_matcher_attr=phrase_matcher_attr
    )

    self.DEFAULT_RULES_FILEPATH = path.join(
        Path(__file__).resolve().parents[2],
        "resources",
        language_code.lower(),
        "section_patterns.json",
    )

    rule_path = None
    if rules == "default":
        rule_path = self.DEFAULT_RULES_FILEPATH
    else:
        rule_path = rules

    if rule_path:
        self.add(SectionRule.from_json(rule_path))

    if span_attrs == "default":
        self.assertion_attributes_mapping = DEFAULT_ATTRS
        self.register_default_attributes()
    elif span_attrs:
        for _, attr_dict in span_attrs.items():
            for attr_name in attr_dict.keys():
                if not Span.has_extension(attr_name):
                    raise ValueError(
                        f"Custom extension {attr_name} has not been set. Please ensure Span.set_extension is "
                        f"called for your pipeline's custom extensions."
                    )
        self.assertion_attributes_mapping = span_attrs

add(rules)

Adds SectionRules to the Sectionizer.

Parameters:

Name Type Description Default
rules

A single SectionRule or a collection of SectionRules to add to the Sectionizer.

required
Source code in medspacy/section_detection/sectionizer.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
def add(self, rules):
    """
    Adds SectionRules to the Sectionizer.

    Args:
        rules: A single SectionRule or a collection of SectionRules to add to the Sectionizer.
    """
    if isinstance(rules, SectionRule):
        rules = [rules]

    for rule in rules:
        if not isinstance(rule, SectionRule):
            raise TypeError("Rules must be type SectionRule, not", type(rule))

    self.__matcher.add(rules)

    for rule in rules:
        name = rule.category
        parents = rule.parents
        parent_required = rule.parent_required
        if parents:
            if name in self._parent_sections.keys():
                warnings.warn(
                    f"Duplicate section title {name}. Merging parents. "
                    f"If this is not intended, please specify distinct titles.",
                    RuntimeWarning,
                )
                self._parent_sections[name].update(parents)
            else:
                self._parent_sections[name] = set(parents)

        if (
            name in self._parent_required.keys()
            and self._parent_required[name] != parent_required
        ):
            warnings.warn(
                f"Duplicate section title {name} has different parent_required option. "
                f"Setting parent_required to False.",
                RuntimeWarning,
            )
            self._parent_required[name] = False
        else:
            self._parent_required[name] = parent_required

filter_end_lines(doc, matches)

Filter a list of matches to only contain spans where the start token is followed by a new line.

Returns:

Type Description
List[Tuple[int, int, int]]

A list of match tuples (match_id, start, end) that meet the filter criteria.

Source code in medspacy/section_detection/sectionizer.py
503
504
505
506
507
508
509
510
511
512
513
514
def filter_end_lines(
    self, doc: Doc, matches: List[Tuple[int, int, int]]
) -> List[Tuple[int, int, int]]:
    """
    Filter a list of matches to only contain spans where the start token is followed by a new line.

    Returns:
        A list of match tuples (match_id, start, end) that meet the filter criteria.
    """
    return [
        m for m in matches if util.is_end_line(m[2] - 1, doc, self.newline_pattern)
    ]

filter_start_lines(doc, matches)

Filter a list of matches to only contain spans where the start token is the beginning of a new line.

Returns:

Type Description
List[Tuple[int, int, int]]

A list of match tuples (match_id, start, end) that meet the filter criteria.

Source code in medspacy/section_detection/sectionizer.py
490
491
492
493
494
495
496
497
498
499
500
501
def filter_start_lines(
    self, doc: Doc, matches: List[Tuple[int, int, int]]
) -> List[Tuple[int, int, int]]:
    """
    Filter a list of matches to only contain spans where the start token is the beginning of a new line.

    Returns:
        A list of match tuples (match_id, start, end) that meet the filter criteria.
    """
    return [
        m for m in matches if util.is_start_line(m[1], doc, self.newline_pattern)
    ]

register_default_attributes() classmethod

Register the default values for the Span attributes defined in DEFAULT_ATTRIBUTES.

Source code in medspacy/section_detection/sectionizer.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
@classmethod
def register_default_attributes(cls):
    """
    Register the default values for the Span attributes defined in `DEFAULT_ATTRIBUTES`.
    """
    for attr_name in [
        "is_negated",
        "is_uncertain",
        "is_historical",
        "is_hypothetical",
        "is_family",
    ]:
        try:
            Span.set_extension(attr_name, default=False)
        except ValueError:  # Extension already set
            pass

set_assertion_attributes(spans)

Add Span-level attributes to entities based on which section they occur in.

Parameters:

Name Type Description Default
spans Iterable[Span]

the spans to modify.

required
Source code in medspacy/section_detection/sectionizer.py
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
def set_assertion_attributes(self, spans: Iterable[Span]):
    """
    Add Span-level attributes to entities based on which section they occur in.

    Args:
        spans: the spans to modify.
    """
    for span in spans:
        if (
            span._.section
            and span._.section.category in self.assertion_attributes_mapping
        ):
            attr_dict = self.assertion_attributes_mapping[span._.section.category]
            for (attr_name, attr_value) in attr_dict.items():
                setattr(span._, attr_name, attr_value)

set_parent_sections(sections)

Determine the legal parent-child section relationships from the list of in-order sections of a document and the possible parents of each section as specified during direction creation.

Parameters:

Name Type Description Default
sections List[Tuple[int, int, int]]

a list of spacy match tuples found in the doc

required

Returns:

Type Description
List[Tuple[int, int, int, int]]

A list of tuples (match_id, start, end, parent_idx) where the first three indices are the same as the input

List[Tuple[int, int, int, int]]

and the added parent_idx represents the index in the list that corresponds to the parent section. Might be a

List[Tuple[int, int, int, int]]

smaller list than the input due to pruning with parent_required.

Source code in medspacy/section_detection/sectionizer.py
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
def set_parent_sections(
    self, sections: List[Tuple[int, int, int]]
) -> List[Tuple[int, int, int, int]]:
    """
    Determine the legal parent-child section relationships from the list
    of in-order sections of a document and the possible parents of each
    section as specified during direction creation.

    Args:
        sections: a list of spacy match tuples found in the doc

    Returns:
        A list of tuples (match_id, start, end, parent_idx) where the first three indices are the same as the input
        and the added parent_idx represents the index in the list that corresponds to the parent section. Might be a
        smaller list than the input due to pruning with `parent_required`.
    """
    sections_final = []
    removed_sections = 0
    for i, (match_id, start, end) in enumerate(sections):
        name = self.__matcher.rule_map[self.nlp.vocab.strings[match_id]].category
        required = self._parent_required[name]
        i_a = i - removed_sections  # adjusted index for removed values
        if required and i_a == 0:
            removed_sections += 1
            continue
        elif i_a == 0 or name not in self._parent_sections.keys():
            sections_final.append((match_id, start, end, None))
        else:
            parents = self._parent_sections[name]
            identified_parent = None
            for parent in parents:
                # go backwards through the section "tree" until you hit a root or the start of the list
                candidate = self.__matcher.rule_map[
                    self.nlp.vocab.strings[sections_final[i_a - 1][0]]
                ].category
                candidates_parent_idx = sections_final[i_a - 1][3]
                if candidates_parent_idx is not None:
                    candidates_parent = self.__matcher.rule_map[
                        self.nlp.vocab.strings[
                            sections_final[candidates_parent_idx][0]
                        ]
                    ].category
                else:
                    candidates_parent = None
                candidate_i = i_a - 1
                while candidate:
                    if candidate == parent:
                        identified_parent = candidate_i
                        candidate = None
                    else:
                        # if you are at the end of the list... no parent
                        if candidate_i < 1:
                            candidate = None
                            continue
                        # if the current candidate has no parent... no parent exists
                        if not candidates_parent:
                            candidate = None
                            continue
                        # otherwise get the previous item in the list
                        temp = self.__matcher.rule_map[
                            self.nlp.vocab.strings[
                                sections_final[candidate_i - 1][0]
                            ]
                        ].category
                        temp_parent_idx = sections_final[candidate_i - 1][3]
                        if temp_parent_idx is not None:
                            temp_parent = self.__matcher.rule_map[
                                self.nlp.vocab.strings[
                                    sections_final[temp_parent_idx][0]
                                ]
                            ].category
                        else:
                            temp_parent = None
                        # if the previous item is the parent of the current item
                        # OR if the previous item is a sibling of the current item
                        # continue to search
                        if (
                            temp == candidates_parent
                            or temp_parent == candidates_parent
                        ):
                            candidate = temp
                            candidates_parent = temp_parent
                            candidate_i -= 1
                        # otherwise, there is no further tree traversal
                        else:
                            candidate = None

            # if a parent is required, then add
            if identified_parent is not None or not required:
                # if the parent is identified, add section
                # if the parent is not required, add section
                # if parent is not identified and required, do not add the section
                sections_final.append((match_id, start, end, identified_parent))
            else:
                removed_sections += 1
    return sections_final

section

Section

Bases: object

Section is the object that stores the result of processing by the Sectionizer class. A Section contains information describing the section's category, title span, body span, parent, and the rule that created it.

Section category is equivalent to label_ in a basic spaCy entity. It is a normalized name for the section type determined on initialization, either created manually or through the Sectionizer pipeline component.

Section title, defined with title_start, title_end, and title_span represents the section title or header matched with the rule. In the text "Past medical history: stroke and high blood pressure", "Past medical history:" would be the title.

Section body is defined with body_start, body_end, and body_span. It represents the text between the end of the current section's title and the start of the title for the next Section or when scope is set in the rule or by the Sectionizer. In the text "Past medical history: stroke and high blood pressure", "stroke and high blood pressure" would be the body.

Parent is a string that represents the conceptual "parent" section in a section->subsection->subsubsection hierarchy. Candidates are determined by category in the rule and matched at runtime.

Source code in medspacy/section_detection/section.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
class Section(object):
    """
    Section is the object that stores the result of processing by the Sectionizer class. A Section contains information
    describing the section's category, title span, body span, parent, and the rule that created it.

    Section `category` is equivalent to `label_` in a basic spaCy entity. It is a normalized name for the section type
    determined on initialization, either created manually or through the Sectionizer pipeline component.

    Section title, defined with `title_start`, `title_end`, and `title_span` represents the section title or header
    matched with the rule. In the text "Past medical history: stroke and high blood pressure", "Past medical history:"
    would be the title.

    Section body is defined with `body_start`, `body_end`, and `body_span`. It represents the text between the end of
    the current section's title and the start of the title for the next Section or when scope is set in the rule or by
    the Sectionizer. In the text "Past medical history: stroke and high blood pressure", "stroke and high blood
    pressure" would be the body.

    Parent is a string that represents the conceptual "parent" section in a section->subsection->subsubsection
    hierarchy. Candidates are determined by category in the rule and matched at runtime.
    """

    def __init__(
        self,
        category: Union[str, None],
        title_start: int,
        title_end: int,
        body_start: int,
        body_end: int,
        parent: Optional[str] = None,
        rule: Optional[SectionRule] = None,
    ):
        """
        Create a new Section object.

        Args:
            category: A normalized name for the section. Equivalent to `label_` for basic spaCy entities.
            title_start: Index of the first token of the section title.
            title_end: Index of the last token of the section title.
            body_start: Index of the first token of the section body.
            body_end: Index of the last token of the section body.
            parent: The category of the parent section.
            rule: The SectionRule that generated the section.
        """
        self.category = category
        self.title_start = title_start
        self.title_end = title_end
        self.body_start = body_start
        self.body_end = body_end
        self.parent = parent
        self.rule = rule

    def __repr__(self):
        return (
            f"Section(category={self.category} at {self.title_start} : {self.title_end} in the doc with a body at "
            f"{self.body_start} : {self.body_end} based on the rule {self.rule}"
        )

    @property
    def title_span(self):
        """
        Gets the span of the section title.

        Returns:
            A tuple (int,int) containing the start and end indexes of the section title.
        """
        return self.title_start, self.title_end

    @property
    def body_span(self):
        """
        Gets the span of the section body.

        Returns:
            A tuple (int,int) containing the start and end indexes of the section body.
        """
        return self.body_start, self.body_end

    @property
    def section_span(self):
        """
        Gets the span of the entire section, from title start to body end.

        Returns:
            A tuple (int,int) containing the start index of the section title and the end index of the section body.
        """
        return self.title_start, self.body_end

    def serialized_representation(self):
        """
        Serialize the Section.

        Returns:
            A json-serialized representation of the section.
        """
        rule = self.rule

        return {
            "category": self.category,
            "title_start": self.title_start,
            "title_end": self.title_end,
            "body_start": self.body_start,
            "body_end": self.body_end,
            "parent": self.parent,
            "rule": rule.to_dict() if rule is not None else None,
        }

    @classmethod
    def from_serialized_representation(cls, serialized_representation: Dict[str, str]):
        """
        Load the section from a json-serialized form.

        Args:
            serialized_representation: The dictionary form of the section object to load.

        Returns:
            A Section object containing the data from the dictionary provided.
        """
        rule = SectionRule.from_dict(serialized_representation["rule"])
        section = Section(
            **{k: v for k, v in serialized_representation.items() if k not in ["rule"]}
        )
        section.rule = rule

        return section
body_span property

Gets the span of the section body.

Returns:

Type Description

A tuple (int,int) containing the start and end indexes of the section body.

section_span property

Gets the span of the entire section, from title start to body end.

Returns:

Type Description

A tuple (int,int) containing the start index of the section title and the end index of the section body.

title_span property

Gets the span of the section title.

Returns:

Type Description

A tuple (int,int) containing the start and end indexes of the section title.

__init__(category, title_start, title_end, body_start, body_end, parent=None, rule=None)

Create a new Section object.

Parameters:

Name Type Description Default
category Union[str, None]

A normalized name for the section. Equivalent to label_ for basic spaCy entities.

required
title_start int

Index of the first token of the section title.

required
title_end int

Index of the last token of the section title.

required
body_start int

Index of the first token of the section body.

required
body_end int

Index of the last token of the section body.

required
parent Optional[str]

The category of the parent section.

None
rule Optional[SectionRule]

The SectionRule that generated the section.

None
Source code in medspacy/section_detection/section.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __init__(
    self,
    category: Union[str, None],
    title_start: int,
    title_end: int,
    body_start: int,
    body_end: int,
    parent: Optional[str] = None,
    rule: Optional[SectionRule] = None,
):
    """
    Create a new Section object.

    Args:
        category: A normalized name for the section. Equivalent to `label_` for basic spaCy entities.
        title_start: Index of the first token of the section title.
        title_end: Index of the last token of the section title.
        body_start: Index of the first token of the section body.
        body_end: Index of the last token of the section body.
        parent: The category of the parent section.
        rule: The SectionRule that generated the section.
    """
    self.category = category
    self.title_start = title_start
    self.title_end = title_end
    self.body_start = body_start
    self.body_end = body_end
    self.parent = parent
    self.rule = rule
from_serialized_representation(serialized_representation) classmethod

Load the section from a json-serialized form.

Parameters:

Name Type Description Default
serialized_representation Dict[str, str]

The dictionary form of the section object to load.

required

Returns:

Type Description

A Section object containing the data from the dictionary provided.

Source code in medspacy/section_detection/section.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
@classmethod
def from_serialized_representation(cls, serialized_representation: Dict[str, str]):
    """
    Load the section from a json-serialized form.

    Args:
        serialized_representation: The dictionary form of the section object to load.

    Returns:
        A Section object containing the data from the dictionary provided.
    """
    rule = SectionRule.from_dict(serialized_representation["rule"])
    section = Section(
        **{k: v for k, v in serialized_representation.items() if k not in ["rule"]}
    )
    section.rule = rule

    return section
serialized_representation()

Serialize the Section.

Returns:

Type Description

A json-serialized representation of the section.

Source code in medspacy/section_detection/section.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def serialized_representation(self):
    """
    Serialize the Section.

    Returns:
        A json-serialized representation of the section.
    """
    rule = self.rule

    return {
        "category": self.category,
        "title_start": self.title_start,
        "title_end": self.title_end,
        "body_start": self.body_start,
        "body_end": self.body_end,
        "parent": self.parent,
        "rule": rule.to_dict() if rule is not None else None,
    }

section_rule

SectionRule

Bases: BaseRule

SectionRule defines rules for extracting entities from text using the Sectionizer.

Source code in medspacy/section_detection/section_rule.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
class SectionRule(BaseRule):
    """
    SectionRule defines rules for extracting entities from text using the Sectionizer.
    """

    _ALLOWED_KEYS = {
        "literal",
        "pattern",
        "category",
        "metadata",
        "parents",
        "parent_required",
        "max_scope",
    }

    def __init__(
        self,
        literal: str,
        category: str,
        pattern: Optional[Union[List[Dict[str, str]], str]] = None,
        on_match: Optional[
            Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
        ] = None,
        max_scope: Optional[int] = None,
        parents: Optional[List[str]] = None,
        parent_required: bool = False,
        metadata: Optional[Dict[Any, Any]] = None,
    ):
        """
        Class for defining rules for extracting entities from text using TargetMatcher.

        Args:
            literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
                matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
                but can be used as a reference as the rule name.
            category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
            pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
                token-based pattern matching to match using token attributes. If a string, will use medspaCy's
                RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
                https://spacy.io/usage/rule-based-matching.
            on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
                matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
            max_scope: A number of tokens to explicitly limit the size of a section body. If None, the scope will
                include the entire doc up until either the next section header or the end of the doc. This variable can
                also be set at a global level as `Sectionizer(nlp, max_scope=...), but if the attribute is set here, the
                rule scope will take precedence. If not None, this will be the number of tokens following the matched
                section header
                    Example:
                        In the text "Past Medical History: Pt has hx of pneumonia",
                        SectionRule("Past Medical History:", "pmh", max_scope=None) will include the entire doc, but
                        SectionRule("Past Medical History:", "pmh", max_scope=2) will limit the section
                            to be "Past Medical History: Pt has"
                This can be useful for limiting certain sections which are known to be short or allowing others to be
                longer than the regular global max_scope.
            parents: A list of candidate parents for determining subsections
            parent_required: Whether a parent is required for the section to exist in the final output. If true and no
                parent is identified, the section will be removed.
            metadata: Optional dictionary of any extra metadata.
        """
        super().__init__(literal, category, pattern, on_match, metadata)
        self.max_scope = max_scope
        self.parents = parents
        if parent_required:
            if not parents:
                raise ValueError(
                    f"Jsonl file incorrectly formatted for pattern name {category}. "
                    f"If parents are required, then at least one parent must be specified."
                )
        self.parent_required = parent_required

    @classmethod
    def from_json(cls, filepath) -> List[SectionRule]:
        """
        Read in a lexicon of modifiers from a JSON file.

        Args:
            filepath: the .json file containing modifier rules

        Returns:
            section_rules: a list of SectionRule objects
        """
        import json

        with open(filepath) as file:
            section_data = json.load(file)
        section_rules = []
        for data in section_data["section_rules"]:
            section_rules.append(SectionRule.from_dict(data))
        return section_rules

    @classmethod
    def from_dict(cls, rule_dict):
        """
        Reads a dictionary into a SectionRule list. Used when reading from a json file.

        Args:
            rule_dict: the dictionary to convert

        Returns:
            item: the SectionRule created from the dictionary
        """
        keys = set(rule_dict.keys())
        invalid_keys = keys.difference(cls._ALLOWED_KEYS)
        if invalid_keys:
            msg = (
                f"JSON object contains invalid keys: {invalid_keys}. "
                f"Must be one of: {cls._ALLOWED_KEYS}"
            )
            raise ValueError(msg)
        rule = SectionRule(**rule_dict)
        return rule

    def to_dict(self):
        """
        Converts TargetRules to a python dictionary. Used when writing section rules to a json file.

        Returns:
            rule_dict: the dictionary containing the TargetRule info.
        """
        rule_dict = {}
        for key in self._ALLOWED_KEYS:
            value = self.__dict__.get(key)
            if value is not None:
                rule_dict[key] = value
        return rule_dict

    def __repr__(self):
        return f"""SectionRule(literal="{self.literal}", category="{self.category}", pattern={self.pattern}, on_match={self.on_match}, parents={self.parents}, parent_required={self.parent_required})"""
__init__(literal, category, pattern=None, on_match=None, max_scope=None, parents=None, parent_required=False, metadata=None)

Class for defining rules for extracting entities from text using TargetMatcher.

Parameters:

Name Type Description Default
literal str

The string representation of a concept. If pattern is None, this string will be lower-cased and matched to the lower-case string. If pattern is not None, this argument will not be used for matching but can be used as a reference as the rule name.

required
category str

The semantic class of the matched span. This corresponds to the label_ attribute of an entity.

required
pattern Optional[Union[List[Dict[str, str]], str]]

A list or string to use as a spaCy pattern rather than literal. If a list, will use spaCy token-based pattern matching to match using token attributes. If a string, will use medspaCy's RegexMatcher. If None, will use literal as the pattern for phrase matching. For more information, see https://spacy.io/usage/rule-based-matching.

None
on_match Optional[Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]]

An optional callback function or other callable which takes 4 arguments: (matcher, doc, i, matches). For more information, see https://spacy.io/usage/rule-based-matching#on_match

None
max_scope Optional[int]

A number of tokens to explicitly limit the size of a section body. If None, the scope will include the entire doc up until either the next section header or the end of the doc. This variable can also be set at a global level as `Sectionizer(nlp, max_scope=...), but if the attribute is set here, the rule scope will take precedence. If not None, this will be the number of tokens following the matched section header Example: In the text "Past Medical History: Pt has hx of pneumonia", SectionRule("Past Medical History:", "pmh", max_scope=None) will include the entire doc, but SectionRule("Past Medical History:", "pmh", max_scope=2) will limit the section to be "Past Medical History: Pt has" This can be useful for limiting certain sections which are known to be short or allowing others to be longer than the regular global max_scope.

None
parents Optional[List[str]]

A list of candidate parents for determining subsections

None
parent_required bool

Whether a parent is required for the section to exist in the final output. If true and no parent is identified, the section will be removed.

False
metadata Optional[Dict[Any, Any]]

Optional dictionary of any extra metadata.

None
Source code in medspacy/section_detection/section_rule.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def __init__(
    self,
    literal: str,
    category: str,
    pattern: Optional[Union[List[Dict[str, str]], str]] = None,
    on_match: Optional[
        Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
    ] = None,
    max_scope: Optional[int] = None,
    parents: Optional[List[str]] = None,
    parent_required: bool = False,
    metadata: Optional[Dict[Any, Any]] = None,
):
    """
    Class for defining rules for extracting entities from text using TargetMatcher.

    Args:
        literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
            matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
            but can be used as a reference as the rule name.
        category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
        pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
            token-based pattern matching to match using token attributes. If a string, will use medspaCy's
            RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
            https://spacy.io/usage/rule-based-matching.
        on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
            matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
        max_scope: A number of tokens to explicitly limit the size of a section body. If None, the scope will
            include the entire doc up until either the next section header or the end of the doc. This variable can
            also be set at a global level as `Sectionizer(nlp, max_scope=...), but if the attribute is set here, the
            rule scope will take precedence. If not None, this will be the number of tokens following the matched
            section header
                Example:
                    In the text "Past Medical History: Pt has hx of pneumonia",
                    SectionRule("Past Medical History:", "pmh", max_scope=None) will include the entire doc, but
                    SectionRule("Past Medical History:", "pmh", max_scope=2) will limit the section
                        to be "Past Medical History: Pt has"
            This can be useful for limiting certain sections which are known to be short or allowing others to be
            longer than the regular global max_scope.
        parents: A list of candidate parents for determining subsections
        parent_required: Whether a parent is required for the section to exist in the final output. If true and no
            parent is identified, the section will be removed.
        metadata: Optional dictionary of any extra metadata.
    """
    super().__init__(literal, category, pattern, on_match, metadata)
    self.max_scope = max_scope
    self.parents = parents
    if parent_required:
        if not parents:
            raise ValueError(
                f"Jsonl file incorrectly formatted for pattern name {category}. "
                f"If parents are required, then at least one parent must be specified."
            )
    self.parent_required = parent_required
from_dict(rule_dict) classmethod

Reads a dictionary into a SectionRule list. Used when reading from a json file.

Parameters:

Name Type Description Default
rule_dict

the dictionary to convert

required

Returns:

Name Type Description
item

the SectionRule created from the dictionary

Source code in medspacy/section_detection/section_rule.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
@classmethod
def from_dict(cls, rule_dict):
    """
    Reads a dictionary into a SectionRule list. Used when reading from a json file.

    Args:
        rule_dict: the dictionary to convert

    Returns:
        item: the SectionRule created from the dictionary
    """
    keys = set(rule_dict.keys())
    invalid_keys = keys.difference(cls._ALLOWED_KEYS)
    if invalid_keys:
        msg = (
            f"JSON object contains invalid keys: {invalid_keys}. "
            f"Must be one of: {cls._ALLOWED_KEYS}"
        )
        raise ValueError(msg)
    rule = SectionRule(**rule_dict)
    return rule
from_json(filepath) classmethod

Read in a lexicon of modifiers from a JSON file.

Parameters:

Name Type Description Default
filepath

the .json file containing modifier rules

required

Returns:

Name Type Description
section_rules List[SectionRule]

a list of SectionRule objects

Source code in medspacy/section_detection/section_rule.py
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
@classmethod
def from_json(cls, filepath) -> List[SectionRule]:
    """
    Read in a lexicon of modifiers from a JSON file.

    Args:
        filepath: the .json file containing modifier rules

    Returns:
        section_rules: a list of SectionRule objects
    """
    import json

    with open(filepath) as file:
        section_data = json.load(file)
    section_rules = []
    for data in section_data["section_rules"]:
        section_rules.append(SectionRule.from_dict(data))
    return section_rules
to_dict()

Converts TargetRules to a python dictionary. Used when writing section rules to a json file.

Returns:

Name Type Description
rule_dict

the dictionary containing the TargetRule info.

Source code in medspacy/section_detection/section_rule.py
123
124
125
126
127
128
129
130
131
132
133
134
135
def to_dict(self):
    """
    Converts TargetRules to a python dictionary. Used when writing section rules to a json file.

    Returns:
        rule_dict: the dictionary containing the TargetRule info.
    """
    rule_dict = {}
    for key in self._ALLOWED_KEYS:
        value = self.__dict__.get(key)
        if value is not None:
            rule_dict[key] = value
    return rule_dict

sectionizer

Sectionizer

The Sectionizer will search for spans in the text which match section header rules, such as 'Past Medical History:'. Sections will be represented in custom attributes as: category: A normalized title of the section. Example: 'past_medical_history' section_title: The Span of the doc which was matched as a section header. Example: 'Past Medical History:' section_span: The entire section of the note, starting with section_header and up until the end of the section, which will be either the start of the next section header of some pre-specified scope. Example: 'Past Medical History: Type II DM'

Section attributes will be registered for each Doc, Span, and Token in the following attributes: Doc..sections: A list of namedtuples of type Section with 4 elements: - section_title - section_header - section_parent - section_span. A Doc will also have attributes corresponding to lists of each (ie., Doc..section_titles, Doc..section_headers, Doc..section_parents, Doc..section_list) (Span|Token)..section_title (Span|Token)..section_header (Span|Token)..section_parent (Span|Token)._.section_span

Source code in medspacy/section_detection/sectionizer.py
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
@Language.factory("medspacy_sectionizer")
class Sectionizer:
    """
    The Sectionizer will search for spans in the text which match section header rules, such as 'Past Medical History:'.
    Sections will be represented in custom attributes as:
        category: A normalized title of the section. Example: 'past_medical_history'
        section_title: The Span of the doc which was matched as a section header.
            Example: 'Past Medical History:'
        section_span: The entire section of the note, starting with section_header and up until the end
            of the section, which will be either the start of the next section header of some pre-specified
            scope. Example: 'Past Medical History: Type II DM'

    Section attributes will be registered for each Doc, Span, and Token in the following attributes:
        Doc._.sections: A list of namedtuples of type Section with 4 elements:
            - section_title
            - section_header
            - section_parent
            - section_span.
        A Doc will also have attributes corresponding to lists of each
            (ie., Doc._.section_titles, Doc._.section_headers, Doc._.section_parents, Doc._.section_list)
        (Span|Token)._.section_title
        (Span|Token)._.section_header
        (Span|Token)._.section_parent
        (Span|Token)._.section_span
    """

    def __init__(
        self,
        nlp: Language,
        name: str = "medspacy_sectionizer",
        rules: Optional[str] = "default",
        language_code: str = 'en',
        max_section_length: Optional[int] = None,
        phrase_matcher_attr: str = "LOWER",
        require_start_line: bool = False,
        require_end_line: bool = False,
        newline_pattern: str = r"[\n\r]+[\s]*$",
        input_span_type: Union[Literal["ents", "group"], None] = "ents",
        span_group_name: str = "medspacy_spans",
        span_attrs: Union[
            Literal["default"], Dict[str, Dict[str, Any]], None
        ] = "default",
        apply_sentence_boundary: bool = False,
    ):
        """
        Create a new Sectionizer component.

        Args:
            nlp: A SpaCy Language object.
            name: The name of the component.
            rules: The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from
                SecTag, MIMIC-III, and practical refinement at the US Department of Veterans Affairs. If None, no rules
                are loaded. Otherwise, must be a path to a json file containing rules. Add SectionRules directly through
                `Sectionizer.add`.
            language_code: Language code to use (ISO code) as a default for loading resources.  See documentation
                and also the /resources directory to see which resources might be available in each language.
                Default is "en" for English.
            max_section_length: Optional argument specifying the maximum number of tokens following a section header
                which can be included in a section body. This can be useful if you think your section rules are
                incomplete and want to prevent sections from running too long in the note. Default is None, meaning that
                the scope of a section will be until either the next section header or the end of the document.
            phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
                is 'LOWER'.
            require_start_line: Optionally require a section header to start on a new line. Default False.
            require_end_line: Optionally require a section header to end with a new line. Default False.
            newline_pattern: Regular expression to match the new line either preceding or following a header
                if either require_start_line or require_end_line are True. Default is r"[\n\r]+[\s]*$"
            span_attrs: The optional span attributes to modify. Default option "default" uses attributes in
                `DEFAULT_ATTRIBUTES`. If a dictionary of custom attributes, format is a dictionary mapping section
                categories to a dictionary containing the attribute name and the value to set the attribute to when a
                span is contained in a section of that category. Custom attributes must be assigned with
                `Span.set_extension` before creating the Sectionizer. If None, sectionizer will not modify span
                attributes.
            input_span_type: "ents" or "group". Where to look for spans when modifying attributes of spans
                contained in a section if `span_attrs` is not None. "ents" will modify attributes of spans in doc.ents.
                "group" will modify attributes of spans in the span group specified by `span_group_name`.
            span_group_name: The name of the span group used when `input_span_type` is "group". Default is
                "medspacy_spans".
            apply_sentence_boundary: Optionally end sentence before and after section header boundary. This ensures
                the section header is considered its own sentence.
        """
        self.nlp = nlp
        self.name = name
        self.max_section_length = max_section_length
        self.require_start_line = require_start_line
        self.require_end_line = require_end_line
        self.newline_pattern = re.compile(newline_pattern)
        self.assertion_attributes_mapping = None
        self._parent_sections = {}
        self._parent_required = {}
        self._input_span_type = input_span_type
        self._span_group_name = span_group_name
        self._apply_sentence_boundary = apply_sentence_boundary

        self.__matcher = MedspacyMatcher(
            nlp, name=name, phrase_matcher_attr=phrase_matcher_attr
        )

        self.DEFAULT_RULES_FILEPATH = path.join(
            Path(__file__).resolve().parents[2],
            "resources",
            language_code.lower(),
            "section_patterns.json",
        )

        rule_path = None
        if rules == "default":
            rule_path = self.DEFAULT_RULES_FILEPATH
        else:
            rule_path = rules

        if rule_path:
            self.add(SectionRule.from_json(rule_path))

        if span_attrs == "default":
            self.assertion_attributes_mapping = DEFAULT_ATTRS
            self.register_default_attributes()
        elif span_attrs:
            for _, attr_dict in span_attrs.items():
                for attr_name in attr_dict.keys():
                    if not Span.has_extension(attr_name):
                        raise ValueError(
                            f"Custom extension {attr_name} has not been set. Please ensure Span.set_extension is "
                            f"called for your pipeline's custom extensions."
                        )
            self.assertion_attributes_mapping = span_attrs

    @property
    def rules(self) -> List[SectionRule]:
        """
        Gets list of rules associated with the Sectionizer.

        Returns:
            The list of SectionRules associated with the Sectionizer.
        """
        return self.__matcher.rules

    @property
    def section_categories(self) -> Set[str]:
        """
        Gets a list of categories used in the Sectionizer.

        Returns:
                The list of all section categories available to the Sectionizer.
        """
        return self.__matcher.labels

    @property
    def input_span_type(self):
        """
        The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for
        a spaCy span group.

        Returns:
            The input type, "ents" or "group".
        """
        return self._input_span_type

    @input_span_type.setter
    def input_span_type(self, val):
        if not (val == "ents" or val == "group"):
            raise ValueError('input_type must be "ents" or "group".')
        self._input_span_type = val

    @property
    def span_group_name(self) -> str:
        """
        The name of the span group used by this component. If `input_type` is "group", calling this component will
        use spans in the span group with this name.

        Returns:
            The span group name.
        """
        return self._span_group_name

    @span_group_name.setter
    def span_group_name(self, name: str):
        if not name or not isinstance(name, str):
            raise ValueError("Span group name must be a string.")
        self._span_group_name = name

    @classmethod
    def register_default_attributes(cls):
        """
        Register the default values for the Span attributes defined in `DEFAULT_ATTRIBUTES`.
        """
        for attr_name in [
            "is_negated",
            "is_uncertain",
            "is_historical",
            "is_hypothetical",
            "is_family",
        ]:
            try:
                Span.set_extension(attr_name, default=False)
            except ValueError:  # Extension already set
                pass

    def add(self, rules):
        """
        Adds SectionRules to the Sectionizer.

        Args:
            rules: A single SectionRule or a collection of SectionRules to add to the Sectionizer.
        """
        if isinstance(rules, SectionRule):
            rules = [rules]

        for rule in rules:
            if not isinstance(rule, SectionRule):
                raise TypeError("Rules must be type SectionRule, not", type(rule))

        self.__matcher.add(rules)

        for rule in rules:
            name = rule.category
            parents = rule.parents
            parent_required = rule.parent_required
            if parents:
                if name in self._parent_sections.keys():
                    warnings.warn(
                        f"Duplicate section title {name}. Merging parents. "
                        f"If this is not intended, please specify distinct titles.",
                        RuntimeWarning,
                    )
                    self._parent_sections[name].update(parents)
                else:
                    self._parent_sections[name] = set(parents)

            if (
                name in self._parent_required.keys()
                and self._parent_required[name] != parent_required
            ):
                warnings.warn(
                    f"Duplicate section title {name} has different parent_required option. "
                    f"Setting parent_required to False.",
                    RuntimeWarning,
                )
                self._parent_required[name] = False
            else:
                self._parent_required[name] = parent_required

    def set_parent_sections(
        self, sections: List[Tuple[int, int, int]]
    ) -> List[Tuple[int, int, int, int]]:
        """
        Determine the legal parent-child section relationships from the list
        of in-order sections of a document and the possible parents of each
        section as specified during direction creation.

        Args:
            sections: a list of spacy match tuples found in the doc

        Returns:
            A list of tuples (match_id, start, end, parent_idx) where the first three indices are the same as the input
            and the added parent_idx represents the index in the list that corresponds to the parent section. Might be a
            smaller list than the input due to pruning with `parent_required`.
        """
        sections_final = []
        removed_sections = 0
        for i, (match_id, start, end) in enumerate(sections):
            name = self.__matcher.rule_map[self.nlp.vocab.strings[match_id]].category
            required = self._parent_required[name]
            i_a = i - removed_sections  # adjusted index for removed values
            if required and i_a == 0:
                removed_sections += 1
                continue
            elif i_a == 0 or name not in self._parent_sections.keys():
                sections_final.append((match_id, start, end, None))
            else:
                parents = self._parent_sections[name]
                identified_parent = None
                for parent in parents:
                    # go backwards through the section "tree" until you hit a root or the start of the list
                    candidate = self.__matcher.rule_map[
                        self.nlp.vocab.strings[sections_final[i_a - 1][0]]
                    ].category
                    candidates_parent_idx = sections_final[i_a - 1][3]
                    if candidates_parent_idx is not None:
                        candidates_parent = self.__matcher.rule_map[
                            self.nlp.vocab.strings[
                                sections_final[candidates_parent_idx][0]
                            ]
                        ].category
                    else:
                        candidates_parent = None
                    candidate_i = i_a - 1
                    while candidate:
                        if candidate == parent:
                            identified_parent = candidate_i
                            candidate = None
                        else:
                            # if you are at the end of the list... no parent
                            if candidate_i < 1:
                                candidate = None
                                continue
                            # if the current candidate has no parent... no parent exists
                            if not candidates_parent:
                                candidate = None
                                continue
                            # otherwise get the previous item in the list
                            temp = self.__matcher.rule_map[
                                self.nlp.vocab.strings[
                                    sections_final[candidate_i - 1][0]
                                ]
                            ].category
                            temp_parent_idx = sections_final[candidate_i - 1][3]
                            if temp_parent_idx is not None:
                                temp_parent = self.__matcher.rule_map[
                                    self.nlp.vocab.strings[
                                        sections_final[temp_parent_idx][0]
                                    ]
                                ].category
                            else:
                                temp_parent = None
                            # if the previous item is the parent of the current item
                            # OR if the previous item is a sibling of the current item
                            # continue to search
                            if (
                                temp == candidates_parent
                                or temp_parent == candidates_parent
                            ):
                                candidate = temp
                                candidates_parent = temp_parent
                                candidate_i -= 1
                            # otherwise, there is no further tree traversal
                            else:
                                candidate = None

                # if a parent is required, then add
                if identified_parent is not None or not required:
                    # if the parent is identified, add section
                    # if the parent is not required, add section
                    # if parent is not identified and required, do not add the section
                    sections_final.append((match_id, start, end, identified_parent))
                else:
                    removed_sections += 1
        return sections_final

    def set_assertion_attributes(self, spans: Iterable[Span]):
        """
        Add Span-level attributes to entities based on which section they occur in.

        Args:
            spans: the spans to modify.
        """
        for span in spans:
            if (
                span._.section
                and span._.section.category in self.assertion_attributes_mapping
            ):
                attr_dict = self.assertion_attributes_mapping[span._.section.category]
                for (attr_name, attr_value) in attr_dict.items():
                    setattr(span._, attr_name, attr_value)

    def __call__(self, doc: Doc) -> Doc:
        """
        Call the Sectionizer on a spaCy doc. Sectionizer will identify sections using provided rules, then evaluate any
        section hierarchy as needed, create section spans, and modify attributes on existing spans based on the sections
        the entities spans in.

        Args:
            doc: The Doc to process.

        Returns:
            The processed spaCy Doc.
        """
        matches = self.__matcher(doc)
        if self.require_start_line:
            matches = self.filter_start_lines(doc, matches)
        if self.require_end_line:
            matches = self.filter_end_lines(doc, matches)
        if self._parent_sections:
            matches = self.set_parent_sections(matches)

        # If this has already been processed by the sectionizer, reset the sections
        doc._.sections = []
        # if there were no matches, return the doc as one section
        if len(matches) == 0:
            doc._.sections.append(Section(None, 0, 0, 0, len(doc)))
            return doc

        section_list = []
        # if the first match does not begin at token 0, handle the first section
        first_match = matches[0]
        if first_match[1] != 0:
            section_list.append(Section(None, 0, 0, 0, first_match[1]))

        # handle section spans
        for i, match in enumerate(matches):
            parent = None
            if len(match) == 4:
                (match_id, start, end, parent_idx) = match
                if parent_idx is not None:
                    parent = section_list[parent_idx]
            else:
                # IDEs will warn here about match shape disagreeing w/ type hinting, but this if is only used if
                # parent sections were never set, so parent_idx does not exist
                (match_id, start, end) = match

            # Make section header its own sentence
            if self._apply_sentence_boundary:
                # Section headers should be considered the start of a sentence
                doc[start].sent_start = True
                # Text following the header should also be considered a new sentence
                if end < len(doc):
                    doc[end].sent_start = True

            rule = self.__matcher.rule_map[self.nlp.vocab.strings[match_id]]
            category = rule.category
            # If this is the last match, it should include the rest of the doc
            if i == len(matches) - 1:
                # If there is no scope limitation, go until the end of the doc
                if self.max_section_length is None and rule.max_scope is None:
                    section_list.append(
                        Section(category, start, end, end, len(doc), parent, rule)
                    )
                else:
                    # If the rule has a max_scope, use that as a precedence
                    if rule.max_scope is not None:
                        scope_end = min(end + rule.max_scope, doc[-1].i + 1)
                    else:
                        scope_end = min(end + self.max_section_length, doc[-1].i + 1)

                    section_list.append(
                        Section(category, start, end, end, scope_end, parent, rule)
                    )
            # Otherwise, go until the next section header
            else:
                next_match = matches[i + 1]
                if len(match) == 4:
                    _, next_start, _, _ = next_match
                else:
                    _, next_start, _ = next_match
                if self.max_section_length is None and rule.max_scope is None:
                    section_list.append(
                        Section(category, start, end, end, next_start, parent, rule)
                    )
                else:
                    if rule.max_scope is not None:
                        scope_end = min(end + rule.max_scope, next_start)
                    else:
                        scope_end = min(end + self.max_section_length, next_start)
                    section_list.append(
                        Section(category, start, end, end, scope_end, parent, rule)
                    )

        for section in section_list:
            doc._.sections.append(section)
            start, end = section.section_span
            for token in doc[start:end]:
                token._.section = section

        # If it is specified to add assertion attributes,
        # iterate through the entities in doc and add them
        if self.assertion_attributes_mapping:
            if self._input_span_type.lower() == "ents":
                self.set_assertion_attributes(doc.ents)
            elif self._input_span_type.lower() == "group":
                self.set_assertion_attributes(doc.spans[self._span_group_name])

        return doc

    def filter_start_lines(
        self, doc: Doc, matches: List[Tuple[int, int, int]]
    ) -> List[Tuple[int, int, int]]:
        """
        Filter a list of matches to only contain spans where the start token is the beginning of a new line.

        Returns:
            A list of match tuples (match_id, start, end) that meet the filter criteria.
        """
        return [
            m for m in matches if util.is_start_line(m[1], doc, self.newline_pattern)
        ]

    def filter_end_lines(
        self, doc: Doc, matches: List[Tuple[int, int, int]]
    ) -> List[Tuple[int, int, int]]:
        """
        Filter a list of matches to only contain spans where the start token is followed by a new line.

        Returns:
            A list of match tuples (match_id, start, end) that meet the filter criteria.
        """
        return [
            m for m in matches if util.is_end_line(m[2] - 1, doc, self.newline_pattern)
        ]
input_span_type property writable

The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for a spaCy span group.

Returns:

Type Description

The input type, "ents" or "group".

rules property

Gets list of rules associated with the Sectionizer.

Returns:

Type Description
List[SectionRule]

The list of SectionRules associated with the Sectionizer.

section_categories property

Gets a list of categories used in the Sectionizer.

Returns:

Type Description
Set[str]

The list of all section categories available to the Sectionizer.

span_group_name property writable

The name of the span group used by this component. If input_type is "group", calling this component will use spans in the span group with this name.

Returns:

Type Description
str

The span group name.

__call__(doc)

Call the Sectionizer on a spaCy doc. Sectionizer will identify sections using provided rules, then evaluate any section hierarchy as needed, create section spans, and modify attributes on existing spans based on the sections the entities spans in.

Parameters:

Name Type Description Default
doc Doc

The Doc to process.

required

Returns:

Type Description
Doc

The processed spaCy Doc.

Source code in medspacy/section_detection/sectionizer.py
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
def __call__(self, doc: Doc) -> Doc:
    """
    Call the Sectionizer on a spaCy doc. Sectionizer will identify sections using provided rules, then evaluate any
    section hierarchy as needed, create section spans, and modify attributes on existing spans based on the sections
    the entities spans in.

    Args:
        doc: The Doc to process.

    Returns:
        The processed spaCy Doc.
    """
    matches = self.__matcher(doc)
    if self.require_start_line:
        matches = self.filter_start_lines(doc, matches)
    if self.require_end_line:
        matches = self.filter_end_lines(doc, matches)
    if self._parent_sections:
        matches = self.set_parent_sections(matches)

    # If this has already been processed by the sectionizer, reset the sections
    doc._.sections = []
    # if there were no matches, return the doc as one section
    if len(matches) == 0:
        doc._.sections.append(Section(None, 0, 0, 0, len(doc)))
        return doc

    section_list = []
    # if the first match does not begin at token 0, handle the first section
    first_match = matches[0]
    if first_match[1] != 0:
        section_list.append(Section(None, 0, 0, 0, first_match[1]))

    # handle section spans
    for i, match in enumerate(matches):
        parent = None
        if len(match) == 4:
            (match_id, start, end, parent_idx) = match
            if parent_idx is not None:
                parent = section_list[parent_idx]
        else:
            # IDEs will warn here about match shape disagreeing w/ type hinting, but this if is only used if
            # parent sections were never set, so parent_idx does not exist
            (match_id, start, end) = match

        # Make section header its own sentence
        if self._apply_sentence_boundary:
            # Section headers should be considered the start of a sentence
            doc[start].sent_start = True
            # Text following the header should also be considered a new sentence
            if end < len(doc):
                doc[end].sent_start = True

        rule = self.__matcher.rule_map[self.nlp.vocab.strings[match_id]]
        category = rule.category
        # If this is the last match, it should include the rest of the doc
        if i == len(matches) - 1:
            # If there is no scope limitation, go until the end of the doc
            if self.max_section_length is None and rule.max_scope is None:
                section_list.append(
                    Section(category, start, end, end, len(doc), parent, rule)
                )
            else:
                # If the rule has a max_scope, use that as a precedence
                if rule.max_scope is not None:
                    scope_end = min(end + rule.max_scope, doc[-1].i + 1)
                else:
                    scope_end = min(end + self.max_section_length, doc[-1].i + 1)

                section_list.append(
                    Section(category, start, end, end, scope_end, parent, rule)
                )
        # Otherwise, go until the next section header
        else:
            next_match = matches[i + 1]
            if len(match) == 4:
                _, next_start, _, _ = next_match
            else:
                _, next_start, _ = next_match
            if self.max_section_length is None and rule.max_scope is None:
                section_list.append(
                    Section(category, start, end, end, next_start, parent, rule)
                )
            else:
                if rule.max_scope is not None:
                    scope_end = min(end + rule.max_scope, next_start)
                else:
                    scope_end = min(end + self.max_section_length, next_start)
                section_list.append(
                    Section(category, start, end, end, scope_end, parent, rule)
                )

    for section in section_list:
        doc._.sections.append(section)
        start, end = section.section_span
        for token in doc[start:end]:
            token._.section = section

    # If it is specified to add assertion attributes,
    # iterate through the entities in doc and add them
    if self.assertion_attributes_mapping:
        if self._input_span_type.lower() == "ents":
            self.set_assertion_attributes(doc.ents)
        elif self._input_span_type.lower() == "group":
            self.set_assertion_attributes(doc.spans[self._span_group_name])

    return doc
__init__(nlp, name='medspacy_sectionizer', rules='default', language_code='en', max_section_length=None, phrase_matcher_attr='LOWER', require_start_line=False, require_end_line=False, newline_pattern='[\\n\\r]+[\\s]*$', input_span_type='ents', span_group_name='medspacy_spans', span_attrs='default', apply_sentence_boundary=False)
   Create a new Sectionizer component.

   Args:
       nlp: A SpaCy Language object.
       name: The name of the component.
       rules: The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from
           SecTag, MIMIC-III, and practical refinement at the US Department of Veterans Affairs. If None, no rules
           are loaded. Otherwise, must be a path to a json file containing rules. Add SectionRules directly through
           `Sectionizer.add`.
       language_code: Language code to use (ISO code) as a default for loading resources.  See documentation
           and also the /resources directory to see which resources might be available in each language.
           Default is "en" for English.
       max_section_length: Optional argument specifying the maximum number of tokens following a section header
           which can be included in a section body. This can be useful if you think your section rules are
           incomplete and want to prevent sections from running too long in the note. Default is None, meaning that
           the scope of a section will be until either the next section header or the end of the document.
       phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
           is 'LOWER'.
       require_start_line: Optionally require a section header to start on a new line. Default False.
       require_end_line: Optionally require a section header to end with a new line. Default False.
       newline_pattern: Regular expression to match the new line either preceding or following a header
           if either require_start_line or require_end_line are True. Default is r"[

]+[\s]*$" span_attrs: The optional span attributes to modify. Default option "default" uses attributes in DEFAULT_ATTRIBUTES. If a dictionary of custom attributes, format is a dictionary mapping section categories to a dictionary containing the attribute name and the value to set the attribute to when a span is contained in a section of that category. Custom attributes must be assigned with Span.set_extension before creating the Sectionizer. If None, sectionizer will not modify span attributes. input_span_type: "ents" or "group". Where to look for spans when modifying attributes of spans contained in a section if span_attrs is not None. "ents" will modify attributes of spans in doc.ents. "group" will modify attributes of spans in the span group specified by span_group_name. span_group_name: The name of the span group used when input_span_type is "group". Default is "medspacy_spans". apply_sentence_boundary: Optionally end sentence before and after section header boundary. This ensures the section header is considered its own sentence.

Source code in medspacy/section_detection/sectionizer.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def __init__(
    self,
    nlp: Language,
    name: str = "medspacy_sectionizer",
    rules: Optional[str] = "default",
    language_code: str = 'en',
    max_section_length: Optional[int] = None,
    phrase_matcher_attr: str = "LOWER",
    require_start_line: bool = False,
    require_end_line: bool = False,
    newline_pattern: str = r"[\n\r]+[\s]*$",
    input_span_type: Union[Literal["ents", "group"], None] = "ents",
    span_group_name: str = "medspacy_spans",
    span_attrs: Union[
        Literal["default"], Dict[str, Dict[str, Any]], None
    ] = "default",
    apply_sentence_boundary: bool = False,
):
    """
    Create a new Sectionizer component.

    Args:
        nlp: A SpaCy Language object.
        name: The name of the component.
        rules: The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from
            SecTag, MIMIC-III, and practical refinement at the US Department of Veterans Affairs. If None, no rules
            are loaded. Otherwise, must be a path to a json file containing rules. Add SectionRules directly through
            `Sectionizer.add`.
        language_code: Language code to use (ISO code) as a default for loading resources.  See documentation
            and also the /resources directory to see which resources might be available in each language.
            Default is "en" for English.
        max_section_length: Optional argument specifying the maximum number of tokens following a section header
            which can be included in a section body. This can be useful if you think your section rules are
            incomplete and want to prevent sections from running too long in the note. Default is None, meaning that
            the scope of a section will be until either the next section header or the end of the document.
        phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
            is 'LOWER'.
        require_start_line: Optionally require a section header to start on a new line. Default False.
        require_end_line: Optionally require a section header to end with a new line. Default False.
        newline_pattern: Regular expression to match the new line either preceding or following a header
            if either require_start_line or require_end_line are True. Default is r"[\n\r]+[\s]*$"
        span_attrs: The optional span attributes to modify. Default option "default" uses attributes in
            `DEFAULT_ATTRIBUTES`. If a dictionary of custom attributes, format is a dictionary mapping section
            categories to a dictionary containing the attribute name and the value to set the attribute to when a
            span is contained in a section of that category. Custom attributes must be assigned with
            `Span.set_extension` before creating the Sectionizer. If None, sectionizer will not modify span
            attributes.
        input_span_type: "ents" or "group". Where to look for spans when modifying attributes of spans
            contained in a section if `span_attrs` is not None. "ents" will modify attributes of spans in doc.ents.
            "group" will modify attributes of spans in the span group specified by `span_group_name`.
        span_group_name: The name of the span group used when `input_span_type` is "group". Default is
            "medspacy_spans".
        apply_sentence_boundary: Optionally end sentence before and after section header boundary. This ensures
            the section header is considered its own sentence.
    """
    self.nlp = nlp
    self.name = name
    self.max_section_length = max_section_length
    self.require_start_line = require_start_line
    self.require_end_line = require_end_line
    self.newline_pattern = re.compile(newline_pattern)
    self.assertion_attributes_mapping = None
    self._parent_sections = {}
    self._parent_required = {}
    self._input_span_type = input_span_type
    self._span_group_name = span_group_name
    self._apply_sentence_boundary = apply_sentence_boundary

    self.__matcher = MedspacyMatcher(
        nlp, name=name, phrase_matcher_attr=phrase_matcher_attr
    )

    self.DEFAULT_RULES_FILEPATH = path.join(
        Path(__file__).resolve().parents[2],
        "resources",
        language_code.lower(),
        "section_patterns.json",
    )

    rule_path = None
    if rules == "default":
        rule_path = self.DEFAULT_RULES_FILEPATH
    else:
        rule_path = rules

    if rule_path:
        self.add(SectionRule.from_json(rule_path))

    if span_attrs == "default":
        self.assertion_attributes_mapping = DEFAULT_ATTRS
        self.register_default_attributes()
    elif span_attrs:
        for _, attr_dict in span_attrs.items():
            for attr_name in attr_dict.keys():
                if not Span.has_extension(attr_name):
                    raise ValueError(
                        f"Custom extension {attr_name} has not been set. Please ensure Span.set_extension is "
                        f"called for your pipeline's custom extensions."
                    )
        self.assertion_attributes_mapping = span_attrs
add(rules)

Adds SectionRules to the Sectionizer.

Parameters:

Name Type Description Default
rules

A single SectionRule or a collection of SectionRules to add to the Sectionizer.

required
Source code in medspacy/section_detection/sectionizer.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
def add(self, rules):
    """
    Adds SectionRules to the Sectionizer.

    Args:
        rules: A single SectionRule or a collection of SectionRules to add to the Sectionizer.
    """
    if isinstance(rules, SectionRule):
        rules = [rules]

    for rule in rules:
        if not isinstance(rule, SectionRule):
            raise TypeError("Rules must be type SectionRule, not", type(rule))

    self.__matcher.add(rules)

    for rule in rules:
        name = rule.category
        parents = rule.parents
        parent_required = rule.parent_required
        if parents:
            if name in self._parent_sections.keys():
                warnings.warn(
                    f"Duplicate section title {name}. Merging parents. "
                    f"If this is not intended, please specify distinct titles.",
                    RuntimeWarning,
                )
                self._parent_sections[name].update(parents)
            else:
                self._parent_sections[name] = set(parents)

        if (
            name in self._parent_required.keys()
            and self._parent_required[name] != parent_required
        ):
            warnings.warn(
                f"Duplicate section title {name} has different parent_required option. "
                f"Setting parent_required to False.",
                RuntimeWarning,
            )
            self._parent_required[name] = False
        else:
            self._parent_required[name] = parent_required
filter_end_lines(doc, matches)

Filter a list of matches to only contain spans where the start token is followed by a new line.

Returns:

Type Description
List[Tuple[int, int, int]]

A list of match tuples (match_id, start, end) that meet the filter criteria.

Source code in medspacy/section_detection/sectionizer.py
503
504
505
506
507
508
509
510
511
512
513
514
def filter_end_lines(
    self, doc: Doc, matches: List[Tuple[int, int, int]]
) -> List[Tuple[int, int, int]]:
    """
    Filter a list of matches to only contain spans where the start token is followed by a new line.

    Returns:
        A list of match tuples (match_id, start, end) that meet the filter criteria.
    """
    return [
        m for m in matches if util.is_end_line(m[2] - 1, doc, self.newline_pattern)
    ]
filter_start_lines(doc, matches)

Filter a list of matches to only contain spans where the start token is the beginning of a new line.

Returns:

Type Description
List[Tuple[int, int, int]]

A list of match tuples (match_id, start, end) that meet the filter criteria.

Source code in medspacy/section_detection/sectionizer.py
490
491
492
493
494
495
496
497
498
499
500
501
def filter_start_lines(
    self, doc: Doc, matches: List[Tuple[int, int, int]]
) -> List[Tuple[int, int, int]]:
    """
    Filter a list of matches to only contain spans where the start token is the beginning of a new line.

    Returns:
        A list of match tuples (match_id, start, end) that meet the filter criteria.
    """
    return [
        m for m in matches if util.is_start_line(m[1], doc, self.newline_pattern)
    ]
register_default_attributes() classmethod

Register the default values for the Span attributes defined in DEFAULT_ATTRIBUTES.

Source code in medspacy/section_detection/sectionizer.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
@classmethod
def register_default_attributes(cls):
    """
    Register the default values for the Span attributes defined in `DEFAULT_ATTRIBUTES`.
    """
    for attr_name in [
        "is_negated",
        "is_uncertain",
        "is_historical",
        "is_hypothetical",
        "is_family",
    ]:
        try:
            Span.set_extension(attr_name, default=False)
        except ValueError:  # Extension already set
            pass
set_assertion_attributes(spans)

Add Span-level attributes to entities based on which section they occur in.

Parameters:

Name Type Description Default
spans Iterable[Span]

the spans to modify.

required
Source code in medspacy/section_detection/sectionizer.py
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
def set_assertion_attributes(self, spans: Iterable[Span]):
    """
    Add Span-level attributes to entities based on which section they occur in.

    Args:
        spans: the spans to modify.
    """
    for span in spans:
        if (
            span._.section
            and span._.section.category in self.assertion_attributes_mapping
        ):
            attr_dict = self.assertion_attributes_mapping[span._.section.category]
            for (attr_name, attr_value) in attr_dict.items():
                setattr(span._, attr_name, attr_value)
set_parent_sections(sections)

Determine the legal parent-child section relationships from the list of in-order sections of a document and the possible parents of each section as specified during direction creation.

Parameters:

Name Type Description Default
sections List[Tuple[int, int, int]]

a list of spacy match tuples found in the doc

required

Returns:

Type Description
List[Tuple[int, int, int, int]]

A list of tuples (match_id, start, end, parent_idx) where the first three indices are the same as the input

List[Tuple[int, int, int, int]]

and the added parent_idx represents the index in the list that corresponds to the parent section. Might be a

List[Tuple[int, int, int, int]]

smaller list than the input due to pruning with parent_required.

Source code in medspacy/section_detection/sectionizer.py
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
def set_parent_sections(
    self, sections: List[Tuple[int, int, int]]
) -> List[Tuple[int, int, int, int]]:
    """
    Determine the legal parent-child section relationships from the list
    of in-order sections of a document and the possible parents of each
    section as specified during direction creation.

    Args:
        sections: a list of spacy match tuples found in the doc

    Returns:
        A list of tuples (match_id, start, end, parent_idx) where the first three indices are the same as the input
        and the added parent_idx represents the index in the list that corresponds to the parent section. Might be a
        smaller list than the input due to pruning with `parent_required`.
    """
    sections_final = []
    removed_sections = 0
    for i, (match_id, start, end) in enumerate(sections):
        name = self.__matcher.rule_map[self.nlp.vocab.strings[match_id]].category
        required = self._parent_required[name]
        i_a = i - removed_sections  # adjusted index for removed values
        if required and i_a == 0:
            removed_sections += 1
            continue
        elif i_a == 0 or name not in self._parent_sections.keys():
            sections_final.append((match_id, start, end, None))
        else:
            parents = self._parent_sections[name]
            identified_parent = None
            for parent in parents:
                # go backwards through the section "tree" until you hit a root or the start of the list
                candidate = self.__matcher.rule_map[
                    self.nlp.vocab.strings[sections_final[i_a - 1][0]]
                ].category
                candidates_parent_idx = sections_final[i_a - 1][3]
                if candidates_parent_idx is not None:
                    candidates_parent = self.__matcher.rule_map[
                        self.nlp.vocab.strings[
                            sections_final[candidates_parent_idx][0]
                        ]
                    ].category
                else:
                    candidates_parent = None
                candidate_i = i_a - 1
                while candidate:
                    if candidate == parent:
                        identified_parent = candidate_i
                        candidate = None
                    else:
                        # if you are at the end of the list... no parent
                        if candidate_i < 1:
                            candidate = None
                            continue
                        # if the current candidate has no parent... no parent exists
                        if not candidates_parent:
                            candidate = None
                            continue
                        # otherwise get the previous item in the list
                        temp = self.__matcher.rule_map[
                            self.nlp.vocab.strings[
                                sections_final[candidate_i - 1][0]
                            ]
                        ].category
                        temp_parent_idx = sections_final[candidate_i - 1][3]
                        if temp_parent_idx is not None:
                            temp_parent = self.__matcher.rule_map[
                                self.nlp.vocab.strings[
                                    sections_final[temp_parent_idx][0]
                                ]
                            ].category
                        else:
                            temp_parent = None
                        # if the previous item is the parent of the current item
                        # OR if the previous item is a sibling of the current item
                        # continue to search
                        if (
                            temp == candidates_parent
                            or temp_parent == candidates_parent
                        ):
                            candidate = temp
                            candidates_parent = temp_parent
                            candidate_i -= 1
                        # otherwise, there is no further tree traversal
                        else:
                            candidate = None

            # if a parent is required, then add
            if identified_parent is not None or not required:
                # if the parent is identified, add section
                # if the parent is not required, add section
                # if parent is not identified and required, do not add the section
                sections_final.append((match_id, start, end, identified_parent))
            else:
                removed_sections += 1
    return sections_final

util

This module will contain helper functions and classes for common clinical processing tasks which will be used in medspaCy's sectionizer.

is_end_line(idx, doc, pattern)

Check whether the token at idx occurs at the end of the line.

Parameters:

Name Type Description Default
idx int

The token index to check.

required
doc Doc

The doc to check in.

required
pattern Pattern

The newline pattern to check with.

required

Returns:

Type Description
bool

Whether the token occurs at the end of a line.

Source code in medspacy/section_detection/util.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def is_end_line(idx: int, doc: Doc, pattern: re.Pattern) -> bool:
    """
    Check whether the token at idx occurs at the end of the line.

    Args:
        idx: The token index to check.
        doc: The doc to check in.
        pattern: The newline pattern to check with.

    Returns:
        Whether the token occurs at the end of a line.
    """
    # If it's the end of the doc, return True
    if idx == len(doc) - 1:
        return True

    # Check if either the token has trailing newlines,
    # or if the next token is a newline
    text = doc[idx].text_with_ws
    if pattern.search(text) is not None:
        return True
    following_text = doc[idx + 1].text_with_ws
    return pattern.search(following_text) is not None

is_start_line(idx, doc, pattern)

Check whether the token at idx occurs at the start of the line.

Parameters:

Name Type Description Default
idx int

The token index to check.

required
doc Doc

The doc to check in.

required
pattern Pattern

The newline pattern to check with.

required

Returns:

Type Description
bool

Whether the token occurs at the start of a line.

Source code in medspacy/section_detection/util.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def is_start_line(idx: int, doc: Doc, pattern: re.Pattern) -> bool:
    """
    Check whether the token at idx occurs at the start of the line.

    Args:
        idx: The token index to check.
        doc: The doc to check in.
        pattern: The newline pattern to check with.

    Returns:
        Whether the token occurs at the start of a line.
    """
    # If it's the start of the doc, return True
    if idx == 0:
        return True
    # Otherwise, check if the preceding token ends with newlines
    preceding_text = doc[idx - 1].text_with_ws
    return pattern.search(preceding_text) is not None

sentence_splitting

PySBDSentenceSplitter

Source code in medspacy/sentence_splitting.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
@Language.factory("medspacy_pysbd")
class PySBDSentenceSplitter:
    def __init__(self, name, nlp, clean=False):
        self.name = name
        self.nlp = nlp
        self.seg = pysbd.Segmenter(language="en", clean=clean, char_span=True)

    def __call__(self, doc):
        """
        Spacy component based on: https://github.com/nipunsadvilkar/pySBD improved to work with spacy 3.0
        """
        sents_char_spans = self.seg.segment(doc.text_with_ws)
        start_token_ids = [sent.start for sent in sents_char_spans]
        for token in doc:
            token.is_sent_start = True if token.idx in start_token_ids else False
        return doc

__call__(doc)

Spacy component based on: https://github.com/nipunsadvilkar/pySBD improved to work with spacy 3.0

Source code in medspacy/sentence_splitting.py
13
14
15
16
17
18
19
20
21
def __call__(self, doc):
    """
    Spacy component based on: https://github.com/nipunsadvilkar/pySBD improved to work with spacy 3.0
    """
    sents_char_spans = self.seg.segment(doc.text_with_ws)
    start_token_ids = [sent.start for sent in sents_char_spans]
    for token in doc:
        token.is_sent_start = True if token.idx in start_token_ids else False
    return doc

target_matcher

concept_tagger

ConceptTagger

ConceptTagger is a component for setting an attribute on tokens contained in spans extracted by TargetRules. This can be used for tasks such as semantic labeling or for normalizing tokens, making downstream extraction simpler.

A common use case is when a single concept can have many synonyms or variants and downstream rules would be simplified by matching on a unified token tag for those synonyms rather than including the entire synonym list in each downstream rule.

Source code in medspacy/target_matcher/concept_tagger.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
@Language.factory("medspacy_concept_tagger")
class ConceptTagger:
    """ConceptTagger is a component for setting an attribute on tokens contained in spans extracted by TargetRules. This
    can be used for tasks such as semantic labeling or for normalizing tokens, making downstream extraction simpler.

    A common use case is when a single concept can have many synonyms or variants and downstream rules would be
    simplified by matching on a unified token tag for those synonyms rather than including the entire synonym list in
    each downstream rule.
    """

    def __init__(
        self,
        nlp: Language,
        name: str = "medspacy_concept_tagger",
        attr_name: str = "concept_tag",
    ):
        """
        Creates a new ConceptTagger.

        Args:
            nlp: A spaCy Language model.
            name: The name of the ConceptTagger component. Must be a valid python variable name.
            attr_name: The name of the attribute to set to tokens.
        """
        self.nlp = nlp
        self.name = name
        self._attr_name = attr_name
        self.__matcher = MedspacyMatcher(nlp, name=name)

        # If the token attribute hasn't been registered, add it now
        # If it has already been set, then we can pass.
        # This will happen, for example, if you've already instantiated
        # the ConceptTagger and it registered the attribute.
        if not Token.has_extension(attr_name):
            Token.set_extension(attr_name, default="")

    @property
    def attr_name(self) -> str:
        """
        The name of the attribute that will be set on each matched token.

        Returns:
            The attribute name.
        """
        return self._attr_name

    def add(self, rules: Union[TargetRule, List[TargetRule]]):
        """
        Adds a single TargetRule or a list of TargetRules to the ConceptTagger.

        Args:
            rules: A single TargetRule or a collection of TargetRules.
        """
        self.__matcher.add(rules)

    def __call__(self, doc: Doc) -> Doc:
        """
        Call ConceptTagger on a doc. Matches spans and assigns attributes to all tokens contained in those spans, but
        does not preserve the spans themselves.

        Args:
            doc: The spaCy Doc to process.

        Returns:
            The spaCy Doc processed.
        """
        matches = self.__matcher(doc)
        for (rule_id, start, end) in matches:
            rule = self.__matcher.rule_map[self.nlp.vocab.strings[rule_id]]
            for i in range(start, end):
                setattr(doc[i]._, self.attr_name, rule.category)

        return doc
attr_name property

The name of the attribute that will be set on each matched token.

Returns:

Type Description
str

The attribute name.

__call__(doc)

Call ConceptTagger on a doc. Matches spans and assigns attributes to all tokens contained in those spans, but does not preserve the spans themselves.

Parameters:

Name Type Description Default
doc Doc

The spaCy Doc to process.

required

Returns:

Type Description
Doc

The spaCy Doc processed.

Source code in medspacy/target_matcher/concept_tagger.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def __call__(self, doc: Doc) -> Doc:
    """
    Call ConceptTagger on a doc. Matches spans and assigns attributes to all tokens contained in those spans, but
    does not preserve the spans themselves.

    Args:
        doc: The spaCy Doc to process.

    Returns:
        The spaCy Doc processed.
    """
    matches = self.__matcher(doc)
    for (rule_id, start, end) in matches:
        rule = self.__matcher.rule_map[self.nlp.vocab.strings[rule_id]]
        for i in range(start, end):
            setattr(doc[i]._, self.attr_name, rule.category)

    return doc
__init__(nlp, name='medspacy_concept_tagger', attr_name='concept_tag')

Creates a new ConceptTagger.

Parameters:

Name Type Description Default
nlp Language

A spaCy Language model.

required
name str

The name of the ConceptTagger component. Must be a valid python variable name.

'medspacy_concept_tagger'
attr_name str

The name of the attribute to set to tokens.

'concept_tag'
Source code in medspacy/target_matcher/concept_tagger.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def __init__(
    self,
    nlp: Language,
    name: str = "medspacy_concept_tagger",
    attr_name: str = "concept_tag",
):
    """
    Creates a new ConceptTagger.

    Args:
        nlp: A spaCy Language model.
        name: The name of the ConceptTagger component. Must be a valid python variable name.
        attr_name: The name of the attribute to set to tokens.
    """
    self.nlp = nlp
    self.name = name
    self._attr_name = attr_name
    self.__matcher = MedspacyMatcher(nlp, name=name)

    # If the token attribute hasn't been registered, add it now
    # If it has already been set, then we can pass.
    # This will happen, for example, if you've already instantiated
    # the ConceptTagger and it registered the attribute.
    if not Token.has_extension(attr_name):
        Token.set_extension(attr_name, default="")
add(rules)

Adds a single TargetRule or a list of TargetRules to the ConceptTagger.

Parameters:

Name Type Description Default
rules Union[TargetRule, List[TargetRule]]

A single TargetRule or a collection of TargetRules.

required
Source code in medspacy/target_matcher/concept_tagger.py
56
57
58
59
60
61
62
63
def add(self, rules: Union[TargetRule, List[TargetRule]]):
    """
    Adds a single TargetRule or a list of TargetRules to the ConceptTagger.

    Args:
        rules: A single TargetRule or a collection of TargetRules.
    """
    self.__matcher.add(rules)

target_matcher

TargetMatcher

TargetMatcher is a component for advanced direction-based text extraction. Rules are defined using medspacy.target_matcher.TargetRule.

A TargetMatcher will use the added TargetRule objects to identify matches in the text and apply labels or modify attributes. It will either modify the input spaCy Doc with the result or return the spans as a list.

In addition to extracting spans of text and setting labels, TargetRules can also define setting custom attributes and metadata. Additionally, each resulting span has an attribute span._.target_rule which maps a span to the TargetRule which set it.

Source code in medspacy/target_matcher/target_matcher.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
@Language.factory("medspacy_target_matcher")
class TargetMatcher:
    """
    TargetMatcher is a component for advanced direction-based text extraction. Rules are defined using
    `medspacy.target_matcher.TargetRule`.

    A `TargetMatcher` will use the added `TargetRule` objects to identify matches in the text and apply labels or modify
    attributes. It will either modify the input spaCy `Doc` with the result or return the spans as a list.

    In addition to extracting spans of text and setting labels, TargetRules can also define setting custom attributes
    and metadata. Additionally, each resulting span has an attribute span._.target_rule which maps a span to the
    TargetRule which set it.
    """

    def __init__(
        self,
        nlp: Language,
        name: str = "medspacy_target_matcher",
        rules: Optional[str] = None,
        phrase_matcher_attr: str = "LOWER",
        result_type: Union[Literal["ents", "group"], None] = "ents",
        span_group_name: str = "medspacy_spans",
        prune: bool = True
    ):
        """
        Creates a new TargetMatcher.

        Args:
            nlp: A spaCy Language model.
            name: The name of the TargetMatcher component
            rules: An optional filepath containing a JSON of TargetRules. If None, then no rules will be added. Default
                None.
            phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
                is 'LOWER'.
            result_type: "ents" (default), "group", or None. Determines where TargetMatcher will put the matched spans.
                "ents" will add spans to doc.ents and add to any existing entities. If conflicts appear, existing
                entities will take precedence. "group" will add spans to doc.spans under the specified group name. None
                will return the list of spans rather than saving to the Doc.
            span_group_name: The name of the span group used to store results when result_type is "group". Default is
                "medspacy_spans".
        """
        self.nlp = nlp
        self.name = name
        self._result_type = result_type
        self._span_group_name = span_group_name
        self._prune = prune

        if rules:
            self.add(TargetRule.from_json(rules))

        self.__matcher = MedspacyMatcher(
            nlp, name=name, phrase_matcher_attr=phrase_matcher_attr, prune=self._prune
        )

    @property
    def rules(self) -> List[TargetRule]:
        """
        Gets the list of TargetRules for the TargetMatcher.

        Returns:
            A list of TargetRules.
        """
        return self.__matcher.rules

    @property
    def labels(self) -> Set[str]:
        """
        Gets the list of labels for the TargetMatcher. Based on rules added to the TargetMatcher.

        Returns:
            A list of all labels that the TargetMatcher can produce.
        """
        return self.__matcher.labels

    @property
    def result_type(self) -> Union[str, None]:
        """
        The result type of the TargetMatcher. "ents" indicates that calling TargetMatcher will store the results in
        doc.ents, "group" indicates that the results will be stored in the span group indicated by `span_group_name`,
        and None indicates that spans will be returned in a list.

        Returns:
            The result type string.
        """
        return self._result_type

    @result_type.setter
    def result_type(self, result_type: Literal["ents", "group"]):
        if not (not result_type or result_type == "group" or result_type == "ents"):
            raise ValueError('result_type must be "ents", "group" or None.')
        self._result_type = result_type

    @property
    def span_group_name(self) -> str:
        """
        The name of the span group used by this component. If `result_type` is "group", calling this component will
        place results in the span group with this name.

        Returns:
            The span group name.
        """
        return self._span_group_name

    @span_group_name.setter
    def span_group_name(self, name: str):
        if not name or not isinstance(name, str):
            raise ValueError("Span group name must be a string.")
        self._span_group_name = name

    def add(self, rules: Union[TargetRule, Iterable[TargetRule]]):
        """
        Adds a single TargetRule or a list of TargetRules to the TargetMatcher.

        Args:
            rules: A single TargetRule or a collection of TargetRules.
        """
        if isinstance(rules, TargetRule):
            rules = [rules]
        for rule in rules:
            if not isinstance(rule, TargetRule):
                raise TypeError("Rules must be TargetRule, not", type(rule))
        self.__matcher.add(rules)

    def __call__(self, doc: Doc) -> Union[Doc, List[Span]]:
        """
        Calls TargetMatcher on a Doc. By default and when `result_type` is "ents", adds results to doc.ents. If
        `result_type` is "group", adds results to the span group specified by `span_group_name`. If `result_type` is
        None, then returns a list of the matched Spans.

        Args:
            doc: The spaCy Doc to process.

        Returns:
            Returns a modified `doc` when `TargetMatcher.result_type` is "ents" or "group". Returns a list of
            `Span` objects if `TargetMatcher.result_type` is None.
        """
        matches = self.__matcher(doc)
        spans = []
        for rule_id, start, end in matches:
            rule = self.__matcher.rule_map[self.nlp.vocab.strings[rule_id]]
            span = Span(doc, start=start, end=end, label=rule.category)
            span._.target_rule = rule
            if rule.attributes is not None:
                for attribute, value in rule.attributes.items():
                    try:
                        setattr(span._, attribute, value)
                    except AttributeError as e:
                        raise e
            spans.append(span)

        if not self.result_type:
            return spans
        elif self.result_type.lower() == "ents":
            for span in spans:
                try:
                    doc.ents += (span,)
                except ValueError:
                    # spaCy will raise a value error if the token in span are already part of an entity (i.e., as part
                    # of an upstream component). In that case, let the existing span supersede this one.
                    warnings.warn(
                        f'The result ""{span}"" conflicts with a pre-existing entity in doc.ents. This result has been '
                        f"skipped.",
                        RuntimeWarning,
                    )
            return doc
        elif self.result_type.lower() == "group":
            if self.span_group_name in doc.spans.keys():
                doc.spans[self.span_group_name] += spans
            else:
                doc.spans[self.span_group_name] = spans
            return doc
labels property

Gets the list of labels for the TargetMatcher. Based on rules added to the TargetMatcher.

Returns:

Type Description
Set[str]

A list of all labels that the TargetMatcher can produce.

result_type property writable

The result type of the TargetMatcher. "ents" indicates that calling TargetMatcher will store the results in doc.ents, "group" indicates that the results will be stored in the span group indicated by span_group_name, and None indicates that spans will be returned in a list.

Returns:

Type Description
Union[str, None]

The result type string.

rules property

Gets the list of TargetRules for the TargetMatcher.

Returns:

Type Description
List[TargetRule]

A list of TargetRules.

span_group_name property writable

The name of the span group used by this component. If result_type is "group", calling this component will place results in the span group with this name.

Returns:

Type Description
str

The span group name.

__call__(doc)

Calls TargetMatcher on a Doc. By default and when result_type is "ents", adds results to doc.ents. If result_type is "group", adds results to the span group specified by span_group_name. If result_type is None, then returns a list of the matched Spans.

Parameters:

Name Type Description Default
doc Doc

The spaCy Doc to process.

required

Returns:

Type Description
Union[Doc, List[Span]]

Returns a modified doc when TargetMatcher.result_type is "ents" or "group". Returns a list of

Union[Doc, List[Span]]

Span objects if TargetMatcher.result_type is None.

Source code in medspacy/target_matcher/target_matcher.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def __call__(self, doc: Doc) -> Union[Doc, List[Span]]:
    """
    Calls TargetMatcher on a Doc. By default and when `result_type` is "ents", adds results to doc.ents. If
    `result_type` is "group", adds results to the span group specified by `span_group_name`. If `result_type` is
    None, then returns a list of the matched Spans.

    Args:
        doc: The spaCy Doc to process.

    Returns:
        Returns a modified `doc` when `TargetMatcher.result_type` is "ents" or "group". Returns a list of
        `Span` objects if `TargetMatcher.result_type` is None.
    """
    matches = self.__matcher(doc)
    spans = []
    for rule_id, start, end in matches:
        rule = self.__matcher.rule_map[self.nlp.vocab.strings[rule_id]]
        span = Span(doc, start=start, end=end, label=rule.category)
        span._.target_rule = rule
        if rule.attributes is not None:
            for attribute, value in rule.attributes.items():
                try:
                    setattr(span._, attribute, value)
                except AttributeError as e:
                    raise e
        spans.append(span)

    if not self.result_type:
        return spans
    elif self.result_type.lower() == "ents":
        for span in spans:
            try:
                doc.ents += (span,)
            except ValueError:
                # spaCy will raise a value error if the token in span are already part of an entity (i.e., as part
                # of an upstream component). In that case, let the existing span supersede this one.
                warnings.warn(
                    f'The result ""{span}"" conflicts with a pre-existing entity in doc.ents. This result has been '
                    f"skipped.",
                    RuntimeWarning,
                )
        return doc
    elif self.result_type.lower() == "group":
        if self.span_group_name in doc.spans.keys():
            doc.spans[self.span_group_name] += spans
        else:
            doc.spans[self.span_group_name] = spans
        return doc
__init__(nlp, name='medspacy_target_matcher', rules=None, phrase_matcher_attr='LOWER', result_type='ents', span_group_name='medspacy_spans', prune=True)

Creates a new TargetMatcher.

Parameters:

Name Type Description Default
nlp Language

A spaCy Language model.

required
name str

The name of the TargetMatcher component

'medspacy_target_matcher'
rules Optional[str]

An optional filepath containing a JSON of TargetRules. If None, then no rules will be added. Default None.

None
phrase_matcher_attr str

The token attribute to use for PhraseMatcher for rules where pattern is None. Default is 'LOWER'.

'LOWER'
result_type Union[Literal['ents', 'group'], None]

"ents" (default), "group", or None. Determines where TargetMatcher will put the matched spans. "ents" will add spans to doc.ents and add to any existing entities. If conflicts appear, existing entities will take precedence. "group" will add spans to doc.spans under the specified group name. None will return the list of spans rather than saving to the Doc.

'ents'
span_group_name str

The name of the span group used to store results when result_type is "group". Default is "medspacy_spans".

'medspacy_spans'
Source code in medspacy/target_matcher/target_matcher.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def __init__(
    self,
    nlp: Language,
    name: str = "medspacy_target_matcher",
    rules: Optional[str] = None,
    phrase_matcher_attr: str = "LOWER",
    result_type: Union[Literal["ents", "group"], None] = "ents",
    span_group_name: str = "medspacy_spans",
    prune: bool = True
):
    """
    Creates a new TargetMatcher.

    Args:
        nlp: A spaCy Language model.
        name: The name of the TargetMatcher component
        rules: An optional filepath containing a JSON of TargetRules. If None, then no rules will be added. Default
            None.
        phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
            is 'LOWER'.
        result_type: "ents" (default), "group", or None. Determines where TargetMatcher will put the matched spans.
            "ents" will add spans to doc.ents and add to any existing entities. If conflicts appear, existing
            entities will take precedence. "group" will add spans to doc.spans under the specified group name. None
            will return the list of spans rather than saving to the Doc.
        span_group_name: The name of the span group used to store results when result_type is "group". Default is
            "medspacy_spans".
    """
    self.nlp = nlp
    self.name = name
    self._result_type = result_type
    self._span_group_name = span_group_name
    self._prune = prune

    if rules:
        self.add(TargetRule.from_json(rules))

    self.__matcher = MedspacyMatcher(
        nlp, name=name, phrase_matcher_attr=phrase_matcher_attr, prune=self._prune
    )
add(rules)

Adds a single TargetRule or a list of TargetRules to the TargetMatcher.

Parameters:

Name Type Description Default
rules Union[TargetRule, Iterable[TargetRule]]

A single TargetRule or a collection of TargetRules.

required
Source code in medspacy/target_matcher/target_matcher.py
121
122
123
124
125
126
127
128
129
130
131
132
133
def add(self, rules: Union[TargetRule, Iterable[TargetRule]]):
    """
    Adds a single TargetRule or a list of TargetRules to the TargetMatcher.

    Args:
        rules: A single TargetRule or a collection of TargetRules.
    """
    if isinstance(rules, TargetRule):
        rules = [rules]
    for rule in rules:
        if not isinstance(rule, TargetRule):
            raise TypeError("Rules must be TargetRule, not", type(rule))
    self.__matcher.add(rules)

target_rule

TargetRule

Bases: BaseRule

TargetRule defines rules for extracting entities from text using the TargetMatcher.

Source code in medspacy/target_matcher/target_rule.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
class TargetRule(BaseRule):
    """
    TargetRule defines rules for extracting entities from text using the TargetMatcher.
    """

    _ALLOWED_KEYS = {
        "literal",
        "pattern",
        "category",
        "metadata",
        "attributes",
    }

    def __init__(
        self,
        literal: str,
        category: str,
        pattern: Optional[Union[List[Dict[str, str]], str]] = None,
        on_match: Optional[
            Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
        ] = None,
        attributes: Optional[Dict[str, Any]] = None,
        metadata: Optional[Dict[Any, Any]] = None,
    ):
        """
        Creates a new TargetRule.

        Args:
            literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
                matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
                but can be used as a reference as the rule name.
            category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
            pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
                token-based pattern matching to match using token attributes. If a string, will use medspaCy's
                RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
                https://spacy.io/usage/rule-based-matching.
            on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
                matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
            attributes: Optional custom attribute names to set for a Span matched by the direction. These attribute
                names are stored under Span._.[attribute_name]. For example, if `attributes={'is_historical':True}`,
                then any spans matched by this direction will have span._.is_historical = True
            metadata: Optional dictionary of any extra metadata.
        """
        super().__init__(literal, category, pattern, on_match, metadata)
        self.attributes = attributes
        self._rule_id = None

    @classmethod
    def from_json(cls, filepath: str) -> List[TargetRule]:
        """Read in a lexicon of modifiers from a JSON file.

        Args:
            filepath: the .json file containing modifier rules

        Returns:
            context_item: A list of ConTextRule objects.

        Raises:
            KeyError: If the dictionary contains any keys other than
                those accepted by ConTextRule.__init__
        """
        import json

        with open(filepath) as file:
            target_data = json.load(file)
        target_rules = []
        for data in target_data["target_rules"]:
            target_rules.append(TargetRule.from_dict(data))
        return target_rules

    @classmethod
    def from_dict(cls, rule_dict: Dict) -> TargetRule:
        """Reads a dictionary into a ConTextRule. Used when reading from a json file.

        Args:
            rule_dict: the dictionary to convert

        Returns:
            The ConTextRule created from the dictionary

        Raises:
            ValueError: if the json is invalid
        """
        keys = set(rule_dict.keys())
        invalid_keys = keys.difference(cls._ALLOWED_KEYS)
        if invalid_keys:
            msg = (
                "JSON object contains invalid keys: {0}.\n"
                "Must be one of: {1}".format(invalid_keys, cls._ALLOWED_KEYS)
            )
            raise ValueError(msg)
        rule = TargetRule(**rule_dict)
        return rule

    @classmethod
    def to_json(cls, target_rules: List[TargetRule], filepath: str):
        """Writes ConTextItems to a json file.

        Args:
            target_rules: a list of TargetRules that will be written to a file.
            filepath: the .json file to contain modifier rules
        """
        import json

        data = {"target_rules": [rule.to_dict() for rule in target_rules]}
        with open(filepath, "w") as file:
            json.dump(data, file, indent=4)

    def to_dict(self):
        """Converts TargetRules to a python dictionary. Used when writing target rules to a json file.

        Returns:
            The dictionary containing the TargetRule info.
        """
        rule_dict = {}
        for key in self._ALLOWED_KEYS:
            value = self.__dict__.get(key)
            if value is not None:
                rule_dict[key] = value
        return rule_dict

    def __repr__(self):
        return f"""TargetRule(literal="{self.literal}", category="{self.category}", pattern={self.pattern}, attributes={self.attributes}, on_match={self.on_match})"""
__init__(literal, category, pattern=None, on_match=None, attributes=None, metadata=None)

Creates a new TargetRule.

Parameters:

Name Type Description Default
literal str

The string representation of a concept. If pattern is None, this string will be lower-cased and matched to the lower-case string. If pattern is not None, this argument will not be used for matching but can be used as a reference as the rule name.

required
category str

The semantic class of the matched span. This corresponds to the label_ attribute of an entity.

required
pattern Optional[Union[List[Dict[str, str]], str]]

A list or string to use as a spaCy pattern rather than literal. If a list, will use spaCy token-based pattern matching to match using token attributes. If a string, will use medspaCy's RegexMatcher. If None, will use literal as the pattern for phrase matching. For more information, see https://spacy.io/usage/rule-based-matching.

None
on_match Optional[Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]]

An optional callback function or other callable which takes 4 arguments: (matcher, doc, i, matches). For more information, see https://spacy.io/usage/rule-based-matching#on_match

None
attributes Optional[Dict[str, Any]]

Optional custom attribute names to set for a Span matched by the direction. These attribute names are stored under Span..[attribute_name]. For example, if attributes={'is_historical':True}, then any spans matched by this direction will have span..is_historical = True

None
metadata Optional[Dict[Any, Any]]

Optional dictionary of any extra metadata.

None
Source code in medspacy/target_matcher/target_rule.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def __init__(
    self,
    literal: str,
    category: str,
    pattern: Optional[Union[List[Dict[str, str]], str]] = None,
    on_match: Optional[
        Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
    ] = None,
    attributes: Optional[Dict[str, Any]] = None,
    metadata: Optional[Dict[Any, Any]] = None,
):
    """
    Creates a new TargetRule.

    Args:
        literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
            matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
            but can be used as a reference as the rule name.
        category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
        pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
            token-based pattern matching to match using token attributes. If a string, will use medspaCy's
            RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
            https://spacy.io/usage/rule-based-matching.
        on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
            matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
        attributes: Optional custom attribute names to set for a Span matched by the direction. These attribute
            names are stored under Span._.[attribute_name]. For example, if `attributes={'is_historical':True}`,
            then any spans matched by this direction will have span._.is_historical = True
        metadata: Optional dictionary of any extra metadata.
    """
    super().__init__(literal, category, pattern, on_match, metadata)
    self.attributes = attributes
    self._rule_id = None
from_dict(rule_dict) classmethod

Reads a dictionary into a ConTextRule. Used when reading from a json file.

Parameters:

Name Type Description Default
rule_dict Dict

the dictionary to convert

required

Returns:

Type Description
TargetRule

The ConTextRule created from the dictionary

Raises:

Type Description
ValueError

if the json is invalid

Source code in medspacy/target_matcher/target_rule.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
@classmethod
def from_dict(cls, rule_dict: Dict) -> TargetRule:
    """Reads a dictionary into a ConTextRule. Used when reading from a json file.

    Args:
        rule_dict: the dictionary to convert

    Returns:
        The ConTextRule created from the dictionary

    Raises:
        ValueError: if the json is invalid
    """
    keys = set(rule_dict.keys())
    invalid_keys = keys.difference(cls._ALLOWED_KEYS)
    if invalid_keys:
        msg = (
            "JSON object contains invalid keys: {0}.\n"
            "Must be one of: {1}".format(invalid_keys, cls._ALLOWED_KEYS)
        )
        raise ValueError(msg)
    rule = TargetRule(**rule_dict)
    return rule
from_json(filepath) classmethod

Read in a lexicon of modifiers from a JSON file.

Parameters:

Name Type Description Default
filepath str

the .json file containing modifier rules

required

Returns:

Name Type Description
context_item List[TargetRule]

A list of ConTextRule objects.

Raises:

Type Description
KeyError

If the dictionary contains any keys other than those accepted by ConTextRule.init

Source code in medspacy/target_matcher/target_rule.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
@classmethod
def from_json(cls, filepath: str) -> List[TargetRule]:
    """Read in a lexicon of modifiers from a JSON file.

    Args:
        filepath: the .json file containing modifier rules

    Returns:
        context_item: A list of ConTextRule objects.

    Raises:
        KeyError: If the dictionary contains any keys other than
            those accepted by ConTextRule.__init__
    """
    import json

    with open(filepath) as file:
        target_data = json.load(file)
    target_rules = []
    for data in target_data["target_rules"]:
        target_rules.append(TargetRule.from_dict(data))
    return target_rules
to_dict()

Converts TargetRules to a python dictionary. Used when writing target rules to a json file.

Returns:

Type Description

The dictionary containing the TargetRule info.

Source code in medspacy/target_matcher/target_rule.py
119
120
121
122
123
124
125
126
127
128
129
130
def to_dict(self):
    """Converts TargetRules to a python dictionary. Used when writing target rules to a json file.

    Returns:
        The dictionary containing the TargetRule info.
    """
    rule_dict = {}
    for key in self._ALLOWED_KEYS:
        value = self.__dict__.get(key)
        if value is not None:
            rule_dict[key] = value
    return rule_dict
to_json(target_rules, filepath) classmethod

Writes ConTextItems to a json file.

Parameters:

Name Type Description Default
target_rules List[TargetRule]

a list of TargetRules that will be written to a file.

required
filepath str

the .json file to contain modifier rules

required
Source code in medspacy/target_matcher/target_rule.py
105
106
107
108
109
110
111
112
113
114
115
116
117
@classmethod
def to_json(cls, target_rules: List[TargetRule], filepath: str):
    """Writes ConTextItems to a json file.

    Args:
        target_rules: a list of TargetRules that will be written to a file.
        filepath: the .json file to contain modifier rules
    """
    import json

    data = {"target_rules": [rule.to_dict() for rule in target_rules]}
    with open(filepath, "w") as file:
        json.dump(data, file, indent=4)

util

This module will contain helper functions and classes for common clinical processing tasks which will be used in many medspaCy components.

_build_pipe_names(enable, disable=None)

Implement logic based on the pipenames defined in 'enable' and 'disable'. If enable and disable are both None, then it will load the default pipenames. Otherwise, will allow custom selection of components.

Parameters:

Name Type Description Default
enable Union[str, Iterable[str]]

"all" loads components from ALL_PIPE_NAMES. "default" loads components from DEFAULT_PIPE_NAMES. Otherwise, loads he list of components as components.

required
disable Optional[Iterable[str]]

The optional list of components to disable. Set difference of enable.

None

Returns:

Type Description
Tuple[Set[str], Set[str]]

A complete list of enabled and disabled components, with all components listed and empty intersection.

Source code in medspacy/util.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def _build_pipe_names(
    enable: Union[str, Iterable[str]], disable: Optional[Iterable[str]] = None
) -> Tuple[Set[str], Set[str]]:
    """
    Implement logic based on the pipenames defined in 'enable' and 'disable'. If enable and disable are both None,
    then it will load the default pipenames. Otherwise, will allow custom selection of components.

    Args:
        enable: "all" loads components from ALL_PIPE_NAMES. "default" loads components from DEFAULT_PIPE_NAMES.
            Otherwise, loads he list of components as components.
        disable: The optional list of components to disable. Set difference of enable.

    Returns:
        A complete list of enabled and disabled components, with all components listed and empty intersection.
    """
    if not enable:
        raise ValueError(
            "Enable cannot be none, please specify 'all', 'default' or a list of components."
        )

    # cannot allow lists of enabled and disabled components, what happens if "context" is both enabled and disabled?
    if (not isinstance(enable, str) and isinstance(enable, Iterable)) and isinstance(
        disable, Iterable
    ):
        raise ValueError("Both enable and disable cannot be collections of components.")

    # set which components are enabled first
    if enable == "all":
        enable = ALL_PIPE_NAMES
    elif enable == "default":
        enable = DEFAULT_PIPE_NAMES
    else:
        enable = set(enable)

    # then find the difference with deactivated components
    if disable is not None:
        enable = enable.difference(set(disable))
    else:
        disable = set()  # otherwise disable is empty

    return enable, disable

load(model='default', medspacy_enable='default', medspacy_disable=None, language_code='en', load_rules=True, quickumls_path=None, **model_kwargs)

Load a spaCy language object with medSpaCy pipeline components. By default, the base model will be a blank 'en' model with the following components: - "medspacy_tokenizer": A customized, more aggressive tokenizer than the default spaCy tokenizer. This is set to nlp.tokenizer and is not loaded as a pipeline component. - "medspacy_pyrush": PyRuSH Sentencizer for sentence splitting - "medspacy_target_matcher": TargetMatcher for extended pattern matching - "medspacy_context": ConText for attribute assertion - "medspacy_quickumls": QuickUMLS for UMLS concept mapping Args: model: The base spaCy model to load. If 'default', will instantiate from a blank 'en' model. If it is a spaCy language model, then it will simply add medspaCy components to the existing pipeline. If it is a string other than 'default', passes the string to spacy.load(model, **model_kwargs). medspacy_enable: Specifies which components to enable in the medspacy pipeline. If "default", will load all components found in DEFAULT_PIPE_NAMES. These represent the simplest components used in a clinical NLP pipeline: tokenization, sentence detection, concept identification, and ConText. If "all", all components in medspaCy will be loaded. If a collection of strings, the components specified will be loaded. medspacy_disable: A collection of component names to exclude. Requires "all" is the value for enable. language_code: Language code to use (ISO code) as a default for loading additional resources. See documentation and also the /resources directory to see which resources might be available in each language. Default is "en" for English. load_rules: Whether to include default rules for available components. If True, sectionizer and context will both be loaded with default rules. Default is True. quickumls_path: Path to QuickUMLS dictionaries if it is included in the pipeline. model_kwargs: Optional model keyword arguments to pass to spacy.load().

Returns:

Type Description

A spaCy Language object containing the specified medspacy components.

Source code in medspacy/util.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def load(
    model: Union[Literal["default"], str, Language] = "default",
    medspacy_enable: Union[Literal["all", "default"], Iterable[str]] = "default",
    medspacy_disable: Optional[Iterable[str]] = None,
    language_code: str = "en",
    load_rules: bool = True,
    quickumls_path: Optional[str] = None,
    **model_kwargs,
):
    """Load a spaCy language object with medSpaCy pipeline components.
    By default, the base model will be a blank 'en' model with the
    following components:
        - "medspacy_tokenizer": A customized, more aggressive tokenizer than the default spaCy tokenizer. This is set to
            `nlp.tokenizer` and is not loaded as a pipeline component.
        - "medspacy_pyrush": PyRuSH Sentencizer for sentence splitting
        - "medspacy_target_matcher": TargetMatcher for extended pattern matching
        - "medspacy_context": ConText for attribute assertion
        - "medspacy_quickumls": QuickUMLS for UMLS concept mapping
    Args:
        model: The base spaCy model to load. If 'default', will instantiate from a blank 'en' model. If it is a spaCy
            language model, then it will simply add medspaCy components to the existing pipeline. If it is a string
            other than 'default', passes the string to spacy.load(model, **model_kwargs).
        medspacy_enable: Specifies which components to enable in the medspacy pipeline. If "default", will load all components
            found in `DEFAULT_PIPE_NAMES`. These represent the simplest components used in a clinical NLP pipeline:
            tokenization, sentence detection, concept identification, and ConText. If "all", all components in medspaCy
            will be loaded. If a collection of strings, the components specified will be loaded.
        medspacy_disable: A collection of component names to exclude. Requires "all" is the value for `enable`.
        language_code: Language code to use (ISO code) as a default for loading additional resources.  See documentation
            and also the /resources directory to see which resources might be available in each language.
            Default is "en" for English.
        load_rules: Whether to include default rules for available components. If True, sectionizer and context will
            both be loaded with default rules. Default is True.
        quickumls_path: Path to QuickUMLS dictionaries if it is included in the pipeline.
        model_kwargs: Optional model keyword arguments to pass to spacy.load().

    Returns:
        A spaCy Language object containing the specified medspacy components.
    """

    medspacy_enable, medspacy_disable = _build_pipe_names(
        medspacy_enable, medspacy_disable
    )

    if model == "default":
        nlp = spacy.blank("en")
    elif isinstance(model, Language):
        nlp = model
    elif isinstance(model, str):
        nlp = spacy.load(model, **model_kwargs)
    else:
        raise ValueError(
            "model must be either 'default' or an actual spaCy Language object, not ",
            type(model),
        )

    if "medspacy_tokenizer" in medspacy_enable:
        from .custom_tokenizer import create_medspacy_tokenizer

        medspacy_tokenizer = create_medspacy_tokenizer(nlp)
        nlp.tokenizer = medspacy_tokenizer

    if "medspacy_preprocessor" in medspacy_enable:
        from .preprocess import Preprocessor

        preprocessor = Preprocessor(nlp.tokenizer)
        nlp.tokenizer = preprocessor

    if "medspacy_pyrush" in medspacy_enable:
        pyrush_path = path.join(
            Path(__file__).resolve().parents[1], "resources", language_code.lower(), "rush_rules.tsv"
        )
        nlp.add_pipe("medspacy_pyrush", config={"rules_path": pyrush_path})

    if "medspacy_target_matcher" in medspacy_enable:
        nlp.add_pipe("medspacy_target_matcher")

    if "medspacy_quickumls" in medspacy_enable:
        if quickumls_path is None:
            quickumls_path = get_quickumls_demo_dir(language_code)

            print(
                "Loading QuickUMLS resources from a Medspacy-distributed SAMPLE of UMLS data from here: {}".format(
                    quickumls_path
                )
            )

        nlp.add_pipe("medspacy_quickumls", config={"quickumls_fp": quickumls_path})

    if "medspacy_context" in medspacy_enable:
        if load_rules is True:
            config = {'language_code': language_code}
        else:
            config = {"rules": None,
                      'language_code': language_code}
        nlp.add_pipe("medspacy_context", config=config)

    if "medspacy_sectionizer" in medspacy_enable:
        if load_rules is True:
            config = {'language_code': language_code}
        else:
            config = {"rules": None,
                      'language_code': language_code}
        nlp.add_pipe("medspacy_sectionizer", config=config)

    if "medspacy_postprocessor" in medspacy_enable:
        nlp.add_pipe("medspacy_postprocessor")

    if "medspacy_doc_consumer" in medspacy_enable:
        nlp.add_pipe("medspacy_doc_consumer")

    return nlp

tuple_overlaps(a, b)

Calculates whether two tuples overlap. Assumes tuples are sorted to be like spans (start, end)

Parameters:

Name Type Description Default
a Tuple[int, int]

A tuple representing a span (start, end).

required
b Tuple[int, int]

A tuple representing a span (start, end).

required

Returns:

Type Description

Whether the tuples overlap.

Source code in medspacy/util.py
192
193
194
195
196
197
198
199
200
201
202
203
def tuple_overlaps(a: Tuple[int, int], b: Tuple[int, int]):
    """
    Calculates whether two tuples overlap. Assumes tuples are sorted to be like spans (start, end)

    Args:
        a: A tuple representing a span (start, end).
        b: A tuple representing a span (start, end).

    Returns:
        Whether the tuples overlap.
    """
    return a[0] <= b[0] < a[1] or a[0] < b[1] <= a[1]

visualization

MedspaCyVisualizerWidget

Source code in medspacy/visualization.py
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
class MedspaCyVisualizerWidget:
    def __init__(self, docs, target_span_type: str = "ents", span_group_name: str = "medspacy_spans"):

        """Create an IPython Widget Box displaying medspaCy's visualizers.
        The widget allows selecting visualization style ("Ent", "Dep", or "Both")
        and a slider for selecting the index of docs.

        For more information on IPython widgets, see:
            https://ipywidgets.readthedocs.io/en/latest/index.html

        Parameters:
            docs: A list of docs processed by a medspaCy pipeline

        """

        import ipywidgets as widgets

        self.docs = docs
        self.target_span_type = target_span_type 
        self.span_group_name = span_group_name
        self.slider = widgets.IntSlider(
            value=0,
            min=0,
            max=len(docs) - 1,
            step=1,
            description="Doc:",
            disabled=False,
            continuous_update=False,
            orientation="horizontal",
            readout=True,
            readout_format="d",
        )
        self.radio = widgets.RadioButtons(options=["Ent", "Dep", "Both"])
        self.layout = widgets.Layout(
            display="flex", flex_flow="column", align_items="stretch", width="100%"
        )
        self.radio.observe(self._change_handler)
        self.slider.observe(self._change_handler)
        self.next_button = widgets.Button(description="Next")
        self.next_button.on_click(self._on_click_next)
        self.previous_button = widgets.Button(description="Previous")
        self.previous_button.on_click(self._on_click_prev)
        self.output = widgets.Output()
        self.box = widgets.Box(
            [
                widgets.HBox([self.radio, self.previous_button, self.next_button]),
                self.slider,
                self.output,
            ],
            layout=self.layout,
        )

        self.display()
        with self.output:
            self._visualize_doc()

    def display(self):
        """Display the Box widget in the current IPython cell."""
        from IPython.display import display as ipydisplay

        ipydisplay(self.box)

    def _change_handler(self, change):

        with self.output:
            self._visualize_doc()

    def _visualize_doc(self):
        self.output.clear_output()
        doc = self.docs[self.slider.value]
        if self.radio.value.lower() in ("dep", "both"):
            visualize_dep(doc)
        if self.radio.value.lower() in ("ent", "both"):
            visualize_ent(doc, target_span_type=self.target_span_type, span_group_name=self.span_group_name)

    def _on_click_next(self, b):
        if self.slider.value < len(self.docs) - 1:
            self.slider.value += 1

    def _on_click_prev(self, b):
        if self.slider.value > 0:
            self.slider.value -= 1

    def set_docs(self, docs):
        "Replace the list of docs to be visualized."
        self.docs = docs
        self._visualize_doc(self.docs[0])

__init__(docs, target_span_type='ents', span_group_name='medspacy_spans')

Create an IPython Widget Box displaying medspaCy's visualizers. The widget allows selecting visualization style ("Ent", "Dep", or "Both") and a slider for selecting the index of docs.

For more information on IPython widgets, see: https://ipywidgets.readthedocs.io/en/latest/index.html

Parameters:

Name Type Description Default
docs

A list of docs processed by a medspaCy pipeline

required
Source code in medspacy/visualization.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
def __init__(self, docs, target_span_type: str = "ents", span_group_name: str = "medspacy_spans"):

    """Create an IPython Widget Box displaying medspaCy's visualizers.
    The widget allows selecting visualization style ("Ent", "Dep", or "Both")
    and a slider for selecting the index of docs.

    For more information on IPython widgets, see:
        https://ipywidgets.readthedocs.io/en/latest/index.html

    Parameters:
        docs: A list of docs processed by a medspaCy pipeline

    """

    import ipywidgets as widgets

    self.docs = docs
    self.target_span_type = target_span_type 
    self.span_group_name = span_group_name
    self.slider = widgets.IntSlider(
        value=0,
        min=0,
        max=len(docs) - 1,
        step=1,
        description="Doc:",
        disabled=False,
        continuous_update=False,
        orientation="horizontal",
        readout=True,
        readout_format="d",
    )
    self.radio = widgets.RadioButtons(options=["Ent", "Dep", "Both"])
    self.layout = widgets.Layout(
        display="flex", flex_flow="column", align_items="stretch", width="100%"
    )
    self.radio.observe(self._change_handler)
    self.slider.observe(self._change_handler)
    self.next_button = widgets.Button(description="Next")
    self.next_button.on_click(self._on_click_next)
    self.previous_button = widgets.Button(description="Previous")
    self.previous_button.on_click(self._on_click_prev)
    self.output = widgets.Output()
    self.box = widgets.Box(
        [
            widgets.HBox([self.radio, self.previous_button, self.next_button]),
            self.slider,
            self.output,
        ],
        layout=self.layout,
    )

    self.display()
    with self.output:
        self._visualize_doc()

display()

Display the Box widget in the current IPython cell.

Source code in medspacy/visualization.py
308
309
310
311
312
def display(self):
    """Display the Box widget in the current IPython cell."""
    from IPython.display import display as ipydisplay

    ipydisplay(self.box)

set_docs(docs)

Replace the list of docs to be visualized.

Source code in medspacy/visualization.py
335
336
337
338
def set_docs(self, docs):
    "Replace the list of docs to be visualized."
    self.docs = docs
    self._visualize_doc(self.docs[0])

_create_color_generator()

Create a generator which will cycle through a list of default matplotlib colors

Source code in medspacy/visualization.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def _create_color_generator():
    """Create a generator which will cycle through a list of
    default matplotlib colors"""
    from itertools import cycle

    colors = [
        "#1f77b4",
        "#ff7f0e",
        "#2ca02c",
        "#d62728",
        "#9467bd",
        "#8c564b",
        "#e377c2",
        "#7f7f7f",
        "#bcbd22",
        "#17becf",
    ]
    return cycle(colors)

visualize_dep(doc, jupyter=True)

Create a dependency-style visualization for ConText targets and modifiers in doc. This will show the relationships between entities in doc and contextual modifiers.

Parameters:

Name Type Description Default
doc Doc

The spacy Doc to visualize.

required
jupyter bool

Whether it is being rendered in a jupyter notebook.

True

Returns:

Type Description
str

The visualization.

Source code in medspacy/visualization.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
def visualize_dep(doc: Doc, jupyter: bool = True) -> str:
    """
    Create a dependency-style visualization for ConText targets and modifiers in doc. This will show the relationships
    between entities in doc and contextual modifiers.

    Args:
        doc: The spacy Doc to visualize.
        jupyter: Whether it is being rendered in a jupyter notebook.

    Returns:
        The visualization.
    """
    token_data = []
    token_data_mapping = {}
    for token in doc:
        data = {"text": token.text, "tag": "", "index": token.i}
        token_data.append(data)
        token_data_mapping[token] = data

    # Merge phrases
    # targets_and_modifiers = [*doc._.context_graph.targets]
    existing_tokens = set()
    targets_and_modifiers = []
    # Used to prevent duplication of token in targets or modifiers that appear twice due to being in a span group or, appearing twice as a modifier
    for target_or_modifier in (list(doc._.context_graph.targets) + doc._.context_graph.modifiers):
        if isinstance (target_or_modifier, Span):
            span=target_or_modifier
        else:
            span=doc[target_or_modifier._start : target_or_modifier._end]
        already_seen = False 
        for token in span:
            if token in existing_tokens:
                already_seen = True 
                break 
        if not already_seen:
            targets_and_modifiers.append(target_or_modifier)
            existing_tokens.update({token for token in span}) 

    for obj in targets_and_modifiers:
        if isinstance(obj, Span):
            first_token = obj[0]
            data = token_data_mapping[first_token]
            data["tag"] = obj.label_
            if len(obj) > 1:
                idx = data["index"]
                for other_token in obj[1:]:
                    # Add the text to the display data for the first word
                    # and remove the subsequent token
                    data["text"] += " " + other_token.text
                    # Remove this token from the list of display data
                    token_data.pop(idx + 1)
                for other_data in token_data[idx + 1:]:
                    other_data["index"] -= len(obj) - 1
        else:
            span_tup = obj.modifier_span
            first_token = doc[span_tup[0]]
            data = token_data_mapping[first_token]
            data["tag"] = obj.category
            if span_tup[1] - span_tup[0] > 1:
                span = doc[span_tup[0]: span_tup[1]]
                idx = data["index"]
                for other_token in span[1:]:
                    # Add the text to the display data for the first word
                    # and remove the subsequent token
                    data["text"] += " " + other_token.text
                    # Remove this token from the list of display data
                    token_data.pop(idx + 1)
                for other_data in token_data[idx + 1:]:
                    other_data["index"] -= len(span) - 1

        # if len(span) == 1:
        #     continue
        #
        # idx = data["index"]
        # for other_token in span[1:]:
        #     # Add the text to the display data for the first word
        #     # and remove the subsequent token
        #     data["text"] += " " + other_token.text
        #     # Remove this token from the list of display data
        #     token_data.pop(idx + 1)
        #
        # # Lower the index of the following tokens
        # for other_data in token_data[idx + 1 :]:
        #     other_data["index"] -= len(span) - 1

    dep_data = {"words": token_data, "arcs": []}

    # Gather the edges between targets and modifiers
    for target, modifier in doc._.context_graph.edges:
        target_data = token_data_mapping[target[0]]
        modifier_data = token_data_mapping[doc[modifier.modifier_span[0]]]
        dep_data["arcs"].append(
            {
                "start": min(target_data["index"], modifier_data["index"]),
                "end": max(target_data["index"], modifier_data["index"]),
                "label": modifier.category,
                "dir": "right"
                if target > doc[modifier.modifier_span[0] : modifier.modifier_span[1]]
                else "left",
            }
        )

    return displacy.render(dep_data, manual=True, jupyter=jupyter)

visualize_ent(doc, context=True, sections=True, jupyter=True, colors=None, target_span_type='ents', span_group_name='medspacy_spans')

Creates a NER-style visualization for targets and modifiers in Doc.

Parameters:

Name Type Description Default
doc Doc

A spacy doc to visualize.

required
context bool

Whether to display the modifiers generated by medSpaCy's cycontext. If the doc has not been processed by context, this will be automatically changed to False. Default True.

True
sections bool

Whether to display the section titles generated by medSpaCy's sectionizer (still in development). If the doc has not been processed by sectionizer , this will be automatically changed to False. This may also have some overlap with cycontext, in which case duplicate spans will be displayed. Default True.

True
jupyter bool

If True, will render directly in a Jupyter notebook. If False, will return the HTML. Default True.

True
colors Dict[str, str]

An optional dictionary which maps labels of targets and modifiers to color strings to be rendered. If None, will create a generator which cycles through the default matplotlib colors for ent and modifier labels and uses a light gray for section headers. Default None.

None

Returns:

Type Description
str

The visualization.

Source code in medspacy/visualization.py
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def visualize_ent(
    doc: Doc,
    context: bool = True,
    sections: bool = True,
    jupyter: bool = True,
    colors: Dict[str, str] = None,
    target_span_type: str = "ents",
    span_group_name: str = "medspacy_spans"
) -> str:
    """
    Creates a NER-style visualization for targets and modifiers in Doc.

    Args:
        doc: A spacy doc to visualize.
        context: Whether to display the modifiers generated by medSpaCy's cycontext. If the doc has not been processed
            by context, this will be automatically changed to False. Default True.
        sections: Whether to display the section titles generated by medSpaCy's sectionizer (still in development). If
            the doc has not been processed by sectionizer , this will be automatically changed to False. This may also
            have some overlap with cycontext, in which case duplicate spans will be displayed. Default True.
        jupyter: If True, will render directly in a Jupyter notebook. If False, will return the HTML. Default True.
        colors: An optional dictionary which maps labels of targets and modifiers to color strings to be rendered. If
            None, will create a generator which cycles through the default matplotlib colors for ent and modifier labels
            and uses a light gray for section headers. Default None.

    Returns:
        The visualization.
    """
    # Make sure that doc has the custom medSpaCy attributes registered
    if not hasattr(doc._, "context_graph"):
        context = False
    if not hasattr(doc._, "sections"):
        sections = False

    ents_data = []

    if target_span_type == "ents":
        targets = doc.ents
    elif target_span_type == "group":
        targets = doc.spans[span_group_name]
    else:
        raise ValueError("Target span type must be either ents or group.")

    for target in targets:
        ent_data = {
            "start": target.start_char,
            "end": target.end_char,
            "label": target.label_.upper(),
        }
        ents_data.append((ent_data, "ent"))

    if context:
        visualized_modifiers = set()
        for target in doc.ents:
            for modifier in target._.modifiers:
                if modifier in visualized_modifiers:
                    continue
                span = doc[modifier.modifier_span[0]: modifier.modifier_span[1]]
                ent_data = {
                    "start": span.start_char,
                    "end": span.end_char,
                    "label": modifier.category,
                }
                ents_data.append((ent_data, "modifier"))
                visualized_modifiers.add(modifier)
    if sections:
        for section in doc._.sections:
            category = section.category
            if category is None:
                continue
            span = doc[section.title_span[0]: section.title_span[1]]
            ent_data = {
                "start": span.start_char,
                "end": span.end_char,
                "label": f"<< {category.upper()} >>",
            }
            ents_data.append((ent_data, "section"))
    if len(ents_data) == 0:  # No data to display
        viz_data = [{"text": doc.text, "ents": []}]
        options = dict()
    else:
        ents_data = sorted(ents_data, key=lambda x: x[0]["start"])

        # If colors aren't defined, generate color mappings for each entity
        # and modifier label and set all section titles to a light gray
        if colors is None:
            labels = set()
            section_titles = set()
            for (ent_data, ent_type) in ents_data:
                if ent_type in ("ent", "modifier"):
                    labels.add(ent_data["label"])
                elif ent_type == "section":
                    section_titles.add(ent_data["label"])
            colors = _create_color_mapping(labels)
            for title in section_titles:
                colors[title] = "#dee0e3"
        ents_display_data, _ = zip(*ents_data)
        viz_data = [
            {
                "text": doc.text,
                "ents": ents_display_data,
            }
        ]

        options = {
            "colors": colors,
        }
    return displacy.render(
        viz_data, style="ent", manual=True, options=options, jupyter=jupyter
    )