Skip to content

medspacy.common.util

This module will contain helper functions and classes for common clinical processing tasks which will be used in medspaCy's matcher objects.

get_token_for_char(doc, char_idx, resolve='left')

Get the token index that best matches a particular character index. Because regex find returns a character index and spaCy matches must align with token boundaries, each character index must be converted into a token index.

Parameters:

Name Type Description Default
doc Doc

The spaCy Doc to search in.

required
char_idx int

The character index to find the corresponding token for.

required
resolve str

The resolution type. "left" will snap character to the token index to the left which precede the

'left'

Returns:

Type Description
Union[Token, None]

The token that best fits the character index based on the resolution type.

Source code in medspacy/common/util.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def get_token_for_char(
    doc: Doc, char_idx: int, resolve: str = "left"
) -> Union[Token, None]:
    """
    Get the token index that best matches a particular character index. Because regex find returns a character index and
    spaCy matches must align with token boundaries, each character index must be converted into a token index.

    Args:
        doc: The spaCy Doc to search in.
        char_idx: The character index to find the corresponding token for.
        resolve: The resolution type. "left" will snap character to the token index to the left which precede the
        `char_idx`. "right" will snap character to the token index to the right, which follows the `char_idx`.

    Returns:
        The token that best fits the character index based on the resolution type.
    """
    if char_idx < 0:
        raise ValueError("char_idx must be > 0")
    if char_idx > len(doc.text_with_ws):
        raise ValueError(
            "char_idx {0} is out of range for text with length {1}".format(
                char_idx, len(doc.text_with_ws)
            )
        )
    for i, token in enumerate(doc):
        if char_idx > token.idx:
            continue
        if char_idx == token.idx:
            return token
        if char_idx < token.idx:
            if resolve == "left":
                return doc[i - 1]
            elif resolve == "right":
                return doc[i]
            else:
                raise ValueError("resolve must be either 'left' or 'right'")
    # Otherwise, we've reached the end of the doc, so this must be the final token
    # If resolving to the left, return the final token
    # If resolving to the right, return None, meaning it should go to the end of the doc
    if resolve == "left":
        return doc[-1]
    if resolve == "right":
        return None

matches_to_spans(doc, matches, set_label=True)

Converts all identified matches to spans.

Parameters:

Name Type Description Default
doc Doc

The spaCy doc corresponding to the matches.

required
matches List[Tuple[int, int, int]]

The list of match Tuples (match_id, start, end).

required
set_label bool

Whether to assign a label to the span based off the source rule. Default is True.

True

Returns:

Type Description
List[Span]

A list of spacy spans corresponding to the input matches.

Source code in medspacy/common/util.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
def matches_to_spans(
    doc: Doc, matches: List[Tuple[int, int, int]], set_label: bool = True
) -> List[Span]:
    """
    Converts all identified matches to spans.

    Args:
        doc: The spaCy doc corresponding to the matches.
        matches: The list of match Tuples (match_id, start, end).
        set_label: Whether to assign a label to the span based off the source rule. Default is True.

    Returns:
        A list of spacy spans corresponding to the input matches.
    """
    spans = []
    for (rule_id, start, end) in matches:
        if set_label:
            label = doc.vocab.strings[rule_id]
        else:
            label = None
        spans.append(Span(doc, start=start, end=end, label=label))
    return spans

overlaps(a, b)

Checks whether two match Tuples out of spacy matchers overlap.

Parameters:

Name Type Description Default
a Tuple[int, int, int]

A match Tuple (match_id, start, end).

required
b Tuple[int, int, int]

A match Tuple (match_id, start, end).

required

Returns:

Type Description
bool

Whether the tuples overlap.

Source code in medspacy/common/util.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def overlaps(a: Tuple[int, int, int], b: Tuple[int, int, int]) -> bool:
    """
    Checks whether two match Tuples out of spacy matchers overlap.

    Args:
        a: A match Tuple (match_id, start, end).
        b: A match Tuple (match_id, start, end).

    Returns:
        Whether the tuples overlap.
    """
    _, a_start, a_end = a
    _, b_start, b_end = b
    return tuple_overlaps((a_start, a_end), (b_start, b_end))

prune_overlapping_matches(matches, strategy='longest')

Prunes overlapping matches from a list of spaCy match tuples (match_id, start, end).

Parameters:

Name Type Description Default
matches List[Tuple[int, int, int]]

A list of match tuples of form (match_id, start, end).

required
strategy str

The pruning strategy to use. At this time, the only available option is "longest" and will keep the longest of any two overlapping spans. Other behavior will be added in a future update.

'longest'

Returns:

Type Description
List[Tuple[int, int, int]]

The pruned list of matches.

Source code in medspacy/common/util.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def prune_overlapping_matches(
    matches: List[Tuple[int, int, int]], strategy: str = "longest"
) -> List[Tuple[int, int, int]]:
    """
    Prunes overlapping matches from a list of spaCy match tuples (match_id, start, end).

    Args:
        matches: A list of match tuples of form (match_id, start, end).
        strategy: The pruning strategy to use. At this time, the only available option is "longest" and will keep the
            longest of any two overlapping spans. Other behavior will be added in a future update.

    Returns:
        The pruned list of matches.
    """
    if strategy != "longest":
        raise NotImplementedError(
            "No other filtering strategy has been implemented. Coming in a future update."
        )

    # Make a copy and sort
    unpruned = sorted(matches, key=lambda x: (x[1], x[2]))
    pruned = []
    num_matches = len(matches)
    if num_matches == 0:
        return matches
    curr_match = unpruned.pop(0)

    while True:
        if len(unpruned) == 0:
            pruned.append(curr_match)
            break
        next_match = unpruned.pop(0)

        # Check if they overlap
        if overlaps(curr_match, next_match):
            # Choose the larger span
            longer_span = max(curr_match, next_match, key=lambda x: (x[2] - x[1]))
            pruned.append(longer_span)
            if len(unpruned) == 0:
                break
            curr_match = unpruned.pop(0)
        else:
            pruned.append(curr_match)
            curr_match = next_match
    # Recursive base point
    if len(pruned) == num_matches:
        return pruned
    # Recursive function call
    else:
        return prune_overlapping_matches(pruned)

span_contains(span, target, regex=True, case_insensitive=True)

Return True if a Span object contains a target phrase.

Parameters:

Name Type Description Default
span Union[Doc, Span]

A spaCy Doc or Span, such as an entity in doc.ents

required
target str

A target phrase or iterable of phrases to check in span.text.lower().

required
regex bool

Whether to search the span using a regular expression rather than a literal string. Default is True.

True
case_insensitive bool

Whether the matching is case-insensitive. Default is True.

True
Source code in medspacy/common/util.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def span_contains(
    span: Union[Doc, Span],
    target: str,
    regex: bool = True,
    case_insensitive: bool = True,
) -> bool:
    """
    Return True if a Span object contains a target phrase.

    Args:
        span: A spaCy Doc or Span, such as an entity in doc.ents
        target: A target phrase or iterable of phrases to check in span.text.lower().
        regex: Whether to search the span using a regular expression rather than
            a literal string. Default is True.
        case_insensitive: Whether the matching is case-insensitive. Default is True.
    """
    if regex is True:
        if case_insensitive:
            func = lambda x: re.search(x, span.text, flags=re.IGNORECASE) is not None
        else:
            func = lambda x: re.search(x, span.text) is not None
    else:
        if case_insensitive:
            func = lambda x: x.lower() in span.text.lower()
        else:
            func = lambda x: x in span.text

    if isinstance(target, str):
        return func(target)

    # If it's an iterable, check if any of the strings are in sent
    for string in target:
        if func(string):
            return True
    return False