Skip to content

medspacy.section_detection.section_rule

SectionRule

Bases: BaseRule

SectionRule defines rules for extracting entities from text using the Sectionizer.

Source code in medspacy/section_detection/section_rule.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
class SectionRule(BaseRule):
    """
    SectionRule defines rules for extracting entities from text using the Sectionizer.
    """

    _ALLOWED_KEYS = {
        "literal",
        "pattern",
        "category",
        "metadata",
        "parents",
        "parent_required",
        "max_scope",
    }

    def __init__(
        self,
        literal: str,
        category: str,
        pattern: Optional[Union[List[Dict[str, str]], str]] = None,
        on_match: Optional[
            Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
        ] = None,
        max_scope: Optional[int] = None,
        parents: Optional[List[str]] = None,
        parent_required: bool = False,
        metadata: Optional[Dict[Any, Any]] = None,
    ):
        """
        Class for defining rules for extracting entities from text using TargetMatcher.

        Args:
            literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
                matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
                but can be used as a reference as the rule name.
            category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
            pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
                token-based pattern matching to match using token attributes. If a string, will use medspaCy's
                RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
                https://spacy.io/usage/rule-based-matching.
            on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
                matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
            max_scope: A number of tokens to explicitly limit the size of a section body. If None, the scope will
                include the entire doc up until either the next section header or the end of the doc. This variable can
                also be set at a global level as `Sectionizer(nlp, max_scope=...), but if the attribute is set here, the
                rule scope will take precedence. If not None, this will be the number of tokens following the matched
                section header
                    Example:
                        In the text "Past Medical History: Pt has hx of pneumonia",
                        SectionRule("Past Medical History:", "pmh", max_scope=None) will include the entire doc, but
                        SectionRule("Past Medical History:", "pmh", max_scope=2) will limit the section
                            to be "Past Medical History: Pt has"
                This can be useful for limiting certain sections which are known to be short or allowing others to be
                longer than the regular global max_scope.
            parents: A list of candidate parents for determining subsections
            parent_required: Whether a parent is required for the section to exist in the final output. If true and no
                parent is identified, the section will be removed.
            metadata: Optional dictionary of any extra metadata.
        """
        super().__init__(literal, category, pattern, on_match, metadata)
        self.max_scope = max_scope
        self.parents = parents
        if parent_required:
            if not parents:
                raise ValueError(
                    f"Jsonl file incorrectly formatted for pattern name {category}. "
                    f"If parents are required, then at least one parent must be specified."
                )
        self.parent_required = parent_required

    @classmethod
    def from_json(cls, filepath) -> List[SectionRule]:
        """
        Read in a lexicon of modifiers from a JSON file.

        Args:
            filepath: the .json file containing modifier rules

        Returns:
            section_rules: a list of SectionRule objects
        """
        import json

        with open(filepath) as file:
            section_data = json.load(file)
        section_rules = []
        for data in section_data["section_rules"]:
            section_rules.append(SectionRule.from_dict(data))
        return section_rules

    @classmethod
    def from_dict(cls, rule_dict):
        """
        Reads a dictionary into a SectionRule list. Used when reading from a json file.

        Args:
            rule_dict: the dictionary to convert

        Returns:
            item: the SectionRule created from the dictionary
        """
        keys = set(rule_dict.keys())
        invalid_keys = keys.difference(cls._ALLOWED_KEYS)
        if invalid_keys:
            msg = (
                f"JSON object contains invalid keys: {invalid_keys}. "
                f"Must be one of: {cls._ALLOWED_KEYS}"
            )
            raise ValueError(msg)
        rule = SectionRule(**rule_dict)
        return rule

    def to_dict(self):
        """
        Converts TargetRules to a python dictionary. Used when writing section rules to a json file.

        Returns:
            rule_dict: the dictionary containing the TargetRule info.
        """
        rule_dict = {}
        for key in self._ALLOWED_KEYS:
            value = self.__dict__.get(key)
            if value is not None:
                rule_dict[key] = value
        return rule_dict

    def __repr__(self):
        return f"""SectionRule(literal="{self.literal}", category="{self.category}", pattern={self.pattern}, on_match={self.on_match}, parents={self.parents}, parent_required={self.parent_required})"""

__init__(literal, category, pattern=None, on_match=None, max_scope=None, parents=None, parent_required=False, metadata=None)

Class for defining rules for extracting entities from text using TargetMatcher.

Parameters:

Name Type Description Default
literal str

The string representation of a concept. If pattern is None, this string will be lower-cased and matched to the lower-case string. If pattern is not None, this argument will not be used for matching but can be used as a reference as the rule name.

required
category str

The semantic class of the matched span. This corresponds to the label_ attribute of an entity.

required
pattern Optional[Union[List[Dict[str, str]], str]]

A list or string to use as a spaCy pattern rather than literal. If a list, will use spaCy token-based pattern matching to match using token attributes. If a string, will use medspaCy's RegexMatcher. If None, will use literal as the pattern for phrase matching. For more information, see https://spacy.io/usage/rule-based-matching.

None
on_match Optional[Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]]

An optional callback function or other callable which takes 4 arguments: (matcher, doc, i, matches). For more information, see https://spacy.io/usage/rule-based-matching#on_match

None
max_scope Optional[int]

A number of tokens to explicitly limit the size of a section body. If None, the scope will include the entire doc up until either the next section header or the end of the doc. This variable can also be set at a global level as `Sectionizer(nlp, max_scope=...), but if the attribute is set here, the rule scope will take precedence. If not None, this will be the number of tokens following the matched section header Example: In the text "Past Medical History: Pt has hx of pneumonia", SectionRule("Past Medical History:", "pmh", max_scope=None) will include the entire doc, but SectionRule("Past Medical History:", "pmh", max_scope=2) will limit the section to be "Past Medical History: Pt has" This can be useful for limiting certain sections which are known to be short or allowing others to be longer than the regular global max_scope.

None
parents Optional[List[str]]

A list of candidate parents for determining subsections

None
parent_required bool

Whether a parent is required for the section to exist in the final output. If true and no parent is identified, the section will be removed.

False
metadata Optional[Dict[Any, Any]]

Optional dictionary of any extra metadata.

None
Source code in medspacy/section_detection/section_rule.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def __init__(
    self,
    literal: str,
    category: str,
    pattern: Optional[Union[List[Dict[str, str]], str]] = None,
    on_match: Optional[
        Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
    ] = None,
    max_scope: Optional[int] = None,
    parents: Optional[List[str]] = None,
    parent_required: bool = False,
    metadata: Optional[Dict[Any, Any]] = None,
):
    """
    Class for defining rules for extracting entities from text using TargetMatcher.

    Args:
        literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
            matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
            but can be used as a reference as the rule name.
        category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
        pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
            token-based pattern matching to match using token attributes. If a string, will use medspaCy's
            RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
            https://spacy.io/usage/rule-based-matching.
        on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
            matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
        max_scope: A number of tokens to explicitly limit the size of a section body. If None, the scope will
            include the entire doc up until either the next section header or the end of the doc. This variable can
            also be set at a global level as `Sectionizer(nlp, max_scope=...), but if the attribute is set here, the
            rule scope will take precedence. If not None, this will be the number of tokens following the matched
            section header
                Example:
                    In the text "Past Medical History: Pt has hx of pneumonia",
                    SectionRule("Past Medical History:", "pmh", max_scope=None) will include the entire doc, but
                    SectionRule("Past Medical History:", "pmh", max_scope=2) will limit the section
                        to be "Past Medical History: Pt has"
            This can be useful for limiting certain sections which are known to be short or allowing others to be
            longer than the regular global max_scope.
        parents: A list of candidate parents for determining subsections
        parent_required: Whether a parent is required for the section to exist in the final output. If true and no
            parent is identified, the section will be removed.
        metadata: Optional dictionary of any extra metadata.
    """
    super().__init__(literal, category, pattern, on_match, metadata)
    self.max_scope = max_scope
    self.parents = parents
    if parent_required:
        if not parents:
            raise ValueError(
                f"Jsonl file incorrectly formatted for pattern name {category}. "
                f"If parents are required, then at least one parent must be specified."
            )
    self.parent_required = parent_required

from_dict(rule_dict) classmethod

Reads a dictionary into a SectionRule list. Used when reading from a json file.

Parameters:

Name Type Description Default
rule_dict

the dictionary to convert

required

Returns:

Name Type Description
item

the SectionRule created from the dictionary

Source code in medspacy/section_detection/section_rule.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
@classmethod
def from_dict(cls, rule_dict):
    """
    Reads a dictionary into a SectionRule list. Used when reading from a json file.

    Args:
        rule_dict: the dictionary to convert

    Returns:
        item: the SectionRule created from the dictionary
    """
    keys = set(rule_dict.keys())
    invalid_keys = keys.difference(cls._ALLOWED_KEYS)
    if invalid_keys:
        msg = (
            f"JSON object contains invalid keys: {invalid_keys}. "
            f"Must be one of: {cls._ALLOWED_KEYS}"
        )
        raise ValueError(msg)
    rule = SectionRule(**rule_dict)
    return rule

from_json(filepath) classmethod

Read in a lexicon of modifiers from a JSON file.

Parameters:

Name Type Description Default
filepath

the .json file containing modifier rules

required

Returns:

Name Type Description
section_rules List[SectionRule]

a list of SectionRule objects

Source code in medspacy/section_detection/section_rule.py
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
@classmethod
def from_json(cls, filepath) -> List[SectionRule]:
    """
    Read in a lexicon of modifiers from a JSON file.

    Args:
        filepath: the .json file containing modifier rules

    Returns:
        section_rules: a list of SectionRule objects
    """
    import json

    with open(filepath) as file:
        section_data = json.load(file)
    section_rules = []
    for data in section_data["section_rules"]:
        section_rules.append(SectionRule.from_dict(data))
    return section_rules

to_dict()

Converts TargetRules to a python dictionary. Used when writing section rules to a json file.

Returns:

Name Type Description
rule_dict

the dictionary containing the TargetRule info.

Source code in medspacy/section_detection/section_rule.py
123
124
125
126
127
128
129
130
131
132
133
134
135
def to_dict(self):
    """
    Converts TargetRules to a python dictionary. Used when writing section rules to a json file.

    Returns:
        rule_dict: the dictionary containing the TargetRule info.
    """
    rule_dict = {}
    for key in self._ALLOWED_KEYS:
        value = self.__dict__.get(key)
        if value is not None:
            rule_dict[key] = value
    return rule_dict