medspacy.section_detection

`Section`

Bases: object

Section is the object that stores the result of processing by the Sectionizer class. A Section contains information describing the section's category, title span, body span, parent, and the rule that created it.

Section category is equivalent to label_ in a basic spaCy entity. It is a normalized name for the section type determined on initialization, either created manually or through the Sectionizer pipeline component.

Section title, defined with title_start, title_end, and title_span represents the section title or header matched with the rule. In the text "Past medical history: stroke and high blood pressure", "Past medical history:" would be the title.

Section body is defined with body_start, body_end, and body_span. It represents the text between the end of the current section's title and the start of the title for the next Section or when scope is set in the rule or by the Sectionizer. In the text "Past medical history: stroke and high blood pressure", "stroke and high blood pressure" would be the body.

Parent is a string that represents the conceptual "parent" section in a section->subsection->subsubsection hierarchy. Candidates are determined by category in the rule and matched at runtime.

Source code in medspacy/section_detection/section.py

class Section(object):
    """
    Section is the object that stores the result of processing by the Sectionizer class. A Section contains information
    describing the section's category, title span, body span, parent, and the rule that created it.

    Section `category` is equivalent to `label_` in a basic spaCy entity. It is a normalized name for the section type
    determined on initialization, either created manually or through the Sectionizer pipeline component.

    Section title, defined with `title_start`, `title_end`, and `title_span` represents the section title or header
    matched with the rule. In the text "Past medical history: stroke and high blood pressure", "Past medical history:"
    would be the title.

    Section body is defined with `body_start`, `body_end`, and `body_span`. It represents the text between the end of
    the current section's title and the start of the title for the next Section or when scope is set in the rule or by
    the Sectionizer. In the text "Past medical history: stroke and high blood pressure", "stroke and high blood
    pressure" would be the body.

    Parent is a string that represents the conceptual "parent" section in a section->subsection->subsubsection
    hierarchy. Candidates are determined by category in the rule and matched at runtime.
    """

    def __init__(
        self,
        category: Union[str, None],
        title_start: int,
        title_end: int,
        body_start: int,
        body_end: int,
        parent: Optional[str] = None,
        rule: Optional[SectionRule] = None,
    ):
        """
        Create a new Section object.

        Args:
            category: A normalized name for the section. Equivalent to `label_` for basic spaCy entities.
            title_start: Index of the first token of the section title.
            title_end: Index of the last token of the section title.
            body_start: Index of the first token of the section body.
            body_end: Index of the last token of the section body.
            parent: The category of the parent section.
            rule: The SectionRule that generated the section.
        """
        self.category = category
        self.title_start = title_start
        self.title_end = title_end
        self.body_start = body_start
        self.body_end = body_end
        self.parent = parent
        self.rule = rule

    def __repr__(self):
        return (
            f"Section(category={self.category} at {self.title_start} : {self.title_end} in the doc with a body at "
            f"{self.body_start} : {self.body_end} based on the rule {self.rule}"
        )

    @property
    def title_span(self):
        """
        Gets the span of the section title.

        Returns:
            A tuple (int,int) containing the start and end indexes of the section title.
        """
        return self.title_start, self.title_end

    @property
    def body_span(self):
        """
        Gets the span of the section body.

        Returns:
            A tuple (int,int) containing the start and end indexes of the section body.
        """
        return self.body_start, self.body_end

    @property
    def section_span(self):
        """
        Gets the span of the entire section, from title start to body end.

        Returns:
            A tuple (int,int) containing the start index of the section title and the end index of the section body.
        """
        return self.title_start, self.body_end

    def serialized_representation(self):
        """
        Serialize the Section.

        Returns:
            A json-serialized representation of the section.
        """
        rule = self.rule

        return {
            "category": self.category,
            "title_start": self.title_start,
            "title_end": self.title_end,
            "body_start": self.body_start,
            "body_end": self.body_end,
            "parent": self.parent,
            "rule": rule.to_dict() if rule is not None else None,
        }

    @classmethod
    def from_serialized_representation(cls, serialized_representation: Dict[str, str]):
        """
        Load the section from a json-serialized form.

        Args:
            serialized_representation: The dictionary form of the section object to load.

        Returns:
            A Section object containing the data from the dictionary provided.
        """
        rule = SectionRule.from_dict(serialized_representation["rule"])
        section = Section(
            **{k: v for k, v in serialized_representation.items() if k not in ["rule"]}
        )
        section.rule = rule

        return section

`body_span` `property`

Gets the span of the section body.

Returns:

Type	Description
	A tuple (int,int) containing the start and end indexes of the section body.

`section_span` `property`

Gets the span of the entire section, from title start to body end.

Returns:

Type	Description
	A tuple (int,int) containing the start index of the section title and the end index of the section body.

`title_span` `property`

Gets the span of the section title.

Returns:

Type	Description
	A tuple (int,int) containing the start and end indexes of the section title.

`init(category, title_start, title_end, body_start, body_end, parent=None, rule=None)`

Create a new Section object.

Parameters:

Name	Type	Description	Default
`category`	`Union[str, None]`	A normalized name for the section. Equivalent to `label_` for basic spaCy entities.	required
`title_start`	`int`	Index of the first token of the section title.	required
`title_end`	`int`	Index of the last token of the section title.	required
`body_start`	`int`	Index of the first token of the section body.	required
`body_end`	`int`	Index of the last token of the section body.	required
`parent`	`Optional[str]`	The category of the parent section.	`None`
`rule`	`Optional[SectionRule]`	The SectionRule that generated the section.	`None`

Source code in medspacy/section_detection/section.py

def __init__(
    self,
    category: Union[str, None],
    title_start: int,
    title_end: int,
    body_start: int,
    body_end: int,
    parent: Optional[str] = None,
    rule: Optional[SectionRule] = None,
):
    """
    Create a new Section object.

    Args:
        category: A normalized name for the section. Equivalent to `label_` for basic spaCy entities.
        title_start: Index of the first token of the section title.
        title_end: Index of the last token of the section title.
        body_start: Index of the first token of the section body.
        body_end: Index of the last token of the section body.
        parent: The category of the parent section.
        rule: The SectionRule that generated the section.
    """
    self.category = category
    self.title_start = title_start
    self.title_end = title_end
    self.body_start = body_start
    self.body_end = body_end
    self.parent = parent
    self.rule = rule

`from_serialized_representation(serialized_representation)` `classmethod`

Load the section from a json-serialized form.

Parameters:

Name	Type	Description	Default
`serialized_representation`	`Dict[str, str]`	The dictionary form of the section object to load.	required

Returns:

Type	Description
	A Section object containing the data from the dictionary provided.

Source code in medspacy/section_detection/section.py

@classmethod
def from_serialized_representation(cls, serialized_representation: Dict[str, str]):
    """
    Load the section from a json-serialized form.

    Args:
        serialized_representation: The dictionary form of the section object to load.

    Returns:
        A Section object containing the data from the dictionary provided.
    """
    rule = SectionRule.from_dict(serialized_representation["rule"])
    section = Section(
        **{k: v for k, v in serialized_representation.items() if k not in ["rule"]}
    )
    section.rule = rule

    return section

`serialized_representation()`

Serialize the Section.

Returns:

Type	Description
	A json-serialized representation of the section.

Source code in medspacy/section_detection/section.py

def serialized_representation(self):
    """
    Serialize the Section.

    Returns:
        A json-serialized representation of the section.
    """
    rule = self.rule

    return {
        "category": self.category,
        "title_start": self.title_start,
        "title_end": self.title_end,
        "body_start": self.body_start,
        "body_end": self.body_end,
        "parent": self.parent,
        "rule": rule.to_dict() if rule is not None else None,
    }

`SectionRule`

Bases: BaseRule

SectionRule defines rules for extracting entities from text using the Sectionizer.

Source code in medspacy/section_detection/section_rule.py

class SectionRule(BaseRule):
    """
    SectionRule defines rules for extracting entities from text using the Sectionizer.
    """

    _ALLOWED_KEYS = {
        "literal",
        "pattern",
        "category",
        "metadata",
        "parents",
        "parent_required",
        "max_scope",
    }

    def __init__(
        self,
        literal: str,
        category: str,
        pattern: Optional[Union[List[Dict[str, str]], str]] = None,
        on_match: Optional[
            Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
        ] = None,
        max_scope: Optional[int] = None,
        parents: Optional[List[str]] = None,
        parent_required: bool = False,
        metadata: Optional[Dict[Any, Any]] = None,
    ):
        """
        Class for defining rules for extracting entities from text using TargetMatcher.

        Args:
            literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
                matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
                but can be used as a reference as the rule name.
            category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
            pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
                token-based pattern matching to match using token attributes. If a string, will use medspaCy's
                RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
                https://spacy.io/usage/rule-based-matching.
            on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
                matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
            max_scope: A number of tokens to explicitly limit the size of a section body. If None, the scope will
                include the entire doc up until either the next section header or the end of the doc. This variable can
                also be set at a global level as `Sectionizer(nlp, max_scope=...), but if the attribute is set here, the
                rule scope will take precedence. If not None, this will be the number of tokens following the matched
                section header
                    Example:
                        In the text "Past Medical History: Pt has hx of pneumonia",
                        SectionRule("Past Medical History:", "pmh", max_scope=None) will include the entire doc, but
                        SectionRule("Past Medical History:", "pmh", max_scope=2) will limit the section
                            to be "Past Medical History: Pt has"
                This can be useful for limiting certain sections which are known to be short or allowing others to be
                longer than the regular global max_scope.
            parents: A list of candidate parents for determining subsections
            parent_required: Whether a parent is required for the section to exist in the final output. If true and no
                parent is identified, the section will be removed.
            metadata: Optional dictionary of any extra metadata.
        """
        super().__init__(literal, category, pattern, on_match, metadata)
        self.max_scope = max_scope
        self.parents = parents
        if parent_required:
            if not parents:
                raise ValueError(
                    f"Jsonl file incorrectly formatted for pattern name {category}. "
                    f"If parents are required, then at least one parent must be specified."
                )
        self.parent_required = parent_required

    @classmethod
    def from_json(cls, filepath) -> List[SectionRule]:
        """
        Read in a lexicon of modifiers from a JSON file.

        Args:
            filepath: the .json file containing modifier rules

        Returns:
            section_rules: a list of SectionRule objects
        """
        import json

        with open(filepath) as file:
            section_data = json.load(file)
        section_rules = []
        for data in section_data["section_rules"]:
            section_rules.append(SectionRule.from_dict(data))
        return section_rules

    @classmethod
    def from_dict(cls, rule_dict):
        """
        Reads a dictionary into a SectionRule list. Used when reading from a json file.

        Args:
            rule_dict: the dictionary to convert

        Returns:
            item: the SectionRule created from the dictionary
        """
        keys = set(rule_dict.keys())
        invalid_keys = keys.difference(cls._ALLOWED_KEYS)
        if invalid_keys:
            msg = (
                f"JSON object contains invalid keys: {invalid_keys}. "
                f"Must be one of: {cls._ALLOWED_KEYS}"
            )
            raise ValueError(msg)
        rule = SectionRule(**rule_dict)
        return rule

    def to_dict(self):
        """
        Converts TargetRules to a python dictionary. Used when writing section rules to a json file.

        Returns:
            rule_dict: the dictionary containing the TargetRule info.
        """
        rule_dict = {}
        for key in self._ALLOWED_KEYS:
            value = self.__dict__.get(key)
            if value is not None:
                rule_dict[key] = value
        return rule_dict

    def __repr__(self):
        return f"""SectionRule(literal="{self.literal}", category="{self.category}", pattern={self.pattern}, on_match={self.on_match}, parents={self.parents}, parent_required={self.parent_required})"""

`init(literal, category, pattern=None, on_match=None, max_scope=None, parents=None, parent_required=False, metadata=None)`

Class for defining rules for extracting entities from text using TargetMatcher.

Parameters:

Name	Type	Description	Default
`literal`	`str`	The string representation of a concept. If `pattern` is None, this string will be lower-cased and matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching but can be used as a reference as the rule name.	required
`category`	`str`	The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.	required
`pattern`	`Optional[Union[List[Dict[str, str]], str]]`	A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy token-based pattern matching to match using token attributes. If a string, will use medspaCy's RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see https://spacy.io/usage/rule-based-matching.	`None`
`on_match`	`Optional[Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]]`	An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i, matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match	`None`
`max_scope`	`Optional[int]`	A number of tokens to explicitly limit the size of a section body. If None, the scope will include the entire doc up until either the next section header or the end of the doc. This variable can also be set at a global level as `Sectionizer(nlp, max_scope=...), but if the attribute is set here, the rule scope will take precedence. If not None, this will be the number of tokens following the matched section header Example: In the text "Past Medical History: Pt has hx of pneumonia", SectionRule("Past Medical History:", "pmh", max_scope=None) will include the entire doc, but SectionRule("Past Medical History:", "pmh", max_scope=2) will limit the section to be "Past Medical History: Pt has" This can be useful for limiting certain sections which are known to be short or allowing others to be longer than the regular global max_scope.	`None`
`parents`	`Optional[List[str]]`	A list of candidate parents for determining subsections	`None`
`parent_required`	`bool`	Whether a parent is required for the section to exist in the final output. If true and no parent is identified, the section will be removed.	`False`
`metadata`	`Optional[Dict[Any, Any]]`	Optional dictionary of any extra metadata.	`None`

Source code in medspacy/section_detection/section_rule.py

def __init__(
    self,
    literal: str,
    category: str,
    pattern: Optional[Union[List[Dict[str, str]], str]] = None,
    on_match: Optional[
        Callable[[Matcher, Doc, int, List[Tuple[int, int, int]]], Any]
    ] = None,
    max_scope: Optional[int] = None,
    parents: Optional[List[str]] = None,
    parent_required: bool = False,
    metadata: Optional[Dict[Any, Any]] = None,
):
    """
    Class for defining rules for extracting entities from text using TargetMatcher.

    Args:
        literal: The string representation of a concept. If `pattern` is None, this string will be lower-cased and
            matched to the lower-case string. If `pattern` is not None, this argument will not be used for matching
            but can be used as a reference as the rule name.
        category: The semantic class of the matched span. This corresponds to the `label_` attribute of an entity.
        pattern: A list or string to use as a spaCy pattern rather than `literal`. If a list, will use spaCy
            token-based pattern matching to match using token attributes. If a string, will use medspaCy's
            RegexMatcher. If None, will use `literal` as the pattern for phrase matching. For more information, see
            https://spacy.io/usage/rule-based-matching.
        on_match: An optional callback function or other callable which takes 4 arguments: `(matcher, doc, i,
            matches)`. For more information, see https://spacy.io/usage/rule-based-matching#on_match
        max_scope: A number of tokens to explicitly limit the size of a section body. If None, the scope will
            include the entire doc up until either the next section header or the end of the doc. This variable can
            also be set at a global level as `Sectionizer(nlp, max_scope=...), but if the attribute is set here, the
            rule scope will take precedence. If not None, this will be the number of tokens following the matched
            section header
                Example:
                    In the text "Past Medical History: Pt has hx of pneumonia",
                    SectionRule("Past Medical History:", "pmh", max_scope=None) will include the entire doc, but
                    SectionRule("Past Medical History:", "pmh", max_scope=2) will limit the section
                        to be "Past Medical History: Pt has"
            This can be useful for limiting certain sections which are known to be short or allowing others to be
            longer than the regular global max_scope.
        parents: A list of candidate parents for determining subsections
        parent_required: Whether a parent is required for the section to exist in the final output. If true and no
            parent is identified, the section will be removed.
        metadata: Optional dictionary of any extra metadata.
    """
    super().__init__(literal, category, pattern, on_match, metadata)
    self.max_scope = max_scope
    self.parents = parents
    if parent_required:
        if not parents:
            raise ValueError(
                f"Jsonl file incorrectly formatted for pattern name {category}. "
                f"If parents are required, then at least one parent must be specified."
            )
    self.parent_required = parent_required

`from_dict(rule_dict)` `classmethod`

Reads a dictionary into a SectionRule list. Used when reading from a json file.

Parameters:

Name	Type	Description	Default
`rule_dict`		the dictionary to convert	required

Returns:

Name	Type	Description
`item`		the SectionRule created from the dictionary

Source code in medspacy/section_detection/section_rule.py

@classmethod
def from_dict(cls, rule_dict):
    """
    Reads a dictionary into a SectionRule list. Used when reading from a json file.

    Args:
        rule_dict: the dictionary to convert

    Returns:
        item: the SectionRule created from the dictionary
    """
    keys = set(rule_dict.keys())
    invalid_keys = keys.difference(cls._ALLOWED_KEYS)
    if invalid_keys:
        msg = (
            f"JSON object contains invalid keys: {invalid_keys}. "
            f"Must be one of: {cls._ALLOWED_KEYS}"
        )
        raise ValueError(msg)
    rule = SectionRule(**rule_dict)
    return rule

`from_json(filepath)` `classmethod`

Read in a lexicon of modifiers from a JSON file.

Parameters:

Name	Type	Description	Default
`filepath`		the .json file containing modifier rules	required

Returns:

Name	Type	Description
`section_rules`	`List[SectionRule]`	a list of SectionRule objects

Source code in medspacy/section_detection/section_rule.py

@classmethod
def from_json(cls, filepath) -> List[SectionRule]:
    """
    Read in a lexicon of modifiers from a JSON file.

    Args:
        filepath: the .json file containing modifier rules

    Returns:
        section_rules: a list of SectionRule objects
    """
    import json

    with open(filepath) as file:
        section_data = json.load(file)
    section_rules = []
    for data in section_data["section_rules"]:
        section_rules.append(SectionRule.from_dict(data))
    return section_rules

`to_dict()`

Converts TargetRules to a python dictionary. Used when writing section rules to a json file.

Returns:

Name	Type	Description
`rule_dict`		the dictionary containing the TargetRule info.

Source code in medspacy/section_detection/section_rule.py

def to_dict(self):
    """
    Converts TargetRules to a python dictionary. Used when writing section rules to a json file.

    Returns:
        rule_dict: the dictionary containing the TargetRule info.
    """
    rule_dict = {}
    for key in self._ALLOWED_KEYS:
        value = self.__dict__.get(key)
        if value is not None:
            rule_dict[key] = value
    return rule_dict

`Sectionizer`

The Sectionizer will search for spans in the text which match section header rules, such as 'Past Medical History:'. Sections will be represented in custom attributes as: category: A normalized title of the section. Example: 'past_medical_history' section_title: The Span of the doc which was matched as a section header. Example: 'Past Medical History:' section_span: The entire section of the note, starting with section_header and up until the end of the section, which will be either the start of the next section header of some pre-specified scope. Example: 'Past Medical History: Type II DM'

Section attributes will be registered for each Doc, Span, and Token in the following attributes: Doc..sections: A list of namedtuples of type Section with 4 elements: - section_title - section_header - section_parent - section_span. A Doc will also have attributes corresponding to lists of each (ie., Doc..section_titles, Doc..section_headers, Doc..section_parents, Doc..section_list) (Span|Token)..section_title (Span|Token)..section_header (Span|Token)..section_parent (Span|Token)._.section_span

Source code in medspacy/section_detection/sectionizer.py

@Language.factory("medspacy_sectionizer")
class Sectionizer:
    """
    The Sectionizer will search for spans in the text which match section header rules, such as 'Past Medical History:'.
    Sections will be represented in custom attributes as:
        category: A normalized title of the section. Example: 'past_medical_history'
        section_title: The Span of the doc which was matched as a section header.
            Example: 'Past Medical History:'
        section_span: The entire section of the note, starting with section_header and up until the end
            of the section, which will be either the start of the next section header of some pre-specified
            scope. Example: 'Past Medical History: Type II DM'

    Section attributes will be registered for each Doc, Span, and Token in the following attributes:
        Doc._.sections: A list of namedtuples of type Section with 4 elements:
            - section_title
            - section_header
            - section_parent
            - section_span.
        A Doc will also have attributes corresponding to lists of each
            (ie., Doc._.section_titles, Doc._.section_headers, Doc._.section_parents, Doc._.section_list)
        (Span|Token)._.section_title
        (Span|Token)._.section_header
        (Span|Token)._.section_parent
        (Span|Token)._.section_span
    """

    def __init__(
        self,
        nlp: Language,
        name: str = "medspacy_sectionizer",
        rules: Optional[str] = "default",
        language_code: str = 'en',
        max_section_length: Optional[int] = None,
        phrase_matcher_attr: str = "LOWER",
        require_start_line: bool = False,
        require_end_line: bool = False,
        newline_pattern: str = r"[\n\r]+[\s]*$",
        input_span_type: Union[Literal["ents", "group"], None] = "ents",
        span_group_name: str = "medspacy_spans",
        span_attrs: Union[
            Literal["default"], Dict[str, Dict[str, Any]], None
        ] = "default",
        apply_sentence_boundary: bool = False,
    ):
        """
        Create a new Sectionizer component.

        Args:
            nlp: A SpaCy Language object.
            name: The name of the component.
            rules: The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from
                SecTag, MIMIC-III, and practical refinement at the US Department of Veterans Affairs. If None, no rules
                are loaded. Otherwise, must be a path to a json file containing rules. Add SectionRules directly through
                `Sectionizer.add`.
            language_code: Language code to use (ISO code) as a default for loading resources.  See documentation
                and also the /resources directory to see which resources might be available in each language.
                Default is "en" for English.
            max_section_length: Optional argument specifying the maximum number of tokens following a section header
                which can be included in a section body. This can be useful if you think your section rules are
                incomplete and want to prevent sections from running too long in the note. Default is None, meaning that
                the scope of a section will be until either the next section header or the end of the document.
            phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
                is 'LOWER'.
            require_start_line: Optionally require a section header to start on a new line. Default False.
            require_end_line: Optionally require a section header to end with a new line. Default False.
            newline_pattern: Regular expression to match the new line either preceding or following a header
                if either require_start_line or require_end_line are True. Default is r"[\n\r]+[\s]*$"
            span_attrs: The optional span attributes to modify. Default option "default" uses attributes in
                `DEFAULT_ATTRIBUTES`. If a dictionary of custom attributes, format is a dictionary mapping section
                categories to a dictionary containing the attribute name and the value to set the attribute to when a
                span is contained in a section of that category. Custom attributes must be assigned with
                `Span.set_extension` before creating the Sectionizer. If None, sectionizer will not modify span
                attributes.
            input_span_type: "ents" or "group". Where to look for spans when modifying attributes of spans
                contained in a section if `span_attrs` is not None. "ents" will modify attributes of spans in doc.ents.
                "group" will modify attributes of spans in the span group specified by `span_group_name`.
            span_group_name: The name of the span group used when `input_span_type` is "group". Default is
                "medspacy_spans".
            apply_sentence_boundary: Optionally end sentence before and after section header boundary. This ensures
                the section header is considered its own sentence.
        """
        self.nlp = nlp
        self.name = name
        self.max_section_length = max_section_length
        self.require_start_line = require_start_line
        self.require_end_line = require_end_line
        self.newline_pattern = re.compile(newline_pattern)
        self.assertion_attributes_mapping = None
        self._parent_sections = {}
        self._parent_required = {}
        self._input_span_type = input_span_type
        self._span_group_name = span_group_name
        self._apply_sentence_boundary = apply_sentence_boundary

        self.__matcher = MedspacyMatcher(
            nlp, name=name, phrase_matcher_attr=phrase_matcher_attr
        )

        self.DEFAULT_RULES_FILEPATH = path.join(
            Path(__file__).resolve().parents[2],
            "resources",
            language_code.lower(),
            "section_patterns.json",
        )

        rule_path = None
        if rules == "default":
            rule_path = self.DEFAULT_RULES_FILEPATH
        else:
            rule_path = rules

        if rule_path:
            self.add(SectionRule.from_json(rule_path))

        if span_attrs == "default":
            self.assertion_attributes_mapping = DEFAULT_ATTRS
            self.register_default_attributes()
        elif span_attrs:
            for _, attr_dict in span_attrs.items():
                for attr_name in attr_dict.keys():
                    if not Span.has_extension(attr_name):
                        raise ValueError(
                            f"Custom extension {attr_name} has not been set. Please ensure Span.set_extension is "
                            f"called for your pipeline's custom extensions."
                        )
            self.assertion_attributes_mapping = span_attrs

    @property
    def rules(self) -> List[SectionRule]:
        """
        Gets list of rules associated with the Sectionizer.

        Returns:
            The list of SectionRules associated with the Sectionizer.
        """
        return self.__matcher.rules

    @property
    def section_categories(self) -> Set[str]:
        """
        Gets a list of categories used in the Sectionizer.

        Returns:
                The list of all section categories available to the Sectionizer.
        """
        return self.__matcher.labels

    @property
    def input_span_type(self):
        """
        The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for
        a spaCy span group.

        Returns:
            The input type, "ents" or "group".
        """
        return self._input_span_type

    @input_span_type.setter
    def input_span_type(self, val):
        if not (val == "ents" or val == "group"):
            raise ValueError('input_type must be "ents" or "group".')
        self._input_span_type = val

    @property
    def span_group_name(self) -> str:
        """
        The name of the span group used by this component. If `input_type` is "group", calling this component will
        use spans in the span group with this name.

        Returns:
            The span group name.
        """
        return self._span_group_name

    @span_group_name.setter
    def span_group_name(self, name: str):
        if not name or not isinstance(name, str):
            raise ValueError("Span group name must be a string.")
        self._span_group_name = name

    @classmethod
    def register_default_attributes(cls):
        """
        Register the default values for the Span attributes defined in `DEFAULT_ATTRIBUTES`.
        """
        for attr_name in [
            "is_negated",
            "is_uncertain",
            "is_historical",
            "is_hypothetical",
            "is_family",
        ]:
            try:
                Span.set_extension(attr_name, default=False)
            except ValueError:  # Extension already set
                pass

    def add(self, rules):
        """
        Adds SectionRules to the Sectionizer.

        Args:
            rules: A single SectionRule or a collection of SectionRules to add to the Sectionizer.
        """
        if isinstance(rules, SectionRule):
            rules = [rules]

        for rule in rules:
            if not isinstance(rule, SectionRule):
                raise TypeError("Rules must be type SectionRule, not", type(rule))

        self.__matcher.add(rules)

        for rule in rules:
            name = rule.category
            parents = rule.parents
            parent_required = rule.parent_required
            if parents:
                if name in self._parent_sections.keys():
                    warnings.warn(
                        f"Duplicate section title {name}. Merging parents. "
                        f"If this is not intended, please specify distinct titles.",
                        RuntimeWarning,
                    )
                    self._parent_sections[name].update(parents)
                else:
                    self._parent_sections[name] = set(parents)

            if (
                name in self._parent_required.keys()
                and self._parent_required[name] != parent_required
            ):
                warnings.warn(
                    f"Duplicate section title {name} has different parent_required option. "
                    f"Setting parent_required to False.",
                    RuntimeWarning,
                )
                self._parent_required[name] = False
            else:
                self._parent_required[name] = parent_required

    def set_parent_sections(
        self, sections: List[Tuple[int, int, int]]
    ) -> List[Tuple[int, int, int, int]]:
        """
        Determine the legal parent-child section relationships from the list
        of in-order sections of a document and the possible parents of each
        section as specified during direction creation.

        Args:
            sections: a list of spacy match tuples found in the doc

        Returns:
            A list of tuples (match_id, start, end, parent_idx) where the first three indices are the same as the input
            and the added parent_idx represents the index in the list that corresponds to the parent section. Might be a
            smaller list than the input due to pruning with `parent_required`.
        """
        sections_final = []
        removed_sections = 0
        for i, (match_id, start, end) in enumerate(sections):
            name = self.__matcher.rule_map[self.nlp.vocab.strings[match_id]].category
            required = self._parent_required[name]
            i_a = i - removed_sections  # adjusted index for removed values
            if required and i_a == 0:
                removed_sections += 1
                continue
            elif i_a == 0 or name not in self._parent_sections.keys():
                sections_final.append((match_id, start, end, None))
            else:
                parents = self._parent_sections[name]
                identified_parent = None
                for parent in parents:
                    # go backwards through the section "tree" until you hit a root or the start of the list
                    candidate = self.__matcher.rule_map[
                        self.nlp.vocab.strings[sections_final[i_a - 1][0]]
                    ].category
                    candidates_parent_idx = sections_final[i_a - 1][3]
                    if candidates_parent_idx is not None:
                        candidates_parent = self.__matcher.rule_map[
                            self.nlp.vocab.strings[
                                sections_final[candidates_parent_idx][0]
                            ]
                        ].category
                    else:
                        candidates_parent = None
                    candidate_i = i_a - 1
                    while candidate:
                        if candidate == parent:
                            identified_parent = candidate_i
                            candidate = None
                        else:
                            # if you are at the end of the list... no parent
                            if candidate_i < 1:
                                candidate = None
                                continue
                            # if the current candidate has no parent... no parent exists
                            if not candidates_parent:
                                candidate = None
                                continue
                            # otherwise get the previous item in the list
                            temp = self.__matcher.rule_map[
                                self.nlp.vocab.strings[
                                    sections_final[candidate_i - 1][0]
                                ]
                            ].category
                            temp_parent_idx = sections_final[candidate_i - 1][3]
                            if temp_parent_idx is not None:
                                temp_parent = self.__matcher.rule_map[
                                    self.nlp.vocab.strings[
                                        sections_final[temp_parent_idx][0]
                                    ]
                                ].category
                            else:
                                temp_parent = None
                            # if the previous item is the parent of the current item
                            # OR if the previous item is a sibling of the current item
                            # continue to search
                            if (
                                temp == candidates_parent
                                or temp_parent == candidates_parent
                            ):
                                candidate = temp
                                candidates_parent = temp_parent
                                candidate_i -= 1
                            # otherwise, there is no further tree traversal
                            else:
                                candidate = None

                # if a parent is required, then add
                if identified_parent is not None or not required:
                    # if the parent is identified, add section
                    # if the parent is not required, add section
                    # if parent is not identified and required, do not add the section
                    sections_final.append((match_id, start, end, identified_parent))
                else:
                    removed_sections += 1
        return sections_final

    def set_assertion_attributes(self, spans: Iterable[Span]):
        """
        Add Span-level attributes to entities based on which section they occur in.

        Args:
            spans: the spans to modify.
        """
        for span in spans:
            if (
                span._.section
                and span._.section.category in self.assertion_attributes_mapping
            ):
                attr_dict = self.assertion_attributes_mapping[span._.section.category]
                for (attr_name, attr_value) in attr_dict.items():
                    setattr(span._, attr_name, attr_value)

    def __call__(self, doc: Doc) -> Doc:
        """
        Call the Sectionizer on a spaCy doc. Sectionizer will identify sections using provided rules, then evaluate any
        section hierarchy as needed, create section spans, and modify attributes on existing spans based on the sections
        the entities spans in.

        Args:
            doc: The Doc to process.

        Returns:
            The processed spaCy Doc.
        """
        matches = self.__matcher(doc)
        if self.require_start_line:
            matches = self.filter_start_lines(doc, matches)
        if self.require_end_line:
            matches = self.filter_end_lines(doc, matches)
        if self._parent_sections:
            matches = self.set_parent_sections(matches)

        # If this has already been processed by the sectionizer, reset the sections
        doc._.sections = []
        # if there were no matches, return the doc as one section
        if len(matches) == 0:
            doc._.sections.append(Section(None, 0, 0, 0, len(doc)))
            return doc

        section_list = []
        # if the first match does not begin at token 0, handle the first section
        first_match = matches[0]
        if first_match[1] != 0:
            section_list.append(Section(None, 0, 0, 0, first_match[1]))

        # handle section spans
        for i, match in enumerate(matches):
            parent = None
            if len(match) == 4:
                (match_id, start, end, parent_idx) = match
                if parent_idx is not None:
                    parent = section_list[parent_idx]
            else:
                # IDEs will warn here about match shape disagreeing w/ type hinting, but this if is only used if
                # parent sections were never set, so parent_idx does not exist
                (match_id, start, end) = match

            # Make section header its own sentence
            if self._apply_sentence_boundary:
                # Section headers should be considered the start of a sentence
                doc[start].sent_start = True
                # Text following the header should also be considered a new sentence
                if end < len(doc):
                    doc[end].sent_start = True

            rule = self.__matcher.rule_map[self.nlp.vocab.strings[match_id]]
            category = rule.category
            # If this is the last match, it should include the rest of the doc
            if i == len(matches) - 1:
                # If there is no scope limitation, go until the end of the doc
                if self.max_section_length is None and rule.max_scope is None:
                    section_list.append(
                        Section(category, start, end, end, len(doc), parent, rule)
                    )
                else:
                    # If the rule has a max_scope, use that as a precedence
                    if rule.max_scope is not None:
                        scope_end = min(end + rule.max_scope, doc[-1].i + 1)
                    else:
                        scope_end = min(end + self.max_section_length, doc[-1].i + 1)

                    section_list.append(
                        Section(category, start, end, end, scope_end, parent, rule)
                    )
            # Otherwise, go until the next section header
            else:
                next_match = matches[i + 1]
                if len(match) == 4:
                    _, next_start, _, _ = next_match
                else:
                    _, next_start, _ = next_match
                if self.max_section_length is None and rule.max_scope is None:
                    section_list.append(
                        Section(category, start, end, end, next_start, parent, rule)
                    )
                else:
                    if rule.max_scope is not None:
                        scope_end = min(end + rule.max_scope, next_start)
                    else:
                        scope_end = min(end + self.max_section_length, next_start)
                    section_list.append(
                        Section(category, start, end, end, scope_end, parent, rule)
                    )

        for section in section_list:
            doc._.sections.append(section)
            start, end = section.section_span
            for token in doc[start:end]:
                token._.section = section

        # If it is specified to add assertion attributes,
        # iterate through the entities in doc and add them
        if self.assertion_attributes_mapping:
            if self._input_span_type.lower() == "ents":
                self.set_assertion_attributes(doc.ents)
            elif self._input_span_type.lower() == "group":
                self.set_assertion_attributes(doc.spans[self._span_group_name])

        return doc

    def filter_start_lines(
        self, doc: Doc, matches: List[Tuple[int, int, int]]
    ) -> List[Tuple[int, int, int]]:
        """
        Filter a list of matches to only contain spans where the start token is the beginning of a new line.

        Returns:
            A list of match tuples (match_id, start, end) that meet the filter criteria.
        """
        return [
            m for m in matches if util.is_start_line(m[1], doc, self.newline_pattern)
        ]

    def filter_end_lines(
        self, doc: Doc, matches: List[Tuple[int, int, int]]
    ) -> List[Tuple[int, int, int]]:
        """
        Filter a list of matches to only contain spans where the start token is followed by a new line.

        Returns:
            A list of match tuples (match_id, start, end) that meet the filter criteria.
        """
        return [
            m for m in matches if util.is_end_line(m[2] - 1, doc, self.newline_pattern)
        ]

`input_span_type` `property` `writable`

The input source of entities for the component. Must be either "ents" corresponding to doc.ents or "group" for a spaCy span group.

Returns:

Type	Description
	The input type, "ents" or "group".

`rules` `property`

Gets list of rules associated with the Sectionizer.

Returns:

Type	Description
`List[SectionRule]`	The list of SectionRules associated with the Sectionizer.

`section_categories` `property`

Gets a list of categories used in the Sectionizer.

Returns:

Type	Description
`Set[str]`	The list of all section categories available to the Sectionizer.

`span_group_name` `property` `writable`

The name of the span group used by this component. If input_type is "group", calling this component will use spans in the span group with this name.

Returns:

Type	Description
`str`	The span group name.

`call(doc)`

Call the Sectionizer on a spaCy doc. Sectionizer will identify sections using provided rules, then evaluate any section hierarchy as needed, create section spans, and modify attributes on existing spans based on the sections the entities spans in.

Parameters:

Name	Type	Description	Default
`doc`	`Doc`	The Doc to process.	required

Returns:

Type	Description
`Doc`	The processed spaCy Doc.

Source code in medspacy/section_detection/sectionizer.py

def __call__(self, doc: Doc) -> Doc:
    """
    Call the Sectionizer on a spaCy doc. Sectionizer will identify sections using provided rules, then evaluate any
    section hierarchy as needed, create section spans, and modify attributes on existing spans based on the sections
    the entities spans in.

    Args:
        doc: The Doc to process.

    Returns:
        The processed spaCy Doc.
    """
    matches = self.__matcher(doc)
    if self.require_start_line:
        matches = self.filter_start_lines(doc, matches)
    if self.require_end_line:
        matches = self.filter_end_lines(doc, matches)
    if self._parent_sections:
        matches = self.set_parent_sections(matches)

    # If this has already been processed by the sectionizer, reset the sections
    doc._.sections = []
    # if there were no matches, return the doc as one section
    if len(matches) == 0:
        doc._.sections.append(Section(None, 0, 0, 0, len(doc)))
        return doc

    section_list = []
    # if the first match does not begin at token 0, handle the first section
    first_match = matches[0]
    if first_match[1] != 0:
        section_list.append(Section(None, 0, 0, 0, first_match[1]))

    # handle section spans
    for i, match in enumerate(matches):
        parent = None
        if len(match) == 4:
            (match_id, start, end, parent_idx) = match
            if parent_idx is not None:
                parent = section_list[parent_idx]
        else:
            # IDEs will warn here about match shape disagreeing w/ type hinting, but this if is only used if
            # parent sections were never set, so parent_idx does not exist
            (match_id, start, end) = match

        # Make section header its own sentence
        if self._apply_sentence_boundary:
            # Section headers should be considered the start of a sentence
            doc[start].sent_start = True
            # Text following the header should also be considered a new sentence
            if end < len(doc):
                doc[end].sent_start = True

        rule = self.__matcher.rule_map[self.nlp.vocab.strings[match_id]]
        category = rule.category
        # If this is the last match, it should include the rest of the doc
        if i == len(matches) - 1:
            # If there is no scope limitation, go until the end of the doc
            if self.max_section_length is None and rule.max_scope is None:
                section_list.append(
                    Section(category, start, end, end, len(doc), parent, rule)
                )
            else:
                # If the rule has a max_scope, use that as a precedence
                if rule.max_scope is not None:
                    scope_end = min(end + rule.max_scope, doc[-1].i + 1)
                else:
                    scope_end = min(end + self.max_section_length, doc[-1].i + 1)

                section_list.append(
                    Section(category, start, end, end, scope_end, parent, rule)
                )
        # Otherwise, go until the next section header
        else:
            next_match = matches[i + 1]
            if len(match) == 4:
                _, next_start, _, _ = next_match
            else:
                _, next_start, _ = next_match
            if self.max_section_length is None and rule.max_scope is None:
                section_list.append(
                    Section(category, start, end, end, next_start, parent, rule)
                )
            else:
                if rule.max_scope is not None:
                    scope_end = min(end + rule.max_scope, next_start)
                else:
                    scope_end = min(end + self.max_section_length, next_start)
                section_list.append(
                    Section(category, start, end, end, scope_end, parent, rule)
                )

    for section in section_list:
        doc._.sections.append(section)
        start, end = section.section_span
        for token in doc[start:end]:
            token._.section = section

    # If it is specified to add assertion attributes,
    # iterate through the entities in doc and add them
    if self.assertion_attributes_mapping:
        if self._input_span_type.lower() == "ents":
            self.set_assertion_attributes(doc.ents)
        elif self._input_span_type.lower() == "group":
            self.set_assertion_attributes(doc.spans[self._span_group_name])

    return doc

`init(nlp, name='medspacy_sectionizer', rules='default', language_code='en', max_section_length=None, phrase_matcher_attr='LOWER', require_start_line=False, require_end_line=False, newline_pattern='[\\n\\r]+[\\s]*$', input_span_type='ents', span_group_name='medspacy_spans', span_attrs='default', apply_sentence_boundary=False)`

   Create a new Sectionizer component.

   Args:
       nlp: A SpaCy Language object.
       name: The name of the component.
       rules: The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from
           SecTag, MIMIC-III, and practical refinement at the US Department of Veterans Affairs. If None, no rules
           are loaded. Otherwise, must be a path to a json file containing rules. Add SectionRules directly through
           `Sectionizer.add`.
       language_code: Language code to use (ISO code) as a default for loading resources.  See documentation
           and also the /resources directory to see which resources might be available in each language.
           Default is "en" for English.
       max_section_length: Optional argument specifying the maximum number of tokens following a section header
           which can be included in a section body. This can be useful if you think your section rules are
           incomplete and want to prevent sections from running too long in the note. Default is None, meaning that
           the scope of a section will be until either the next section header or the end of the document.
       phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
           is 'LOWER'.
       require_start_line: Optionally require a section header to start on a new line. Default False.
       require_end_line: Optionally require a section header to end with a new line. Default False.
       newline_pattern: Regular expression to match the new line either preceding or following a header
           if either require_start_line or require_end_line are True. Default is r"[

]+[\s]*$" span_attrs: The optional span attributes to modify. Default option "default" uses attributes in DEFAULT_ATTRIBUTES. If a dictionary of custom attributes, format is a dictionary mapping section categories to a dictionary containing the attribute name and the value to set the attribute to when a span is contained in a section of that category. Custom attributes must be assigned with Span.set_extension before creating the Sectionizer. If None, sectionizer will not modify span attributes. input_span_type: "ents" or "group". Where to look for spans when modifying attributes of spans contained in a section if span_attrs is not None. "ents" will modify attributes of spans in doc.ents. "group" will modify attributes of spans in the span group specified by span_group_name. span_group_name: The name of the span group used when input_span_type is "group". Default is "medspacy_spans". apply_sentence_boundary: Optionally end sentence before and after section header boundary. This ensures the section header is considered its own sentence.

Source code in medspacy/section_detection/sectionizer.py

def __init__(
    self,
    nlp: Language,
    name: str = "medspacy_sectionizer",
    rules: Optional[str] = "default",
    language_code: str = 'en',
    max_section_length: Optional[int] = None,
    phrase_matcher_attr: str = "LOWER",
    require_start_line: bool = False,
    require_end_line: bool = False,
    newline_pattern: str = r"[\n\r]+[\s]*$",
    input_span_type: Union[Literal["ents", "group"], None] = "ents",
    span_group_name: str = "medspacy_spans",
    span_attrs: Union[
        Literal["default"], Dict[str, Dict[str, Any]], None
    ] = "default",
    apply_sentence_boundary: bool = False,
):
    """
    Create a new Sectionizer component.

    Args:
        nlp: A SpaCy Language object.
        name: The name of the component.
        rules: The rules to load. Default is "default", loads rules packaged with medspaCy that are derived from
            SecTag, MIMIC-III, and practical refinement at the US Department of Veterans Affairs. If None, no rules
            are loaded. Otherwise, must be a path to a json file containing rules. Add SectionRules directly through
            `Sectionizer.add`.
        language_code: Language code to use (ISO code) as a default for loading resources.  See documentation
            and also the /resources directory to see which resources might be available in each language.
            Default is "en" for English.
        max_section_length: Optional argument specifying the maximum number of tokens following a section header
            which can be included in a section body. This can be useful if you think your section rules are
            incomplete and want to prevent sections from running too long in the note. Default is None, meaning that
            the scope of a section will be until either the next section header or the end of the document.
        phrase_matcher_attr: The token attribute to use for PhraseMatcher for rules where `pattern` is None. Default
            is 'LOWER'.
        require_start_line: Optionally require a section header to start on a new line. Default False.
        require_end_line: Optionally require a section header to end with a new line. Default False.
        newline_pattern: Regular expression to match the new line either preceding or following a header
            if either require_start_line or require_end_line are True. Default is r"[\n\r]+[\s]*$"
        span_attrs: The optional span attributes to modify. Default option "default" uses attributes in
            `DEFAULT_ATTRIBUTES`. If a dictionary of custom attributes, format is a dictionary mapping section
            categories to a dictionary containing the attribute name and the value to set the attribute to when a
            span is contained in a section of that category. Custom attributes must be assigned with
            `Span.set_extension` before creating the Sectionizer. If None, sectionizer will not modify span
            attributes.
        input_span_type: "ents" or "group". Where to look for spans when modifying attributes of spans
            contained in a section if `span_attrs` is not None. "ents" will modify attributes of spans in doc.ents.
            "group" will modify attributes of spans in the span group specified by `span_group_name`.
        span_group_name: The name of the span group used when `input_span_type` is "group". Default is
            "medspacy_spans".
        apply_sentence_boundary: Optionally end sentence before and after section header boundary. This ensures
            the section header is considered its own sentence.
    """
    self.nlp = nlp
    self.name = name
    self.max_section_length = max_section_length
    self.require_start_line = require_start_line
    self.require_end_line = require_end_line
    self.newline_pattern = re.compile(newline_pattern)
    self.assertion_attributes_mapping = None
    self._parent_sections = {}
    self._parent_required = {}
    self._input_span_type = input_span_type
    self._span_group_name = span_group_name
    self._apply_sentence_boundary = apply_sentence_boundary

    self.__matcher = MedspacyMatcher(
        nlp, name=name, phrase_matcher_attr=phrase_matcher_attr
    )

    self.DEFAULT_RULES_FILEPATH = path.join(
        Path(__file__).resolve().parents[2],
        "resources",
        language_code.lower(),
        "section_patterns.json",
    )

    rule_path = None
    if rules == "default":
        rule_path = self.DEFAULT_RULES_FILEPATH
    else:
        rule_path = rules

    if rule_path:
        self.add(SectionRule.from_json(rule_path))

    if span_attrs == "default":
        self.assertion_attributes_mapping = DEFAULT_ATTRS
        self.register_default_attributes()
    elif span_attrs:
        for _, attr_dict in span_attrs.items():
            for attr_name in attr_dict.keys():
                if not Span.has_extension(attr_name):
                    raise ValueError(
                        f"Custom extension {attr_name} has not been set. Please ensure Span.set_extension is "
                        f"called for your pipeline's custom extensions."
                    )
        self.assertion_attributes_mapping = span_attrs

`add(rules)`

Adds SectionRules to the Sectionizer.

Parameters:

Name	Type	Description	Default
`rules`		A single SectionRule or a collection of SectionRules to add to the Sectionizer.	required

Source code in medspacy/section_detection/sectionizer.py

def add(self, rules):
    """
    Adds SectionRules to the Sectionizer.

    Args:
        rules: A single SectionRule or a collection of SectionRules to add to the Sectionizer.
    """
    if isinstance(rules, SectionRule):
        rules = [rules]

    for rule in rules:
        if not isinstance(rule, SectionRule):
            raise TypeError("Rules must be type SectionRule, not", type(rule))

    self.__matcher.add(rules)

    for rule in rules:
        name = rule.category
        parents = rule.parents
        parent_required = rule.parent_required
        if parents:
            if name in self._parent_sections.keys():
                warnings.warn(
                    f"Duplicate section title {name}. Merging parents. "
                    f"If this is not intended, please specify distinct titles.",
                    RuntimeWarning,
                )
                self._parent_sections[name].update(parents)
            else:
                self._parent_sections[name] = set(parents)

        if (
            name in self._parent_required.keys()
            and self._parent_required[name] != parent_required
        ):
            warnings.warn(
                f"Duplicate section title {name} has different parent_required option. "
                f"Setting parent_required to False.",
                RuntimeWarning,
            )
            self._parent_required[name] = False
        else:
            self._parent_required[name] = parent_required

`filter_end_lines(doc, matches)`

Filter a list of matches to only contain spans where the start token is followed by a new line.

Returns:

Type	Description
`List[Tuple[int, int, int]]`	A list of match tuples (match_id, start, end) that meet the filter criteria.

Source code in medspacy/section_detection/sectionizer.py

def filter_end_lines(
    self, doc: Doc, matches: List[Tuple[int, int, int]]
) -> List[Tuple[int, int, int]]:
    """
    Filter a list of matches to only contain spans where the start token is followed by a new line.

    Returns:
        A list of match tuples (match_id, start, end) that meet the filter criteria.
    """
    return [
        m for m in matches if util.is_end_line(m[2] - 1, doc, self.newline_pattern)
    ]

`filter_start_lines(doc, matches)`

Filter a list of matches to only contain spans where the start token is the beginning of a new line.

Returns:

Type	Description
`List[Tuple[int, int, int]]`	A list of match tuples (match_id, start, end) that meet the filter criteria.

Source code in medspacy/section_detection/sectionizer.py

def filter_start_lines(
    self, doc: Doc, matches: List[Tuple[int, int, int]]
) -> List[Tuple[int, int, int]]:
    """
    Filter a list of matches to only contain spans where the start token is the beginning of a new line.

    Returns:
        A list of match tuples (match_id, start, end) that meet the filter criteria.
    """
    return [
        m for m in matches if util.is_start_line(m[1], doc, self.newline_pattern)
    ]

`register_default_attributes()` `classmethod`

Register the default values for the Span attributes defined in DEFAULT_ATTRIBUTES.

Source code in medspacy/section_detection/sectionizer.py

@classmethod
def register_default_attributes(cls):
    """
    Register the default values for the Span attributes defined in `DEFAULT_ATTRIBUTES`.
    """
    for attr_name in [
        "is_negated",
        "is_uncertain",
        "is_historical",
        "is_hypothetical",
        "is_family",
    ]:
        try:
            Span.set_extension(attr_name, default=False)
        except ValueError:  # Extension already set
            pass

`set_assertion_attributes(spans)`

Add Span-level attributes to entities based on which section they occur in.

Parameters:

Name	Type	Description	Default
`spans`	`Iterable[Span]`	the spans to modify.	required

Source code in medspacy/section_detection/sectionizer.py

def set_assertion_attributes(self, spans: Iterable[Span]):
    """
    Add Span-level attributes to entities based on which section they occur in.

    Args:
        spans: the spans to modify.
    """
    for span in spans:
        if (
            span._.section
            and span._.section.category in self.assertion_attributes_mapping
        ):
            attr_dict = self.assertion_attributes_mapping[span._.section.category]
            for (attr_name, attr_value) in attr_dict.items():
                setattr(span._, attr_name, attr_value)

`set_parent_sections(sections)`

Determine the legal parent-child section relationships from the list of in-order sections of a document and the possible parents of each section as specified during direction creation.

Parameters:

Name	Type	Description	Default
`sections`	`List[Tuple[int, int, int]]`	a list of spacy match tuples found in the doc	required

Returns:

Type	Description
`List[Tuple[int, int, int, int]]`	A list of tuples (match_id, start, end, parent_idx) where the first three indices are the same as the input
`List[Tuple[int, int, int, int]]`	and the added parent_idx represents the index in the list that corresponds to the parent section. Might be a
`List[Tuple[int, int, int, int]]`	smaller list than the input due to pruning with `parent_required`.

Source code in medspacy/section_detection/sectionizer.py

def set_parent_sections(
    self, sections: List[Tuple[int, int, int]]
) -> List[Tuple[int, int, int, int]]:
    """
    Determine the legal parent-child section relationships from the list
    of in-order sections of a document and the possible parents of each
    section as specified during direction creation.

    Args:
        sections: a list of spacy match tuples found in the doc

    Returns:
        A list of tuples (match_id, start, end, parent_idx) where the first three indices are the same as the input
        and the added parent_idx represents the index in the list that corresponds to the parent section. Might be a
        smaller list than the input due to pruning with `parent_required`.
    """
    sections_final = []
    removed_sections = 0
    for i, (match_id, start, end) in enumerate(sections):
        name = self.__matcher.rule_map[self.nlp.vocab.strings[match_id]].category
        required = self._parent_required[name]
        i_a = i - removed_sections  # adjusted index for removed values
        if required and i_a == 0:
            removed_sections += 1
            continue
        elif i_a == 0 or name not in self._parent_sections.keys():
            sections_final.append((match_id, start, end, None))
        else:
            parents = self._parent_sections[name]
            identified_parent = None
            for parent in parents:
                # go backwards through the section "tree" until you hit a root or the start of the list
                candidate = self.__matcher.rule_map[
                    self.nlp.vocab.strings[sections_final[i_a - 1][0]]
                ].category
                candidates_parent_idx = sections_final[i_a - 1][3]
                if candidates_parent_idx is not None:
                    candidates_parent = self.__matcher.rule_map[
                        self.nlp.vocab.strings[
                            sections_final[candidates_parent_idx][0]
                        ]
                    ].category
                else:
                    candidates_parent = None
                candidate_i = i_a - 1
                while candidate:
                    if candidate == parent:
                        identified_parent = candidate_i
                        candidate = None
                    else:
                        # if you are at the end of the list... no parent
                        if candidate_i < 1:
                            candidate = None
                            continue
                        # if the current candidate has no parent... no parent exists
                        if not candidates_parent:
                            candidate = None
                            continue
                        # otherwise get the previous item in the list
                        temp = self.__matcher.rule_map[
                            self.nlp.vocab.strings[
                                sections_final[candidate_i - 1][0]
                            ]
                        ].category
                        temp_parent_idx = sections_final[candidate_i - 1][3]
                        if temp_parent_idx is not None:
                            temp_parent = self.__matcher.rule_map[
                                self.nlp.vocab.strings[
                                    sections_final[temp_parent_idx][0]
                                ]
                            ].category
                        else:
                            temp_parent = None
                        # if the previous item is the parent of the current item
                        # OR if the previous item is a sibling of the current item
                        # continue to search
                        if (
                            temp == candidates_parent
                            or temp_parent == candidates_parent
                        ):
                            candidate = temp
                            candidates_parent = temp_parent
                            candidate_i -= 1
                        # otherwise, there is no further tree traversal
                        else:
                            candidate = None

            # if a parent is required, then add
            if identified_parent is not None or not required:
                # if the parent is identified, add section
                # if the parent is not required, add section
                # if parent is not identified and required, do not add the section
                sections_final.append((match_id, start, end, identified_parent))
            else:
                removed_sections += 1
    return sections_final

medspacy.section_detection

Section

body_span property

section_span property

title_span property

__init__(category, title_start, title_end, body_start, body_end, parent=None, rule=None)

from_serialized_representation(serialized_representation) classmethod

serialized_representation()

SectionRule

__init__(literal, category, pattern=None, on_match=None, max_scope=None, parents=None, parent_required=False, metadata=None)

from_dict(rule_dict) classmethod

from_json(filepath) classmethod

to_dict()

Sectionizer

input_span_type property writable

rules property

section_categories property

span_group_name property writable

__call__(doc)

add(rules)

filter_end_lines(doc, matches)

filter_start_lines(doc, matches)

register_default_attributes() classmethod

set_assertion_attributes(spans)

set_parent_sections(sections)

`Section`

`body_span` `property`

`section_span` `property`

`title_span` `property`

`init(category, title_start, title_end, body_start, body_end, parent=None, rule=None)`

`from_serialized_representation(serialized_representation)` `classmethod`

`serialized_representation()`

`SectionRule`

`init(literal, category, pattern=None, on_match=None, max_scope=None, parents=None, parent_required=False, metadata=None)`

`from_dict(rule_dict)` `classmethod`

`from_json(filepath)` `classmethod`

`to_dict()`

`Sectionizer`

`input_span_type` `property` `writable`

`rules` `property`

`section_categories` `property`

`span_group_name` `property` `writable`

`call(doc)`

`add(rules)`

`filter_end_lines(doc, matches)`

`filter_start_lines(doc, matches)`

`register_default_attributes()` `classmethod`

`set_assertion_attributes(spans)`

`set_parent_sections(sections)`