Skip to content

medspacy.preprocess

PreprocessingRule

This is a rule for handling preprocessing in the medspaCy Preprocessor. This class does not inherit from BaseRule, as it cannot be used in a spaCy pipeline. The Preprocessor and PreprocessingRules are designed to preprocess text before entering a spaCy pipeline to allow for destructive preprocessing, such as stripping or replacing text.

Source code in medspacy/preprocess/preprocessing_rule.py
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
class PreprocessingRule:
    """
    This is a rule for handling preprocessing in the medspaCy Preprocessor. This class does not inherit from BaseRule,
    as it cannot be used in a spaCy pipeline. The Preprocessor and PreprocessingRules are designed to preprocess text
    before entering a spaCy pipeline to allow for destructive preprocessing, such as stripping or replacing text.
    """

    _ALLOWED_KEYS = {"pattern", "repl", "desc", "pattern", "flags"}

    def __init__(
        self,
        pattern: str,
        repl: Union[str, Callable[[re.Match], Any]] = "",
        flags: re.RegexFlag = re.IGNORECASE,
        callback: Optional[Callable[[str, re.Match], str]] = None,
        desc: Optional[str] = None,
    ):
        """
        Creates a new PreprocessingRule. Preprocessing rules define spans of text to be removed and optionally
        replaced from the text underneath a doc.

        Args:
            pattern: The text pattern to match and replace in a doc. Must be a string, which will be compiled as
                a regular expression. The patterns will lead to re.Match objects.
            repl: The text to replace a matched string with. By default, repl is an empty string. If repl is a function,
                sends function to re.sub and it will be called on each Match object. More info here
                https://docs.python.org/3/library/re.html#re.sub
            flags: A regex compilation flag. Default is re.IGNORECASE.
            callback: An optional callable which takes the raw text and a Match and returns the new copy of the text,
                rather than just replacing strings for the matched text. This can allow larger text manipulation, such
                as stripping out an entire section based on a header.
            desc: An optional description.
        """
        self.pattern = re.compile(pattern, flags=flags)
        self.repl = repl
        self.callback = callback
        self.desc = desc

    @classmethod
    def from_dict(cls, d: Dict) -> PreprocessingRule:
        """
        Creates a PreprocessingRule from a dictionary.

        Args:
            d: The dict to read.

        Returns:
            A PreprocessingRule from the dictionary.
        """
        return PreprocessingRule(
            d["pattern"],
            repl=d["repl"],
            flags=d["flags"],
            callback=d["callback"],
            desc=d.get("desc", None),
        )

    def to_dict(self):
        """
        Writes a preprocessing rule to a dictionary. Useful for writing all rules to a json later.

        Returns:
            A dictionary containing the PreprocessingRule's data.
        """
        d = {
            "pattern": self.pattern.pattern,
            "repl": self.repl,
            "callback": self.callback,
            "desc": self.desc,
            "flags": self.pattern.flags,
        }
        return d

    @classmethod
    def from_json(cls, filepath):
        """
        Read a JSON file containing PreprocessingRule data at the key "preprocessing_rules".

        Args:
            filepath: The filepath of the JSON to read.

        Returns:
            A list of PreprocessingRules from the JSON file.
        """
        import json

        with open(filepath) as f:
            data = json.load(f)
        return [
            PreprocessingRule.from_dict(rule) for rule in data["preprocessing_rules"]
        ]

    def __call__(self, text):
        """
        Apply a preprocessing direction. If the callback attribute of direction is None, then it will return a string
        using the direction sub method. If callback is not None, then callback function will be executed using
        the resulting match as an argument.
        """
        # If the direction just has a repl attribute,
        # Just return a simple re.sub
        if self.callback is None:
            return self.pattern.sub(self.repl, text)

        match = self.pattern.search(text)
        if match is None:
            return text
        return self.callback(text, match)

    def __repr__(self):
        return (
            f"PreprocessingRule(pattern={self.pattern.pattern}, flags={self.pattern.flags}, repl={self.repl}, "
            f"callback={self.callback}, desc={self.desc})"
        )

__call__(text)

Apply a preprocessing direction. If the callback attribute of direction is None, then it will return a string using the direction sub method. If callback is not None, then callback function will be executed using the resulting match as an argument.

Source code in medspacy/preprocess/preprocessing_rule.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def __call__(self, text):
    """
    Apply a preprocessing direction. If the callback attribute of direction is None, then it will return a string
    using the direction sub method. If callback is not None, then callback function will be executed using
    the resulting match as an argument.
    """
    # If the direction just has a repl attribute,
    # Just return a simple re.sub
    if self.callback is None:
        return self.pattern.sub(self.repl, text)

    match = self.pattern.search(text)
    if match is None:
        return text
    return self.callback(text, match)

__init__(pattern, repl='', flags=re.IGNORECASE, callback=None, desc=None)

Creates a new PreprocessingRule. Preprocessing rules define spans of text to be removed and optionally replaced from the text underneath a doc.

Parameters:

Name Type Description Default
pattern str

The text pattern to match and replace in a doc. Must be a string, which will be compiled as a regular expression. The patterns will lead to re.Match objects.

required
repl Union[str, Callable[[Match], Any]]

The text to replace a matched string with. By default, repl is an empty string. If repl is a function, sends function to re.sub and it will be called on each Match object. More info here https://docs.python.org/3/library/re.html#re.sub

''
flags RegexFlag

A regex compilation flag. Default is re.IGNORECASE.

IGNORECASE
callback Optional[Callable[[str, Match], str]]

An optional callable which takes the raw text and a Match and returns the new copy of the text, rather than just replacing strings for the matched text. This can allow larger text manipulation, such as stripping out an entire section based on a header.

None
desc Optional[str]

An optional description.

None
Source code in medspacy/preprocess/preprocessing_rule.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def __init__(
    self,
    pattern: str,
    repl: Union[str, Callable[[re.Match], Any]] = "",
    flags: re.RegexFlag = re.IGNORECASE,
    callback: Optional[Callable[[str, re.Match], str]] = None,
    desc: Optional[str] = None,
):
    """
    Creates a new PreprocessingRule. Preprocessing rules define spans of text to be removed and optionally
    replaced from the text underneath a doc.

    Args:
        pattern: The text pattern to match and replace in a doc. Must be a string, which will be compiled as
            a regular expression. The patterns will lead to re.Match objects.
        repl: The text to replace a matched string with. By default, repl is an empty string. If repl is a function,
            sends function to re.sub and it will be called on each Match object. More info here
            https://docs.python.org/3/library/re.html#re.sub
        flags: A regex compilation flag. Default is re.IGNORECASE.
        callback: An optional callable which takes the raw text and a Match and returns the new copy of the text,
            rather than just replacing strings for the matched text. This can allow larger text manipulation, such
            as stripping out an entire section based on a header.
        desc: An optional description.
    """
    self.pattern = re.compile(pattern, flags=flags)
    self.repl = repl
    self.callback = callback
    self.desc = desc

from_dict(d) classmethod

Creates a PreprocessingRule from a dictionary.

Parameters:

Name Type Description Default
d Dict

The dict to read.

required

Returns:

Type Description
PreprocessingRule

A PreprocessingRule from the dictionary.

Source code in medspacy/preprocess/preprocessing_rule.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
@classmethod
def from_dict(cls, d: Dict) -> PreprocessingRule:
    """
    Creates a PreprocessingRule from a dictionary.

    Args:
        d: The dict to read.

    Returns:
        A PreprocessingRule from the dictionary.
    """
    return PreprocessingRule(
        d["pattern"],
        repl=d["repl"],
        flags=d["flags"],
        callback=d["callback"],
        desc=d.get("desc", None),
    )

from_json(filepath) classmethod

Read a JSON file containing PreprocessingRule data at the key "preprocessing_rules".

Parameters:

Name Type Description Default
filepath

The filepath of the JSON to read.

required

Returns:

Type Description

A list of PreprocessingRules from the JSON file.

Source code in medspacy/preprocess/preprocessing_rule.py
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
@classmethod
def from_json(cls, filepath):
    """
    Read a JSON file containing PreprocessingRule data at the key "preprocessing_rules".

    Args:
        filepath: The filepath of the JSON to read.

    Returns:
        A list of PreprocessingRules from the JSON file.
    """
    import json

    with open(filepath) as f:
        data = json.load(f)
    return [
        PreprocessingRule.from_dict(rule) for rule in data["preprocessing_rules"]
    ]

to_dict()

Writes a preprocessing rule to a dictionary. Useful for writing all rules to a json later.

Returns:

Type Description

A dictionary containing the PreprocessingRule's data.

Source code in medspacy/preprocess/preprocessing_rule.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def to_dict(self):
    """
    Writes a preprocessing rule to a dictionary. Useful for writing all rules to a json later.

    Returns:
        A dictionary containing the PreprocessingRule's data.
    """
    d = {
        "pattern": self.pattern.pattern,
        "repl": self.repl,
        "callback": self.callback,
        "desc": self.desc,
        "flags": self.pattern.flags,
    }
    return d

Preprocessor

This is the medspacy Preprocessor class. It is designed as a wrapper for destructive preprocessing rules such as stripping or replacing text in a document before the text enters a spaCy pipeline.

This is NOT a spaCy component and cannot be added to a spaCy pipeline. Please use the preprocessor before calling nlp("your text here"). SpaCy only allows for non-destructive processing on the text, but that is not always advisable for every project, so this enables destructive preprocessing when required.

Source code in medspacy/preprocess/preprocessor.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class Preprocessor:
    """
    This is the medspacy Preprocessor class. It is designed as a wrapper for destructive preprocessing rules such as
    stripping or replacing text in a document before the text enters a spaCy pipeline.

    This is NOT a spaCy component and cannot be added to a spaCy pipeline. Please use the preprocessor before
    calling `nlp("your text here")`. SpaCy only allows for non-destructive processing on the text, but that is not
    always advisable for every project, so this enables destructive preprocessing when required.
    """

    def __init__(self, tokenizer):
        """

        Args:
            tokenizer:
        """
        self.tokenizer = tokenizer
        self._rules = []

    def add(self, rules: Union[PreprocessingRule, Iterable[PreprocessingRule]]):
        """
        Adds a PreprocessingRule or collection of PreprocessingRules to the Preprocessor.

        Args:
            rules: A single PreprocessingRule or a collection of PreprocessingRules to add.
        """
        if isinstance(rules, PreprocessingRule):
            rules = [rules]
        for rule in rules:
            if not isinstance(rule, PreprocessingRule):
                raise TypeError(
                    f"Each rule must be an instance of PreprocessingRule, not {type(rule)}."
                )
        self._rules += rules

    def __call__(self, text, tokenize=True) -> Union[str, Doc]:
        """

        Args:
            text:
            tokenize:

        Returns:

        """
        for rule in self._rules:
            text = rule(text)

        if not tokenize:
            return text

        return self.tokenizer(text)

__call__(text, tokenize=True)

Parameters:

Name Type Description Default
text
required
tokenize
True

Returns:

Source code in medspacy/preprocess/preprocessor.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __call__(self, text, tokenize=True) -> Union[str, Doc]:
    """

    Args:
        text:
        tokenize:

    Returns:

    """
    for rule in self._rules:
        text = rule(text)

    if not tokenize:
        return text

    return self.tokenizer(text)

__init__(tokenizer)

Parameters:

Name Type Description Default
tokenizer
required
Source code in medspacy/preprocess/preprocessor.py
18
19
20
21
22
23
24
25
def __init__(self, tokenizer):
    """

    Args:
        tokenizer:
    """
    self.tokenizer = tokenizer
    self._rules = []

add(rules)

Adds a PreprocessingRule or collection of PreprocessingRules to the Preprocessor.

Parameters:

Name Type Description Default
rules Union[PreprocessingRule, Iterable[PreprocessingRule]]

A single PreprocessingRule or a collection of PreprocessingRules to add.

required
Source code in medspacy/preprocess/preprocessor.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def add(self, rules: Union[PreprocessingRule, Iterable[PreprocessingRule]]):
    """
    Adds a PreprocessingRule or collection of PreprocessingRules to the Preprocessor.

    Args:
        rules: A single PreprocessingRule or a collection of PreprocessingRules to add.
    """
    if isinstance(rules, PreprocessingRule):
        rules = [rules]
    for rule in rules:
        if not isinstance(rule, PreprocessingRule):
            raise TypeError(
                f"Each rule must be an instance of PreprocessingRule, not {type(rule)}."
            )
    self._rules += rules