Skip to content

medspacy.io.doc_consumer

DocConsumer

A DocConsumer object will consume a spacy doc and output rows based on a configuration provided by the user.

This component extracts structured information from a Doc. Information is stored in doc._.data, which is a nested dictionary. The outer keys represent the data type of can one or more of: - "ents": data about the spans in doc.ents such as the text, label, context attributes, section information, or custom attributes - "group": data about spans in a span group with the name span_group_attrs section text and category - "context": data about entity-modifier pairs extracted by ConText - "doc": a single doc-level representation. By default only doc.text is extracted, but other attributes may be specified

Once processed, a doc's data can be accessed either by:
    - doc._.data
    - doc._.get_data(dtype=...)
    - doc._.ent_data
    - doc._.to_dataframe(dtype=...)
Source code in medspacy/io/doc_consumer.py
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
@Language.factory("medspacy_doc_consumer")
class DocConsumer:
    """
    A DocConsumer object will consume a spacy doc and output rows based on a configuration provided by the user.

    This component extracts structured information from a Doc. Information is stored in doc._.data, which is a
        nested dictionary. The outer keys represent the data type of can one or more of:
            - "ents": data about the spans in doc.ents such as the text, label,
                context attributes, section information, or custom attributes
            - "group": data about spans in a span group with the name `span_group_attrs` section text and category
            - "context": data about entity-modifier pairs extracted by ConText
            - "doc": a single doc-level representation. By default only doc.text is extracted, but other attributes may
                be specified

        Once processed, a doc's data can be accessed either by:
            - doc._.data
            - doc._.get_data(dtype=...)
            - doc._.ent_data
            - doc._.to_dataframe(dtype=...)
    """

    def __init__(
        self,
        nlp,
        name: str = "medspacy_doc_consumer",
        dtypes: Tuple = ("ents",),
        dtype_attrs: Dict = None,
        span_group_name: str = "medspacy_spans",
    ):
        """
        Creates a new DocConsumer.

        Args:
            nlp: A spaCy model
            dtypes: Either a tuple of data types to collect or the string "all". Default ("ents",). Valid  options are:
                "ents", "group", "section", "context", "doc".
            dtype_attrs: An optional dictionary mapping the data types in dtypes to a list of attributes. If None, will
                set defaults for each dtype. Attributes for "ents", "group", and "doc" may be customized be adding either
                native or custom attributes (i.e., ent._....) "context" and "section" are not customizable at this time.
                Default values for each dtype can be retrieved by the class method `DocConsumer.get_default_attrs()
            span_group_name: the name of the span group used when dtypes contains "group". At this time, only one span
                group is supported.
        """
        self.nlp = nlp
        self.name = name
        self._span_group_name = span_group_name
        if not isinstance(dtypes, tuple):
            if dtypes == "all":
                dtypes = tuple(ALLOWED_DATA_TYPES)
            else:
                raise ValueError(
                    "dtypes must be either 'all' or a tuple, not {0}".format(dtypes)
                )
        for dtype in dtypes:
            if dtype not in ALLOWED_DATA_TYPES:
                raise ValueError(
                    "Invalid dtypes. Supported dtypes are {0}, not {1}".format(
                        ALLOWED_DATA_TYPES, dtype
                    )
                )
            if dtype == "section":
                self.validate_section_attrs(dtype_attrs)
        self.dtypes = dtypes
        self.dtype_attrs = dtype_attrs

        if self.dtype_attrs is None:
            self._set_default_attrs()

    @classmethod
    def get_default_attrs(cls, dtypes: Optional[Tuple] = None):
        """
        Gets the default attributes available to each type specified.

        Args:
            dtypes: Optional tuple containing "ents", "group", "context", "section", or "doc". If None, all will be
                returned.

        Returns:
            The attributes the doc consumer will output for each of the specified types in `dtypes`.
        """
        if dtypes is None:
            dtypes = ALLOWED_DATA_TYPES
        else:
            if isinstance(dtypes, str):
                dtypes = (dtypes,)
            for dtype in dtypes:
                if dtype not in ALLOWED_DATA_TYPES:
                    raise ValueError("Invalid dtype,", dtype)
        dtype_attrs = {
            dtype: list(attrs)
            for (dtype, attrs) in DEFAULT_ATTRS.items()
            if dtype in dtypes
        }
        return dtype_attrs

    def _set_default_attrs(self):
        """
        Gets the default attributes.
        """
        self.dtype_attrs = self.get_default_attrs(self.dtypes)

    def validate_section_attrs(self, attrs):
        """
        Validate that section attributes are either not specified or are valid attribute names.
        """
        if attrs is None:
            return True
        if "section" not in attrs:
            return True
        diff = set(attrs["section"]).difference(ALLOWED_SECTION_ATTRS)
        if diff:
            raise ValueError("Invalid section dtype_attrs specified: {0}".format(diff))
        return True

    def __call__(self, doc):
        """
        Call the doc consumer on a doc and assign the data.

        Args:
            doc: The Doc to process.

        Returns:
            The processed Doc.
        """
        data = dict()
        for dtype, attrs in self.dtype_attrs.items():
            data.setdefault(dtype, OrderedDict())
            for attr in attrs:
                data[dtype][attr] = list()
        if "ents" in self.dtypes:
            for ent in doc.ents:
                for attr in self.dtype_attrs["ents"]:
                    try:
                        val = getattr(ent, attr)
                    except AttributeError:
                        val = getattr(ent._, attr)
                    data["ents"][attr].append(val)
        if "group" in self.dtypes:
            for span in doc.spans[self._span_group_name]:
                for attr in self.dtype_attrs["group"]:
                    try:
                        val = getattr(span, attr)
                    except AttributeError:
                        val = getattr(span._, attr)
                    data["group"][attr].append(val)
        if "context" in self.dtypes:
            for (ent, modifier) in doc._.context_graph.edges:
                self.add_context_edge_attributes(ent, modifier, data["context"], doc)
        if "section" in self.dtypes:
            for section in doc._.sections:
                self.add_section_attributes(section, data["section"], doc)
        if "doc" in self.dtypes:
            for attr in self.dtype_attrs["doc"]:
                try:
                    val = getattr(doc, attr)
                except AttributeError:
                    val = getattr(doc._, attr)
                data["doc"][attr].append(val)

        doc._.data = data
        return doc

    def add_context_edge_attributes(
        self, ent: Span, modifier: ConTextModifier, context_data, doc
    ):
        span_tup = modifier.modifier_span
        span = doc[span_tup[0] : span_tup[1]]
        scope_tup = modifier.scope_span
        scope = doc[scope_tup[0] : scope_tup[1]]
        for attr in self.dtype_attrs["context"]:
            if attr == "ent_text":
                context_data["ent_text"].append(ent.text)
            elif attr == "ent_label_":
                context_data["ent_label_"].append(ent.label_)
            elif attr == "ent_start_char":
                context_data["ent_start_char"].append(ent.start_char)
            elif attr == "ent_end_char":
                context_data["ent_end_char"].append(ent.end_char)
            elif attr == "modifier_text":
                context_data["modifier_text"].append(span.text)
            elif attr == "modifier_category":
                context_data["modifier_category"].append(modifier.category)
            elif attr == "modifier_direction":
               context_data["modifier_direction"].append(modifier.direction)
            elif attr == "modifier_start_char":
                context_data["modifier_start_char"].append(span.start_char)
            elif attr == "modifier_end_char":
                context_data["modifier_end_char"].append(span.end_char)
            elif attr == "modifier_scope_start_char":
                context_data["modifier_scope_start_char"].append(scope.start_char)
            elif attr == "modifier_scope_end_char":
                context_data["modifier_scope_end_char"].append(scope.end_char)
            else:
            # if specified attribute is not one of these standard values, check the entity to see if it's an entity value
                try:
                    val = getattr(ent, attr)
                except AttributeError:
                    try:
                        val = getattr(ent._, attr)
                    except AttributeError:
                        raise ValueError(f"Attributes for dtype 'context' must be either "
                                         f"a registered custom Span attribute (i.e., Span._.attr) or one of these pre-defined values: "
                                          f"{ALLOWED_CONTEXT_ATTRS}. \nYou passed in '{attr}'")
                context_data[f"{attr}"].append(val)

    def add_section_attributes(self, section, section_data, doc):
        # Allow for null sections
        section_title_tup = section.title_span
        section_body_tup = section.body_span
        section_title = doc[section_title_tup[0] : section_title_tup[1]]
        section_body = doc[section_body_tup[0] : section_body_tup[1]]
        if "section_category" in self.dtype_attrs["section"]:
            section_data["section_category"].append(section.category)
        if section.category is not None:
            if "section_title_text" in self.dtype_attrs["section"]:
                section_data["section_title_text"].append(section_title.text)
            if "section_title_start_char" in self.dtype_attrs["section"]:
                section_data["section_title_start_char"].append(
                    section_title.start_char
                )
            if "section_title_end_char" in self.dtype_attrs["section"]:
                section_data["section_title_end_char"].append(section_title.end_char)
        else:
            if "section_title_text" in self.dtype_attrs["section"]:
                section_data["section_title_text"].append(None)
            if "section_title_start_char" in self.dtype_attrs["section"]:
                section_data["section_title_start_char"].append(0)
            if "section_title_end_char" in self.dtype_attrs["section"]:
                section_data["section_title_end_char"].append(0)
        if "section_body" in self.dtype_attrs["section"]:
            section_data["section_body"].append(section_body.text)
        if "section_body_start_char" in self.dtype_attrs["section"]:
            section_data["section_body_start_char"].append(section_body.start_char)
        if "section_body_end_char" in self.dtype_attrs["section"]:
            section_data["section_body_end_char"].append(section_body.end_char)
        if "section_parent" in self.dtype_attrs["section"]:
            section_data["section_parent"].append(section.parent)

__call__(doc)

Call the doc consumer on a doc and assign the data.

Parameters:

Name Type Description Default
doc

The Doc to process.

required

Returns:

Type Description

The processed Doc.

Source code in medspacy/io/doc_consumer.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def __call__(self, doc):
    """
    Call the doc consumer on a doc and assign the data.

    Args:
        doc: The Doc to process.

    Returns:
        The processed Doc.
    """
    data = dict()
    for dtype, attrs in self.dtype_attrs.items():
        data.setdefault(dtype, OrderedDict())
        for attr in attrs:
            data[dtype][attr] = list()
    if "ents" in self.dtypes:
        for ent in doc.ents:
            for attr in self.dtype_attrs["ents"]:
                try:
                    val = getattr(ent, attr)
                except AttributeError:
                    val = getattr(ent._, attr)
                data["ents"][attr].append(val)
    if "group" in self.dtypes:
        for span in doc.spans[self._span_group_name]:
            for attr in self.dtype_attrs["group"]:
                try:
                    val = getattr(span, attr)
                except AttributeError:
                    val = getattr(span._, attr)
                data["group"][attr].append(val)
    if "context" in self.dtypes:
        for (ent, modifier) in doc._.context_graph.edges:
            self.add_context_edge_attributes(ent, modifier, data["context"], doc)
    if "section" in self.dtypes:
        for section in doc._.sections:
            self.add_section_attributes(section, data["section"], doc)
    if "doc" in self.dtypes:
        for attr in self.dtype_attrs["doc"]:
            try:
                val = getattr(doc, attr)
            except AttributeError:
                val = getattr(doc._, attr)
            data["doc"][attr].append(val)

    doc._.data = data
    return doc

__init__(nlp, name='medspacy_doc_consumer', dtypes=('ents',), dtype_attrs=None, span_group_name='medspacy_spans')

Creates a new DocConsumer.

Parameters:

Name Type Description Default
nlp

A spaCy model

required
dtypes Tuple

Either a tuple of data types to collect or the string "all". Default ("ents",). Valid options are: "ents", "group", "section", "context", "doc".

('ents',)
dtype_attrs Dict

An optional dictionary mapping the data types in dtypes to a list of attributes. If None, will set defaults for each dtype. Attributes for "ents", "group", and "doc" may be customized be adding either native or custom attributes (i.e., ent._....) "context" and "section" are not customizable at this time. Default values for each dtype can be retrieved by the class method `DocConsumer.get_default_attrs()

None
span_group_name str

the name of the span group used when dtypes contains "group". At this time, only one span group is supported.

'medspacy_spans'
Source code in medspacy/io/doc_consumer.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def __init__(
    self,
    nlp,
    name: str = "medspacy_doc_consumer",
    dtypes: Tuple = ("ents",),
    dtype_attrs: Dict = None,
    span_group_name: str = "medspacy_spans",
):
    """
    Creates a new DocConsumer.

    Args:
        nlp: A spaCy model
        dtypes: Either a tuple of data types to collect or the string "all". Default ("ents",). Valid  options are:
            "ents", "group", "section", "context", "doc".
        dtype_attrs: An optional dictionary mapping the data types in dtypes to a list of attributes. If None, will
            set defaults for each dtype. Attributes for "ents", "group", and "doc" may be customized be adding either
            native or custom attributes (i.e., ent._....) "context" and "section" are not customizable at this time.
            Default values for each dtype can be retrieved by the class method `DocConsumer.get_default_attrs()
        span_group_name: the name of the span group used when dtypes contains "group". At this time, only one span
            group is supported.
    """
    self.nlp = nlp
    self.name = name
    self._span_group_name = span_group_name
    if not isinstance(dtypes, tuple):
        if dtypes == "all":
            dtypes = tuple(ALLOWED_DATA_TYPES)
        else:
            raise ValueError(
                "dtypes must be either 'all' or a tuple, not {0}".format(dtypes)
            )
    for dtype in dtypes:
        if dtype not in ALLOWED_DATA_TYPES:
            raise ValueError(
                "Invalid dtypes. Supported dtypes are {0}, not {1}".format(
                    ALLOWED_DATA_TYPES, dtype
                )
            )
        if dtype == "section":
            self.validate_section_attrs(dtype_attrs)
    self.dtypes = dtypes
    self.dtype_attrs = dtype_attrs

    if self.dtype_attrs is None:
        self._set_default_attrs()

_set_default_attrs()

Gets the default attributes.

Source code in medspacy/io/doc_consumer.py
156
157
158
159
160
def _set_default_attrs(self):
    """
    Gets the default attributes.
    """
    self.dtype_attrs = self.get_default_attrs(self.dtypes)

get_default_attrs(dtypes=None) classmethod

Gets the default attributes available to each type specified.

Parameters:

Name Type Description Default
dtypes Optional[Tuple]

Optional tuple containing "ents", "group", "context", "section", or "doc". If None, all will be returned.

None

Returns:

Type Description

The attributes the doc consumer will output for each of the specified types in dtypes.

Source code in medspacy/io/doc_consumer.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
@classmethod
def get_default_attrs(cls, dtypes: Optional[Tuple] = None):
    """
    Gets the default attributes available to each type specified.

    Args:
        dtypes: Optional tuple containing "ents", "group", "context", "section", or "doc". If None, all will be
            returned.

    Returns:
        The attributes the doc consumer will output for each of the specified types in `dtypes`.
    """
    if dtypes is None:
        dtypes = ALLOWED_DATA_TYPES
    else:
        if isinstance(dtypes, str):
            dtypes = (dtypes,)
        for dtype in dtypes:
            if dtype not in ALLOWED_DATA_TYPES:
                raise ValueError("Invalid dtype,", dtype)
    dtype_attrs = {
        dtype: list(attrs)
        for (dtype, attrs) in DEFAULT_ATTRS.items()
        if dtype in dtypes
    }
    return dtype_attrs

validate_section_attrs(attrs)

Validate that section attributes are either not specified or are valid attribute names.

Source code in medspacy/io/doc_consumer.py
162
163
164
165
166
167
168
169
170
171
172
173
def validate_section_attrs(self, attrs):
    """
    Validate that section attributes are either not specified or are valid attribute names.
    """
    if attrs is None:
        return True
    if "section" not in attrs:
        return True
    diff = set(attrs["section"]).difference(ALLOWED_SECTION_ATTRS)
    if diff:
        raise ValueError("Invalid section dtype_attrs specified: {0}".format(diff))
    return True