Module gatenlp.lib_spacy
Support for using spacy: convert from spacy to gatenlp documents and annotations.
Expand source code
"""
Support for using spacy: convert from spacy to gatenlp documents and annotations.
"""
from gatenlp import Document, AnnotationSet
from gatenlp.processing.annotator import Annotator
import spacy
class AnnSpacy(Annotator):
""" """
def __init__(
self,
pipeline=None,
outsetname="",
token_type="Token",
space_token_type="SpaceToken",
sentence_type="Sentence",
nounchunk_type="NounChunk",
add_tokens=True,
# add_spacetokens=True, # not sure how to do this yet
add_entities=True,
add_sentences=True,
add_nounchunks=True,
add_deps=True,
ent_prefix=None,
):
"""
Create an annotator for running a spacy pipeline on documents.
Args:
pipeline: if this is specified, a pre-configured spacy pipeline (default: "en_core_web_sm"
pipeline)
outsetname: the annotation set name where to put the annotations
token_type: the annotation type for the token annotations
space_token_type: type of any space token annotations
sentence_type: the annotation type for the sentence annotations
nounchunk_type: annotation type for noun chunks
add_tokens: if token annotations should be added
add_entities: if true, add entity annotations
add_sentences: if sentence annotations should be added
add_nounchunks: if nounchunks should be added
add_deps: if dependencies should be added
ent_prefix: the prefix to add to all entity annotation types
"""
self.outsetname = outsetname
self.token_type = token_type
self.sentence_type = sentence_type
self.add_entities = add_entities
self.ent_prefix = ent_prefix
self.space_token_type = space_token_type
self.nounchunk_type = nounchunk_type
self.add_tokens = add_tokens
self.add_sentences = add_sentences
self.add_nounchunks = add_nounchunks
self.add_deps = add_deps
if pipeline:
self.pipeline = pipeline
else:
self.pipeline = spacy.load("en_core_web_sm")
def __call__(self, doc, **kwargs):
spacy_doc = self.pipeline(doc.text)
spacy2gatenlp(
spacy_doc,
doc,
setname=self.outsetname,
token_type=self.token_type,
space_token_type=self.space_token_type,
sentence_type=self.sentence_type,
nounchunk_type=self.nounchunk_type,
add_tokens=self.add_tokens,
add_ents=self.add_entities,
add_nounchunks=self.add_nounchunks,
add_sents=self.add_sentences,
add_dep=self.add_deps,
ent_prefix=self.ent_prefix,
)
return doc
def apply_spacy(nlp, gatenlpdoc, setname="", containing_anns=None,
component_cfg=None, retrieve_spans=None):
"""Run the spacy nlp pipeline on the gatenlp document and transfer the annotations.
This modifies the gatenlp document in place.
Args:
nlp: spacy pipeline
gatenlpdoc: gatenlp document
setname: annotation set to receive the annotations (Default value = "")
containing_anns: annotation set or iterable of annotations. If not None, only the text covered be each
of the annotations is analyzed. The annotations should not overlap.
component_cfg: the component config to use for Spacy
retrieve_spans: if not None, a list of additional span types to retrieve from the SpaCy document
Returns:
The modified document.
"""
if containing_anns:
component_config = None
if isinstance(containing_anns, AnnotationSet):
annsiter = containing_anns.fast_iter()
else:
annsiter = containing_anns
for ann in annsiter:
if component_cfg:
component_config = {component_cfg: ann.features.to_dict()}
covered = gatenlpdoc[ann.start:ann.end]
spacydoc = nlp(covered, component_cfg=component_config)
spacy2gatenlp(spacydoc, gatenlpdoc=gatenlpdoc, setname=setname,
start_offset=ann.start, retrieve_spans=retrieve_spans)
elems = dir(spacydoc._)
for elem in elems:
if elem not in ['get', 'set', 'has']:
ann.features[elem] = spacydoc._.get(elem)
return gatenlpdoc
else:
spacydoc = nlp(gatenlpdoc.text)
return spacy2gatenlp(spacydoc, gatenlpdoc=gatenlpdoc, setname=setname)
def spacy2gatenlp(
spacydoc,
gatenlpdoc=None,
setname="",
token_type="Token",
space_token_type="SpaceToken",
sentence_type="Sentence",
nounchunk_type="NounChunk",
add_tokens=True,
# add_spacetokens=True, # not sure how to do this yet
add_ents=True,
add_sents=True,
add_nounchunks=True,
add_dep=True,
ent_prefix=None,
start_offset=0,
retrieve_spans=None
):
"""Convert a spacy document to a gatenlp document. If a gatenlp document is already
provided, add the annotations from the spacy document to it. In this case the
original gatenlpdoc is used and gets modified.
Args:
spacydoc: a spacy document
gatenlpdoc: if None, a new gatenlp document is created otherwise this
document is added to. (Default value = None)
setname: the annotation set name to which the annotations get added, empty string
for the default annotation set.
token_type: the annotation type to use for tokens (Default value = "Token")
space_token_type: the annotation type to use for space tokens (Default value = "SpaceToken")
sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence")
nounchunk_type: the annotation type to use for noun chunk annotations (Default value = "NounChunk")
add_tokens: should annotations for tokens get added? If not, dependency parser
info cannot be added either. (Default value = True)
add_ents: should annotations for entities get added
add_sents: should sentence annotations get added (Default value = True)
add_nounchunks: should noun chunk annotations get added (Default value = True)
add_dep: should dependency parser information get added (Default value = True)
add_ents: (Default value = True)
ent_prefix: (Default value = None)
start_offset: If a document is specified, an offset where the text starts can be defined.
This allows a part of a document with spacy and then include the annotations back to the document,
in the corresponding possition
retrieve_spans: if not None, a list of additional Spacy span types to retrieve
Returns:
the new or modified Document
"""
# add_spacetokens: (Default value = True)
# not sure how to do this yet
if retrieve_spans is None:
retrieve_spans = []
if gatenlpdoc is None:
retdoc = Document(spacydoc.text)
start_offset = 0
else:
retdoc = gatenlpdoc
toki2annid = {}
annset = retdoc.annset(setname)
for tok in spacydoc:
from_off = tok.idx
to_off = tok.idx + len(tok)
# is_space = tok.is_space
fm = {
"_i": tok.i,
"is_alpha": tok.is_alpha,
"is_bracket": tok.is_bracket,
"is_currency": tok.is_currency,
"is_digit": tok.is_digit,
"is_left_punct": tok.is_left_punct,
"is_lower": tok.is_lower,
"is_oov": tok.is_oov,
"is_punct": tok.is_punct,
"is_quote": tok.is_quote,
"is_right_punct": tok.is_right_punct,
"is_sent_start": tok.is_sent_start,
"is_space": tok.is_space,
"is_stop": tok.is_stop,
"is_title": tok.is_title,
"is_upper": tok.is_upper,
"lang": tok.lang_,
"lemma": tok.lemma_,
"like_email": tok.like_email,
"like_num": tok.like_num,
"like_url": tok.like_url,
"orth": tok.orth,
"pos": tok.pos_,
"prefix": tok.prefix_,
"prob": tok.prob,
"rank": tok.rank,
"sentiment": tok.sentiment,
"tag": tok.tag_,
"shape": tok.shape_,
"suffix": tok.suffix_,
}
if spacydoc.has_annotation("ENT_IOB") and add_ents:
fm["ent_type"] = tok.ent_type_
if spacydoc.has_annotation("DEP") and add_dep:
fm["dep"] = tok.dep_
if tok.is_space:
anntype = space_token_type
else:
anntype = token_type
annid = annset.add(from_off+start_offset, to_off+start_offset, anntype, fm).id
toki2annid[tok.i] = annid
# print("Added annotation with id: {} for token {}".format(annid, tok.i))
ws = tok.whitespace_
if len(ws) > 0:
annset.add(to_off+start_offset, to_off + len(ws)+start_offset, space_token_type, {"is_space": True})
# if we have a dependency parse, now also add the parse edges
if spacydoc.has_annotation("DEP") and add_tokens and add_dep:
for tok in spacydoc:
ann = annset.get(toki2annid[tok.i])
ann.features["head"] = toki2annid[tok.head.i]
ann.features["left_edge"] = toki2annid[tok.left_edge.i]
ann.features["right_edge"] = toki2annid[tok.right_edge.i]
if spacydoc.ents and add_ents:
for ent in spacydoc.ents:
if ent_prefix:
entname = ent_prefix + ent.label_
else:
entname = ent.label_
annset.add(ent.start_char+start_offset, ent.end_char+start_offset, entname, {"lemma": ent.lemma_})
if spacydoc.sents and add_sents:
for sent in spacydoc.sents:
annset.add(sent.start_char+start_offset, sent.end_char+start_offset, sentence_type, {})
if spacydoc.noun_chunks and add_nounchunks:
for chunk in spacydoc.noun_chunks:
annset.add(chunk.start_char+start_offset, chunk.end_char+start_offset, nounchunk_type, {})
for spanType in retrieve_spans:
for span in spacydoc.spans[spanType]:
annset.add(span.start_char+start_offset, span.end_char+start_offset, spanType, {})
return retdoc
Functions
def apply_spacy(nlp, gatenlpdoc, setname='', containing_anns=None, component_cfg=None, retrieve_spans=None)
-
Run the spacy nlp pipeline on the gatenlp document and transfer the annotations. This modifies the gatenlp document in place.
Args
nlp
- spacy pipeline
gatenlpdoc
- gatenlp document
setname
- annotation set to receive the annotations (Default value = "")
containing_anns
- annotation set or iterable of annotations. If not None, only the text covered be each of the annotations is analyzed. The annotations should not overlap.
component_cfg
- the component config to use for Spacy
retrieve_spans
- if not None, a list of additional span types to retrieve from the SpaCy document
Returns
The modified document.
def spacy2gatenlp(spacydoc, gatenlpdoc=None, setname='', token_type='Token', space_token_type='SpaceToken', sentence_type='Sentence', nounchunk_type='NounChunk', add_tokens=True, add_ents=True, add_sents=True, add_nounchunks=True, add_dep=True, ent_prefix=None, start_offset=0, retrieve_spans=None)
-
Convert a spacy document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the spacy document to it. In this case the original gatenlpdoc is used and gets modified.
Args
spacydoc
- a spacy document
gatenlpdoc
- if None, a new gatenlp document is created otherwise this document is added to. (Default value = None)
setname
- the annotation set name to which the annotations get added, empty string for the default annotation set.
token_type
- the annotation type to use for tokens (Default value = "Token")
space_token_type
- the annotation type to use for space tokens (Default value = "SpaceToken")
sentence_type
- the annotation type to use for sentence anntoations (Default value = "Sentence")
nounchunk_type
- the annotation type to use for noun chunk annotations (Default value = "NounChunk")
add_tokens
- should annotations for tokens get added? If not, dependency parser info cannot be added either. (Default value = True)
add_ents
- should annotations for entities get added
add_sents
- should sentence annotations get added (Default value = True)
add_nounchunks
- should noun chunk annotations get added (Default value = True)
add_dep
- should dependency parser information get added (Default value = True)
add_ents
- (Default value = True)
ent_prefix
- (Default value = None)
start_offset
- If a document is specified, an offset where the text starts can be defined. This allows a part of a document with spacy and then include the annotations back to the document, in the corresponding possition
retrieve_spans
- if not None, a list of additional Spacy span types to retrieve
Returns
the new or modified Document
Classes
class AnnSpacy (pipeline=None, outsetname='', token_type='Token', space_token_type='SpaceToken', sentence_type='Sentence', nounchunk_type='NounChunk', add_tokens=True, add_entities=True, add_sentences=True, add_nounchunks=True, add_deps=True, ent_prefix=None)
-
Create an annotator for running a spacy pipeline on documents.
Args
pipeline
- if this is specified, a pre-configured spacy pipeline (default: "en_core_web_sm" pipeline)
outsetname: the annotation set name where to put the annotations token_type: the annotation type for the token annotations space_token_type: type of any space token annotations sentence_type: the annotation type for the sentence annotations nounchunk_type: annotation type for noun chunks add_tokens: if token annotations should be added add_entities: if true, add entity annotations add_sentences: if sentence annotations should be added add_nounchunks: if nounchunks should be added add_deps: if dependencies should be added ent_prefix: the prefix to add to all entity annotation types
Expand source code
class AnnSpacy(Annotator): """ """ def __init__( self, pipeline=None, outsetname="", token_type="Token", space_token_type="SpaceToken", sentence_type="Sentence", nounchunk_type="NounChunk", add_tokens=True, # add_spacetokens=True, # not sure how to do this yet add_entities=True, add_sentences=True, add_nounchunks=True, add_deps=True, ent_prefix=None, ): """ Create an annotator for running a spacy pipeline on documents. Args: pipeline: if this is specified, a pre-configured spacy pipeline (default: "en_core_web_sm" pipeline) outsetname: the annotation set name where to put the annotations token_type: the annotation type for the token annotations space_token_type: type of any space token annotations sentence_type: the annotation type for the sentence annotations nounchunk_type: annotation type for noun chunks add_tokens: if token annotations should be added add_entities: if true, add entity annotations add_sentences: if sentence annotations should be added add_nounchunks: if nounchunks should be added add_deps: if dependencies should be added ent_prefix: the prefix to add to all entity annotation types """ self.outsetname = outsetname self.token_type = token_type self.sentence_type = sentence_type self.add_entities = add_entities self.ent_prefix = ent_prefix self.space_token_type = space_token_type self.nounchunk_type = nounchunk_type self.add_tokens = add_tokens self.add_sentences = add_sentences self.add_nounchunks = add_nounchunks self.add_deps = add_deps if pipeline: self.pipeline = pipeline else: self.pipeline = spacy.load("en_core_web_sm") def __call__(self, doc, **kwargs): spacy_doc = self.pipeline(doc.text) spacy2gatenlp( spacy_doc, doc, setname=self.outsetname, token_type=self.token_type, space_token_type=self.space_token_type, sentence_type=self.sentence_type, nounchunk_type=self.nounchunk_type, add_tokens=self.add_tokens, add_ents=self.add_entities, add_nounchunks=self.add_nounchunks, add_sents=self.add_sentences, add_dep=self.add_deps, ent_prefix=self.ent_prefix, ) return doc
Ancestors
- Annotator
- abc.ABC
Inherited members